diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index aadf36180b1..4ac0920c729 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -30,13 +30,6 @@ subprojects: - "optimize-unit-test-PR-test" - "Genreate-OptimizeUT-Report" - - id: "Neural Speed Unit Test workflow" - paths: - - .github/workflows/unit-test-neuralspeed.yml - - ".github/workflows/script/unitTest/run_unit_test_neuraspeed.sh" - checks: - - "neural-speed-unit-test" - - id: "NeuralChat Unit Test" paths: - ".github/workflows/unit-test-neuralchat.yml" diff --git a/.github/workflows/Scaner_BDBA.yaml b/.github/workflows/Scaner_BDBA.yaml index 1a84a54bf81..53fce282267 100644 --- a/.github/workflows/Scaner_BDBA.yaml +++ b/.github/workflows/Scaner_BDBA.yaml @@ -3,7 +3,6 @@ name: Scanner BDBA on: workflow_dispatch: -permissions: write-all jobs: bdba_job: name: BDBA Scan diff --git a/.github/workflows/Scaner_Coverity.yaml b/.github/workflows/Scaner_Coverity.yaml index 8a89ebbc808..a2ee1363a75 100644 --- a/.github/workflows/Scaner_Coverity.yaml +++ b/.github/workflows/Scaner_Coverity.yaml @@ -3,7 +3,6 @@ name: Scanner Coverity PYTHON on: workflow_dispatch: -permissions: write-all jobs: coverity_job: name: Coverity diff --git a/.github/workflows/Scaner_Trivy.yaml b/.github/workflows/Scaner_Trivy.yaml index 70dbb788016..cebddadb656 100644 --- a/.github/workflows/Scaner_Trivy.yaml +++ b/.github/workflows/Scaner_Trivy.yaml @@ -2,7 +2,6 @@ name: Trivy Scan for Containers on: workflow_dispatch: -permissions: write-all jobs: trivy_container_job: uses: "intel-innersource/frameworks.ai.infrastructure.code-scan-tools/.github/workflows/Scanner_Trivy.yml@one-ci-cd" diff --git a/.github/workflows/build-container.yaml b/.github/workflows/build-container.yaml index 332484c2bc3..4285231e3be 100644 --- a/.github/workflows/build-container.yaml +++ b/.github/workflows/build-container.yaml @@ -3,7 +3,9 @@ on: workflow_dispatch: # Can be manually executed schedule: # 1/week Sunday at 07:00AM - cron: "5 7 * * 0" -permissions: write-all +permissions: + contents: read + jobs: build: container: # MLOps Dev container for Compose Automation diff --git a/.github/workflows/chatbot-finetune-mpt-7b-chat-hpu.yml b/.github/workflows/chatbot-finetune-mpt-7b-chat-hpu.yml index 3ffac7f3131..0eeb04e6fe7 100644 --- a/.github/workflows/chatbot-finetune-mpt-7b-chat-hpu.yml +++ b/.github/workflows/chatbot-finetune-mpt-7b-chat-hpu.yml @@ -6,7 +6,9 @@ on: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-ft-mpt-7b-hpu cancel-in-progress: true -permissions: write-all +permissions: + contents: read + jobs: finetuning: name: finetuning test diff --git a/.github/workflows/chatbot-finetune-mpt-7b-chat.yml b/.github/workflows/chatbot-finetune-mpt-7b-chat.yml index 398411d47a1..b7a0bb28dc7 100644 --- a/.github/workflows/chatbot-finetune-mpt-7b-chat.yml +++ b/.github/workflows/chatbot-finetune-mpt-7b-chat.yml @@ -6,7 +6,9 @@ on: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-ft-mpt-7b cancel-in-progress: true -permissions: write-all +permissions: + contents: read + jobs: finetuning: name: finetuning test diff --git a/.github/workflows/chatbot-inference-llama-2-7b-chat-hf.yml b/.github/workflows/chatbot-inference-llama-2-7b-chat-hf.yml index b606c5ee138..61f1773cd4d 100644 --- a/.github/workflows/chatbot-inference-llama-2-7b-chat-hf.yml +++ b/.github/workflows/chatbot-inference-llama-2-7b-chat-hf.yml @@ -6,7 +6,9 @@ on: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-inf-lla-7b cancel-in-progress: true -permissions: write-all +permissions: + contents: read + jobs: inference: name: inference test diff --git a/.github/workflows/chatbot-inference-llama-2-7b_70b-chat-hf-hpu.yml b/.github/workflows/chatbot-inference-llama-2-7b_70b-chat-hf-hpu.yml index 1627cb9bd74..0ddb3cc3b89 100644 --- a/.github/workflows/chatbot-inference-llama-2-7b_70b-chat-hf-hpu.yml +++ b/.github/workflows/chatbot-inference-llama-2-7b_70b-chat-hf-hpu.yml @@ -6,7 +6,9 @@ on: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-inf-lla-7b-hpu cancel-in-progress: true -permissions: write-all +permissions: + contents: read + jobs: inference: name: inference test diff --git a/.github/workflows/chatbot-inference-mpt-7b-chat-hpu.yml b/.github/workflows/chatbot-inference-mpt-7b-chat-hpu.yml index b7974c5db9f..02c7eb328f9 100644 --- a/.github/workflows/chatbot-inference-mpt-7b-chat-hpu.yml +++ b/.github/workflows/chatbot-inference-mpt-7b-chat-hpu.yml @@ -6,7 +6,9 @@ on: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-inf-mpt-7b-hpu cancel-in-progress: true -permissions: write-all +permissions: + contents: read + jobs: inference: name: inference test diff --git a/.github/workflows/chatbot-inference-mpt-7b-chat.yml b/.github/workflows/chatbot-inference-mpt-7b-chat.yml index 96135ec5b33..bf0601615f9 100644 --- a/.github/workflows/chatbot-inference-mpt-7b-chat.yml +++ b/.github/workflows/chatbot-inference-mpt-7b-chat.yml @@ -6,7 +6,9 @@ on: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-inf-mpt-7b cancel-in-progress: true -permissions: write-all +permissions: + contents: read + jobs: inference: name: inference test diff --git a/.github/workflows/chatbot-test.yml b/.github/workflows/chatbot-test.yml index cd4a8e8dd32..64e16a9cf41 100644 --- a/.github/workflows/chatbot-test.yml +++ b/.github/workflows/chatbot-test.yml @@ -27,7 +27,6 @@ on: - '!intel_extension_for_transformers/neural_chat/README.md' workflow_dispatch: -permissions: write-all # If there is a new commit, the previous jobs will be canceled concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} diff --git a/.github/workflows/chatbot_finetuning.yml b/.github/workflows/chatbot_finetuning.yml index ead32180106..e93355232f1 100644 --- a/.github/workflows/chatbot_finetuning.yml +++ b/.github/workflows/chatbot_finetuning.yml @@ -7,7 +7,6 @@ on: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true -permissions: write-all jobs: call-finetune-mpt-7b-chat: uses: ./.github/workflows/chatbot-finetune-mpt-7b-chat.yml diff --git a/.github/workflows/deploy-test.yml b/.github/workflows/deploy-test.yml index d7d33c6b9cb..b4625842967 100644 --- a/.github/workflows/deploy-test.yml +++ b/.github/workflows/deploy-test.yml @@ -7,7 +7,6 @@ on: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true -permissions: write-all env: OUT_SCRIPT_PATH: ${{ github.workspace }}/.github/workflows/script/models SCRIPT_PATH: /intel-extension-for-transformers/.github/workflows/script @@ -19,8 +18,15 @@ env: EXTRA_CONTAINER_NAME: "utTest" EXTRA_CONTAINER_NAME2: "codeScan" +permissions: + contents: read + jobs: Deploy-Workflow: + permissions: + actions: read # for dawidd6/action-download-artifact to query and download artifacts + contents: read # for actions/checkout to fetch code + pull-requests: read # for dawidd6/action-download-artifact to query commit hash runs-on: itrex-node strategy: matrix: @@ -128,6 +134,10 @@ jobs: retention-days: 60 # 1 <= retention-days <= 90 Genreate-Report: + permissions: + actions: read # for dawidd6/action-download-artifact to query and download artifacts + contents: read # for actions/checkout to fetch code + pull-requests: read # for dawidd6/action-download-artifact to query commit hash runs-on: itrex-node-spell needs: [Deploy-Workflow] steps: diff --git a/.github/workflows/format_scan.yml b/.github/workflows/format_scan.yml index 01209afd5e3..db3f280c33d 100644 --- a/.github/workflows/format_scan.yml +++ b/.github/workflows/format_scan.yml @@ -11,7 +11,6 @@ on: - .github/workflows/format_scan.yml - .github/workflows/script/formatScan/** workflow_dispatch: -permissions: write-all # If there is a new commit, the previous jobs will be canceled concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -24,6 +23,9 @@ env: DOCKER_FILE_NAME: "codeScan" CONTAINER_NAME: "codeScan" +permissions: + contents: read + jobs: format-scan: runs-on: itrex-node-spell diff --git a/.github/workflows/llm-test.yml b/.github/workflows/llm-test.yml index 02b8e484b5f..cdf4374f1e9 100644 --- a/.github/workflows/llm-test.yml +++ b/.github/workflows/llm-test.yml @@ -13,7 +13,6 @@ on: - "!intel_extension_for_transformers/transformers/runtime/third_party/**" - "!intel_extension_for_transformers/transformers/runtime/docs/**" workflow_dispatch: -permissions: write-all # If there is a new commit, the previous jobs will be canceled concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -26,6 +25,9 @@ env: EXTRA_CONTAINER_NAME: "codeScan" +permissions: + contents: read + jobs: LLM-Workflow: runs-on: spr @@ -50,6 +52,7 @@ jobs: - name: Binary build run: | cd ${{ github.workspace }} + source ~/.bashrc conda activate llm-test || source activate llm-test compiler_version=11.1.0 conda install --update-deps -c conda-forge gxx==${compiler_version} gcc==${compiler_version} gxx_linux-64==${compiler_version} libstdcxx-ng sysroot_linux-64 libxcrypt -y @@ -83,6 +86,10 @@ jobs: retention-days: 60 # 1 <= retention-days <= 90 Generate-LLM-Report: + permissions: + actions: read # for dawidd6/action-download-artifact to query and download artifacts + contents: read # for actions/checkout to fetch code + pull-requests: read # for dawidd6/action-download-artifact to query commit hash runs-on: itrex-node-spell needs: [LLM-Workflow] steps: diff --git a/.github/workflows/optimize-test.yml b/.github/workflows/optimize-test.yml index b0bf5922146..527c4a1a53f 100644 --- a/.github/workflows/optimize-test.yml +++ b/.github/workflows/optimize-test.yml @@ -7,7 +7,6 @@ on: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true -permissions: write-all env: OUT_SCRIPT_PATH: ${{ github.workspace }}/.github/workflows/script/models SCRIPT_PATH: /intel-extension-for-transformers/.github/workflows/script @@ -20,8 +19,15 @@ env: EXTRA_CONTAINER_NAME2: "codeScan" +permissions: + contents: read + jobs: Optimize-Workflow: + permissions: + actions: read # for dawidd6/action-download-artifact to query and download artifacts + contents: read # for actions/checkout to fetch code + pull-requests: read # for dawidd6/action-download-artifact to query commit hash runs-on: itrex-node strategy: matrix: @@ -133,6 +139,10 @@ jobs: retention-days: 60 # 1 <= retention-days <= 90 Genreate-Report: + permissions: + actions: read # for dawidd6/action-download-artifact to query and download artifacts + contents: read # for actions/checkout to fetch code + pull-requests: read # for dawidd6/action-download-artifact to query commit hash runs-on: itrex-node-spell needs: [Optimize-Workflow] steps: diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 2d5427a01bd..34a01a10a36 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -5,10 +5,14 @@ on: branches: - main workflow_dispatch: -permissions: write-all +permissions: + contents: read + jobs: build: + permissions: + contents: write # for peaceiris/actions-gh-pages to push pages branch runs-on: ubuntu-latest steps: diff --git a/.github/workflows/script/launch_llm.sh b/.github/workflows/script/launch_llm.sh index 7e8dc3aebd2..74afd6738c3 100644 --- a/.github/workflows/script/launch_llm.sh +++ b/.github/workflows/script/launch_llm.sh @@ -23,7 +23,7 @@ function main() { fi # init conda - #. $(dirname ${CONDA_EXE})/../etc/profile.d/conda.sh + source ~/.bashrc conda activate $conda_env || source activate $conda_env # env diff --git a/.github/workflows/script/models/run_llm.sh b/.github/workflows/script/models/run_llm.sh index 30369b9543e..4ef62af935e 100644 --- a/.github/workflows/script/models/run_llm.sh +++ b/.github/workflows/script/models/run_llm.sh @@ -36,9 +36,9 @@ main() { } function prepare() { - [[ -d ${HOME}/anaconda3/bin ]] && export PATH=${HOME}/anaconda3/bin/:$PATH - [[ -d ${HOME}/miniconda3/bin ]] && export PATH=${HOME}/miniconda3/bin/:$PATH - export LD_LIBRARY_PATH=/lib64/libcrypto.so.1.1:${HOME}/miniconda3/envs/${conda_env_name}/lib/:$LD_LIBRARY_PATH + source ~/.bashrc + source activate ${conda_env_name} || conda activate ${conda_env_name} + export LD_LIBRARY_PATH=/lib64/libcrypto.so.1.1:${CONDA_PREFIX}/lib/:$LD_LIBRARY_PATH if [[ ${precision} == "fp8" ]]; then export NE_WEIGHT_FP8_4E3M=1 fi @@ -46,7 +46,6 @@ function prepare() { working_dir="${WORKING_DIR}/examples/huggingface/pytorch/text-generation/deployment" fi $BOLD_YELLOW && echo "Running ---- ${framework}, ${model}----Prepare" - source activate ${conda_env_name} || conda activate ${conda_env_name} if [[ ${cpu} == *"spr"* ]] || [[ ${cpu} == *"SPR"* ]] || [[ ${cpu} == *"Spr"* ]]; then export CC=/opt/rh/gcc-toolset-11/root/usr/bin/gcc export CXX=/opt/rh/gcc-toolset-11/root/usr/bin/g++ @@ -56,8 +55,7 @@ function prepare() { echo "Working in ${working_dir}" echo -e "\nInstalling model requirements..." export PATH=/lib64/libcrypto.so.1.1:$PATH - cp /lib64/libcrypto.so.1.1 ${HOME}/miniconda3/envs/${conda_env_name}/lib/libcrypto.so.1.1 - cp /lib64/libcrypto.so.1.1 ${HOME}/miniconda3/lib/libcrypto.so.1.1 + cp /lib64/libcrypto.so.1.1 ${CONDA_PREFIX}/lib/libcrypto.so.1.1 if [ -f "requirements.txt" ]; then sed -i '/^transformers/d' requirements.txt n=0 diff --git a/.github/workflows/script/prepare_env_with_conda.sh b/.github/workflows/script/prepare_env_with_conda.sh index b6622a08837..7f5f6784a51 100644 --- a/.github/workflows/script/prepare_env_with_conda.sh +++ b/.github/workflows/script/prepare_env_with_conda.sh @@ -6,13 +6,6 @@ if [[ -z "${conda_env_name}" ]] || [[ -z "${python_version}" ]]; then exit 1 fi +source ~/.bashrc conda create -n ${conda_env_name} python=${python_version} -y source activate ${conda_env_name} || conda activate ${conda_env_name} -#pip install -U pip -# -#if [ -f "requirements.txt" ]; then -# python -m pip install --default-timeout=100 -r requirements.txt -# pip list -#else -# echo "Not found requirements.txt file." -#fi diff --git a/.github/workflows/sparse_lib_CI.yml b/.github/workflows/sparse_lib_CI.yml index d908f994518..4a66ad63d62 100644 --- a/.github/workflows/sparse_lib_CI.yml +++ b/.github/workflows/sparse_lib_CI.yml @@ -8,7 +8,6 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true -permissions: write-all env: DOCKER_CONFIG_NAME: "commonDockerConfig" @@ -17,6 +16,9 @@ env: DOCKER_FILE_NAME: "unitTest" CONTAINER_NAME: "utTest" +permissions: + contents: read + jobs: sparselib: runs-on: itrex-node diff --git a/.github/workflows/trellix.yaml b/.github/workflows/trellix.yaml index ffed7d15e24..0a4a7a3ff71 100644 --- a/.github/workflows/trellix.yaml +++ b/.github/workflows/trellix.yaml @@ -3,7 +3,9 @@ name: Trellix Command Line Scanner on: workflow_dispatch: -permissions: write-all +permissions: + contents: read + jobs: Trellix: runs-on: inner-source diff --git a/.github/workflows/unit-test-engine.yml b/.github/workflows/unit-test-engine.yml index fef1f600514..d0045bac863 100644 --- a/.github/workflows/unit-test-engine.yml +++ b/.github/workflows/unit-test-engine.yml @@ -14,7 +14,6 @@ on: - "!intel_extension_for_transformers/transformers/runtime/third_party/**" - "!intel_extension_for_transformers/transformers/runtime/docs/**" workflow_dispatch: -permissions: write-all # If there is a new commit, the previous jobs will be canceled concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -29,6 +28,9 @@ env: EXTRA_CONTAINER_NAME: "modelTest" CONTAINER_SCAN: "codeScan" +permissions: + contents: read + jobs: engine-unit-test: runs-on: [self-hosted, linux, X64, itrex-node] diff --git a/.github/workflows/unit-test-kernel.yml b/.github/workflows/unit-test-kernel.yml index 75ba18a16c2..31829d87057 100644 --- a/.github/workflows/unit-test-kernel.yml +++ b/.github/workflows/unit-test-kernel.yml @@ -14,7 +14,6 @@ on: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true -permissions: write-all env: DOCKER_CONFIG_NAME: "commonDockerConfig" REPO_NAME: "intel-extension-for-transformers" @@ -23,6 +22,9 @@ env: CONTAINER_NAME: "utTest" EXTRA_CONTAINER_NAME: "modelTest" +permissions: + contents: read + jobs: unit-test: runs-on: [self-hosted, linux, X64, itrex-node] diff --git a/.github/workflows/unit-test-neuralchat.yml b/.github/workflows/unit-test-neuralchat.yml index b5a13f9686b..f2a6e15b2ac 100644 --- a/.github/workflows/unit-test-neuralchat.yml +++ b/.github/workflows/unit-test-neuralchat.yml @@ -25,7 +25,6 @@ on: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true -permissions: write-all env: DOCKER_CONFIG_NAME: "commonDockerConfig" REPO_NAME: "intel-extension-for-transformers" @@ -36,6 +35,9 @@ env: CONTAINER_SCAN: "codeScan" GOOGLE_API_KEY: ${{ vars.GOOGLE_API_KEY }} +permissions: + contents: read + jobs: neuralchat-unit-test: runs-on: [self-hosted, Linux, X64, itrex-node] diff --git a/.github/workflows/unit-test-neuralspeed.yml b/.github/workflows/unit-test-neuralspeed.yml index f140c9601e9..78d41a4dc70 100644 --- a/.github/workflows/unit-test-neuralspeed.yml +++ b/.github/workflows/unit-test-neuralspeed.yml @@ -12,7 +12,6 @@ on: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true -permissions: write-all env: DOCKER_CONFIG_NAME: "commonDockerConfig" REPO_NAME: "intel-extension-for-transformers" @@ -21,6 +20,9 @@ env: CONTAINER_NAME: "utTest" EXTRA_CONTAINER_NAME: "modelTest" +permissions: + contents: read + jobs: neural-speed-unit-test: runs-on: [self-hosted, linux, X64, llmruntime-node] diff --git a/.github/workflows/unit-test-optimize.yml b/.github/workflows/unit-test-optimize.yml index d210c6a26c3..4d11947d92c 100644 --- a/.github/workflows/unit-test-optimize.yml +++ b/.github/workflows/unit-test-optimize.yml @@ -23,7 +23,6 @@ on: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true -permissions: write-all env: DOCKER_CONFIG_NAME: "commonDockerConfig" REPO_NAME: "intel-extension-for-transformers" @@ -33,6 +32,9 @@ env: EXTRA_CONTAINER_NAME: "modelTest" CONTAINER_SCAN: "codeScan" +permissions: + contents: read + jobs: optimize-unit-test: runs-on: [self-hosted, Linux, X64, itrex-node] diff --git a/.github/workflows/windows-test.yml b/.github/workflows/windows-test.yml index 1905f5bb57a..f08d1059d47 100644 --- a/.github/workflows/windows-test.yml +++ b/.github/workflows/windows-test.yml @@ -14,7 +14,6 @@ on: - "!intel_extension_for_transformers/transformers/runtime/test/**" - "!intel_extension_for_transformers/qbits/qbits_ut/**" workflow_dispatch: -permissions: write-all # If there is a new commit, the previous jobs will be canceled concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -24,6 +23,9 @@ env: SCRIPT_PATH: ${{ github.workspace }}\.github\workflows\script WORKING_DIR: ${{ github.workspace }} +permissions: + contents: read + jobs: Windows-Binary-Test: runs-on: 'Windows' diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 24ce3d4fe3f..da91235b092 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -57,8 +57,7 @@ repos: examples/huggingface/pytorch/text-generation/inference/prompt.json| intel_extension_for_transformers/transformers/runtime/oneDNN-THIRD-PARTY-PROGRAMS| docker/intel-tensorflow-third-party-programs.txt| - .github/workflows/sample_data/alpaca_data_sample_45.json| - intel_extension_for_transformers/neural_chat/assets/docs/4th\ Generation\ Intel®\ Xeon®\ Scalable\ Processors\ Product\ Specifications.html + .github/workflows/sample_data/alpaca_data_sample_45.json )$ - repo: https://github.com/Lucas-C/pre-commit-hooks diff --git a/SECURITY.md b/SECURITY.md index 71a71eff1b6..eb546f7f75a 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -1,3 +1,8 @@ +OpenSSF Badge +=============== + +## [OpenSSF Badge](https://www.bestpractices.dev/en/projects/9128) + Security Policy =============== diff --git a/docs/api_doc/optimization/optimizer.rst b/docs/api_doc/optimization/optimizer.rst deleted file mode 100644 index f4b31c471b9..00000000000 --- a/docs/api_doc/optimization/optimizer.rst +++ /dev/null @@ -1,7 +0,0 @@ -PyTorch Optimizer -============== - -.. autoapisummary:: - - intel_extension_for_transformers.transformers.optimizer.NoTrainerOptimizer - intel_extension_for_transformers.transformers.optimizer.Orchestrate_optimizer diff --git a/docs/contributors.md b/docs/contributors.md index 4b92fd91bd0..b4771da78c8 100644 --- a/docs/contributors.md +++ b/docs/contributors.md @@ -116,9 +116,20 @@ Liangliang-Ma + + Lucas Guimarães + + Siddhi Velankar + + Sergey Nesterov - + + + Srikanth Ramakrishna + + igeni +
diff --git a/docs/devcatalog.md b/docs/devcatalog.md index ab826d913d1..30f371489b2 100644 --- a/docs/devcatalog.md +++ b/docs/devcatalog.md @@ -99,7 +99,8 @@ raw_datasets = raw_datasets.map(lambda e: tokenizer(e['sentence'], truncation=Tr Documentation for API usage can be found [here](https://github.com/intel/intel-extension-for-transformers/tree/main/docs) ```python -from intel_extension_for_transformers.transformers import QuantizationConfig, metrics, objectives +from intel_extension_for_transformers.transformers import metrics, objectives +from neural_compressor.config import PostTrainingQuantConfig from intel_extension_for_transformers.transformers.trainer import NLPTrainer # load config, model and metric config = AutoConfig.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english",num_labels=2) @@ -120,7 +121,9 @@ trainer = NLPTrainer(model=model, tokenizer=tokenizer ) # model quantization using trainer -q_config = QuantizationConfig(metrics=[metrics.Metric(name="eval_accuracy")]) +tune_metric = metrics.Metric(name="eval_accuracy") +trainer.metrics = tune_metric +q_config = PostTrainingQuantConfig() model = trainer.quantize(quant_config=q_config) # test sentiment analysis with quantization diff --git a/docs/distillation.md b/docs/distillation.md index 871bcc73b47..b74c5349f15 100644 --- a/docs/distillation.md +++ b/docs/distillation.md @@ -49,39 +49,20 @@ Where $D$ is a distance measurement as before, $F_t^{n_i}$ the output feature of ## usage ### Pytorch Script: ```python -from intel_extension_for_transformers.transformers import metric, objectives, DistillationConfig, Criterion + from intel_extension_for_transformers.transformers.trainer import NLPTrainer +from neural_compressor.config import DistillationConfig # Replace transformers.Trainer with NLPTrainer # trainer = transformers.Trainer(......) trainer = NLPTrainer(......) metric = metrics.Metric(name="eval_accuracy") -d_conf = DistillationConfig(metrics=tune_metric) -model = trainer.distill( - distillation_config=d_conf, teacher_model=teacher_model -) +trainer.metrics = metric +d_conf = DistillationConfig(teacher_model=teacher_model, criterion=criterion) +model = trainer.distill(distillation_config=d_conf) ``` Please refer to [example](../examples/huggingface/pytorch/text-classification/distillation/run_glue.py) for the details. -### Tensorflow Script: -```python -from intel_extension_for_transformers.transformers import (DistillationConfig, metrics) -from intel_extension_for_transformers.transformers.distillation import Criterion - -optimizer = TFOptimization(...) -metric_ = metrics.Metric(name="eval_accuracy") -criterion = Criterion(name='KnowledgeLoss', - layer_mappings=[['classifier', 'classifier']], - loss_types=['CE', 'CE'], - loss_weight_ratio=[0.5, 0.5], - add_origin_loss=False) -distillation_conf = DistillationConfig(metrics=metric_, - criterion=criterion) -distilled_model = optimizer.distill( - distillation_config=distillation_conf, - teacher_model=teacher_model) -``` -Please refer to [example](../examples/huggingface/tensorflow/text-classification/distillation/run_glue.py) for the details. ### Create an Instance of Metric The Metric defines which metric will be used to measure the performance of tuned models. - example: @@ -94,19 +75,23 @@ The Metric defines which metric will be used to measure the performance of tuned ### Create an Instance of Criterion(Optional) The criterion used in training phase. -- arguments: +- KnowledgeDistillationLossConfig arguments: |Argument |Type |Description |Default value | |:----------|:----------|:-----------------------------------------------|:----------------| - |name |String|Name of criterion, like:"KnowledgeLoss", "IntermediateLayersLoss" |"KnowledgeLoss"| |temperature|Float |parameter for KnowledgeDistillationLoss |1.0 | |loss_types|List of string|Type of loss |['CE', 'CE'] | |loss_weight_ratio|List of float|weight ratio of loss |[0.5, 0.5] | + +- IntermediateLayersKnowledgeDistillationLossConfig arguments: + |Argument |Type |Description |Default value | + |:----------|:----------|:-----------------------------------------------|:----------------| + |loss_types|List of string|Type of loss |['CE', 'CE'] | + |loss_weight_ratio|List of float|weight ratio of loss |[0.5, 0.5] | |layer_mappings|List|parameter for IntermediateLayersLoss |[] | |add_origin_loss|bool|parameter for IntermediateLayersLoss |False | - - example: ```python - criterion = Criterion(name='KnowledgeLoss') + criterion = KnowledgeDistillationLossConfig() ``` ### Create an Instance of DistillationConfig @@ -115,20 +100,18 @@ The DistillationConfig contains all the information related to the model distill - arguments: |Argument |Type |Description |Default value | |:----------|:----------|:-----------------------------------------------|:----------------| - |framework |string |which framework you used |"pytorch" | - |criterion|Criterion |criterion of training |"KnowledgeLoss"| - |metrics |Metric |Used to evaluate accuracy of tuning model, no need for NoTrainerOptimizer|None | + |teacher_model |torch.nn.Module | teacher model object |None | + |criterion|Criterion |criterion of training |KnowledgeLoss object| + - example: ```python - d_conf = DistillationConfig(metrics=metric, criterion=criterion) + d_conf = DistillationConfig(teacher_model=teacher_model, criterion=criterion) ``` ### Distill with Trainer - Distill with Trainer NLPTrainer inherits from transformers.Trainer, so you can create a trainer as in examples of Transformers. Then you can distill model with trainer.distill function. ```python - model = trainer.distill( - distillation_config=d_conf, teacher_model=teacher_model - ) + model = trainer.distill(distillation_config=d_conf) ``` diff --git a/docs/examples.md b/docs/examples.md index b6fe8b0e6af..5e833e9bbca 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -37,8 +37,8 @@ Intel Extension for Transformers is a powerful toolkit with multiple model optim Model Task Dataset - PostTrainingDynamic - PostTrainingStatic + dynamic + static @@ -177,7 +177,7 @@ Intel Extension for Transformers is a powerful toolkit with multiple model optim Model Task Dataset - QuantizationAwareTraining + qat No Trainer quantization @@ -206,7 +206,7 @@ Intel Extension for Transformers is a powerful toolkit with multiple model optim Model Task Dataset - PostTrainingStatic + static @@ -232,7 +232,7 @@ Intel Extension for Transformers is a powerful toolkit with multiple model optim Model Task Dataset - PostTrainingStatic + static diff --git a/docs/export.md b/docs/export.md index c0fda81a54a..619402e3b14 100644 --- a/docs/export.md +++ b/docs/export.md @@ -22,9 +22,9 @@ We support exporting PyTorch models into ONNX models with our well-designed API | Input Model | Export FP32 | Export BF16 | Export INT8 | | --- | --- | --- | --- | | FP32 PyTorch Model | ✔ | ✔ | / | -| INT8 PyTorch Model
(PostTrainingDynamic) | / | / | ✔ | -| INT8 PyTorch Model
(PostTrainingStatic) | / | / | ✔ | -| INT8 PyTorch Model
(QuantizationAwareTraining) | / | / | ✔ | +| INT8 PyTorch Model
(dynamic) | / | / | ✔ | +| INT8 PyTorch Model
(static) | / | / | ✔ | +| INT8 PyTorch Model
(qat) | / | / | ✔ | ## Examples diff --git a/docs/get_started.md b/docs/get_started.md index 492b505eeb8..ea807226f03 100644 --- a/docs/get_started.md +++ b/docs/get_started.md @@ -13,7 +13,7 @@ ## Quantization ```python -from intel_extension_for_transformers.transformers import QuantizationConfig, metrics, objectives +from neural_compressor.config import PostTrainingQuantConfig from intel_extension_for_transformers.transformers.trainer import NLPTrainer config = AutoConfig.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english",num_labels=2) @@ -27,7 +27,9 @@ trainer = NLPTrainer(model=model, eval_dataset=raw_datasets["validation"], tokenizer=tokenizer ) -q_config = QuantizationConfig(metrics=[metrics.Metric(name="eval_loss", greater_is_better=False)]) +quantization_config = PostTrainingQuantConfig( + approach="static", +) model = trainer.quantize(quant_config=q_config) input = tokenizer("I like Intel Extension for Transformers", return_tensors="pt") @@ -73,17 +75,17 @@ model = trainer.distill(distillation_config=d_conf, teacher_model=teacher_model) ## Quantized Length Adaptive Transformer Quantized Length Adaptive Transformer leverages sequence-length reduction and low-bit representation techniques to further enhance model inference performance, enabling adaptive sequence-length sizes to accommodate different computational budget requirements with an optimal accuracy efficiency tradeoff. ```python -from intel_extension_for_transformers.transformers import QuantizationConfig, DynamicLengthConfig, metric, objectives +from intel_extension_for_transformers.transformers import DynamicLengthConfig, metric, objectives +from neural_compressor.config import PostTrainingQuantConfig from intel_extension_for_transformers.transformers.trainer import NLPTrainer # Replace transformers.Trainer with NLPTrainer # trainer = transformers.Trainer(...) trainer = NLPTrainer(...) metric = metrics.Metric(name="eval_f1", is_relative=True, criterion=0.01) -q_config = QuantizationConfig( - approach="PostTrainingStatic", - metrics=[metric], - objectives=[objectives.performance] +trainer.metrics = metric +q_config = PostTrainingQuantConfig( + approach="static" ) # Apply the length config dynamic_length_config = DynamicLengthConfig(length_config=length_config) diff --git a/docs/pruning.md b/docs/pruning.md index 3e5b46ae136..f5909bb2008 100644 --- a/docs/pruning.md +++ b/docs/pruning.md @@ -7,32 +7,23 @@ Pruning ## Introduction Pruning is the process of removing redundant parameters of a network. The idea bears similarity to the ["optimal brain damage"](http://yann.lecun.com/exdb/publis/pdf/lecun-90b.pdf) hypothesis by Yann LeCun. There are two types of pruning: Unstructured and Structured. Unstructured pruning means finding and removing the less salient connection in the model, the place could be anywhere in the matrix. Structured pruning means deleting entire blocks, filters, or channels. -## Pruning types - -There are three pruning types in Intel® Extension for Transformers: - -- Magnitude (Unstructured) - - The algorithm prunes the weight by the lowest absolute value at each layer with a given sparsity target. - -- Group Lasso (Structured) - - The algorithm uses Group lasso regularization to prune entire rows, columns, or blocks of parameters that result in a smaller dense network. - -- Pattern Lock (Unstructured & Structured) - - The algorithm locks the sparsity pattern in fine tune phase by freezing those zero values of the weight tensor during the weight update of training. - ## Usage ### Script: ```python -from intel_extension_for_transformers.transformers import metrics, objectives, PrunerConfig, PruningConfig, +from intel_extension_for_transformers.transformers import metrics +from neural_compressor.config import WeightPruningConfig from intel_extension_for_transformers.transformers.trainer import NLPTrainer # Replace transformers.Trainer with NLPTrainer # trainer = transformers.Trainer(......) trainer = NLPTrainer(......) metric = metrics.Metric(name="eval_accuracy") -pruner_config = PrunerConfig(prune_type='BasicMagnitude', target_sparsity_ratio=0.9) -p_conf = PruningConfig(pruner_config=[pruner_config], metrics=metric) -model = trainer.prune(pruning_config=p_conf) +trainer.metrics = tune_metric +pruning_conf = WeightPruningConfig([{"start_step": 0, "end_step": 2}], + target_sparsity=0,9, + pruning_scope="local", + pruning_type="magnitude") +model = trainer.prune(pruning_config=pruning_conf) ``` Please refer to [example](../examples/huggingface/pytorch/text-classification/pruning) for the details. @@ -45,41 +36,27 @@ The Metric defines which metric will be used to measure the performance of tuned Please refer to [metrics document](metrics.md) for the details. -### Create list of an instance of PrunerConfig(Optional) -PrunerConfig defines which pruning algorithm to use and how to apply it during the training process. Intel® Extension for Transformers supports pruning types "BasicMagnitude", "PatternLock", and "GroupLasso". You can create different pruners for different layers. +### Create an instance of WeightPruningConfig +[WeightPruningConfig](neural-compressor_neural_compressor_config.py at master · intel_neural-compressor.html) defines which pruning algorithm to use and how to apply it during the training process. Intel® Extension for Transformers supports pruning types "magnitude", "pattern_lock", and "GroupLasso". You can create different pruners for different layers. - arguments: |Argument |Type |Description |Default value | |:----------|:----------|:-----------------------------------------------|:----------------| - |epoch_range|list of integer|Which epochs to pruning |[0, 4] | - |initial_sparsity_ratio|float |Initial sparsity goal |0.0 | - |target_sparsity_ratio|float |Target sparsity goal |0.97 | + |pruning_configs |list of dicts|Which epochs to pruning |[{}] | + |target_sparsity |float |Initial sparsity goal |0.90 | |update_frequency|integer|Frequency to updating sparsity |1 | - |prune_type|string|Pruning algorithm |'BasicMagnitude' | - |method|string|Pruning method |'per_tensor' | - |names|list of string|List of weight name to be pruned. If no weight is specified, all weights of the model will be pruned|[]| - |parameters|dict of string|The hyper-parameters for pruning, refer to [the link](https://github.com/intel/neural-compressor/blob/master/docs/source/pruning.md)|None| + |pruning_type |string|Pruning algorithm |'snip_momentum' | + -- example: - ```python - pruner_config = PrunerConfig(prune_type='BasicMagnitude', target_sparsity_ratio=0.9) - ``` - -### Create an instance of PruningConfig -The PruningConfig contains all the information related to the model pruning behavior. If you have created Metric and PrunerConfig instance, then you can create an instance of PruningConfig. Metric and pruner are optional. - -- arguments: - |Argument |Type |Description |Default value | - |:----------|:----------|:-----------------------------------------------|:----------------| - |framework |string |Which framework you used |"pytorch" | - |initial_sparsity_ratio|float |Initial sparsity goal, if pruner_config argument is defined, it didn't need |0.0| - |target_sparsity_ratio|float |Target sparsity goal, if pruner argument is defined, it didn't need |0.97| - |metrics |Metric |Used to evaluate accuracy of tuning model, no need for NoTrainerOptimizer|None | - |pruner_config |PrunerConfig |Defined pruning behavior, if it is None, then NLP will create a default a pruner with 'BasicMagnitude' pruning type |None | +The WeightPruningConfig contains all the information related to the model pruning behavior. If you have created Metric and WeightPruningConfig instance, then you can create an instance of WeightPruningConfig. Metric and pruner are optional. - example: ```python - pruning_conf = PruningConfig(pruner_config=[pruner_config], metrics=tune_metric) + from neural_compressor.config import WeightPruningConfig + pruning_conf = WeightPruningConfig([{"start_step": 0, "end_step": 2}], + target_sparsity=0,9, + pruning_scope="local", + pruning_type="magnitude") ``` ### Prune with Trainer diff --git a/docs/quantization.md b/docs/quantization.md index 7d4ff061503..93e621a0db9 100644 --- a/docs/quantization.md +++ b/docs/quantization.md @@ -134,33 +134,18 @@ Quantization methods include the following three types: ## Get Started ### Script: ```python -from intel_extension_for_transformers.transformers import metric, objectives, QuantizationConfig +from neural_compressor.config import PostTrainingQuantConfig from intel_extension_for_transformers.transformers.trainer import NLPTrainer # Replace transformers.Trainer with NLPTrainer # trainer = transformers.Trainer(......) trainer = NLPTrainer(......) -metric = metrics.Metric( - name="eval_f1", is_relative=True, criterion=0.01 -) -objective = objectives.performance -q_config = QuantizationConfig( - approach="PostTrainingStatic", - metrics=[metric], - objectives=[objective] +q_config = PostTrainingQuantConfig( + approach="static" ) model = trainer.quantize(quant_config=q_config) ``` Please refer to [quantization example](../examples/huggingface/pytorch/text-classification/quantization/run_glue.py) for the details. -### Create an Instance of Metric -The Metric defines which metric will be used to measure the performance of tuned models. -- example: - ```python - metric = metrics.Metric(name="eval_f1", greater_is_better=True, is_relative=True, criterion=0.01, weight_ratio=None) - ``` - - Please refer to [metrics document](metrics.md) for the details. - ### Create an Instance of Objective(Optional) In terms of evaluating the status of a specific model during tuning, we should have general objectives to measure the status of different models. @@ -172,25 +157,28 @@ In terms of evaluating the status of a specific model during tuning, we should h Please refer to [objective document](objectives.md) for the details. ### Create an Instance of QuantizationConfig -The QuantizationConfig contains all the information related to the model quantization behavior. If you have created Metric and Objective instance(default Objective is "performance"), then you can create an instance of QuantizationConfig. - -- arguments: +The QuantizationConfig contains all the information related to the model quantization behavior. If you have created Metric and Objective instance(default Objective is "performance"), then you can create an instance of PostTrainingQuantConfig or QuantizationAwareTrainingConfig. -|Argument |Type |Description |Default value | -|:----------|:----------|:-----------------------------------------------|:----------------| -|framework |string |Which framework you used |"pytorch" | -|approach |string |Which quantization approach you used |"PostTrainingStatic"| -|timeout |integer |Tuning timeout(seconds), 0 means early stop; combine with max_trials field to decide when to exit|0 | -|max_trials |integer |Max tune times |100 | -|metrics |list of Metric|Used to evaluate accuracy of tuning model, no need for NoTrainerOptimizer|None | -|objectives |list of Objective|Objective with accuracy constraint guaranteed|performance| - example: ```python - q_config = QuantizationConfig( - approach="PostTrainingDynamic", - metrics=[metric], - objectives=[objective] + from neural_compressor.config import ( + PostTrainingQuantConfig, + QuantizationAwareTrainingConfig, + TuningCriterion, + AccuracyCriterion + ) + + tuning_criterion = TuningCriterion(max_trials=600, objective=["performance"]) + accuracy_criterion = AccuracyCriterion( + higher_is_better=True, # optional. + criterion="relative", # optional. Available values are "relative" and "absolute". + tolerable_loss=0.01, # optional. + ) + q_config = PostTrainingQuantConfig( + approach="dynamic", + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion ) ``` diff --git a/docs/tutorials/pytorch/language-modeling/bert-base-uncased.ipynb b/docs/tutorials/pytorch/language-modeling/bert-base-uncased.ipynb index c25aff405e5..f87803721a2 100644 --- a/docs/tutorials/pytorch/language-modeling/bert-base-uncased.ipynb +++ b/docs/tutorials/pytorch/language-modeling/bert-base-uncased.ipynb @@ -80,7 +80,8 @@ "from dataclasses import dataclass, field\n", "from datasets import load_dataset, load_metric\n", "from itertools import chain\n", - "from intel_extension_for_transformers.transformers import metrics, OptimizedModel, QuantizationConfig\n", + "from intel_extension_for_transformers.transformers import metrics, OptimizedModel\n", + "from neural_compressor.config import PostTrainingQuantConfig\n", "from intel_extension_for_transformers.transformers.trainer import NLPTrainer\n", "from transformers import (\n", " CONFIG_MAPPING,\n", @@ -373,9 +374,10 @@ " criterion=\"0.25\", # Performance tolerance when optimizing the model.\n", " greater_is_better=False \n", ")\n", - "quantization_config = QuantizationConfig(\n", - " approach=\"PostTrainingStatic\",\n", - " metrics=[tune_metric],\n", + "\n", + "trainer_ptq_static.metrics = tune_metric\n", + "quantization_config = PostTrainingQuantConfig(\n", + " approach=\"static\"\n", ")\n", "\n", "# run quantization\n", @@ -473,9 +475,9 @@ " criterion=\"0.25\", # why performance tolerance\n", " greater_is_better=False\n", ")\n", - "quantization_config = QuantizationConfig(\n", - " approach=\"PostTrainingDynamic\",\n", - " metrics=[tune_metric],\n", + "trainer_ptq_dynamic.metrics = tune_metric\n", + "quantization_config = PostTrainingQuantConfig(\n", + " approach=\"dynamic\",\n", ")\n", "\n", "# run quantization\n", diff --git a/docs/tutorials/pytorch/multiple-choice/bert-base-uncased_SWAG.ipynb b/docs/tutorials/pytorch/multiple-choice/bert-base-uncased_SWAG.ipynb index 9f554cbd1a0..07cd55ec7f9 100644 --- a/docs/tutorials/pytorch/multiple-choice/bert-base-uncased_SWAG.ipynb +++ b/docs/tutorials/pytorch/multiple-choice/bert-base-uncased_SWAG.ipynb @@ -108,7 +108,8 @@ "from dataclasses import dataclass, field\n", "from datasets import load_dataset\n", "from itertools import chain\n", - "from intel_extension_for_transformers.transformers import metrics, OptimizedModel, QuantizationConfig\n", + "from intel_extension_for_transformers.transformers import metrics, OptimizedModel\n", + "from neural_compressor.config import PostTrainingQuantConfig\n", "from intel_extension_for_transformers.transformers.trainer import NLPTrainer\n", "from transformers import (\n", " AutoConfig,\n", @@ -199,9 +200,9 @@ " metadata={\"help\": \"Whether or not to apply quantization.\"},\n", " )\n", " quantization_approach: Optional[str] = field(\n", - " default=\"POSTTRAININGSTATIC\",\n", - " metadata={\"help\": \"Quantization approach. Supported approach are POSTTRAININGSTATIC, \"\n", - " \"POSTTRAININGDYNAMIC and QUANTIZATIONAWARETRAINING.\"},\n", + " default=\"static\",\n", + " metadata={\"help\": \"Quantization approach. Supported approach are static, \"\n", + " \"dynamic and qat.\"},\n", " )" ] }, @@ -235,7 +236,7 @@ ")\n", "optim_args = OptimizationArguments(\n", " tune=True,\n", - " quantization_approach=\"PostTrainingStatic\"\n", + " quantization_approach=\"static\"\n", ")" ] }, @@ -438,9 +439,9 @@ " is_relative=True, # Metric tolerance mode, True is for relative, otherwise for absolute.\n", " criterion=\"0.25\", # Performance tolerance when optimizing the model.\n", ")\n", - "quantization_config = QuantizationConfig(\n", - " approach=\"PostTrainingStatic\",\n", - " metrics=[tune_metric],\n", + "trainer_static.metrics = tune_metric\n", + "quantization_config = PostTrainingQuantConfig(\n", + " approach=\"static\",\n", ")\n", "\n", "# run quantization\n", @@ -547,9 +548,9 @@ " is_relative=True, # Metric tolerance mode, True is for relative, otherwise for absolute.\n", " criterion=\"0.25\", # Performance tolerance when optimizing the model.\n", ")\n", - "quantization_config = QuantizationConfig(\n", - " approach=\"PostTrainingDynamic\",\n", - " metrics=[tune_metric],\n", + "trainer_dynamic.metrics = tune_metric\n", + "quantization_config = PostTrainingQuantConfig(\n", + " approach=\"dynamic\",\n", ")\n", "\n", "# run quantization\n", diff --git a/docs/tutorials/pytorch/question-answering/Dynamic_MiniLM_SQuAD.ipynb b/docs/tutorials/pytorch/question-answering/Dynamic_MiniLM_SQuAD.ipynb index 4f59134773f..26ddaa817c8 100644 --- a/docs/tutorials/pytorch/question-answering/Dynamic_MiniLM_SQuAD.ipynb +++ b/docs/tutorials/pytorch/question-answering/Dynamic_MiniLM_SQuAD.ipynb @@ -86,7 +86,8 @@ "from dataclasses import dataclass, field\n", "from datasets import load_dataset, load_metric\n", "from itertools import chain\n", - "from intel_extension_for_transformers.transformers import metrics, OptimizedModel, QuantizationConfig, DynamicLengthConfig\n", + "from intel_extension_for_transformers.transformers import metrics, OptimizedModel, DynamicLengthConfig\n", + "from neural_compressor.config import PostTrainingQuantConfig\n", "from intel_extension_for_transformers.transformers.trainer import NLPTrainer\n", "from transformers.trainer_utils import get_last_checkpoint\n", "from transformers.utils.versions import require_version\n", @@ -2162,10 +2163,9 @@ " is_relative=True, # Metric tolerance mode, True is for relative, otherwise for absolute.\n", " criterion=\"0.01\", # Performance tolerance when optimizing the model.\n", ")\n", - "quantization_config = QuantizationConfig(\n", - " approach=\"PostTrainingStatic\",\n", - " max_trials=200,\n", - " metrics=[tune_metric],\n", + "quant_dynamic_trainer.metrics = tune_metric\n", + "quantization_config = PostTrainingQuantConfig(\n", + " approach=\"static\",\n", ")\n", "\n", "# lc = \"(269, 253, 252, 202, 104, 34)\" # configure model with best length config\n", diff --git a/docs/tutorials/pytorch/question-answering/bert-base-uncased_distilled-squad.ipynb b/docs/tutorials/pytorch/question-answering/bert-base-uncased_distilled-squad.ipynb index c2902561cee..6ea589616ba 100644 --- a/docs/tutorials/pytorch/question-answering/bert-base-uncased_distilled-squad.ipynb +++ b/docs/tutorials/pytorch/question-answering/bert-base-uncased_distilled-squad.ipynb @@ -116,7 +116,8 @@ "import transformers\n", "from dataclasses import dataclass, field\n", "from datasets import load_dataset, load_metric\n", - "from intel_extension_for_transformers.transformers import metrics , QuantizationConfig\n", + "from intel_extension_for_transformers.transformers import metrics\n", + "from neural_compressor.config import PostTrainingQuantConfig\n", "from transformers import (\n", " AutoConfig,\n", " AutoModelForQuestionAnswering,\n", @@ -645,7 +646,7 @@ ")\n", "optim_args = OptimizationArguments(\n", " tune=True,\n", - " quantization_approach=\"PostTrainingStatic\"\n", + " quantization_approach=\"static\"\n", ")\n", "log_level = training_args.get_process_log_level()" ] @@ -999,10 +1000,9 @@ " is_relative=True, # Metric tolerance mode, True is for relative, otherwise for absolute.\n", " criterion=0.25, # Performance tolerance when optimizing the model.\n", ")\n", - "quantization_config = QuantizationConfig(\n", - " approach=\"PostTrainingStatic\",\n", - " max_trials=200,\n", - " metrics=[tune_metric],\n", + "trainer_static.metrics = tune_metric\n", + "quantization_config = PostTrainingQuantConfig(\n", + " approach=\"static\",\n", ")\n", "\n", "# run quantization\n", @@ -1113,10 +1113,9 @@ " is_relative=True, # Metric tolerance mode, True is for relative, otherwise for absolute.\n", " criterion=0.25, # Performance tolerance when optimizing the model.\n", ")\n", - "quantization_config = QuantizationConfig(\n", - " approach=\"PostTrainingDynamic\",\n", - " max_trials=200,\n", - " metrics=[tune_metric],\n", + "trainer_dynamic.metrics = tune_metric\n", + "quantization_config = PostTrainingQuantConfig(\n", + " approach=\"dynamic\",\n", ")\n", "\n", "# run quantization\n", diff --git a/docs/tutorials/pytorch/question-answering/bert-large-uncased-whole-word-masking-finetuned-squad.ipynb b/docs/tutorials/pytorch/question-answering/bert-large-uncased-whole-word-masking-finetuned-squad.ipynb index 9b893e6a340..3914a427e34 100644 --- a/docs/tutorials/pytorch/question-answering/bert-large-uncased-whole-word-masking-finetuned-squad.ipynb +++ b/docs/tutorials/pytorch/question-answering/bert-large-uncased-whole-word-masking-finetuned-squad.ipynb @@ -116,7 +116,8 @@ "import transformers\n", "from dataclasses import dataclass, field\n", "from datasets import load_dataset, load_metric\n", - "from intel_extension_for_transformers.transformers import metrics , QuantizationConfig\n", + "from intel_extension_for_transformers.transformers import metrics\n", + "from neural_compressor.config import PostTrainingQuantConfig\n", "from transformers import (\n", " AutoConfig,\n", " AutoModelForQuestionAnswering,\n", @@ -645,7 +646,7 @@ ")\n", "optim_args = OptimizationArguments(\n", " tune=True,\n", - " quantization_approach=\"PostTrainingStatic\"\n", + " quantization_approach=\"static\"\n", ")\n", "log_level = training_args.get_process_log_level()" ] @@ -999,10 +1000,9 @@ " is_relative=True, # Metric tolerance mode, True is for relative, otherwise for absolute.\n", " criterion=0.25, # Performance tolerance when optimizing the model.\n", ")\n", - "quantization_config = QuantizationConfig(\n", - " approach=\"PostTrainingStatic\",\n", - " max_trials=200,\n", - " metrics=[tune_metric],\n", + "trainer_static.metrics = tune_metric\n", + "quantization_config = PostTrainingQuantConfig(\n", + " approach=\"static\"\n", ")\n", "\n", "# run quantization\n", @@ -1092,10 +1092,9 @@ " is_relative=True, # Metric tolerance mode, True is for relative, otherwise for absolute.\n", " criterion=0.25, # Performance tolerance when optimizing the model.\n", ")\n", - "quantization_config = QuantizationConfig(\n", - " approach=\"PostTrainingDynamic\",\n", - " max_trials=200,\n", - " metrics=[tune_metric],\n", + "trainer_dynamic.metrics = tune_metric\n", + "quantization_config = PostTrainingQuantConfig(\n", + " approach=\"dynamic\",\n", ")\n", "\n", "# run quantization\n", diff --git a/docs/tutorials/pytorch/question-answering/orchestrate_optimizations.ipynb b/docs/tutorials/pytorch/question-answering/orchestrate_optimizations.ipynb index accba939b30..a70dd5a1ab5 100644 --- a/docs/tutorials/pytorch/question-answering/orchestrate_optimizations.ipynb +++ b/docs/tutorials/pytorch/question-answering/orchestrate_optimizations.ipynb @@ -74,13 +74,15 @@ "import transformers\n", "from intel_extension_for_transformers.transformers import (\n", " metrics,\n", - " PrunerConfig,\n", - " PruningConfig,\n", - " DistillationConfig,\n", - " QuantizationConfig,\n", " OptimizedModel,\n", " objectives\n", ")\n", + "from neural_compressor.config import (\n", + " WeightPruningConfig,\n", + " DistillationConfig,\n", + " KnowledgeDistillationLossConfig,\n", + " QuantizationAwareTrainingConfig,\n", + ")\n", "from torch.utils.data import DataLoader\n", "from tqdm import tqdm\n", "from trainer_qa import QuestionAnsweringTrainer\n", @@ -214,7 +216,7 @@ " metadata={\"help\": \"Whether or not to apply prune.\"},\n", " )\n", " pruning_approach: Optional[str] = field(\n", - " default=\"BasicMagnitude\",\n", + " default=\"magnitude\",\n", " metadata={\"help\": \"Pruning approach. Supported approach is basic_magnite.\"},\n", " )\n", " target_sparsity_ratio: Optional[float] = field(\n", @@ -234,9 +236,9 @@ " metadata={\"help\": \"Whether or not to apply quantization.\"},\n", " )\n", " quantization_approach: Optional[str] = field(\n", - " default=\"PostTrainingStatic\",\n", - " metadata={\"help\": \"Quantization approach. Supported approach are PostTrainingStatic, \"\n", - " \"PostTrainingDynamic and QuantizationAwareTraining.\"},\n", + " default=\"static\",\n", + " metadata={\"help\": \"Quantization approach. Supported approach are static, \"\n", + " \"dynamic and qat.\"},\n", " )\n", " metric_name: Optional[str] = field(\n", " default=None,\n", @@ -300,7 +302,7 @@ ")\n", "optim_args = OptimizationArguments(\n", " tune=True,\n", - " quantization_approach=\"PostTrainingStatic\"\n", + " quantization_approach=\"static\"\n", ")\n", "log_level = training_args.get_process_log_level()" ] @@ -730,9 +732,7 @@ "logger.info(\"***** Number of student model parameters: {:.2f}M *****\".format(\\\n", " para_counter(model)/10**6))\n", "\n", - "# Trace model\n", - "from neural_compressor.adaptor.torch_utils.symbolic_trace import symbolic_trace\n", - "model = symbolic_trace(model, optim_args.quantization_approach==\"QuantizationAwareTraining\")" + "# Trace model\n" ] }, { @@ -779,21 +779,18 @@ " tune_metric = metrics.Metric(\n", " name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol\n", " )\n", - " prune_type = 'PatternLock' \\\n", + " prune_type = 'pattern_lock' \\\n", " if optim_args.pruning_approach else optim_args.pruning_approach\n", " target_sparsity_ratio = optim_args.target_sparsity_ratio \\\n", " if optim_args.target_sparsity_ratio else None\n", - " pruner_config = PrunerConfig(prune_type=prune_type, target_sparsity_ratio=target_sparsity_ratio)\n", - " pruning_conf = PruningConfig(framework=\"pytorch_fx\",pruner_config=[pruner_config], metrics=tune_metric)\n", - " distillation_conf = DistillationConfig(framework=\"pytorch_fx\", metrics=tune_metric)\n", - "\n", - " objective = objectives.performance\n", - " quantization_conf = QuantizationConfig(\n", - " approach=optim_args.quantization_approach,\n", - " max_trials=600,\n", - " metrics=[tune_metric],\n", - " objectives=[objective]\n", - " )\n", + " trainer.metrics = tune_metric\n", + " pruning_conf = WeightPruningConfig([{\"start_step\": 0, \"end_step\": 2}],\n", + " target_sparsity=target_sparsity_ratio,\n", + " pruning_scope=\"local\",\n", + " pruning_type=prune_type)\n", + " distillation_criterion = KnowledgeDistillationLossConfig(loss_types=[\"CE\", \"KL\"])\n", + " distillation_conf = DistillationConfig(teacher_model=teacher_model, criterion=distillation_criterion)\n", + " quantization_conf = QuantizationAwareTrainingConfig()\n", " conf_list = [pruning_conf, distillation_conf, quantization_conf]\n", " model = trainer.orchestrate_optimizations(config_list=conf_list, teacher_model=teacher_model)" ] diff --git a/docs/tutorials/pytorch/question-answering/orchestrate_optimizations_bert_mini.ipynb b/docs/tutorials/pytorch/question-answering/orchestrate_optimizations_bert_mini.ipynb index b3a983f8d35..78b1258d580 100644 --- a/docs/tutorials/pytorch/question-answering/orchestrate_optimizations_bert_mini.ipynb +++ b/docs/tutorials/pytorch/question-answering/orchestrate_optimizations_bert_mini.ipynb @@ -78,6 +78,12 @@ " DataCollatorWithPadding,\n", " EvalPrediction,\n", ")\n", + "from neural_compressor.config import (\n", + " WeightPruningConfig,\n", + " DistillationConfig,\n", + " KnowledgeDistillationLossConfig,\n", + " QuantizationAwareTrainingConfig,\n", + ")\n", "from transformers.utils import check_min_version\n", "from transformers.utils.versions import require_version\n", "from typing import Optional\n", @@ -430,18 +436,14 @@ " name=metric_name, is_relative=True, criterion=0.01\n", ")\n", "\n", - "target_sparsity_ratio = None\n", - "pruner_config = PrunerConfig(prune_type='PatternLock', target_sparsity_ratio=None)\n", - "pruning_conf = PruningConfig(framework=\"pytorch_fx\",pruner_config=[pruner_config], metrics=tune_metric)\n", - "distillation_conf = DistillationConfig(framework=\"pytorch_fx\", metrics=tune_metric)\n", - "\n", - "objective = objectives.performance\n", - "quantization_conf = QuantizationConfig(\n", - " approach=\"QuantizationAwareTraining\",\n", - " max_trials=600,\n", - " metrics=[tune_metric],\n", - " objectives=[objective]\n", - ")\n", + "trainer.metrics = tune_metric\n", + "pruning_conf = WeightPruningConfig([{\"start_step\": 0, \"end_step\": 2}],\n", + " target_sparsity=0.64,\n", + " pruning_scope=\"local\",\n", + " pruning_type=\"pattern_lock\")\n", + "distillation_criterion = KnowledgeDistillationLossConfig(loss_types=[\"CE\", \"KL\"])\n", + "distillation_conf = DistillationConfig(teacher_model=teacher_model, criterion=distillation_criterion)\n", + "quantization_conf = QuantizationAwareTrainingConfig()\n", "conf_list = [pruning_conf, distillation_conf, quantization_conf]\n", "model = trainer.orchestrate_optimizations(config_list=conf_list, teacher_model=teacher_model)" ] diff --git a/docs/tutorials/pytorch/question-answering/pruning.ipynb b/docs/tutorials/pytorch/question-answering/pruning.ipynb index 61dde668c8b..b49b7e11e57 100644 --- a/docs/tutorials/pytorch/question-answering/pruning.ipynb +++ b/docs/tutorials/pytorch/question-answering/pruning.ipynb @@ -66,7 +66,8 @@ "import transformers\n", "from dataclasses import dataclass, field\n", "from datasets import load_dataset, load_metric\n", - "from intel_extension_for_transformers.transformers import metrics, OptimizedModel, PrunerConfig, PruningConfig, PruningMode\n", + "from intel_extension_for_transformers.transformers import metrics, OptimizedModel\n", + "from neural_compressor.config import WeightPruningConfig\n", "from trainer_qa import QuestionAnsweringTrainer\n", "from transformers import (\n", " AutoConfig,\n", @@ -225,7 +226,7 @@ " metadata={\"help\": \"Whether or not to apply prune.\"},\n", " )\n", " pruning_approach: Optional[str] = field(\n", - " default=\"BasicMagnitude\",\n", + " default=\"magnitude\",\n", " metadata={\"help\": \"Pruning approach. Supported approach is basic_magnite.\"},\n", " )\n", " target_sparsity_ratio: Optional[float] = field(\n", @@ -278,7 +279,7 @@ ")\n", "optim_args = OptimizationArguments(\n", " tune=True,\n", - " quantization_approach=\"PostTrainingStatic\"\n", + " quantization_approach=\"static\"\n", ")\n", "log_level = training_args.get_process_log_level()" ] @@ -625,11 +626,14 @@ " raise ValueError(\"do_train must be set to True for pruning.\")\n", "\n", " tune_metric = metrics.Metric(name=metric_name)\n", - " prune_type = 'BasicMagnitude' if optim_args.pruning_approach else optim_args.pruning_approach\n", + " prune_type = 'magnitude' if optim_args.pruning_approach else optim_args.pruning_approach\n", " target_sparsity_ratio = optim_args.target_sparsity_ratio \\\n", " if optim_args.target_sparsity_ratio else None\n", - " pruner_config = PrunerConfig(prune_type=prune_type, target_sparsity_ratio=target_sparsity_ratio)\n", - " pruning_conf = PruningConfig(pruner_config=pruner_config, metrics=tune_metric)\n", + " trainer.metrics = tune_metric\n", + " pruning_conf = WeightPruningConfig([{\"start_step\": 0, \"end_step\": 2}],\n", + " target_sparsity=target_sparsity_ratio,\n", + " pruning_scope=\"local\",\n", + " pruning_type=prune_type)\n", "\n", " model = trainer.prune(pruning_config=pruning_conf)\n", " trainer.save_model(training_args.output_dir)" diff --git a/docs/tutorials/pytorch/summarization/pegasus-samsum.ipynb b/docs/tutorials/pytorch/summarization/pegasus-samsum.ipynb index 4c66e32752a..ae599745a0d 100644 --- a/docs/tutorials/pytorch/summarization/pegasus-samsum.ipynb +++ b/docs/tutorials/pytorch/summarization/pegasus-samsum.ipynb @@ -110,7 +110,8 @@ "from datasets import load_dataset, load_metric\n", "\n", "from filelock import FileLock\n", - "from intel_extension_for_transformers.transformers import OptimizedModel, QuantizationConfig\n", + "from intel_extension_for_transformers.transformers import OptimizedModel\n", + "from neural_compressor.config import PostTrainingQuantConfig\n", "from intel_extension_for_transformers.transformers import metrics as nlp_metrics\n", "from intel_extension_for_transformers.transformers.trainer import NLPSeq2SeqTrainer\n", "from transformers import (\n", @@ -277,9 +278,9 @@ " metadata={\"help\": \"Whether or not to apply quantization.\"},\n", " )\n", " quantization_approach: Optional[str] = field(\n", - " default=\"PostTrainingStatic\",\n", - " metadata={\"help\": \"Quantization approach. Supported approach are PostTrainingStatic, \"\n", - " \"PostTrainingDynamic and QuantizationAwareTraining.\"},\n", + " default=\"static\",\n", + " metadata={\"help\": \"Quantization approach. Supported approach are static, \"\n", + " \"dynamic and qat.\"},\n", " )\n" ] }, @@ -631,10 +632,9 @@ "tune_metric = nlp_metrics.Metric(\n", " name=metric_name, is_relative=True, criterion=0.25\n", ")\n", - "quantization_config = QuantizationConfig(\n", - " approach=\"PostTrainingDynamic\",\n", - " max_trials=200,\n", - " metrics=[tune_metric],\n", + "trainer.metrics = tune_metric\n", + "quantization_config = PostTrainingQuantConfig(\n", + " approach=\"dynamic\",\n", ")\n", "trainer.max_length = max_length\n", "trainer.num_beams = num_beams\n", diff --git a/docs/tutorials/pytorch/text-classification/bert-base-uncased-MRPC.ipynb b/docs/tutorials/pytorch/text-classification/bert-base-uncased-MRPC.ipynb index 4855c46ccec..a4f13fad850 100644 --- a/docs/tutorials/pytorch/text-classification/bert-base-uncased-MRPC.ipynb +++ b/docs/tutorials/pytorch/text-classification/bert-base-uncased-MRPC.ipynb @@ -113,7 +113,8 @@ "import transformers\n", "from dataclasses import dataclass, field\n", "from datasets import load_dataset, load_metric\n", - "from intel_extension_for_transformers.transformers import metrics, objectives, OptimizedModel, QuantizationConfig\n", + "from intel_extension_for_transformers.transformers import metrics, objectives, OptimizedModel\n", + "from neural_compressor.config import PostTrainingQuantConfig\n", "from intel_extension_for_transformers.transformers.trainer import NLPTrainer\n", "from transformers import (\n", " AutoConfig,\n", @@ -247,9 +248,9 @@ " metadata={\"help\": \"Whether or not to apply quantization.\"},\n", " )\n", " quantization_approach: Optional[str] = field(\n", - " default=\"PostTrainingStatic\",\n", - " metadata={\"help\": \"Quantization approach. Supported approach are PostTrainingStatic, \"\n", - " \"PostTrainingDynamic and QuantizationAwareTraining.\"},\n", + " default=\"static\",\n", + " metadata={\"help\": \"Quantization approach. Supported approach are static, \"\n", + " \"dynamic and qat.\"},\n", " )\n", " is_relative: Optional[bool] = field(\n", " default=True,\n", @@ -296,7 +297,7 @@ ")\n", "optim_args = OptimizationArguments(\n", " tune=True,\n", - " quantization_approach=\"PostTrainingStatic\"\n", + " quantization_approach=\"static\"\n", ")\n", "log_level = training_args.get_process_log_level()\n", "logger.setLevel(log_level)" @@ -532,11 +533,9 @@ " name=metric_name, is_relative=True, criterion=0.25\n", ")\n", "objective = objectives.performance\n", - "quantization_config = QuantizationConfig(\n", - " approach=\"PostTrainingStatic\",\n", - " max_trials=600,\n", - " metrics=[tune_metric],\n", - " objectives=[objective]\n", + "trainer_static.metrics = metrics\n", + "quantization_config = PostTrainingQuantConfig(\n", + " approach=\"static\",\n", ")\n", "trainer_static.quantize(quant_config=quantization_config)" ] @@ -642,11 +641,9 @@ " name=metric_name, is_relative=True, criterion=0.25\n", ")\n", "objective = objectives.performance\n", - "quantization_config = QuantizationConfig(\n", - " approach=\"PostTrainingDynamic\",\n", - " max_trials=600,\n", - " metrics=[tune_metric],\n", - " objectives=[objective]\n", + "trainer_dynamic.metrics = metrics\n", + "quantization_config = PostTrainingQuantConfig(\n", + " approach=\"dynamic\",\n", ")\n", "trainer_dynamic.quantize(quant_config=quantization_config)" ] diff --git a/docs/tutorials/pytorch/text-classification/orchestrate_optimizations.ipynb b/docs/tutorials/pytorch/text-classification/orchestrate_optimizations.ipynb index fd8c926085f..6fa3f293695 100644 --- a/docs/tutorials/pytorch/text-classification/orchestrate_optimizations.ipynb +++ b/docs/tutorials/pytorch/text-classification/orchestrate_optimizations.ipynb @@ -71,10 +71,6 @@ "from datasets import load_dataset, load_metric\n", "from intel_extension_for_transformers.transformers import (\n", " metrics,\n", - " PrunerConfig,\n", - " PruningConfig,\n", - " DistillationConfig,\n", - " QuantizationConfig,\n", " OptimizedModel,\n", " objectives\n", ")\n", @@ -93,9 +89,14 @@ " default_data_collator,\n", " set_seed,\n", ")\n", + "from neural_compressor.config import (\n", + " WeightPruningConfig,\n", + " DistillationConfig,\n", + " KnowledgeDistillationLossConfig,\n", + " QuantizationAwareTrainingConfig,\n", + ")\n", "from transformers.trainer_utils import get_last_checkpoint\n", "from transformers.utils import check_min_version\n", - "from transformers.utils.fx import symbolic_trace\n", "from typing import Optional\n", "\n", "\n", @@ -251,7 +252,7 @@ " metadata={\"help\": \"Whether or not to apply prune.\"},\n", " )\n", " pruning_approach: Optional[str] = field(\n", - " default=\"BasicMagnitude\",\n", + " default=\"magnitude\",\n", " metadata={\"help\": \"Pruning approach. Supported approach is basic_magnite.\"},\n", " )\n", " target_sparsity_ratio: Optional[float] = field(\n", @@ -271,9 +272,9 @@ " metadata={\"help\": \"Whether or not to apply quantization.\"},\n", " )\n", " quantization_approach: Optional[str] = field(\n", - " default=\"QuantizationAwareTraining\",\n", - " metadata={\"help\": \"Quantization approach. Supported approach are PostTrainingStatic, \"\n", - " \"PostTrainingDynamic and QuantizationAwareTraining.\"},\n", + " default=\"qat\",\n", + " metadata={\"help\": \"Quantization approach. Supported approach are static, \"\n", + " \"dynamic and qat.\"},\n", " )\n", " metric_name: Optional[str] = field(\n", " default=\"eval_f1\",\n", @@ -341,7 +342,7 @@ ")\n", "optim_args = OptimizationArguments(\n", " tune=True,\n", - " quantization_approach=\"PostTrainingStatic\"\n", + " quantization_approach=\"static\"\n", ")\n", "log_level = training_args.get_process_log_level()\n", "logger.setLevel(log_level)" @@ -618,7 +619,7 @@ "\n", "# Trace model\n", "from neural_compressor.adaptor.torch_utils.symbolic_trace import symbolic_trace\n", - "model = symbolic_trace(model, optim_args.quantization_approach==\"QuantizationAwareTraining\")" + "model = symbolic_trace(model, optim_args.quantization_approach==\"qat\")" ] }, { @@ -671,23 +672,20 @@ " tune_metric = metrics.Metric(\n", " name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol\n", " )\n", - " prune_type = 'PatternLock' \\\n", + " prune_type = 'pattern_lock' \\\n", " if optim_args.pruning_approach else optim_args.pruning_approach\n", " target_sparsity_ratio = optim_args.target_sparsity_ratio \\\n", " if optim_args.target_sparsity_ratio else None\n", - " pruner_config = PrunerConfig(prune_type=prune_type, target_sparsity_ratio=target_sparsity_ratio)\n", - " pruning_conf = PruningConfig(framework=\"pytorch_fx\",pruner_config=[pruner_config], metrics=tune_metric)\n", - " distillation_conf = DistillationConfig(framework=\"pytorch_fx\", metrics=tune_metric)\n", - " \n", - " objective = objectives.performance\n", - " quantization_conf = QuantizationConfig(\n", - " approach=optim_args.quantization_approach,\n", - " max_trials=600,\n", - " metrics=[tune_metric],\n", - " objectives=[objective]\n", - " )\n", + " trainer.metrics = tune_metric\n", + " pruning_conf = WeightPruningConfig([{\"start_step\": 0, \"end_step\": 2}],\n", + " target_sparsity=target_sparsity_ratio,\n", + " pruning_scope=\"local\",\n", + " pruning_type=prune_type)\n", + " distillation_criterion = KnowledgeDistillationLossConfig(loss_types=[\"CE\", \"KL\"])\n", + " distillation_conf = DistillationConfig(teacher_model=teacher_model, criterion=distillation_criterion)\n", + " quantization_conf = QuantizationAwareTrainingConfig()\n", " conf_list = [pruning_conf, distillation_conf, quantization_conf]\n", - " model = trainer.orchestrate_optimizations(config_list=conf_list, teacher_model=teacher_model)" + " model = trainer.orchestrate_optimizations(config_list=conf_list)" ] }, { diff --git a/docs/tutorials/pytorch/text-classification/orchestrate_optimizations_bert_mini.ipynb b/docs/tutorials/pytorch/text-classification/orchestrate_optimizations_bert_mini.ipynb index e533ab555f9..ffbd067af30 100644 --- a/docs/tutorials/pytorch/text-classification/orchestrate_optimizations_bert_mini.ipynb +++ b/docs/tutorials/pytorch/text-classification/orchestrate_optimizations_bert_mini.ipynb @@ -70,12 +70,14 @@ "from datasets import load_dataset, load_metric\n", "from intel_extension_for_transformers.transformers import (\n", " metrics,\n", - " PrunerConfig,\n", - " PruningConfig,\n", - " DistillationConfig,\n", - " QuantizationConfig,\n", " objectives\n", ")\n", + "from neural_compressor.config import (\n", + " WeightPruningConfig,\n", + " DistillationConfig,\n", + " KnowledgeDistillationLossConfig,\n", + " QuantizationAwareTrainingConfig,\n", + ")\n", "from intel_extension_for_transformers.transformers.trainer import NLPTrainer\n", "from transformers import (\n", " AutoConfig,\n", @@ -343,18 +345,14 @@ " name=metric_name, is_relative=True, criterion=0.01\n", ")\n", "\n", - "target_sparsity_ratio = None\n", - "pruner_config = PrunerConfig(prune_type='PatternLock', target_sparsity_ratio=None)\n", - "pruning_conf = PruningConfig(framework=\"pytorch_fx\",pruner_config=[pruner_config], metrics=tune_metric)\n", - "distillation_conf = DistillationConfig(framework=\"pytorch_fx\", metrics=tune_metric)\n", - "\n", - "objective = objectives.performance\n", - "quantization_conf = QuantizationConfig(\n", - " approach=\"QuantizationAwareTraining\",\n", - " max_trials=600,\n", - " metrics=[tune_metric],\n", - " objectives=[objective]\n", - ")\n", + "trainer.metrics = tune_metric\n", + "pruning_conf = WeightPruningConfig([{\"start_step\": 0, \"end_step\": 2}],\n", + " target_sparsity=0.64,\n", + " pruning_scope=\"local\",\n", + " pruning_type=\"pattern_lock\")\n", + "distillation_criterion = KnowledgeDistillationLossConfig(loss_types=[\"CE\", \"KL\"])\n", + "distillation_conf = DistillationConfig(teacher_model=teacher_model, criterion=distillation_criterion)\n", + "quantization_conf = QuantizationAwareTrainingConfig()\n", "conf_list = [pruning_conf, distillation_conf, quantization_conf]\n", "model = trainer.orchestrate_optimizations(config_list=conf_list, teacher_model=teacher_model)" ] diff --git a/docs/tutorials/pytorch/text-classification/pruning.ipynb b/docs/tutorials/pytorch/text-classification/pruning.ipynb index 723de2b8f46..53d56051dc5 100644 --- a/docs/tutorials/pytorch/text-classification/pruning.ipynb +++ b/docs/tutorials/pytorch/text-classification/pruning.ipynb @@ -69,10 +69,9 @@ "from datasets import load_dataset, load_metric\n", "from intel_extension_for_transformers.transformers import (\n", " metrics,\n", - " OptimizedModel,\n", - " PrunerConfig,\n", - " PruningConfig,\n", + " OptimizedModel\n", ")\n", + "from neural_compressor.config import WeightPruningConfig\n", "from intel_extension_for_transformers.transformers.trainer import NLPTrainer\n", "from transformers import (\n", " AutoConfig,\n", @@ -283,7 +282,7 @@ ")\n", "optim_args = OptimizationArguments(\n", " tune=True,\n", - " quantization_approach=\"PostTrainingStatic\"\n", + " quantization_approach=\"static\"\n", ")\n", "log_level = training_args.get_process_log_level()\n", "logger.setLevel(log_level)" @@ -465,12 +464,14 @@ ")\n", "\n", "tune_metric = metrics.Metric(name=metric_name)\n", - "prune_type = 'BasicMagnitude' \\\n", - " if optim_args.pruning_approach else optim_args.pruning_approach\n", + "prune_type = 'magnitude' if optim_args.pruning_approach else optim_args.pruning_approach\n", "target_sparsity_ratio = optim_args.target_sparsity_ratio \\\n", " if optim_args.target_sparsity_ratio else None\n", - "pruner_config = PrunerConfig(prune_type=prune_type, target_sparsity_ratio=target_sparsity_ratio)\n", - "pruning_conf = PruningConfig(pruner_config=pruner_config, metrics=tune_metric)\n", + "trainer.metrics = tune_metric\n", + "pruning_conf = WeightPruningConfig([{\"start_step\": 0, \"end_step\": 2}],\n", + " target_sparsity=target_sparsity_ratio,\n", + " pruning_scope=\"local\",\n", + " pruning_type=prune_type)\n", "\n", "model = trainer.prune(pruning_config=pruning_conf)\n", "trainer.save_model(training_args.output_dir)" diff --git a/docs/tutorials/pytorch/token-classification/distilbert_base_ner.ipynb b/docs/tutorials/pytorch/token-classification/distilbert_base_ner.ipynb index 82b6ecbe4ce..8d43eb95e30 100644 --- a/docs/tutorials/pytorch/token-classification/distilbert_base_ner.ipynb +++ b/docs/tutorials/pytorch/token-classification/distilbert_base_ner.ipynb @@ -108,8 +108,8 @@ "from intel_extension_for_transformers.transformers import(\n", " metrics,\n", " OptimizedModel,\n", - " QuantizationConfig,\n", ")\n", + "from neural_compressor.config import PostTrainingQuantConfig\n", "from intel_extension_for_transformers.transformers.trainer import NLPTrainer\n", "from transformers import (\n", " AutoConfig,\n", @@ -559,9 +559,9 @@ "tune_metric = metrics.Metric(\n", " name=metric_name, is_relative=True, criterion=0.25\n", ")\n", - "quantization_config = QuantizationConfig(\n", - " approach=\"PostTrainingStatic\",\n", - " metrics=[tune_metric],\n", + "trainer_static.metrics = tune_metric\n", + "quantization_config = PostTrainingQuantConfig(\n", + " approach=\"static\",\n", ")\n", "trainer_static.quantize(quantization_config)" ] @@ -661,9 +661,9 @@ "tune_metric = metrics.Metric(\n", " name=metric_name, is_relative=True, criterion=0.25\n", ")\n", - "quantization_config = QuantizationConfig(\n", - " approach=\"PostTrainingDynamic\",\n", - " metrics=[tune_metric],\n", + "trainer_dynamic.metrics = tune_metric\n", + "quantization_config = PostTrainingQuantConfig(\n", + " approach=\"dynamic\",\n", ")\n", "trainer_dynamic.quantize(quantization_config)" ] diff --git a/docs/tutorials/pytorch/translation/t5-small.ipynb b/docs/tutorials/pytorch/translation/t5-small.ipynb index b10ee380c09..9356c5214ec 100644 --- a/docs/tutorials/pytorch/translation/t5-small.ipynb +++ b/docs/tutorials/pytorch/translation/t5-small.ipynb @@ -108,7 +108,8 @@ "import numpy as np\n", "from datasets import load_dataset, load_metric\n", "\n", - "from intel_extension_for_transformers.transformers import OptimizedModel, QuantizationConfig\n", + "from intel_extension_for_transformers.transformers import OptimizedModel\n", + "from neural_compressor.config import PostTrainingQuantConfig\n", "from intel_extension_for_transformers.transformers import metrics as nlp_metrics\n", "from intel_extension_for_transformers.transformers.trainer import NLPSeq2SeqTrainer\n", "import transformers\n", @@ -510,10 +511,9 @@ "tune_metric = nlp_metrics.Metric(\n", " name=metric_name, is_relative=True, criterion=0.25\n", ")\n", - "quantization_config = QuantizationConfig(\n", - " approach=\"PostTrainingDynamic\",\n", - " max_trials=200,\n", - " metrics=[tune_metric],\n", + "trainer.metrics = tune_metric\n", + "quantization_config = PostTrainingQuantConfig(\n", + " approach=\"dynamic\",\n", ")\n", "trainer.max_length = max_length\n", "trainer.num_beams = num_beams\n", diff --git a/examples/huggingface/onnxruntime/optimization_README.md b/examples/huggingface/onnxruntime/optimization_README.md index 6f2da2cd552..3c801acfe8b 100644 --- a/examples/huggingface/onnxruntime/optimization_README.md +++ b/examples/huggingface/onnxruntime/optimization_README.md @@ -4,7 +4,7 @@ Welcome to ONNX Runtime Huggingface examples. The models are from [Huggingface]( ## Quantization approach -| Task | PostTrainingDynamic | PostTrainingStatic +| Task | dynamic | static |---|:---:|:---:| |**`speech-recognition`**| ✅ | ✅ | diff --git a/examples/huggingface/onnxruntime/speech-recognition/quantization/README.md b/examples/huggingface/onnxruntime/speech-recognition/quantization/README.md index 5638882e2ca..ae878cc4cb8 100644 --- a/examples/huggingface/onnxruntime/speech-recognition/quantization/README.md +++ b/examples/huggingface/onnxruntime/speech-recognition/quantization/README.md @@ -1,6 +1,6 @@ Step-by-Step​ ============ -The script `run_whisper.py` provides two quantization approaches (PostTrainingStatic and PostTrainingDynamic) based on [Intel® Neural Compressor](https://github.com/intel/neural-compressor) with [LibriSpeech test-clean](https://huggingface.co/datasets/librispeech_asr) dataset. +The script `run_whisper.py` provides two quantization approaches (static and dynamic) based on [Intel® Neural Compressor](https://github.com/intel/neural-compressor) with [LibriSpeech test-clean](https://huggingface.co/datasets/librispeech_asr) dataset. # Prerequisite​ ## 1. Create Environment​ @@ -96,7 +96,7 @@ Available INT4 models on huggingface: # Validated model list -|Topology|Pretrained model|PostTrainingDynamic|PostTrainingStatic|WeightOnly4Bit| +|Topology|Pretrained model|dynamic|static|WeightOnly4Bit| |---|------------------------------------|---|---|--- |whisper_tiny|openai/whisper-tiny| | | ✅| |whisper_base|openai/whisper-base| | | ✅| diff --git a/examples/huggingface/pytorch/code-generation/quantization/run_tuning.sh b/examples/huggingface/pytorch/code-generation/quantization/run_tuning.sh index 524a5e8d46d..0a793301dbb 100644 --- a/examples/huggingface/pytorch/code-generation/quantization/run_tuning.sh +++ b/examples/huggingface/pytorch/code-generation/quantization/run_tuning.sh @@ -16,7 +16,7 @@ function init_params { model_name_or_path="bigcode/starcoder" extra_cmd="" batch_size=8 - approach="PostTrainingStatic" + approach="static" alpha=0.5 script="run_generation.py" for var in "$@" diff --git a/examples/huggingface/pytorch/image-classification/deployment/imagenet/vit/model_quant_convert.py b/examples/huggingface/pytorch/image-classification/deployment/imagenet/vit/model_quant_convert.py index 9fdf4114e54..c73a4f6dcb7 100644 --- a/examples/huggingface/pytorch/image-classification/deployment/imagenet/vit/model_quant_convert.py +++ b/examples/huggingface/pytorch/image-classification/deployment/imagenet/vit/model_quant_convert.py @@ -49,7 +49,13 @@ from transformers.utils import check_min_version, send_example_telemetry from transformers.utils.versions import require_version -from intel_extension_for_transformers.transformers import metrics, objectives, OptimizedModel, QuantizationConfig +from intel_extension_for_transformers.transformers import OptimizedModel, metrics +from neural_compressor.config import ( + PostTrainingQuantConfig, + QuantizationAwareTrainingConfig, + TuningCriterion, + AccuracyCriterion +) from intel_extension_for_transformers.transformers.trainer import NLPTrainer os.environ["WANDB_DISABLED"] = "true" @@ -171,9 +177,9 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply quantization."}, ) quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, + default="static", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic and qat."}, ) metric_name: Optional[str] = field( default="eval_accuracy", @@ -431,28 +437,38 @@ def val_transforms(example_batch): raise ValueError("do_eval must be set to True for quantization.") trainer.save_model(training_args.output_dir) - if optim_args.quantization_approach != "PostTrainingDynamic": + if optim_args.quantization_approach != "dynamic": if not training_args.do_train: raise ValueError( "do_train must be set to True for static and aware training quantization." ) - if optim_args.quantization_approach == "QuantizationAwareTraining": - early_stopping_patience = 6 - early_stopping_threshold = 0.001 # optional - trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, - early_stopping_threshold)) - tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol ) - objective = objectives.performance - quantization_config = QuantizationConfig( - approach=optim_args.quantization_approach, - max_trials=600, - metrics=[tune_metric], - objectives=[objective], - config_file='vit.yaml' - ) + trainer.metrics = tune_metric + if optim_args.quantization_approach != "qat": + op_name_dict = { + 'vit.embeddings.patch_embeddings.projection.module': { + 'activation': {'dtype': ['fp32']}, + 'weight': {'dtype': ['fp32']} + }, + 'vit.embeddings.dropout': { + 'activation': {'dtype': ['fp32']}, + 'weight': {'dtype': ['fp32']} + }, + } + tuning_criterion = TuningCriterion(max_trials=600) + accuracy_criterion = AccuracyCriterion( + higher_is_better=True, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. + ) + quantization_config = PostTrainingQuantConfig( + approach=optim_args.quantization_approach, + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion, + op_name_dict=op_name_dict, + ) model = trainer.quantize(quant_config=quantization_config) diff --git a/examples/huggingface/pytorch/image-classification/deployment/imagenet/vit/run_vit.sh b/examples/huggingface/pytorch/image-classification/deployment/imagenet/vit/run_vit.sh index 73c1add48ca..a9350cde559 100644 --- a/examples/huggingface/pytorch/image-classification/deployment/imagenet/vit/run_vit.sh +++ b/examples/huggingface/pytorch/image-classification/deployment/imagenet/vit/run_vit.sh @@ -123,7 +123,7 @@ else echo "========== Prepare Model ${MODEL_NAME_OR_PATH} with Precision ${PRECISION} ========" mode_cmd="" if [[ ${PRECISION} = 'int8' ]]; then - mode_cmd=$mode_cmd" --tune --quantization_approach PostTrainingStatic" + mode_cmd=$mode_cmd" --tune --quantization_approach static" elif [[ ${PRECISION} = 'bf16' ]]; then mode_cmd=$mode_cmd" --enable_bf16" fi diff --git a/examples/huggingface/pytorch/image-classification/deployment/imagenet/vit/vit.yaml b/examples/huggingface/pytorch/image-classification/deployment/imagenet/vit/vit.yaml deleted file mode 100644 index 66ac67fcc58..00000000000 --- a/examples/huggingface/pytorch/image-classification/deployment/imagenet/vit/vit.yaml +++ /dev/null @@ -1,28 +0,0 @@ -model: # mandatory. used to specify model specific information. - name: vit - framework: pytorch_fx # mandatory. possible values are tensorflow, mxnet, pytorch, pytorch_ipex, onnxrt_integerops and onnxrt_qlinearops. - -quantization: # optional. tuning constraints on model-wise for advance user to reduce tuning space. - approach: post_training_static_quant - calibration: - sampling_size: [1000] - op_wise: { # optional. tuning constraints on op-wise for advance user to reduce tuning space. - 'vit.embeddings.patch_embeddings.projection.module': { # optional. set default qconfig to fp32 for FX model - 'activation': {'dtype': ['fp32']}, - 'weight': {'dtype': ['fp32']} - }, - 'vit.embeddings.dropout': { # optional. set default qconfig to fp32 for FX model - 'activation': {'dtype': ['fp32']}, - 'weight': {'dtype': ['fp32']} - }, - } - -tuning: - accuracy_criterion: - relative: 0.01 # optional. default value is relative, other value is absolute. this example allows relative accuracy loss: 1%. - exit_policy: - timeout: 0 # optional. tuning timeout (seconds). default value is 0 which means early stop. combine with max_trials field to decide when to exit. - max_trials: 500 - random_seed: 1978 # optional. random seed for deterministic tuning. - workspace: - path: nc_workspace/vit/ diff --git a/examples/huggingface/pytorch/image-classification/quantization/README.md b/examples/huggingface/pytorch/image-classification/quantization/README.md index 09d2eaa2e83..d9a9f8c1055 100644 --- a/examples/huggingface/pytorch/image-classification/quantization/README.md +++ b/examples/huggingface/pytorch/image-classification/quantization/README.md @@ -1,11 +1,11 @@ # Image classification -The script `run_image_classification.py` provides three quantization approaches (PostTrainingStatic, PostTrainingStatic and QuantizationAwareTraining) based on [Intel® Neural Compressor](https://github.com/intel/neural-compressor). +The script `run_image_classification.py` provides three quantization approaches (dynamic, static and qat) based on [Intel® Neural Compressor](https://github.com/intel/neural-compressor). Here is how to run the script: >**Note**: Please use transformers no higher than 4.34.1 -1. quantization with PostTrainingStatic +1. static quantization ``` sh run_tuning.sh @@ -26,6 +26,6 @@ run run_benchmark.sh ### Validated model list -|Dataset|Pretrained model|PostTrainingDynamic | PostTrainingStatic | QuantizationAwareTraining +|Dataset|Pretrained model|dynamic | static | qat |---|------------------------------------|---|---|--- |imagenet-1k|google/vit-base-patch16-224| ✅| ✅| N/A| diff --git a/examples/huggingface/pytorch/image-classification/quantization/conf.yaml b/examples/huggingface/pytorch/image-classification/quantization/conf.yaml deleted file mode 100644 index a9c4b4879b6..00000000000 --- a/examples/huggingface/pytorch/image-classification/quantization/conf.yaml +++ /dev/null @@ -1,18 +0,0 @@ -model: # mandatory. used to specify model specific information. - name: vit - framework: pytorch_fx # mandatory. possible values are tensorflow, mxnet, pytorch, pytorch_ipex, onnxrt_integerops and onnxrt_qlinearops. - -quantization: # optional. tuning constraints on model-wise for advance user to reduce tuning space. - approach: post_training_static_quant - calibration: - sampling_size: [1000] - -tuning: - accuracy_criterion: - relative: 0.01 # optional. default value is relative, other value is absolute. this example allows relative accuracy loss: 1%. - exit_policy: - timeout: 0 # optional. tuning timeout (seconds). default value is 0 which means early stop. combine with max_trials field to decide when to exit. - max_trials: 300 - random_seed: 1978 # optional. random seed for deterministic tuning. - workspace: - path: nc_workspace/vit/ \ No newline at end of file diff --git a/examples/huggingface/pytorch/image-classification/quantization/run_image_classification.py b/examples/huggingface/pytorch/image-classification/quantization/run_image_classification.py index 92d11d9c380..47cd548107c 100644 --- a/examples/huggingface/pytorch/image-classification/quantization/run_image_classification.py +++ b/examples/huggingface/pytorch/image-classification/quantization/run_image_classification.py @@ -49,7 +49,13 @@ from transformers.utils import check_min_version, send_example_telemetry from transformers.utils.versions import require_version -from intel_extension_for_transformers.transformers import metrics, objectives, OptimizedModel, QuantizationConfig +from intel_extension_for_transformers.transformers import OptimizedModel, metrics +from neural_compressor.config import ( + PostTrainingQuantConfig, + QuantizationAwareTrainingConfig, + TuningCriterion, + AccuracyCriterion +) from intel_extension_for_transformers.transformers.trainer import NLPTrainer os.environ["WANDB_DISABLED"] = "true" @@ -171,9 +177,9 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply quantization."}, ) quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, + default="static", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic and qat."}, ) metric_name: Optional[str] = field( default="eval_accuracy", @@ -208,9 +214,6 @@ class OptimizationArguments: num_of_instance: int = field( default=-1, metadata={"help":"the number of instance for benchmark."}) - inc_config_file: Optional[str] = field( - default="vit_config.yaml", metadata={"help": "quantization configuration file"} - ) def collate_fn(examples): @@ -446,29 +449,54 @@ def val_transforms(example_batch): model.config.save_pretrained(training_args.output_dir) trainer.save_model(training_args.output_dir) - if optim_args.quantization_approach != "PostTrainingDynamic": + if optim_args.quantization_approach != "dynamic": if not training_args.do_train: raise ValueError( "do_train must be set to True for static and aware training quantization." ) model.config.save_pretrained(training_args.output_dir) - if optim_args.quantization_approach == "QuantizationAwareTraining": - early_stopping_patience = 6 - early_stopping_threshold = 0.001 # optional - trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, - early_stopping_threshold)) - tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol ) - objective = objectives.performance - quantization_config = QuantizationConfig( - approach=optim_args.quantization_approach, - max_trials=600, - metrics=[tune_metric], - objectives=[objective], - config_file=optim_args.inc_config_file - ) + trainer.metrics = tune_metric + if optim_args.quantization_approach != "qat": + op_name_dict = { + 'vit.embeddings.patch_embeddings.projection.module': { + 'activation': {'dtype': ['fp32']}, + 'weight': {'dtype': ['fp32']} + }, + 'vit.embeddings.dropout': { + 'activation': {'dtype': ['fp32']}, + 'weight': {'dtype': ['fp32']} + }, + } + tuning_criterion = TuningCriterion(max_trials=600) + accuracy_criterion = AccuracyCriterion( + higher_is_better=True, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. + ) + quantization_config = PostTrainingQuantConfig( + approach=optim_args.quantization_approach, + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion, + op_name_dict=op_name_dict, + ) + else: + tuning_criterion = TuningCriterion(max_trials=600) + accuracy_criterion = AccuracyCriterion( + higher_is_better=True, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. + ) + quantization_config = QuantizationAwareTrainingConfig( + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + early_stopping_patience = 2 + early_stopping_threshold = 0.001 # optional + trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, \ + early_stopping_threshold)) model = trainer.quantize(quant_config=quantization_config) diff --git a/examples/huggingface/pytorch/image-classification/quantization/run_tuning.sh b/examples/huggingface/pytorch/image-classification/quantization/run_tuning.sh index 2733f3e555c..e596d93e824 100644 --- a/examples/huggingface/pytorch/image-classification/quantization/run_tuning.sh +++ b/examples/huggingface/pytorch/image-classification/quantization/run_tuning.sh @@ -16,7 +16,7 @@ function init_params { model_name_or_path="google/vit-base-patch16-224" extra_cmd="" batch_size=8 - approach="PostTrainingStatic" + approach="static" for var in "$@" do case $var in @@ -45,8 +45,7 @@ function init_params { function run_tuning { if [ "${topology}" = "vit-base-patch16-224_static" ]; then model_name_or_path="/tf_dataset2/models/nlp_toolkit/vit-base" - approach="PostTrainingStatic" - inc_config_file="vit_config.yaml" + approach="static" fi python -u ./run_image_classification.py \ @@ -62,7 +61,6 @@ function run_tuning { --tune \ --overwrite_output_dir \ --quantization_approach ${approach} \ - --inc_config_file ${inc_config_file} \ ${extra_cmd} } diff --git a/examples/huggingface/pytorch/image-classification/quantization/vit_config.yaml b/examples/huggingface/pytorch/image-classification/quantization/vit_config.yaml deleted file mode 100644 index 5b9b73e5472..00000000000 --- a/examples/huggingface/pytorch/image-classification/quantization/vit_config.yaml +++ /dev/null @@ -1,30 +0,0 @@ -version: 2.0 - -model: # mandatory. used to specify model specific information. - name: vit - framework: pytorch_fx # mandatory. possible values are tensorflow, mxnet, pytorch, pytorch_ipex, onnxrt_integerops and onnxrt_qlinearops. - -quantization: # optional. tuning constraints on model-wise for advance user to reduce tuning space. - approach: post_training_static_quant - calibration: - sampling_size: [1000] - op_wise: { # optional. tuning constraints on op-wise for advance user to reduce tuning space. - 'vit.embeddings.patch_embeddings.projection.module': { # optional. set default qconfig to fp32 for FX model - 'activation': {'dtype': ['fp32']}, - 'weight': {'dtype': ['fp32']} - }, - 'vit.embeddings.dropout': { # optional. set default qconfig to fp32 for FX model - 'activation': {'dtype': ['fp32']}, - 'weight': {'dtype': ['fp32']} - }, - } - -tuning: - accuracy_criterion: - relative: 0.01 # optional. default value is relative, other value is absolute. this example allows relative accuracy loss: 1%. - exit_policy: - timeout: 0 # optional. tuning timeout (seconds). default value is 0 which means early stop. combine with max_trials field to decide when to exit. - max_trials: 300 - random_seed: 1978 # optional. random seed for deterministic tuning. - workspace: - path: nc_workspace/vit/ \ No newline at end of file diff --git a/examples/huggingface/pytorch/language-modeling/quantization/README.md b/examples/huggingface/pytorch/language-modeling/quantization/README.md index a053aef7617..e4c71ee8ba3 100644 --- a/examples/huggingface/pytorch/language-modeling/quantization/README.md +++ b/examples/huggingface/pytorch/language-modeling/quantization/README.md @@ -2,7 +2,7 @@ Step-by-Step ============ This document describes the step-by-step instructions to run large language models (LLMs) on 4th Gen Intel® Xeon® Scalable Processor (codenamed Sapphire Rapids) with PyTorch and Intel® Extension for PyTorch. -The scripts `run_clm.py`, `run_mlm.py` and `run_plm.py` provide three quantization approaches respectively (PostTrainingDynamic, PostTrainingStatic, QuantAwareTraining) based on [Intel® Neural Compressor](https://github.com/intel/neural-compressor) and return last token prediction accuracy by `trainer`. +The scripts `run_clm.py`, `run_mlm.py` and `run_plm.py` provide three quantization approaches respectively (dynamic, static, qat) based on [Intel® Neural Compressor](https://github.com/intel/neural-compressor) and return last token prediction accuracy by `trainer`. The large language model quantization is moved to [text-generation](../../text-generation/quantization/) now. @@ -16,9 +16,7 @@ pip install -r requirements.txt pip install -v . cd examples/huggingface/pytorch/language-modeling/quantization pip install -r requirements.txt -pip install transformers==4.34.1 ``` ->**Note**: Please use transformers no higher than 4.34.1 # Run @@ -32,7 +30,7 @@ python run_clm.py \ --dataset_name wikitext \ --dataset_config_name wikitext-2-raw-v1 \ --tune \ - --quantization_approach PostTrainingStatic \ + --quantization_approach static \ --do_train \ --do_eval \ --output_dir ./tmp/clm_output \ @@ -47,7 +45,7 @@ python run_mlm.py \ --dataset_name wikitext \ --dataset_config_name wikitext-2-raw-v1 \ --tune \ - --quantization_approach PostTrainingStatic \ + --quantization_approach static \ --do_train \ --do_eval \ --output_dir ./tmp/mlm_output \ @@ -62,12 +60,9 @@ python run_mlm.py \ --dataset_name wikitext \ --dataset_config_name wikitext-2-raw-v1 \ --tune \ - --quantization_approach PostTrainingStatic \ + --quantization_approach static \ --do_train \ --do_eval \ --output_dir ./tmp/plm_output \ --overwrite_output_dir ``` - -[1]. Elias, Frantar, et al. "GPTQ: Accurate Post-training Compression for Generative Pretrained Transformers." arXiv preprint arXiv:2210.17323 (2023). -[2]. Lin, Ji, et al. "AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration." arXiv preprint arXiv:2306.00978 (2023). diff --git a/examples/huggingface/pytorch/language-modeling/quantization/run_clm.py b/examples/huggingface/pytorch/language-modeling/quantization/run_clm.py index d58abdeaa87..9a9637785ec 100644 --- a/examples/huggingface/pytorch/language-modeling/quantization/run_clm.py +++ b/examples/huggingface/pytorch/language-modeling/quantization/run_clm.py @@ -28,7 +28,13 @@ from dataclasses import dataclass, field from datasets import load_dataset, load_metric from itertools import chain -from intel_extension_for_transformers.transformers import metrics, OptimizedModel, QuantizationConfig +from intel_extension_for_transformers.transformers import OptimizedModel, metrics +from neural_compressor.config import ( + PostTrainingQuantConfig, + QuantizationAwareTrainingConfig, + TuningCriterion, + AccuracyCriterion +) from intel_extension_for_transformers.transformers.trainer import NLPTrainer from transformers import ( CONFIG_MAPPING, @@ -199,9 +205,9 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply quantization."}, ) quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, + default="static", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic and qat."}, ) metric_name: Optional[str] = field( default="eval_loss", @@ -565,33 +571,39 @@ def compute_metrics(eval_preds): raise ValueError("do_eval must be set to True for quantization.") trainer.save_model(training_args.output_dir) - if optim_args.quantization_approach != "PostTrainingDynamic": + if optim_args.quantization_approach != "dynamic": if not training_args.do_train: raise ValueError( "do_train must be set to True for static and aware training quantization." ) - if optim_args.quantization_approach == "QuantizationAwareTraining": - early_stopping_patience = 6 - early_stopping_threshold = 0.001 # optional - trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, - early_stopping_threshold)) - tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol, greater_is_better=False ) - quantization_config = QuantizationConfig(approach=optim_args.quantization_approach, - metrics=[tune_metric], - sampling_size=optim_args.sampling_size - if optim_args.sampling_size is not None else len(train_dataset) // 100 * 5 , - recipes={ - "smooth_quant": True, - "smooth_quant_args": { - "alpha": optim_args.smooth_quant_alpha - } - } if optim_args.smooth_quant else None) + trainer.metrics = tune_metric + tuning_criterion = TuningCriterion(max_trials=600) + accuracy_criterion = AccuracyCriterion( + higher_is_better=False, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. + ) + if optim_args.quantization_approach != "qat": + quantization_config = PostTrainingQuantConfig( + approach=optim_args.quantization_approach, + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + else: + quantization_config = QuantizationAwareTrainingConfig( + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + early_stopping_patience = 2 + early_stopping_threshold = 0.001 # optional + trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, \ + early_stopping_threshold)) model = trainer.quantize(quant_config=quantization_config) if optim_args.benchmark_only: diff --git a/examples/huggingface/pytorch/language-modeling/quantization/run_mlm.py b/examples/huggingface/pytorch/language-modeling/quantization/run_mlm.py index ec01cacbeb4..ea78bdd0e08 100644 --- a/examples/huggingface/pytorch/language-modeling/quantization/run_mlm.py +++ b/examples/huggingface/pytorch/language-modeling/quantization/run_mlm.py @@ -28,7 +28,13 @@ from dataclasses import dataclass, field from datasets import load_dataset, load_metric from itertools import chain -from intel_extension_for_transformers.transformers import metrics, OptimizedModel, QuantizationConfig +from intel_extension_for_transformers.transformers import OptimizedModel, metrics +from neural_compressor.config import ( + PostTrainingQuantConfig, + QuantizationAwareTrainingConfig, + TuningCriterion, + AccuracyCriterion +) from intel_extension_for_transformers.transformers.trainer import NLPTrainer from transformers import ( CONFIG_MAPPING, @@ -207,9 +213,9 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply quantization."}, ) quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, + default="static", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic and qat."}, ) metric_name: Optional[str] = field( default="eval_loss", @@ -561,6 +567,7 @@ def compute_metrics(eval_preds): mlm_probability=data_args.mlm_probability, pad_to_multiple_of=8 if pad_to_multiple_of_8 else None, ) + metric_name = optim_args.metric_name training_args.metric_for_best_model = metric_name @@ -584,28 +591,39 @@ def compute_metrics(eval_preds): raise ValueError("do_eval must be set to True for quantization.") trainer.save_model(training_args.output_dir) - if optim_args.quantization_approach != "PostTrainingDynamic": + if optim_args.quantization_approach != "dynamic": if not training_args.do_train: raise ValueError( "do_train must be set to True for static and aware training quantization." ) - if optim_args.quantization_approach == "QuantizationAwareTraining": - early_stopping_patience = 6 - early_stopping_threshold = 0.001 # optional - trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, - early_stopping_threshold)) - tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol, greater_is_better=False ) - quantization_config = QuantizationConfig( - approach=optim_args.quantization_approach, - metrics=[tune_metric], - sampling_size = len(train_dataset)//20 + trainer.metrics = tune_metric + tuning_criterion = TuningCriterion(max_trials=600) + accuracy_criterion = AccuracyCriterion( + higher_is_better=False, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. ) + if optim_args.quantization_approach != "qat": + quantization_config = PostTrainingQuantConfig( + approach=optim_args.quantization_approach, + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + else: + quantization_config = QuantizationAwareTrainingConfig( + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + early_stopping_patience = 2 + early_stopping_threshold = 0.001 # optional + trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, \ + early_stopping_threshold)) model = trainer.quantize(quant_config=quantization_config) if optim_args.benchmark_only: diff --git a/examples/huggingface/pytorch/language-modeling/quantization/run_plm.py b/examples/huggingface/pytorch/language-modeling/quantization/run_plm.py index 0c15cedb9c8..4550de43e69 100644 --- a/examples/huggingface/pytorch/language-modeling/quantization/run_plm.py +++ b/examples/huggingface/pytorch/language-modeling/quantization/run_plm.py @@ -27,7 +27,13 @@ from datasets import load_dataset from itertools import chain -from intel_extension_for_transformers.transformers import metrics, OptimizedModel, QuantizationConfig +from intel_extension_for_transformers.transformers import OptimizedModel, metrics +from neural_compressor.config import ( + PostTrainingQuantConfig, + QuantizationAwareTrainingConfig, + TuningCriterion, + AccuracyCriterion +) from intel_extension_for_transformers.transformers.trainer import NLPTrainer from transformers import ( AutoConfig, @@ -202,15 +208,15 @@ class OptimizationArguments: default=False, metadata={"help": "Whether or not to apply quantization."}, ) - quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, - ) metric_name: Optional[str] = field( default="eval_loss", metadata={"help": "Metric used for the tuning strategy."}, ) + quantization_approach: Optional[str] = field( + default="static", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic and qat."}, + ) is_relative: Optional[bool] = field( default=False, metadata={"help": "Metric tolerance mode, True for relative, otherwise for absolute."}, @@ -511,6 +517,7 @@ def group_texts(examples): plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) + metric_name = optim_args.metric_name training_args.metric_for_best_model = metric_name @@ -530,28 +537,39 @@ def group_texts(examples): raise ValueError("do_eval must be set to True for quantization.") trainer.save_model(training_args.output_dir) - if optim_args.quantization_approach != "PostTrainingDynamic": + if optim_args.quantization_approach != "dynamic": if not training_args.do_train: raise ValueError( "do_train must be set to True for static and aware training quantization." ) - if optim_args.quantization_approach == "QuantizationAwareTraining": - early_stopping_patience = 6 - early_stopping_threshold = 0.001 # optional - trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, - early_stopping_threshold)) - tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol, greater_is_better=False ) - quantization_config = QuantizationConfig( - approach=optim_args.quantization_approach, - metrics=[tune_metric], - sampling_size = len(train_dataset)//20 + trainer.metrics = tune_metric + tuning_criterion = TuningCriterion(max_trials=600) + accuracy_criterion = AccuracyCriterion( + higher_is_better=False, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. ) + if optim_args.quantization_approach != "qat": + quantization_config = PostTrainingQuantConfig( + approach=optim_args.quantization_approach, + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + else: + quantization_config = QuantizationAwareTrainingConfig( + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + early_stopping_patience = 2 + early_stopping_threshold = 0.001 # optional + trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, \ + early_stopping_threshold)) model = trainer.quantize(quant_config=quantization_config) if optim_args.benchmark_only: diff --git a/examples/huggingface/pytorch/language-modeling/quantization/run_tuning.sh b/examples/huggingface/pytorch/language-modeling/quantization/run_tuning.sh index eae534dac65..18a709a59e7 100644 --- a/examples/huggingface/pytorch/language-modeling/quantization/run_tuning.sh +++ b/examples/huggingface/pytorch/language-modeling/quantization/run_tuning.sh @@ -17,7 +17,7 @@ function init_params { extra_cmd="" batch_size=8 model_type="bert" - approach="PostTrainingStatic" + approach="static" alpha=0.5 for var in "$@" do @@ -60,7 +60,7 @@ function run_tuning { DATASET_CONFIG_NAME="wikitext-2-raw-v1" model_name_or_path="EleutherAI/gpt-neo-125m" task="clm" - approach="PostTrainingStatic" + approach="static" backend="" elif [ "${topology}" = "gpt_neo" ]; then if [ "${task}" = "clm" ]; then @@ -70,11 +70,11 @@ function run_tuning { DATASET_CONFIG_NAME="wikitext-2-raw-v1" model_name_or_path="EleutherAI/gpt-neo-125M" if [ "${approach}" = "dynamic" ]; then - approach="PostTrainingDynamic" + approach="dynamic" elif [ "${approach}" = "static" ]; then - approach="PostTrainingStatic" + approach="static" elif [ "${approach}" = "qat" ]; then - approach="QuantizationAwareTraining" + approach="qat" extra_cmd=$extra_cmd" --learning_rate 1e-5 \ --num_train_epochs 6 \ --eval_steps 100 \ @@ -83,7 +83,8 @@ function run_tuning { --load_best_model_at_end True \ --evaluation_strategy steps \ --save_strategy steps \ - --save_total_limit 1" + --save_total_limit 1 \ + --save_safetensors False" fi elif [ "${topology}" = "gpt_j" ]; then if [ "${task}" = "clm" ]; then @@ -93,9 +94,9 @@ function run_tuning { DATASET_CONFIG_NAME="wikitext-2-raw-v1" model_name_or_path="/tf_dataset2/models/pytorch/gpt-j-6B" if [ "${approach}" = "dynamic" ]; then - approach="PostTrainingDynamic" + approach="dynamic" elif [ "${approach}" = "static" ]; then - approach="PostTrainingStatic" + approach="static" fi elif [ "${topology}" = "bert" ]; then if [ "${task}" = "mlm" ]; then @@ -105,11 +106,11 @@ function run_tuning { DATASET_CONFIG_NAME="wikitext-2-raw-v1" model_name_or_path="bert-base-uncased" if [ "${approach}" = "dynamic" ]; then - approach="PostTrainingDynamic" + approach="dynamic" elif [ "${approach}" = "static" ]; then - approach="PostTrainingStatic" + approach="static" elif [ "${approach}" = "qat" ]; then - approach="QuantizationAwareTraining" + approach="qat" extra_cmd=$extra_cmd" --learning_rate 1e-5 \ --num_train_epochs 6 \ --eval_steps 100 \ @@ -119,7 +120,8 @@ function run_tuning { --evaluation_strategy steps \ --save_strategy steps \ --metric_for_best_model accuracy \ - --save_total_limit 1" + --save_total_limit 1 \ + --save_safetensors False" fi elif [ "${topology}" = "xlnet" ]; then if [ "${task}" = "plm" ]; then @@ -129,11 +131,11 @@ function run_tuning { DATASET_CONFIG_NAME="wikitext-2-raw-v1" model_name_or_path="xlnet-base-cased" if [ "${approach}" = "dynamic" ]; then - approach="PostTrainingDynamic" + approach="dynamic" elif [ "${approach}" = "static" ]; then - approach="PostTrainingStatic" + approach="static" elif [ "${approach}" = "qat" ]; then - approach="QuantizationAwareTraining" + approach="qat" extra_cmd=$extra_cmd" --learning_rate 1e-5 \ --num_train_epochs 6 \ --eval_steps 100 \ @@ -143,7 +145,8 @@ function run_tuning { --evaluation_strategy steps \ --save_strategy steps \ --metric_for_best_model accuracy \ - --save_total_limit 1" + --save_total_limit 1 \ + --save_safetensors False" fi elif [ "${topology}" = "gpt_neox" ]; then if [ "${task}" = "clm" ]; then @@ -153,9 +156,9 @@ function run_tuning { DATASET_CONFIG_NAME="unshuffled_original_ast" model_name_or_path="abeja/gpt-neox-japanese-2.7b" if [ "${approach}" = "dynamic" ]; then - approach="PostTrainingDynamic" + approach="dynamic" elif [ "${approach}" = "static" ]; then - approach="PostTrainingStatic" + approach="static" fi elif [ "${topology}" = "bloom" ]; then if [ "${task}" = "clm" ]; then @@ -164,7 +167,7 @@ function run_tuning { DATASET_NAME="lambada" model_name_or_path="bigscience/bloom-560m" if [ "${approach}" = "static" ]; then - approach="PostTrainingStatic" + approach="static" fi extra_cmd=$extra_cmd" --smooth_quant --sampling_size 400 --torchscript" fi diff --git a/examples/huggingface/pytorch/multiple-choice/quantization/README.md b/examples/huggingface/pytorch/multiple-choice/quantization/README.md index d966319be83..96caf84bc49 100644 --- a/examples/huggingface/pytorch/multiple-choice/quantization/README.md +++ b/examples/huggingface/pytorch/multiple-choice/quantization/README.md @@ -9,19 +9,17 @@ This example shows the model quantization for multiple choice task. A multiple c ``` pip install intel-extension-for-transformers pip install -r requirements.txt -pip install transformers==4.34.1 ``` ->**Note**: Please use transformers no higher than 4.34.1 # Run -The script `run_swag.py` provides three quantization approaches (PostTrainingStatic, PostTrainingStatic and QuantizationAwareTraining) based on [Intel® Neural Compressor](https://github.com/intel/neural-compressor). +The script `run_swag.py` provides three quantization approaches (dynamic, static and qat) based on [Intel® Neural Compressor](https://github.com/intel/neural-compressor). ``` python run_swag.py \ --model_name_or_path ehdwns1516/bert-base-uncased_SWAG \ --tune \ - --quantization_approach PostTrainingStatic \ + --quantization_approach static \ --do_train \ --do_eval \ --pad_to_max_length \ @@ -31,6 +29,6 @@ python run_swag.py \ # Validated model list -|DATASET|Pretrained model|PostTrainingDynamic | PostTrainingStatic | QuantizationAwareTraining +|DATASET|Pretrained model|dynamic | static | qat |---|------------------------------------|---|---|--- |SWAG|ehdwns1516/bert-base-uncased_SWAG| ✅| ✅| ✅ diff --git a/examples/huggingface/pytorch/multiple-choice/quantization/run_swag.py b/examples/huggingface/pytorch/multiple-choice/quantization/run_swag.py index 511010f4da6..0304dd71c29 100644 --- a/examples/huggingface/pytorch/multiple-choice/quantization/run_swag.py +++ b/examples/huggingface/pytorch/multiple-choice/quantization/run_swag.py @@ -28,7 +28,13 @@ from dataclasses import dataclass, field from datasets import load_dataset from itertools import chain -from intel_extension_for_transformers.transformers import metrics, OptimizedModel, QuantizationConfig +from intel_extension_for_transformers.transformers import OptimizedModel, metrics +from neural_compressor.config import ( + PostTrainingQuantConfig, + QuantizationAwareTrainingConfig, + TuningCriterion, + AccuracyCriterion +) from intel_extension_for_transformers.transformers.trainer import NLPTrainer from transformers import ( AutoConfig, @@ -215,9 +221,9 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply quantization."}, ) quantization_approach: Optional[str] = field( - default="POSTTRAININGSTATIC", - metadata={"help": "Quantization approach. Supported approach are POSTTRAININGSTATIC, " - "POSTTRAININGDYNAMIC and QUANTIZATIONAWARETRAINING."}, + default="static", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic and qat."}, ) metric_name: Optional[str] = field( default=None, @@ -473,26 +479,44 @@ def compute_metrics(eval_predictions): model.config.save_pretrained(training_args.output_dir) trainer.save_model(training_args.output_dir) - if optim_args.quantization_approach != "POSTTRAININGDYNAMIC": + if optim_args.quantization_approach != "dynamic": if not training_args.do_train: raise ValueError( "do_train must be set to True for static and aware training quantization." ) model.config.save_pretrained(training_args.output_dir) - if optim_args.quantization_approach == "QUANTIZATIONAWARETRAINING": - early_stopping_patience = 6 - early_stopping_threshold = 0.001 # optional - trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, - early_stopping_threshold)) tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol ) - quantization_config = QuantizationConfig( - approach=optim_args.quantization_approach, - metrics=[tune_metric], - sampling_size = len(train_dataset)//20 - ) + trainer.metrics = tune_metric + if optim_args.quantization_approach != "qat": + tuning_criterion = TuningCriterion(max_trials=600) + accuracy_criterion = AccuracyCriterion( + higher_is_better=True, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. + ) + quantization_config = PostTrainingQuantConfig( + approach=optim_args.quantization_approach, + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + else: + tuning_criterion = TuningCriterion(max_trials=600) + accuracy_criterion = AccuracyCriterion( + higher_is_better=True, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. + ) + quantization_config = QuantizationAwareTrainingConfig( + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + early_stopping_patience = 2 + early_stopping_threshold = 0.001 # optional + trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, \ + early_stopping_threshold)) model = trainer.quantize(quant_config=quantization_config) if optim_args.benchmark_only: diff --git a/examples/huggingface/pytorch/multiple-choice/quantization/run_tuning.sh b/examples/huggingface/pytorch/multiple-choice/quantization/run_tuning.sh index 3a718e34f0e..b8123802f38 100644 --- a/examples/huggingface/pytorch/multiple-choice/quantization/run_tuning.sh +++ b/examples/huggingface/pytorch/multiple-choice/quantization/run_tuning.sh @@ -11,7 +11,7 @@ function main { # init params function init_params { tuned_checkpoint="saved_results" - approach="PostTrainingStatic" + approach="static" batch_size=8 for var in "$@" do @@ -41,13 +41,13 @@ function init_params { function run_tuning { if [ "${topology}" = "bert_base_swag_static" ]; then model_name_or_path="ehdwns1516/bert-base-uncased_SWAG" - approach="PostTrainingStatic" + approach="static" elif [ "${topology}" = "bert_base_swag_dynamic" ]; then model_name_or_path="ehdwns1516/bert-base-uncased_SWAG" - approach="PostTrainingDynamic" + approach="dynamic" elif [ "${topology}" = "bert_base_swag_qat" ]; then model_name_or_path="ehdwns1516/bert-base-uncased_SWAG" - approach="QuantizationAwareTraining" + approach="qat" extra_cmd=$extra_cmd" --learning_rate 1e-5 \ --num_train_epochs 6 \ --eval_steps 100 \ @@ -56,7 +56,8 @@ function run_tuning { --load_best_model_at_end True \ --evaluation_strategy steps \ --save_strategy steps \ - --save_total_limit 1" + --save_total_limit 1 \ + --save_safetensors False" fi python -u ./run_swag.py \ @@ -72,7 +73,8 @@ function run_tuning { --tune \ --pad_to_max_length \ --overwrite_cache \ - --overwrite_output_dir + --overwrite_output_dir \ + ${extra_cmd} } main "$@" diff --git a/examples/huggingface/pytorch/optimization_README.md b/examples/huggingface/pytorch/optimization_README.md index 2bafa814908..3d1909b01a6 100644 --- a/examples/huggingface/pytorch/optimization_README.md +++ b/examples/huggingface/pytorch/optimization_README.md @@ -4,7 +4,7 @@ Welcome to Pytorch Huggingface examples. The examples is following from [Hugging ## Quantization approach -| Task | PostTrainingDynamic | PostTrainingStatic | QuantizationAwareTraining +| Task | dynamic | static | qat |---|:---:|:---:|:---:| |**`language-modeling`**| ✅ | ✅ | ✅ |**`multi-choice`**| ✅ | ✅ | ✅ diff --git a/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/bert_large/requirements.txt b/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/bert_large/requirements.txt index 1a42dc1dc28..1b6c3d214c6 100644 --- a/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/bert_large/requirements.txt +++ b/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/bert_large/requirements.txt @@ -1,5 +1,5 @@ datasets >= 1.8.0 -torch == 2.0 +torch == 2.3.0 transformers intel-extension-for-pytorch==2.3.0 wandb diff --git a/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/distilbert_base_uncased_sparse/requirements.txt b/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/distilbert_base_uncased_sparse/requirements.txt index 1a42dc1dc28..1b6c3d214c6 100644 --- a/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/distilbert_base_uncased_sparse/requirements.txt +++ b/examples/huggingface/pytorch/question-answering/deployment/squad/ipex/distilbert_base_uncased_sparse/requirements.txt @@ -1,5 +1,5 @@ datasets >= 1.8.0 -torch == 2.0 +torch == 2.3.0 transformers intel-extension-for-pytorch==2.3.0 wandb diff --git a/examples/huggingface/pytorch/question-answering/dynamic/README.md b/examples/huggingface/pytorch/question-answering/dynamic/README.md index 8b79be87732..b1d1ab9e662 100644 --- a/examples/huggingface/pytorch/question-answering/dynamic/README.md +++ b/examples/huggingface/pytorch/question-answering/dynamic/README.md @@ -82,7 +82,7 @@ python run_qa.py \ python run_qa.py \ --model_name_or_path "sguskin/dynamic-minilmv2-L6-H384-squad1.1" \ --dataset_name squad \ ---quantization_approach PostTrainingStatic \ +--quantization_approach static \ --do_eval \ --do_train \ --tune \ diff --git a/examples/huggingface/pytorch/question-answering/dynamic/run_qa.py b/examples/huggingface/pytorch/question-answering/dynamic/run_qa.py index 348b3a56977..d6c32905700 100644 --- a/examples/huggingface/pytorch/question-answering/dynamic/run_qa.py +++ b/examples/huggingface/pytorch/question-answering/dynamic/run_qa.py @@ -29,7 +29,12 @@ import transformers from dataclasses import dataclass, field from datasets import load_dataset, load_metric -from intel_extension_for_transformers.transformers import metrics , OptimizedModel, QuantizationConfig, DynamicLengthConfig +from intel_extension_for_transformers.transformers import metrics, DynamicLengthConfig +from neural_compressor.config import ( + PostTrainingQuantConfig, + TuningCriterion, + AccuracyCriterion +) from trainer_qa import QuestionAnsweringTrainer from intel_extension_for_transformers.transformers.modeling.modeling_roberta_dynamic import RobertaForQuestionAnswering @@ -221,9 +226,9 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply quantization."}, ) quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, + default="static", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic and qat."}, ) metric_name: Optional[str] = field( default="eval_f1", @@ -771,26 +776,29 @@ def compute_metrics(p: EvalPrediction): trainer.save_model(training_args.output_dir) trainer.calib_dataloader = trainer.get_eval_dataloader() - if optim_args.quantization_approach != "PostTrainingDynamic": + if optim_args.quantization_approach != "dynamic": if not training_args.do_train: raise ValueError( "do_train must be set to True for static and aware training quantization." ) - if optim_args.quantization_approach == "QuantizationAwareTraining": - early_stopping_patience = 6 - early_stopping_threshold = 0.001 # optional - trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, - early_stopping_threshold)) - tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol ) - quantization_config = QuantizationConfig( - approach=optim_args.quantization_approach, - max_trials=200, - metrics=[tune_metric], - ) - quantization_config.framework = "pytorch_ipex" + trainer.metrics = tune_metric + if optim_args.quantization_approach != "qat": + tuning_criterion = TuningCriterion(max_trials=600, objective=["performance"]) + accuracy_criterion = AccuracyCriterion( + higher_is_better=True, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. + ) + quantization_config = PostTrainingQuantConfig( + backend = "ipex", + approach=optim_args.quantization_approach, + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + model = trainer.quantize(quant_config=quantization_config) if optim_args.benchmark or optim_args.accuracy_only: diff --git a/examples/huggingface/pytorch/question-answering/orchestrate_optimizations/run_qa.py b/examples/huggingface/pytorch/question-answering/orchestrate_optimizations/run_qa.py index c07e38affc1..b9e416d41b9 100644 --- a/examples/huggingface/pytorch/question-answering/orchestrate_optimizations/run_qa.py +++ b/examples/huggingface/pytorch/question-answering/orchestrate_optimizations/run_qa.py @@ -33,13 +33,15 @@ import transformers from intel_extension_for_transformers.transformers import ( metrics, - PrunerConfig, - PruningConfig, - DistillationConfig, - QuantizationConfig, OptimizedModel, objectives ) +from neural_compressor.config import ( + WeightPruningConfig, + DistillationConfig, + KnowledgeDistillationLossConfig, + QuantizationAwareTrainingConfig, +) from torch.utils.data import DataLoader from tqdm import tqdm from trainer_qa import QuestionAnsweringTrainer @@ -225,7 +227,7 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply prune."}, ) pruning_approach: Optional[str] = field( - default="BasicMagnitude", + default="magnitude", metadata={"help": "Pruning approach. Supported approach is basic_magnite."}, ) target_sparsity_ratio: Optional[float] = field( @@ -245,9 +247,9 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply quantization."}, ) quantization_approach: Optional[str] = field( - default="QuantizationAwareTraining", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, + default="qat", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic and qat."}, ) metric_name: Optional[str] = field( default="eval_f1", @@ -789,7 +791,7 @@ def get_logits(teacher_model, train_dataset, teacher_train_dataset): # Trace model from neural_compressor.adaptor.torch_utils.symbolic_trace import symbolic_trace - model = symbolic_trace(model, optim_args.quantization_approach=="QuantizationAwareTraining") + model = symbolic_trace(model, optim_args.quantization_approach=="qat") # Initialize our Trainer trainer = QuestionAnsweringTrainer( @@ -814,23 +816,20 @@ def get_logits(teacher_model, train_dataset, teacher_train_dataset): tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol ) - prune_type = 'PatternLock' \ + prune_type = 'pattern_lock' \ if optim_args.pruning_approach else optim_args.pruning_approach target_sparsity_ratio = optim_args.target_sparsity_ratio \ if optim_args.target_sparsity_ratio else None - pruner_config = PrunerConfig(prune_type=prune_type, target_sparsity_ratio=target_sparsity_ratio) - pruning_conf = PruningConfig(framework="pytorch_fx",pruner_config=[pruner_config], metrics=tune_metric) - distillation_conf = DistillationConfig(framework="pytorch_fx", metrics=tune_metric) - - objective = objectives.performance - quantization_conf = QuantizationConfig( - approach=optim_args.quantization_approach, - max_trials=600, - metrics=[tune_metric], - objectives=[objective] - ) + trainer.metrics = tune_metric + pruning_conf = WeightPruningConfig([{"start_step": 0, "end_step": 2}], + target_sparsity=target_sparsity_ratio, + pruning_scope="local", + pruning_type=prune_type) + distillation_criterion = KnowledgeDistillationLossConfig(loss_types=["CE", "KL"]) + distillation_conf = DistillationConfig(teacher_model=teacher_model, criterion=distillation_criterion) + quantization_conf = QuantizationAwareTrainingConfig() conf_list = [pruning_conf, distillation_conf, quantization_conf] - model = trainer.orchestrate_optimizations(config_list=conf_list, teacher_model=teacher_model) + model = trainer.orchestrate_optimizations(config_list=conf_list) if optim_args.benchmark or optim_args.accuracy_only: start_time = timeit.default_timer() @@ -839,7 +838,7 @@ def get_logits(teacher_model, train_dataset, teacher_train_dataset): max_eval_samples = data_args.max_eval_samples \ if data_args.max_eval_samples is not None else len(eval_dataset) eval_samples = min(max_eval_samples, len(eval_dataset)) - samples = eval_samples - (eval_samples % batch_size) \ + samples = eval_samples - (eval_samples % optim_args.batch_size) \ if training_args.dataloader_drop_last else eval_samples logger.info("metrics keys: {}".format(results.keys())) bert_task_acc_keys = ['eval_f1', 'eval_accuracy', 'eval_matthews_correlation', diff --git a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/README.md b/examples/huggingface/pytorch/question-answering/pruning/group_lasso/README.md deleted file mode 100644 index f7054bf95fa..00000000000 --- a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/README.md +++ /dev/null @@ -1,68 +0,0 @@ -Step-by-Step -============ - -This document is used to list steps of reproducing PyTorch BERT pruning result. - -# Prerequisite - -## 1. Environment - -Recommend python 3.7 or higher version. - -### Install [intel-extension-for-transformers]() -``` -pip install intel-extension-for-transformers -``` - -### Install PyTorch - -Install pytorch-gpu, visit [pytorch.org](https://pytorch.org/). -```bash -# Install pytorch -pip3 install torch==1.10.0+cu113 torchvision==0.11.1+cu113 torchaudio==0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html -``` - -### Install BERT dependency - -```bash -cd examples/pytorch/huggingface/question-answering/pruning/group_lasso -pip3 install -r requirements.txt --ignore-installed PyYAML -``` -```bash -git clone https://github.com/NVIDIA/apex -cd apex -pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ -``` -> **Note** -> -> If no CUDA runtime is found, please export CUDA_HOME='/usr/local/cuda'. - -## 2. Prepare Dataset - -* For SQuAD task, you should download SQuAD dataset from [SQuAD dataset link](https://rajpurkar.github.io/SQuAD-explorer/). -## 3. Prepare Model -* Please download BERT large pretrained model from [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/models/bert_pyt_ckpt_large_pretraining_amp_lamb/files?version=20.03.0). -```bash -# wget cmd -wget https://api.ngc.nvidia.com/v2/models/nvidia/bert_pyt_ckpt_large_pretraining_amp_lamb/versions/20.03.0/files/bert_large_pretrained_amp.pt - -# curl cmd -curl -LO https://api.ngc.nvidia.com/v2/models/nvidia/bert_pyt_ckpt_large_pretraining_amp_lamb/versions/20.03.0/files/bert_large_pretrained_amp.pt -``` -# Run -Enter your created conda env, then run the script. -```bash -bash scripts/run_squad_sparse.sh /path/to/model.pt 2.0 16 5e-5 tf32 /path/to/data /path/to/outdir prune_bert.yaml -``` -The default parameters are as follows: -```shell -init_checkpoint=${1:-"/path/to/ckpt_8601.pt"} -epochs=${2:-"2.0"} -batch_size=${3:-"4"} -learning_rate=${4:-"3e-5"} -precision=${5:-"tf32"} -BERT_PREP_WORKING_DIR=${6:-'/path/to/bert_data'} -OUT_DIR=${7:-"./results/SQuAD"} -prune_config=${8:-"prune_bert.yaml"} -``` - >**Note**: For original BERT readme, please refer [BERT README](https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/README.md) diff --git a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/bert_config.json b/examples/huggingface/pytorch/question-answering/pruning/group_lasso/bert_config.json deleted file mode 100644 index a7efa973d74..00000000000 --- a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/bert_config.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "attention_probs_dropout_prob": 0.1, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "hidden_size": 1024, - "initializer_range": 0.02, - "intermediate_size": 4096, - "max_position_embeddings": 512, - "num_attention_heads": 16, - "num_hidden_layers": 24, - "type_vocab_size": 2, - "vocab_size": 30522 -} diff --git a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/extract_features.py b/examples/huggingface/pytorch/question-answering/pruning/group_lasso/extract_features.py deleted file mode 100644 index dd206f52221..00000000000 --- a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/extract_features.py +++ /dev/null @@ -1,298 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Extract pre-computed feature vectors from a PyTorch BERT model.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -import collections -import logging -import json -import re - -import torch -from torch.utils.data import TensorDataset, DataLoader, SequentialSampler -from torch.utils.data.distributed import DistributedSampler - -from tokenization import BertTokenizer -from modeling import BertModel - -logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.INFO) -logger = logging.getLogger(__name__) - - -class InputExample(object): - - def __init__(self, unique_id, text_a, text_b): - self.unique_id = unique_id - self.text_a = text_a - self.text_b = text_b - - -class InputFeatures(object): - """A single set of features of data.""" - - def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids): - self.unique_id = unique_id - self.tokens = tokens - self.input_ids = input_ids - self.input_mask = input_mask - self.input_type_ids = input_type_ids - - -def convert_examples_to_features(examples, seq_length, tokenizer): - """Loads a data file into a list of `InputBatch`s.""" - - features = [] - for (ex_index, example) in enumerate(examples): - tokens_a = tokenizer.tokenize(example.text_a) - - tokens_b = None - if example.text_b: - tokens_b = tokenizer.tokenize(example.text_b) - - if tokens_b: - # Modifies `tokens_a` and `tokens_b` in place so that the total - # length is less than the specified length. - # Account for [CLS], [SEP], [SEP] with "- 3" - _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3) - else: - # Account for [CLS] and [SEP] with "- 2" - if len(tokens_a) > seq_length - 2: - tokens_a = tokens_a[0:(seq_length - 2)] - - # The convention in BERT is: - # (a) For sequence pairs: - # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] - # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 - # (b) For single sequences: - # tokens: [CLS] the dog is hairy . [SEP] - # type_ids: 0 0 0 0 0 0 0 - # - # Where "type_ids" are used to indicate whether this is the first - # sequence or the second sequence. The embedding vectors for `type=0` and - # `type=1` were learned during pre-training and are added to the wordpiece - # embedding vector (and position vector). This is not *strictly* necessary - # since the [SEP] token unambiguously separates the sequences, but it makes - # it easier for the model to learn the concept of sequences. - # - # For classification tasks, the first vector (corresponding to [CLS]) is - # used as as the "sentence vector". Note that this only makes sense because - # the entire model is fine-tuned. - tokens = [] - input_type_ids = [] - tokens.append("[CLS]") - input_type_ids.append(0) - for token in tokens_a: - tokens.append(token) - input_type_ids.append(0) - tokens.append("[SEP]") - input_type_ids.append(0) - - if tokens_b: - for token in tokens_b: - tokens.append(token) - input_type_ids.append(1) - tokens.append("[SEP]") - input_type_ids.append(1) - - input_ids = tokenizer.convert_tokens_to_ids(tokens) - - # The mask has 1 for real tokens and 0 for padding tokens. Only real - # tokens are attended to. - input_mask = [1] * len(input_ids) - - # Zero-pad up to the sequence length. - while len(input_ids) < seq_length: - input_ids.append(0) - input_mask.append(0) - input_type_ids.append(0) - - assert len(input_ids) == seq_length - assert len(input_mask) == seq_length - assert len(input_type_ids) == seq_length - - if ex_index < 5: - logger.info("*** Example ***") - logger.info("unique_id: %s" % (example.unique_id)) - logger.info("tokens: %s" % " ".join([str(x) for x in tokens])) - logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) - logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) - logger.info( - "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids])) - - features.append( - InputFeatures( - unique_id=example.unique_id, - tokens=tokens, - input_ids=input_ids, - input_mask=input_mask, - input_type_ids=input_type_ids)) - return features - - -def _truncate_seq_pair(tokens_a, tokens_b, max_length): - """Truncates a sequence pair in place to the maximum length.""" - - # This is a simple heuristic which will always truncate the longer sequence - # one token at a time. This makes more sense than truncating an equal percent - # of tokens from each, since if one sequence is very short then each token - # that's truncated likely contains more information than a longer sequence. - while True: - total_length = len(tokens_a) + len(tokens_b) - if total_length <= max_length: - break - if len(tokens_a) > len(tokens_b): - tokens_a.pop() - else: - tokens_b.pop() - - -def read_examples(input_file): - """Read a list of `InputExample`s from an input file.""" - examples = [] - unique_id = 0 - with open(input_file, "r", encoding='utf-8') as reader: - while True: - line = reader.readline() - if not line: - break - line = line.strip() - text_a = None - text_b = None - m = re.match(r"^(.*) \|\|\| (.*)$", line) - if m is None: - text_a = line - else: - text_a = m.group(1) - text_b = m.group(2) - examples.append( - InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) - unique_id += 1 - return examples - - -def main(): - parser = argparse.ArgumentParser() - - ## Required parameters - parser.add_argument("--input_file", default=None, type=str, required=True) - parser.add_argument("--output_file", default=None, type=str, required=True) - parser.add_argument("--bert_model", default=None, type=str, required=True, - help="Bert pre-trained model selected in the list: bert-base-uncased, " - "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") - - ## Other parameters - parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") - parser.add_argument("--layers", default="-1,-2,-3,-4", type=str) - parser.add_argument("--max_seq_length", default=128, type=int, - help="The maximum total input sequence length after WordPiece tokenization. Sequences longer " - "than this will be truncated, and sequences shorter than this will be padded.") - parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.") - parser.add_argument("--local_rank", - type=int, - default=-1, - help = "local_rank for distributed training on gpus") - parser.add_argument("--no_cuda", - action='store_true', - help="Whether not to use CUDA when available") - - args = parser.parse_args() - - if args.local_rank == -1 or args.no_cuda: - device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") - n_gpu = torch.cuda.device_count() - else: - device = torch.device("cuda", args.local_rank) - n_gpu = 1 - # Initializes the distributed backend which will take care of synchronizing nodes/GPUs - torch.distributed.init_process_group(backend='nccl') - logger.info("device: {} n_gpu: {} distributed training: {}".format(device, n_gpu, bool(args.local_rank != -1))) - - layer_indexes = [int(x) for x in args.layers.split(",")] - - tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) - - examples = read_examples(args.input_file) - - features = convert_examples_to_features( - examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer) - - unique_id_to_feature = {} - for feature in features: - unique_id_to_feature[feature.unique_id] = feature - - model = BertModel.from_pretrained(args.bert_model) - model.to(device) - - if args.local_rank != -1: - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], - output_device=args.local_rank) - elif n_gpu > 1: - model = torch.nn.DataParallel(model) - - all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) - all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) - - eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index) - if args.local_rank == -1: - eval_sampler = SequentialSampler(eval_data) - else: - eval_sampler = DistributedSampler(eval_data) - eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) - - model.eval() - with open(args.output_file, "w", encoding='utf-8') as writer: - for input_ids, input_mask, example_indices in eval_dataloader: - input_ids = input_ids.to(device) - input_mask = input_mask.to(device) - - all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask) - all_encoder_layers = all_encoder_layers - - for b, example_index in enumerate(example_indices): - feature = features[example_index.item()] - unique_id = int(feature.unique_id) - # feature = unique_id_to_feature[unique_id] - output_json = collections.OrderedDict() - output_json["linex_index"] = unique_id - all_out_features = [] - for (i, token) in enumerate(feature.tokens): - all_layers = [] - for (j, layer_index) in enumerate(layer_indexes): - layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy() - layer_output = layer_output[b] - layers = collections.OrderedDict() - layers["index"] = layer_index - layers["values"] = [ - round(x.item(), 6) for x in layer_output[i] - ] - all_layers.append(layers) - out_features = collections.OrderedDict() - out_features["token"] = token - out_features["layers"] = all_layers - all_out_features.append(out_features) - output_json["features"] = all_out_features - writer.write(json.dumps(output_json) + "\n") - - -if __name__ == "__main__": - main() diff --git a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/file_utils.py b/examples/huggingface/pytorch/question-answering/pruning/group_lasso/file_utils.py deleted file mode 100644 index cdefb125839..00000000000 --- a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/file_utils.py +++ /dev/null @@ -1,263 +0,0 @@ -# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Utilities for working with the local dataset cache. -This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp -Copyright by the AllenNLP authors. -""" - -from __future__ import (absolute_import, division, print_function, unicode_literals) - -import json -import logging -import os -import shutil -import tempfile -from functools import wraps -from hashlib import sha256 -import sys -from io import open - -import boto3 -import requests -from botocore.exceptions import ClientError -from tqdm import tqdm - -try: - from urllib.parse import urlparse -except ImportError: - from urlparse import urlparse - -try: - from pathlib import Path - PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', - Path.home() / '.pytorch_pretrained_bert')) -except AttributeError: - PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', - os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert')) - -logger = logging.getLogger(__name__) # pylint: disable=invalid-name - - -def url_to_filename(url, etag=None): - """ - Convert `url` into a hashed filename in a repeatable way. - If `etag` is specified, append its hash to the url's, delimited - by a period. - """ - url_bytes = url.encode('utf-8') - url_hash = sha256(url_bytes) - filename = url_hash.hexdigest() - - if etag: - etag_bytes = etag.encode('utf-8') - etag_hash = sha256(etag_bytes) - filename += '.' + etag_hash.hexdigest() - - return filename - - -def filename_to_url(filename, cache_dir=None): - """ - Return the url and etag (which may be ``None``) stored for `filename`. - Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist. - """ - if cache_dir is None: - cache_dir = PYTORCH_PRETRAINED_BERT_CACHE - if sys.version_info[0] == 3 and isinstance(cache_dir, Path): - cache_dir = str(cache_dir) - - cache_path = os.path.join(cache_dir, filename) - if not os.path.exists(cache_path): - raise EnvironmentError("file {} not found".format(cache_path)) - - meta_path = cache_path + '.json' - if not os.path.exists(meta_path): - raise EnvironmentError("file {} not found".format(meta_path)) - - with open(meta_path, encoding="utf-8") as meta_file: - metadata = json.load(meta_file) - url = metadata['url'] - etag = metadata['etag'] - - return url, etag - - -def cached_path(url_or_filename, cache_dir=None): - """ - Given something that might be a URL (or might be a local path), - determine which. If it's a URL, download the file and cache it, and - return the path to the cached file. If it's already a local path, - make sure the file exists and then return the path. - """ - if cache_dir is None: - cache_dir = PYTORCH_PRETRAINED_BERT_CACHE - if sys.version_info[0] == 3 and isinstance(url_or_filename, Path): - url_or_filename = str(url_or_filename) - if sys.version_info[0] == 3 and isinstance(cache_dir, Path): - cache_dir = str(cache_dir) - - parsed = urlparse(url_or_filename) - - if parsed.scheme in ('http', 'https', 's3'): - # URL, so get it from the cache (downloading if necessary) - return get_from_cache(url_or_filename, cache_dir) - elif os.path.exists(url_or_filename): - # File, and it exists. - return url_or_filename - elif parsed.scheme == '': - # File, but it doesn't exist. - raise EnvironmentError("file {} not found".format(url_or_filename)) - else: - # Something unknown - raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename)) - - -def split_s3_path(url): - """Split a full s3 path into the bucket name and path.""" - parsed = urlparse(url) - if not parsed.netloc or not parsed.path: - raise ValueError("bad s3 path {}".format(url)) - bucket_name = parsed.netloc - s3_path = parsed.path - # Remove '/' at beginning of path. - if s3_path.startswith("/"): - s3_path = s3_path[1:] - return bucket_name, s3_path - - -def s3_request(func): - """ - Wrapper function for s3 requests in order to create more helpful error - messages. - """ - - @wraps(func) - def wrapper(url, *args, **kwargs): - try: - return func(url, *args, **kwargs) - except ClientError as exc: - if int(exc.response["Error"]["Code"]) == 404: - raise EnvironmentError("file {} not found".format(url)) - else: - raise - - return wrapper - - -@s3_request -def s3_etag(url): - """Check ETag on S3 object.""" - s3_resource = boto3.resource("s3") - bucket_name, s3_path = split_s3_path(url) - s3_object = s3_resource.Object(bucket_name, s3_path) - return s3_object.e_tag - - -@s3_request -def s3_get(url, temp_file): - """Pull a file directly from S3.""" - s3_resource = boto3.resource("s3") - bucket_name, s3_path = split_s3_path(url) - s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file) - - -def http_get(url, temp_file): - req = requests.get(url, stream=True) - content_length = req.headers.get('Content-Length') - total = int(content_length) if content_length is not None else None - progress = tqdm(unit="B", total=total) - for chunk in req.iter_content(chunk_size=1024): - if chunk: # filter out keep-alive new chunks - progress.update(len(chunk)) - temp_file.write(chunk) - progress.close() - - -def get_from_cache(url, cache_dir=None): - """ - Given a URL, look for the corresponding dataset in the local cache. - If it's not there, download it. Then return the path to the cached file. - """ - if cache_dir is None: - cache_dir = PYTORCH_PRETRAINED_BERT_CACHE - if sys.version_info[0] == 3 and isinstance(cache_dir, Path): - cache_dir = str(cache_dir) - - if not os.path.exists(cache_dir): - os.makedirs(cache_dir) - - # Get eTag to add to filename, if it exists. - if url.startswith("s3://"): - etag = s3_etag(url) - else: - response = requests.head(url, allow_redirects=True) - if response.status_code != 200: - raise IOError("HEAD request failed for url {} with status code {}" - .format(url, response.status_code)) - etag = response.headers.get("ETag") - - filename = url_to_filename(url, etag) - - # get cache path to put the file - cache_path = os.path.join(cache_dir, filename) - - if not os.path.exists(cache_path): - # Download to temporary file, then copy to cache dir once finished. - # Otherwise you get corrupt cache entries if the download gets interrupted. - with tempfile.NamedTemporaryFile() as temp_file: - logger.info("%s not found in cache, downloading to %s", url, temp_file.name) - - # GET file object - if url.startswith("s3://"): - s3_get(url, temp_file) - else: - http_get(url, temp_file) - - # we are copying the file before closing it, so flush to avoid truncation - temp_file.flush() - # shutil.copyfileobj() starts at the current position, so go to the start - temp_file.seek(0) - - logger.info("copying %s to cache at %s", temp_file.name, cache_path) - with open(cache_path, 'wb') as cache_file: - shutil.copyfileobj(temp_file, cache_file) - - logger.info("creating metadata file for %s", cache_path) - meta = {'url': url, 'etag': etag} - meta_path = cache_path + '.json' - with open(meta_path, 'w', encoding="utf-8") as meta_file: - json.dump(meta, meta_file) - - logger.info("removing temp file %s", temp_file.name) - - return cache_path - - -def read_set_from_file(filename): - ''' - Extract a de-duped collection (set) of text from a file. - Expected file format is one item per line. - ''' - collection = set() - with open(filename, 'r', encoding='utf-8') as file_: - for line in file_: - collection.add(line.rstrip()) - return collection - - -def get_file_extension(path, dot=True, lower=True): - ext = os.path.splitext(path)[1] - ext = ext if dot else ext[1:] - return ext.lower() if lower else ext diff --git a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/modeling.py b/examples/huggingface/pytorch/question-answering/pruning/group_lasso/modeling.py deleted file mode 100644 index cebd2b17f75..00000000000 --- a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/modeling.py +++ /dev/null @@ -1,1285 +0,0 @@ -# coding=utf-8 -# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. -# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""PyTorch BERT model.""" - -from __future__ import absolute_import, division, print_function, unicode_literals - -import copy -import json -import logging -import math -import os -import shutil -import tarfile -import tempfile -import sys -from io import open - -import torch -from torch import nn -from torch.nn import CrossEntropyLoss -from torch.utils import checkpoint - -sys.path.append('/workspace/bert/') -from file_utils import cached_path - -from torch.nn import Module -from torch.nn.parameter import Parameter -import torch.nn.functional as F -import torch.nn.init as init - -logger = logging.getLogger(__name__) - -PRETRAINED_MODEL_ARCHIVE_MAP = { - 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz", - 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz", - 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz", - 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz", - 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz", - 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz", - 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz", -} -CONFIG_NAME = 'bert_config.json' -WEIGHTS_NAME = 'pytorch_model.bin' -TF_WEIGHTS_NAME = 'model.ckpt' - -def load_tf_weights_in_bert(model, tf_checkpoint_path): - """ Load tf checkpoints in a pytorch model - """ - try: - import re - import numpy as np - import tensorflow as tf - except ImportError: - print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " - "https://www.tensorflow.org/install/ for installation instructions.") - raise - tf_path = os.path.abspath(tf_checkpoint_path) - print("Converting TensorFlow checkpoint from {}".format(tf_path)) - # Load weights from TF model - init_vars = tf.train.list_variables(tf_path) - names = [] - arrays = [] - for name, shape in init_vars: - print("Loading TF weight {} with shape {}".format(name, shape)) - array = tf.train.load_variable(tf_path, name) - names.append(name) - arrays.append(array) - - for name, array in zip(names, arrays): - name = name.split('/') - # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v - # which are not required for using pretrained model - if any(n in ["adam_v", "adam_m"] for n in name): - print("Skipping {}".format("/".join(name))) - continue - pointer = model - for m_name in name: - if re.fullmatch(r'[A-Za-z]+_\d+', m_name): - l = re.split(r'_(\d+)', m_name) - else: - l = [m_name] - if l[0] == 'kernel' or l[0] == 'gamma': - pointer = getattr(pointer, 'weight') - elif l[0] == 'output_bias' or l[0] == 'beta': - pointer = getattr(pointer, 'bias') - elif l[0] == 'output_weights': - pointer = getattr(pointer, 'weight') - else: - pointer = getattr(pointer, l[0]) - if len(l) >= 2: - num = int(l[1]) - pointer = pointer[num] - if m_name[-11:] == '_embeddings': - pointer = getattr(pointer, 'weight') - elif m_name == 'kernel': - array = np.ascontiguousarray(np.transpose(array)) - try: - assert pointer.shape == array.shape - except AssertionError as e: - e.args += (pointer.shape, array.shape) - raise - print("Initialize PyTorch weight {}".format(name)) - pointer.data = torch.from_numpy(array) - return model - -def gelu(x): - return x * 0.5 * (1.0 + torch.erf(x / 1.41421)) - -#used only for triton inference -def bias_gelu(bias, y): - x = bias + y - return x * 0.5 * (1.0 + torch.erf(x / 1.41421)) - -# used specifically for training since torch.nn.functional.gelu breaks ONNX export -def bias_gelu_training(bias, y): - x = bias + y - return torch.nn.functional.gelu(x) # Breaks ONNX export - -def bias_tanh(bias, y): - x = bias + y - return torch.tanh(x) - -def swish(x): - return x * torch.sigmoid(x) - -#torch.nn.functional.gelu(x) # Breaks ONNX export -ACT2FN = {"gelu": gelu, "bias_gelu": bias_gelu, "bias_tanh": bias_tanh, "relu": torch.nn.functional.relu, "swish": swish} - -class LinearActivation(Module): - r"""Fused Linear and activation Module. - """ - __constants__ = ['bias'] - - def __init__(self, in_features, out_features, act='gelu', bias=True): - super(LinearActivation, self).__init__() - self.in_features = in_features - self.out_features = out_features - self.act_fn = nn.Identity() # - self.biased_act_fn = None # - self.bias = None # - if isinstance(act, str) or (sys.version_info[0] == 2 and isinstance(act, unicode)): # For TorchScript - if bias and not 'bias' in act: # compatibility - act = 'bias_' + act # - self.biased_act_fn = ACT2FN[act] # - - else: - self.act_fn = ACT2FN[act] - else: - self.act_fn = act - self.weight = Parameter(torch.Tensor(out_features, in_features)) - if bias: - self.bias = Parameter(torch.Tensor(out_features)) - else: - self.register_parameter('bias', None) - self.reset_parameters() - - def reset_parameters(self): - init.kaiming_uniform_(self.weight, a=math.sqrt(5)) - if self.bias is not None: - fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight) - bound = 1 / math.sqrt(fan_in) - init.uniform_(self.bias, -bound, bound) - - def forward(self, input): - if not self.bias is None: - return self.biased_act_fn(self.bias, F.linear(input, self.weight, None)) - else: - return self.act_fn(F.linear(input, self.weight, self.bias)) - - def extra_repr(self): - return 'in_features={}, out_features={}, bias={}'.format( - self.in_features, self.out_features, self.bias is not None - ) - - -class BertConfig(object): - """Configuration class to store the configuration of a `BertModel`. - """ - def __init__(self, - vocab_size_or_config_json_file, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - output_all_encoded_layers=False): - """Constructs BertConfig. - - Args: - vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`. - hidden_size: Size of the encoder layers and the pooler layer. - num_hidden_layers: Number of hidden layers in the Transformer encoder. - num_attention_heads: Number of attention heads for each attention layer in - the Transformer encoder. - intermediate_size: The size of the "intermediate" (i.e., feed-forward) - layer in the Transformer encoder. - hidden_act: The non-linear activation function (function or string) in the - encoder and pooler. If string, "gelu", "relu" and "swish" are supported. - hidden_dropout_prob: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - attention_probs_dropout_prob: The dropout ratio for the attention - probabilities. - max_position_embeddings: The maximum sequence length that this model might - ever be used with. Typically set this to something large just in case - (e.g., 512 or 1024 or 2048). - type_vocab_size: The vocabulary size of the `token_type_ids` passed into - `BertModel`. - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. - """ - if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 - and isinstance(vocab_size_or_config_json_file, unicode)): - with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: - json_config = json.loads(reader.read()) - for key, value in json_config.items(): - self.__dict__[key] = value - elif isinstance(vocab_size_or_config_json_file, int): - self.vocab_size = vocab_size_or_config_json_file - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.output_all_encoded_layers = output_all_encoded_layers - else: - raise ValueError("First argument must be either a vocabulary size (int)" - "or the path to a pretrained model config file (str)") - - @classmethod - def from_dict(cls, json_object): - """Constructs a `BertConfig` from a Python dictionary of parameters.""" - config = BertConfig(vocab_size_or_config_json_file=-1) - for key, value in json_object.items(): - config.__dict__[key] = value - return config - - @classmethod - def from_json_file(cls, json_file): - """Constructs a `BertConfig` from a json file of parameters.""" - with open(json_file, "r", encoding='utf-8') as reader: - text = reader.read() - return cls.from_dict(json.loads(text)) - - def __repr__(self): - return str(self.to_json_string()) - - def to_dict(self): - """Serializes this instance to a Python dictionary.""" - output = copy.deepcopy(self.__dict__) - return output - - def to_json_string(self): - """Serializes this instance to a JSON string.""" - return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" - -class BertNonFusedLayerNorm(nn.Module): - def __init__(self, hidden_size, eps=1e-12): - """Construct a layernorm module in the TF style (epsilon inside the square root). - """ - super(BertNonFusedLayerNorm, self).__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.bias = nn.Parameter(torch.zeros(hidden_size)) - self.variance_epsilon = eps - - def forward(self, x): - u = x.mean(-1, keepdim=True) - s = (x - u) - s = s * s - s = s.mean(-1, keepdim=True) - x = (x - u) / torch.sqrt(s + self.variance_epsilon) - return self.weight * x + self.bias - -try: - import apex - #apex.amp.register_half_function(apex.normalization.fused_layer_norm, 'FusedLayerNorm') - import apex.normalization - from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction - #apex.amp.register_float_function(apex.normalization.FusedLayerNorm, 'forward') - #BertLayerNorm = apex.normalization.FusedLayerNorm - APEX_IS_AVAILABLE = True -except ImportError: - print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.") - #BertLayerNorm = BertNonFusedLayerNorm - APEX_IS_AVAILABLE = False -class BertLayerNorm(Module): - def __init__(self, hidden_size, eps=1e-12): - super(BertLayerNorm, self).__init__() - self.shape = torch.Size((hidden_size,)) - self.eps = eps - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.bias = nn.Parameter(torch.zeros(hidden_size)) - self.apex_enabled = APEX_IS_AVAILABLE - - @torch.jit.unused - def fused_layer_norm(self, x): - return FusedLayerNormAffineFunction.apply( - x, self.weight, self.bias, self.shape, self.eps) - - - def forward(self, x): - if self.apex_enabled and not torch.jit.is_scripting(): - x = self.fused_layer_norm(x) - else: - u = x.mean(-1, keepdim=True) - s = (x - u) - s = s * s - s = s.mean(-1, keepdim=True) - x = (x - u) / torch.sqrt(s + self.eps) - x = self.weight * x + self.bias - return x - -class BertEmbeddings(nn.Module): - """Construct the embeddings from word, position and token_type embeddings. - """ - def __init__(self, config): - super(BertEmbeddings, self).__init__() - self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size) - self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) - self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) - - # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load - # any TensorFlow checkpoint file - self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - - def forward(self, input_ids, token_type_ids): - seq_length = input_ids.size(1) - position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) - position_ids = position_ids.unsqueeze(0).expand_as(input_ids) - - words_embeddings = self.word_embeddings(input_ids) - position_embeddings = self.position_embeddings(position_ids) - token_type_embeddings = self.token_type_embeddings(token_type_ids) - - embeddings = words_embeddings + position_embeddings + token_type_embeddings - embeddings = self.LayerNorm(embeddings) - embeddings = self.dropout(embeddings) - return embeddings - - -class BertSelfAttention(nn.Module): - def __init__(self, config): - super(BertSelfAttention, self).__init__() - if config.hidden_size % config.num_attention_heads != 0: - raise ValueError( - "The hidden size (%d) is not a multiple of the number of attention " - "heads (%d)" % (config.hidden_size, config.num_attention_heads)) - self.num_attention_heads = config.num_attention_heads - self.attention_head_size = int(config.hidden_size / config.num_attention_heads) - self.all_head_size = self.num_attention_heads * self.attention_head_size - - self.query = nn.Linear(config.hidden_size, self.all_head_size) - self.key = nn.Linear(config.hidden_size, self.all_head_size) - self.value = nn.Linear(config.hidden_size, self.all_head_size) - - self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - - def transpose_for_scores(self, x): - new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) - x = torch.reshape(x, new_x_shape) - return x.permute(0, 2, 1, 3) - - def transpose_key_for_scores(self, x): - new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) - x = torch.reshape(x, new_x_shape) - return x.permute(0, 2, 3, 1) - - def forward(self, hidden_states, attention_mask): - mixed_query_layer = self.query(hidden_states) - mixed_key_layer = self.key(hidden_states) - mixed_value_layer = self.value(hidden_states) - - query_layer = self.transpose_for_scores(mixed_query_layer) - key_layer = self.transpose_key_for_scores(mixed_key_layer) - value_layer = self.transpose_for_scores(mixed_value_layer) - - # Take the dot product between "query" and "key" to get the raw attention scores. - attention_scores = torch.matmul(query_layer, key_layer) - attention_scores = attention_scores / math.sqrt(self.attention_head_size) - # Apply the attention mask is (precomputed for all layers in BertModel forward() function) - attention_scores = attention_scores + attention_mask - - # Normalize the attention scores to probabilities. - attention_probs = F.softmax(attention_scores, dim=-1) - - # This is actually dropping out entire tokens to attend to, which might - # seem a bit unusual, but is taken from the original Transformer paper. - attention_probs = self.dropout(attention_probs) - - context_layer = torch.matmul(attention_probs, value_layer) - context_layer = context_layer.permute(0, 2, 1, 3).contiguous() - new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) - context_layer = torch.reshape(context_layer, new_context_layer_shape) - return context_layer - - -class BertSelfOutput(nn.Module): - def __init__(self, config): - super(BertSelfOutput, self).__init__() - self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - - def forward(self, hidden_states, input_tensor): - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states) - hidden_states = self.LayerNorm(hidden_states + input_tensor) - return hidden_states - - -class BertAttention(nn.Module): - def __init__(self, config): - super(BertAttention, self).__init__() - self.self = BertSelfAttention(config) - self.output = BertSelfOutput(config) - - def forward(self, input_tensor, attention_mask): - self_output = self.self(input_tensor, attention_mask) - attention_output = self.output(self_output, input_tensor) - return attention_output - - -class BertIntermediate(nn.Module): - def __init__(self, config): - super(BertIntermediate, self).__init__() - self.dense_act = LinearActivation(config.hidden_size, config.intermediate_size, act=config.hidden_act) - - def forward(self, hidden_states): - hidden_states = self.dense_act(hidden_states) - return hidden_states - - -class BertOutput(nn.Module): - def __init__(self, config): - super(BertOutput, self).__init__() - self.dense = nn.Linear(config.intermediate_size, config.hidden_size) - self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - - def forward(self, hidden_states, input_tensor): - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states) - hidden_states = self.LayerNorm(hidden_states + input_tensor) - return hidden_states - - -class BertLayer(nn.Module): - def __init__(self, config): - super(BertLayer, self).__init__() - self.attention = BertAttention(config) - self.intermediate = BertIntermediate(config) - self.output = BertOutput(config) - - def forward(self, hidden_states, attention_mask): - attention_output = self.attention(hidden_states, attention_mask) - intermediate_output = self.intermediate(attention_output) - layer_output = self.output(intermediate_output, attention_output) - return layer_output - -class BertEncoder(nn.Module): - def __init__(self, config): - super(BertEncoder, self).__init__() - self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)]) - self.output_all_encoded_layers = config.output_all_encoded_layers - self._checkpoint_activations = False - - @torch.jit.unused - def checkpointed_forward(self, hidden_states, attention_mask): - def custom(start, end): - def custom_forward(*inputs): - layers = self.layer[start:end] - x_ = inputs[0] - for layer in layers: - x_ = layer(x_, inputs[1]) - return x_ - return custom_forward - - l = 0 - num_layers = len(self.layer) - chunk_length = math.ceil(math.sqrt(num_layers)) - while l < num_layers: - hidden_states = checkpoint.checkpoint(custom(l, l+chunk_length), hidden_states, attention_mask*1) - l += chunk_length - - return hidden_states - - def forward(self, hidden_states, attention_mask): - all_encoder_layers = [] - - if self._checkpoint_activations: - hidden_states = self.checkpointed_forward(hidden_states, attention_mask) - else: - for i,layer_module in enumerate(self.layer): - hidden_states = layer_module(hidden_states, attention_mask) - - if self.output_all_encoded_layers: - all_encoder_layers.append(hidden_states) - - if not self.output_all_encoded_layers or self._checkpoint_activations: - all_encoder_layers.append(hidden_states) - return all_encoder_layers - -class BertPooler(nn.Module): - def __init__(self, config): - super(BertPooler, self).__init__() - self.dense_act = LinearActivation(config.hidden_size, config.hidden_size, act="tanh") - - def forward(self, hidden_states): - # We "pool" the model by simply taking the hidden state corresponding - # to the first token. - first_token_tensor = hidden_states[:, 0] - pooled_output = self.dense_act(first_token_tensor) - return pooled_output - - -class BertPredictionHeadTransform(nn.Module): - def __init__(self, config): - super(BertPredictionHeadTransform, self).__init__() - self.dense_act = LinearActivation(config.hidden_size, config.hidden_size, act=config.hidden_act) - self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) - - def forward(self, hidden_states): - hidden_states = self.dense_act(hidden_states) - hidden_states = self.LayerNorm(hidden_states) - return hidden_states - - -class BertLMPredictionHead(nn.Module): - def __init__(self, config, bert_model_embedding_weights): - super(BertLMPredictionHead, self).__init__() - self.transform = BertPredictionHeadTransform(config) - - # The output weights are the same as the input embeddings, but there is - # an output-only bias for each token. - self.decoder = nn.Linear(bert_model_embedding_weights.size(1), - bert_model_embedding_weights.size(0), - bias=False) - self.decoder.weight = bert_model_embedding_weights - self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0))) - - def forward(self, hidden_states): - hidden_states = self.transform(hidden_states) - hidden_states = self.decoder(hidden_states) + self.bias - return hidden_states - - -class BertOnlyMLMHead(nn.Module): - def __init__(self, config, bert_model_embedding_weights): - super(BertOnlyMLMHead, self).__init__() - self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights) - - def forward(self, sequence_output): - prediction_scores = self.predictions(sequence_output) - return prediction_scores - - -class BertOnlyNSPHead(nn.Module): - def __init__(self, config): - super(BertOnlyNSPHead, self).__init__() - self.seq_relationship = nn.Linear(config.hidden_size, 2) - - def forward(self, pooled_output): - seq_relationship_score = self.seq_relationship(pooled_output) - return seq_relationship_score - - -class BertPreTrainingHeads(nn.Module): - def __init__(self, config, bert_model_embedding_weights): - super(BertPreTrainingHeads, self).__init__() - self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights) - self.seq_relationship = nn.Linear(config.hidden_size, 2) - - def forward(self, sequence_output, pooled_output): - prediction_scores = self.predictions(sequence_output) - seq_relationship_score = self.seq_relationship(pooled_output) - return prediction_scores, seq_relationship_score - - -class BertPreTrainedModel(nn.Module): - """ An abstract class to handle weights initialization and - a simple interface for downloading and loading pretrained models. - """ - def __init__(self, config, *inputs, **kwargs): - super(BertPreTrainedModel, self).__init__() - if not isinstance(config, BertConfig): - raise ValueError( - "Parameter config in `{}(config)` should be an instance of class `BertConfig`. " - "To create a model from a Google pretrained model use " - "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( - self.__class__.__name__, self.__class__.__name__ - )) - self.config = config - - def init_bert_weights(self, module): - """ Initialize the weights. - """ - if isinstance(module, (nn.Linear, nn.Embedding)): - # Slightly different from the TF version which uses truncated_normal for initialization - # cf https://github.com/pytorch/pytorch/pull/5617 - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) - elif isinstance(module, BertLayerNorm): - module.bias.data.zero_() - module.weight.data.fill_(1.0) - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() - - def checkpoint_activations(self, val): - def _apply_flag(module): - if hasattr(module, "_checkpoint_activations"): - module._checkpoint_activations=val - self.apply(_apply_flag) - def enable_apex(self, val): - def _apply_flag(module): - if hasattr(module, "apex_enabled"): - module.apex_enabled=val - self.apply(_apply_flag) - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, state_dict=None, cache_dir=None, - from_tf=False, *inputs, **kwargs): - """ - Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict. - Download and cache the pre-trained model file if needed. - - Params: - pretrained_model_name_or_path: either: - - a str with the name of a pre-trained model to load selected in the list of: - . `bert-base-uncased` - . `bert-large-uncased` - . `bert-base-cased` - . `bert-large-cased` - . `bert-base-multilingual-uncased` - . `bert-base-multilingual-cased` - . `bert-base-chinese` - - a path or url to a pretrained model archive containing: - . `bert_config.json` a configuration file for the model - . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance - - a path or url to a pretrained model archive containing: - . `bert_config.json` a configuration file for the model - . `model.chkpt` a TensorFlow checkpoint - from_tf: should we load the weights from a locally saved TensorFlow checkpoint - cache_dir: an optional path to a folder in which the pre-trained models will be cached. - state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of Google pre-trained models - *inputs, **kwargs: additional input for the specific Bert class - (ex: num_labels for BertForSequenceClassification) - """ - if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP: - archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path] - else: - archive_file = pretrained_model_name_or_path - # redirect to the cache, if necessary - try: - resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir) - except EnvironmentError: - logger.error( - "Model name '{}' was not found in model name list ({}). " - "We assumed '{}' was a path or url but couldn't find any file " - "associated to this path or url.".format( - pretrained_model_name_or_path, - ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), - archive_file)) - return None - if resolved_archive_file == archive_file: - logger.info("loading archive file {}".format(archive_file)) - else: - logger.info("loading archive file {} from cache at {}".format( - archive_file, resolved_archive_file)) - tempdir = None - if os.path.isdir(resolved_archive_file) or from_tf: - serialization_dir = resolved_archive_file - else: - # Extract archive to temp dir - tempdir = tempfile.mkdtemp() - logger.info("extracting archive file {} to temp dir {}".format( - resolved_archive_file, tempdir)) - if os.path.isfile(resolved_archive_file) and tarfile.is_tarfile(resolved_archive_file): - with tarfile.open(resolved_archive_file, 'r:gz') as archive: - archive.extractall(tempdir) - else: - logger.error("Invalid tar file {}".format(resolved_archive_file)) - serialization_dir = tempdir - # Load config - config_file = os.path.join(serialization_dir, CONFIG_NAME) - config = BertConfig.from_json_file(config_file) - logger.info("Model config {}".format(config)) - # Instantiate model. - model = cls(config, *inputs, **kwargs) - if state_dict is None and not from_tf: - weights_path = os.path.join(serialization_dir, WEIGHTS_NAME) - state_dict = torch.load(weights_path, map_location='cpu' if not torch.cuda.is_available() else None) - if tempdir: - # Clean up temp dir - shutil.rmtree(tempdir) - if from_tf: - # Directly load from a TensorFlow checkpoint - weights_path = os.path.join(serialization_dir, TF_WEIGHTS_NAME) - return load_tf_weights_in_bert(model, weights_path) - # Load from a PyTorch state_dict - old_keys = [] - new_keys = [] - for key in state_dict.keys(): - new_key = None - if 'gamma' in key: - new_key = key.replace('gamma', 'weight') - if 'beta' in key: - new_key = key.replace('beta', 'bias') - if new_key: - old_keys.append(key) - new_keys.append(new_key) - for old_key, new_key in zip(old_keys, new_keys): - state_dict[new_key] = state_dict.pop(old_key) - - missing_keys = [] - unexpected_keys = [] - error_msgs = [] - # copy state_dict so _load_from_state_dict can modify it - metadata = getattr(state_dict, '_metadata', None) - state_dict = state_dict.copy() - if metadata is not None: - state_dict._metadata = metadata - - def load(module, prefix=''): - local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) - module._load_from_state_dict( - state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) - for name, child in module._modules.items(): - if child is not None: - load(child, prefix + name + '.') - start_prefix = '' - if not hasattr(model, 'bert') and any(s.startswith('bert.') for s in state_dict.keys()): - start_prefix = 'bert.' - load(model, prefix=start_prefix) - if len(missing_keys) > 0: - logger.info("Weights of {} not initialized from pretrained model: {}".format( - model.__class__.__name__, missing_keys)) - if len(unexpected_keys) > 0: - logger.info("Weights from pretrained model not used in {}: {}".format( - model.__class__.__name__, unexpected_keys)) - if len(error_msgs) > 0: - raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( - model.__class__.__name__, "\n\t".join(error_msgs))) - return model - - -class BertModel(BertPreTrainedModel): - """BERT model ("Bidirectional Embedding Representations from a Transformer"). - - Params: - config: a BertConfig class instance with the configuration to build a new model - - Inputs: - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] - with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts - `extract_features.py`, `run_classifier.py` and `run_squad.py`) - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token - types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to - a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when - a batch has varying length sentences. - - Outputs: Tuple of (encoded_layers, pooled_output) - `encoded_layers`: controlled by `output_all_encoded_layers` argument: - - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end - of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each - encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size], - - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding - to the last attention block of shape [batch_size, sequence_length, hidden_size], - `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a - classifier pretrained on top of the hidden state associated to the first character of the - input (`CLS`) to train on the Next-Sentence task (see BERT's paper). - - Example usage: - ```python - # Already been converted into WordPiece token ids - input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - - config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) - - model = modeling.BertModel(config=config) - all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask) - ``` - """ - def __init__(self, config): - super(BertModel, self).__init__(config) - self.embeddings = BertEmbeddings(config) - self.encoder = BertEncoder(config) - self.pooler = BertPooler(config) - self.apply(self.init_bert_weights) - self.output_all_encoded_layers = config.output_all_encoded_layers - - def forward(self, input_ids, token_type_ids, attention_mask): - # We create a 3D attention mask from a 2D tensor mask. - # Sizes are [batch_size, 1, 1, to_seq_length] - # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] - # this attention mask is more simple than the triangular masking of causal attention - # used in OpenAI GPT, we just need to prepare the broadcast dimension here. - extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) - - # Since attention_mask is 1.0 for positions we want to attend and 0.0 for - # masked positions, this operation will create a tensor which is 0.0 for - # positions we want to attend and -10000.0 for masked positions. - # Since we are adding it to the raw scores before the softmax, this is - # effectively the same as removing these entirely. - extended_attention_mask = extended_attention_mask.to(dtype=self.embeddings.word_embeddings.weight.dtype) # fp16 compatibility - extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 - - embedding_output = self.embeddings(input_ids, token_type_ids) - encoded_layers = self.encoder(embedding_output, extended_attention_mask) - sequence_output = encoded_layers[-1] - pooled_output = self.pooler(sequence_output) - if not self.output_all_encoded_layers: - encoded_layers = encoded_layers[-1:] - return encoded_layers, pooled_output - - -class BertForPreTraining(BertPreTrainedModel): - """BERT model with pre-training heads. - This module comprises the BERT model followed by the two pre-training heads: - - the masked language modeling head, and - - the next sentence classification head. - - Params: - config: a BertConfig class instance with the configuration to build a new model. - - Inputs: - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] - with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts - `extract_features.py`, `run_classifier.py` and `run_squad.py`) - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token - types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to - a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when - a batch has varying length sentences. - `masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] - with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss - is only computed for the labels set in [0, ..., vocab_size] - `next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size] - with indices selected in [0, 1]. - 0 => next sentence is the continuation, 1 => next sentence is a random sentence. - - Outputs: - if `masked_lm_labels` and `next_sentence_label` are not `None`: - Outputs the total_loss which is the sum of the masked language modeling loss and the next - sentence classification loss. - if `masked_lm_labels` or `next_sentence_label` is `None`: - Outputs a tuple comprising - - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and - - the next sentence classification logits of shape [batch_size, 2]. - - Example usage: - ```python - # Already been converted into WordPiece token ids - input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - - config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) - - model = BertForPreTraining(config) - masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask) - ``` - """ - def __init__(self, config): - super(BertForPreTraining, self).__init__(config) - self.bert = BertModel(config) - self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight) - self.apply(self.init_bert_weights) - - def forward(self, input_ids, token_type_ids, attention_mask): - encoded_layers, pooled_output = self.bert(input_ids, token_type_ids, attention_mask) - sequence_output = encoded_layers[-1] - prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) - - return prediction_scores, seq_relationship_score - - -class BertForMaskedLM(BertPreTrainedModel): - """BERT model with the masked language modeling head. - This module comprises the BERT model followed by the masked language modeling head. - - Params: - config: a BertConfig class instance with the configuration to build a new model. - - Inputs: - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] - with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts - `extract_features.py`, `run_classifier.py` and `run_squad.py`) - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token - types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to - a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when - a batch has varying length sentences. - `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] - with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss - is only computed for the labels set in [0, ..., vocab_size] - - Outputs: - if `masked_lm_labels` is not `None`: - Outputs the masked language modeling loss. - if `masked_lm_labels` is `None`: - Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size]. - - Example usage: - ```python - # Already been converted into WordPiece token ids - input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - - config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) - - model = BertForMaskedLM(config) - masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask) - ``` - """ - def __init__(self, config): - super(BertForMaskedLM, self).__init__(config) - self.bert = BertModel(config) - self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight) - self.apply(self.init_bert_weights) - - def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None): - encoded_layers, _ = self.bert(input_ids, token_type_ids, attention_mask) - sequence_output = encoded_layers[-1] - prediction_scores = self.cls(sequence_output) - - if masked_lm_labels is not None: - loss_fct = CrossEntropyLoss(ignore_index=-1) - masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) - return masked_lm_loss - else: - return prediction_scores - - -class BertForNextSentencePrediction(BertPreTrainedModel): - """BERT model with next sentence prediction head. - This module comprises the BERT model followed by the next sentence classification head. - - Params: - config: a BertConfig class instance with the configuration to build a new model. - - Inputs: - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] - with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts - `extract_features.py`, `run_classifier.py` and `run_squad.py`) - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token - types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to - a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when - a batch has varying length sentences. - `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size] - with indices selected in [0, 1]. - 0 => next sentence is the continuation, 1 => next sentence is a random sentence. - - Outputs: - if `next_sentence_label` is not `None`: - Outputs the total_loss which is the sum of the masked language modeling loss and the next - sentence classification loss. - if `next_sentence_label` is `None`: - Outputs the next sentence classification logits of shape [batch_size, 2]. - - Example usage: - ```python - # Already been converted into WordPiece token ids - input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - - config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) - - model = BertForNextSentencePrediction(config) - seq_relationship_logits = model(input_ids, token_type_ids, input_mask) - ``` - """ - def __init__(self, config): - super(BertForNextSentencePrediction, self).__init__(config) - self.bert = BertModel(config) - self.cls = BertOnlyNSPHead(config) - self.apply(self.init_bert_weights) - - def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None): - _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask) - seq_relationship_score = self.cls( pooled_output) - - if next_sentence_label is not None: - loss_fct = CrossEntropyLoss(ignore_index=-1) - next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) - return next_sentence_loss - else: - return seq_relationship_score - - -class BertForSequenceClassification(BertPreTrainedModel): - """BERT model for classification. - This module is composed of the BERT model with a linear layer on top of - the pooled output. - - Params: - `config`: a BertConfig class instance with the configuration to build a new model. - `num_labels`: the number of classes for the classifier. Default = 2. - - Inputs: - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] - with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts - `extract_features.py`, `run_classifier.py` and `run_squad.py`) - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token - types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to - a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when - a batch has varying length sentences. - `labels`: labels for the classification output: torch.LongTensor of shape [batch_size] - with indices selected in [0, ..., num_labels]. - - Outputs: - if `labels` is not `None`: - Outputs the CrossEntropy classification loss of the output with the labels. - if `labels` is `None`: - Outputs the classification logits of shape [batch_size, num_labels]. - - Example usage: - ```python - # Already been converted into WordPiece token ids - input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - - config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) - - num_labels = 2 - - model = BertForSequenceClassification(config, num_labels) - logits = model(input_ids, token_type_ids, input_mask) - ``` - """ - def __init__(self, config, num_labels): - super(BertForSequenceClassification, self).__init__(config) - self.num_labels = num_labels - self.bert = BertModel(config) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.classifier = nn.Linear(config.hidden_size, num_labels) - self.apply(self.init_bert_weights) - - def forward(self, input_ids, token_type_ids=None, attention_mask=None): - _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask) - pooled_output = self.dropout(pooled_output) - return self.classifier(pooled_output) - - -class BertForMultipleChoice(BertPreTrainedModel): - """BERT model for multiple choice tasks. - This module is composed of the BERT model with a linear layer on top of - the pooled output. - - Params: - `config`: a BertConfig class instance with the configuration to build a new model. - `num_choices`: the number of classes for the classifier. Default = 2. - - Inputs: - `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] - with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts - `extract_features.py`, `run_classifier.py` and `run_squad.py`) - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] - with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` - and type 1 corresponds to a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when - a batch has varying length sentences. - `labels`: labels for the classification output: torch.LongTensor of shape [batch_size] - with indices selected in [0, ..., num_choices]. - - Outputs: - if `labels` is not `None`: - Outputs the CrossEntropy classification loss of the output with the labels. - if `labels` is `None`: - Outputs the classification logits of shape [batch_size, num_labels]. - - Example usage: - ```python - # Already been converted into WordPiece token ids - input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]]) - input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]]) - token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]]) - config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) - - num_choices = 2 - - model = BertForMultipleChoice(config, num_choices) - logits = model(input_ids, token_type_ids, input_mask) - ``` - """ - def __init__(self, config, num_choices): - super(BertForMultipleChoice, self).__init__(config) - self.num_choices = num_choices - self.bert = BertModel(config) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.classifier = nn.Linear(config.hidden_size, 1) - self.apply(self.init_bert_weights) - - def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None): - flat_input_ids = input_ids.view(-1, input_ids.size(-1)) - flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) - flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) - _, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask) - pooled_output = self.dropout(pooled_output) - logits = self.classifier(pooled_output) - reshaped_logits = logits.view(-1, self.num_choices) - - if labels is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(reshaped_logits, labels) - return loss - else: - return reshaped_logits - - -class BertForTokenClassification(BertPreTrainedModel): - """BERT model for token-level classification. - This module is composed of the BERT model with a linear layer on top of - the full hidden state of the last layer. - - Params: - `config`: a BertConfig class instance with the configuration to build a new model. - `num_labels`: the number of classes for the classifier. Default = 2. - - Inputs: - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] - with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts - `extract_features.py`, `run_classifier.py` and `run_squad.py`) - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token - types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to - a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when - a batch has varying length sentences. - `labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length] - with indices selected in [0, ..., num_labels]. - - Outputs: - if `labels` is not `None`: - Outputs the CrossEntropy classification loss of the output with the labels. - if `labels` is `None`: - Outputs the classification logits of shape [batch_size, sequence_length, num_labels]. - - Example usage: - ```python - # Already been converted into WordPiece token ids - input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - - config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) - - num_labels = 2 - - model = BertForTokenClassification(config, num_labels) - logits = model(input_ids, token_type_ids, input_mask) - ``` - """ - def __init__(self, config, num_labels): - super(BertForTokenClassification, self).__init__(config) - self.num_labels = num_labels - self.bert = BertModel(config) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.classifier = nn.Linear(config.hidden_size, num_labels) - self.apply(self.init_bert_weights) - - def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None): - encoded_layers, _ = self.bert(input_ids, token_type_ids, attention_mask) - sequence_output = encoded_layers[-1] - sequence_output = self.dropout(sequence_output) - logits = self.classifier(sequence_output) - - if labels is not None: - loss_fct = CrossEntropyLoss() - # Only keep active parts of the loss - if attention_mask is not None: - active_loss = attention_mask.view(-1) == 1 - active_logits = logits.view(-1, self.num_labels)[active_loss] - active_labels = labels.view(-1)[active_loss] - loss = loss_fct(active_logits, active_labels) - else: - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - return loss - else: - return logits - - -class BertForQuestionAnswering(BertPreTrainedModel): - """BERT model for Question Answering (span extraction). - This module is composed of the BERT model with a linear layer on top of - the sequence output that computes start_logits and end_logits - - Params: - `config`: a BertConfig class instance with the configuration to build a new model. - - Inputs: - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] - with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts - `extract_features.py`, `run_classifier.py` and `run_squad.py`) - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token - types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to - a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when - a batch has varying length sentences. - - Outputs: - Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end - position tokens of shape [batch_size, sequence_length]. - - Example usage: - ```python - # Already been converted into WordPiece token ids - input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - - config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) - - model = BertForQuestionAnswering(config) - start_logits, end_logits = model(input_ids, token_type_ids, input_mask) - ``` - """ - def __init__(self, config): - super(BertForQuestionAnswering, self).__init__(config) - self.bert = BertModel(config) - # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version - # self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.qa_outputs = nn.Linear(config.hidden_size, 2) - self.apply(self.init_bert_weights) - - def forward(self, input_ids, token_type_ids, attention_mask): - encoded_layers, _ = self.bert(input_ids, token_type_ids, attention_mask) - sequence_output = encoded_layers[-1] - logits = self.qa_outputs(sequence_output) - start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) - return start_logits, end_logits diff --git a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/optimization.py b/examples/huggingface/pytorch/question-answering/pruning/group_lasso/optimization.py deleted file mode 100644 index 5881a5b5156..00000000000 --- a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/optimization.py +++ /dev/null @@ -1,174 +0,0 @@ -# coding=utf-8 -# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. -# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""PyTorch optimization for BERT model.""" - -import math -import torch -from torch.optim import Optimizer -from torch.optim.optimizer import required -from torch.nn.utils import clip_grad_norm_ -#from fused_adam_local import FusedAdam -from apex.optimizers import FusedAdam -from apex.multi_tensor_apply import multi_tensor_applier -import amp_C -from utils import is_main_process - -multi_tensor_l2norm = amp_C.multi_tensor_l2norm -lamb_compute_update = amp_C.multi_tensor_lamb_stage1_cuda -lamb_apply_update = amp_C.multi_tensor_lamb_stage2_cuda -scale = amp_C.multi_tensor_scale - - -def warmup_cosine(x, warmup=0.002): - if x < warmup: - return x/warmup - return 0.5 * (1.0 + torch.cos(math.pi * x)) - -def warmup_constant(x, warmup=0.002): - if x < warmup: - return x/warmup - return 1.0 - -def warmup_linear(x, warmup=0.002): - if x < warmup: - return x/warmup - return max((x - 1. )/ (warmup - 1.), 0.) - -def warmup_poly(x, warmup=0.002, degree=0.5): - if x < warmup: - return x/warmup - return (1.0 - x)**degree - - -SCHEDULES = { - 'warmup_cosine':warmup_cosine, - 'warmup_constant':warmup_constant, - 'warmup_linear':warmup_linear, - 'warmup_poly':warmup_poly, -} - -class BertAdam(Optimizer): - """Implements BERT version of Adam algorithm with weight decay fix. - Params: - lr: learning rate - warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1 - t_total: total number of training steps for the learning - rate schedule, -1 means constant learning rate. Default: -1 - schedule: schedule to use for the warmup (see above). Default: 'warmup_linear' - b1: Adams b1. Default: 0.9 - b2: Adams b2. Default: 0.999 - e: Adams epsilon. Default: 1e-6 - weight_decay: Weight decay. Default: 0.01 - max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0 - """ - def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear', - b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, - max_grad_norm=1.0): - if lr is not required and lr < 0.0: - raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) - if schedule not in SCHEDULES: - raise ValueError("Invalid schedule parameter: {}".format(schedule)) - if not 0.0 <= warmup < 1.0 and not warmup == -1: - raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup)) - if not 0.0 <= b1 < 1.0: - raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1)) - if not 0.0 <= b2 < 1.0: - raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2)) - if not e >= 0.0: - raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e)) - defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total, - b1=b1, b2=b2, e=e, weight_decay=weight_decay, - max_grad_norm=max_grad_norm) - super(BertAdam, self).__init__(params, defaults) - - def get_lr(self): - lr = [] - for group in self.param_groups: - for p in group['params']: - state = self.state[p] - if len(state) == 0: - return [0] - if group['t_total'] != -1: - schedule_fct = SCHEDULES[group['schedule']] - lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup']) - else: - lr_scheduled = group['lr'] - lr.append(lr_scheduled) - return lr - - def step(self, closure=None): - """Performs a single optimization step. - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - loss = closure() - for group in self.param_groups: - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data - if grad.is_sparse: - raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') - - state = self.state[p] - - # State initialization - if len(state) == 0: - state['step'] = 0 - # Exponential moving average of gradient values - state['next_m'] = torch.zeros_like(p.data) - # Exponential moving average of squared gradient values - state['next_v'] = torch.zeros_like(p.data) - - next_m, next_v = state['next_m'], state['next_v'] - beta1, beta2 = group['b1'], group['b2'] - - # Add grad clipping - if group['max_grad_norm'] > 0: - clip_grad_norm_(p, group['max_grad_norm']) - - # Decay the first and second moment running average coefficient - # In-place operations to update the averages at the same time - next_m.mul_(beta1).add_(1 - beta1, grad) - next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad) - update = next_m / (next_v.sqrt() + group['e']) - - # Just adding the square of the weights to the loss function is *not* - # the correct way of using L2 regularization/weight decay with Adam, - # since that will interact with the m and v parameters in strange ways. - # - # Instead we want to decay the weights in a manner that doesn't interact - # with the m/v parameters. This is equivalent to adding the square - # of the weights to the loss with plain (non-momentum) SGD. - if group['weight_decay'] > 0.0: - update += group['weight_decay'] * p.data - - if group['t_total'] != -1: - schedule_fct = SCHEDULES[group['schedule']] - lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup']) - else: - lr_scheduled = group['lr'] - - update_with_lr = lr_scheduled * update - p.data.add_(-update_with_lr) - - state['step'] += 1 - - return loss diff --git a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/requirements.txt b/examples/huggingface/pytorch/question-answering/pruning/group_lasso/requirements.txt deleted file mode 100644 index 9741bff445c..00000000000 --- a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/requirements.txt +++ /dev/null @@ -1,18 +0,0 @@ -# progress bars in model download and training scripts -tqdm -# Accessing files from S3 directly. -boto3 -# Used for downloading models over HTTP -requests -six -ipdb -#Data processing -h5py -nltk -progressbar -#Others -numpy -onnxruntime -requests -urllib3 -git+https://github.com/NVIDIA/dllogger diff --git a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/run_squad_sparse.py b/examples/huggingface/pytorch/question-answering/pruning/group_lasso/run_squad_sparse.py deleted file mode 100644 index 864c3cb8666..00000000000 --- a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/run_squad_sparse.py +++ /dev/null @@ -1,1285 +0,0 @@ -# coding=utf-8 -# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. -# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Run BERT on SQuAD.""" - -from __future__ import absolute_import, division, print_function - -import argparse -import collections -import dllogger, time -import json -import logging -import math -import modeling -import numpy as np -import os -import random -import sys -import torch -from apex import amp -from file_utils import PYTORCH_PRETRAINED_BERT_CACHE -from io import open -from optimization import BertAdam, warmup_linear -from schedulers import LinearWarmUpScheduler -from torch.utils.data import( - DataLoader, - RandomSampler, - SequentialSampler, - TensorDataset -) -from torch.utils.data.distributed import DistributedSampler -from tqdm import tqdm -from tokenization import (BasicTokenizer, BertTokenizer, whitespace_tokenize) -from utils import is_main_process, format_step -import builtins -import io - -safe_builtins = { - 'range', - 'complex', - 'set', - 'frozenset', - 'slice', -} - -torch._C._jit_set_profiling_mode(False) -torch._C._jit_set_profiling_executor(False) - -if sys.version_info[0] == 2: - import cPickle as pickle -else: - import pickle - -logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt='%m/%d/%Y %H:%M:%S', - level=logging.INFO) -logger = logging.getLogger(__name__) - - -class SquadExample(object): - """ - A single training/test example for the Squad dataset. - For examples without an answer, the start and end position are -1. - """ - - def __init__(self, - qas_id, - question_text, - doc_tokens, - orig_answer_text=None, - start_position=None, - end_position=None, - is_impossible=None): - self.qas_id = qas_id - self.question_text = question_text - self.doc_tokens = doc_tokens - self.orig_answer_text = orig_answer_text - self.start_position = start_position - self.end_position = end_position - self.is_impossible = is_impossible - - def __str__(self): - return self.__repr__() - - def __repr__(self): - s = "" - s += "qas_id: %s" % (self.qas_id) - s += ", question_text: %s" % ( - self.question_text) - s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) - if self.start_position: - s += ", start_position: %d" % (self.start_position) - if self.end_position: - s += ", end_position: %d" % (self.end_position) - if self.is_impossible: - s += ", is_impossible: %r" % (self.is_impossible) - return s - - -class InputFeatures(object): - """A single set of features of data.""" - - def __init__(self, - unique_id, - example_index, - doc_span_index, - tokens, - token_to_orig_map, - token_is_max_context, - input_ids, - input_mask, - segment_ids, - start_position=None, - end_position=None, - is_impossible=None): - self.unique_id = unique_id - self.example_index = example_index - self.doc_span_index = doc_span_index - self.tokens = tokens - self.token_to_orig_map = token_to_orig_map - self.token_is_max_context = token_is_max_context - self.input_ids = input_ids - self.input_mask = input_mask - self.segment_ids = segment_ids - self.start_position = start_position - self.end_position = end_position - self.is_impossible = is_impossible - - -def read_squad_examples(input_file, is_training, version_2_with_negative): - """Read a SQuAD json file into a list of SquadExample.""" - with open(input_file, "r", encoding='utf-8') as reader: - input_data = json.load(reader)["data"] - - def is_whitespace(c): - if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: - return True - return False - - examples = [] - for entry in input_data: - for paragraph in entry["paragraphs"]: - paragraph_text = paragraph["context"] - doc_tokens = [] - char_to_word_offset = [] - prev_is_whitespace = True - for c in paragraph_text: - if is_whitespace(c): - prev_is_whitespace = True - else: - if prev_is_whitespace: - doc_tokens.append(c) - else: - doc_tokens[-1] += c - prev_is_whitespace = False - char_to_word_offset.append(len(doc_tokens) - 1) - - for qa in paragraph["qas"]: - qas_id = qa["id"] - question_text = qa["question"] - start_position = None - end_position = None - orig_answer_text = None - is_impossible = False - if is_training: - if version_2_with_negative: - is_impossible = qa["is_impossible"] - if (len(qa["answers"]) != 1) and (not is_impossible): - raise ValueError( - "For training, each question should have exactly 1 answer.") - if not is_impossible: - answer = qa["answers"][0] - orig_answer_text = answer["text"] - answer_offset = answer["answer_start"] - answer_length = len(orig_answer_text) - start_position = char_to_word_offset[answer_offset] - end_position = char_to_word_offset[answer_offset + answer_length - 1] - # Only add answers where the text can be exactly recovered from the - # document. If this CAN'T happen it's likely due to weird Unicode - # stuff so we will just skip the example. - # - # Note that this means for training mode, every example is NOT - # guaranteed to be preserved. - actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) - cleaned_answer_text = " ".join( - whitespace_tokenize(orig_answer_text)) - if actual_text.find(cleaned_answer_text) == -1: - logger.warning("Could not find answer: '%s' vs. '%s'", - actual_text, cleaned_answer_text) - continue - else: - start_position = -1 - end_position = -1 - orig_answer_text = "" - - example = SquadExample( - qas_id=qas_id, - question_text=question_text, - doc_tokens=doc_tokens, - orig_answer_text=orig_answer_text, - start_position=start_position, - end_position=end_position, - is_impossible=is_impossible) - examples.append(example) - return examples - - -def convert_examples_to_features(examples, tokenizer, max_seq_length, - doc_stride, max_query_length, is_training): - """Loads a data file into a list of `InputBatch`s.""" - - unique_id = 1000000000 - - features = [] - for (example_index, example) in enumerate(examples): - query_tokens = tokenizer.tokenize(example.question_text) - - if len(query_tokens) > max_query_length: - query_tokens = query_tokens[0:max_query_length] - - tok_to_orig_index = [] - orig_to_tok_index = [] - all_doc_tokens = [] - for (i, token) in enumerate(example.doc_tokens): - orig_to_tok_index.append(len(all_doc_tokens)) - sub_tokens = tokenizer.tokenize(token) - for sub_token in sub_tokens: - tok_to_orig_index.append(i) - all_doc_tokens.append(sub_token) - - tok_start_position = None - tok_end_position = None - if is_training and example.is_impossible: - tok_start_position = -1 - tok_end_position = -1 - if is_training and not example.is_impossible: - tok_start_position = orig_to_tok_index[example.start_position] - if example.end_position < len(example.doc_tokens) - 1: - tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 - else: - tok_end_position = len(all_doc_tokens) - 1 - (tok_start_position, tok_end_position) = _improve_answer_span( - all_doc_tokens, tok_start_position, tok_end_position, tokenizer, - example.orig_answer_text) - - # The -3 accounts for [CLS], [SEP] and [SEP] - max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 - - # We can have documents that are longer than the maximum sequence length. - # To deal with this we do a sliding window approach, where we take chunks - # of the up to our max length with a stride of `doc_stride`. - _DocSpan = collections.namedtuple( # pylint: disable=invalid-name - "DocSpan", ["start", "length"]) - doc_spans = [] - start_offset = 0 - while start_offset < len(all_doc_tokens): - length = len(all_doc_tokens) - start_offset - if length > max_tokens_for_doc: - length = max_tokens_for_doc - doc_spans.append(_DocSpan(start=start_offset, length=length)) - if start_offset + length == len(all_doc_tokens): - break - start_offset += min(length, doc_stride) - - for (doc_span_index, doc_span) in enumerate(doc_spans): - tokens = [] - token_to_orig_map = {} - token_is_max_context = {} - segment_ids = [] - tokens.append("[CLS]") - segment_ids.append(0) - for token in query_tokens: - tokens.append(token) - segment_ids.append(0) - tokens.append("[SEP]") - segment_ids.append(0) - - for i in range(doc_span.length): - split_token_index = doc_span.start + i - token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] - - is_max_context = _check_is_max_context(doc_spans, doc_span_index, - split_token_index) - token_is_max_context[len(tokens)] = is_max_context - tokens.append(all_doc_tokens[split_token_index]) - segment_ids.append(1) - tokens.append("[SEP]") - segment_ids.append(1) - - input_ids = tokenizer.convert_tokens_to_ids(tokens) - - # The mask has 1 for real tokens and 0 for padding tokens. Only real - # tokens are attended to. - input_mask = [1] * len(input_ids) - - # Zero-pad up to the sequence length. - while len(input_ids) < max_seq_length: - input_ids.append(0) - input_mask.append(0) - segment_ids.append(0) - - assert len(input_ids) == max_seq_length - assert len(input_mask) == max_seq_length - assert len(segment_ids) == max_seq_length - - start_position = None - end_position = None - if is_training and not example.is_impossible: - # For training, if our document chunk does not contain an annotation - # we throw it out, since there is nothing to predict. - doc_start = doc_span.start - doc_end = doc_span.start + doc_span.length - 1 - out_of_span = False - if not (tok_start_position >= doc_start and - tok_end_position <= doc_end): - out_of_span = True - if out_of_span: - start_position = 0 - end_position = 0 - else: - doc_offset = len(query_tokens) + 2 - start_position = tok_start_position - doc_start + doc_offset - end_position = tok_end_position - doc_start + doc_offset - if is_training and example.is_impossible: - start_position = 0 - end_position = 0 - - features.append( - InputFeatures( - unique_id=unique_id, - example_index=example_index, - doc_span_index=doc_span_index, - tokens=tokens, - token_to_orig_map=token_to_orig_map, - token_is_max_context=token_is_max_context, - input_ids=input_ids, - input_mask=input_mask, - segment_ids=segment_ids, - start_position=start_position, - end_position=end_position, - is_impossible=example.is_impossible)) - unique_id += 1 - - return features - - -def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, - orig_answer_text): - """Returns tokenized answer spans that better match the annotated answer.""" - - # The SQuAD annotations are character based. We first project them to - # whitespace-tokenized words. But then after WordPiece tokenization, we can - # often find a "better match". For example: - # - # Question: What year was John Smith born? - # Context: The leader was John Smith (1895-1943). - # Answer: 1895 - # - # The original whitespace-tokenized answer will be "(1895-1943).". However - # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match - # the exact answer, 1895. - # - # However, this is not always possible. Consider the following: - # - # Question: What country is the top exporter of electornics? - # Context: The Japanese electronics industry is the lagest in the world. - # Answer: Japan - # - # In this case, the annotator chose "Japan" as a character sub-span of - # the word "Japanese". Since our WordPiece tokenizer does not split - # "Japanese", we just use "Japanese" as the annotation. This is fairly rare - # in SQuAD, but does happen. - tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) - - for new_start in range(input_start, input_end + 1): - for new_end in range(input_end, new_start - 1, -1): - text_span = " ".join(doc_tokens[new_start:(new_end + 1)]) - if text_span == tok_answer_text: - return (new_start, new_end) - - return (input_start, input_end) - - -def _check_is_max_context(doc_spans, cur_span_index, position): - """Check if this is the 'max context' doc span for the token.""" - - # Because of the sliding window approach taken to scoring documents, a single - # token can appear in multiple documents. E.g. - # Doc: the man went to the store and bought a gallon of milk - # Span A: the man went to the - # Span B: to the store and bought - # Span C: and bought a gallon of - # ... - # - # Now the word 'bought' will have two scores from spans B and C. We only - # want to consider the score with "maximum context", which we define as - # the *minimum* of its left and right context (the *sum* of left and - # right context will always be the same, of course). - # - # In the example the maximum context for 'bought' would be span C since - # it has 1 left context and 3 right context, while span B has 4 left context - # and 0 right context. - best_score = None - best_span_index = None - for (span_index, doc_span) in enumerate(doc_spans): - end = doc_span.start + doc_span.length - 1 - if position < doc_span.start: - continue - if position > end: - continue - num_left_context = position - doc_span.start - num_right_context = end - position - score = min(num_left_context, num_right_context) + 0.01 * doc_span.length - if best_score is None or score > best_score: - best_score = score - best_span_index = span_index - - return cur_span_index == best_span_index - - -RawResult = collections.namedtuple("RawResult", - ["unique_id", "start_logits", "end_logits"]) - - -def get_answers(examples, features, results, args): - predictions = collections.defaultdict(list) #it is possible that one example corresponds to multiple features - Prediction = collections.namedtuple('Prediction', ['text', 'start_logit', 'end_logit']) - - if args.version_2_with_negative: - null_vals = collections.defaultdict(lambda: (float("inf"),0,0)) - for ex, feat, result in match_results(examples, features, results): - start_indices = _get_best_indices(result.start_logits, args.n_best_size) - end_indices = _get_best_indices(result.end_logits, args.n_best_size) - prelim_predictions = get_valid_prelim_predictions(start_indices, end_indices, feat, result, args) - prelim_predictions = sorted( - prelim_predictions, - key=lambda x: (x.start_logit + x.end_logit), - reverse=True) - if args.version_2_with_negative: - score = result.start_logits[0] + result.end_logits[0] - if score < null_vals[ex.qas_id][0]: - null_vals[ex.qas_id] = (score, result.start_logits[0], result.end_logits[0]) - - curr_predictions = [] - seen_predictions = [] - for pred in prelim_predictions: - if len(curr_predictions) == args.n_best_size: - break - if pred.start_index > 0: # this is a non-null prediction TODO: this probably is irrelevant - final_text = get_answer_text(ex, feat, pred, args) - if final_text in seen_predictions: - continue - else: - final_text = "" - - seen_predictions.append(final_text) - curr_predictions.append(Prediction(final_text, pred.start_logit, pred.end_logit)) - predictions[ex.qas_id] += curr_predictions - - #Add empty prediction - if args.version_2_with_negative: - for qas_id in predictions.keys(): - predictions[qas_id].append(Prediction('', - null_vals[ex.qas_id][1], - null_vals[ex.qas_id][2])) - - - nbest_answers = collections.defaultdict(list) - answers = {} - for qas_id, preds in predictions.items(): - nbest = sorted( - preds, - key=lambda x: (x.start_logit + x.end_logit), - reverse=True)[:args.n_best_size] - - # In very rare edge cases we could only have single null prediction. - # So we just create a nonce prediction in this case to avoid failure. - if not nbest: - nbest.append(Prediction(text="empty", start_logit=0.0, end_logit=0.0)) - - total_scores = [] - best_non_null_entry = None - for entry in nbest: - total_scores.append(entry.start_logit + entry.end_logit) - if not best_non_null_entry and entry.text: - best_non_null_entry = entry - probs = _compute_softmax(total_scores) - for (i, entry) in enumerate(nbest): - output = collections.OrderedDict() - output["text"] = entry.text - output["probability"] = probs[i] - output["start_logit"] = entry.start_logit - output["end_logit"] = entry.end_logit - nbest_answers[qas_id].append(output) - if args.version_2_with_negative: - score_diff = null_vals[qas_id][0] - best_non_null_entry.start_logit - best_non_null_entry.end_logit - if score_diff > args.null_score_diff_threshold: - answers[qas_id] = "" - else: - answers[qas_id] = best_non_null_entry.text - else: - answers[qas_id] = nbest_answers[qas_id][0]['text'] - - return answers, nbest_answers - -def get_answer_text(example, feature, pred, args): - tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] - orig_doc_start = feature.token_to_orig_map[pred.start_index] - orig_doc_end = feature.token_to_orig_map[pred.end_index] - orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] - tok_text = " ".join(tok_tokens) - - # De-tokenize WordPieces that have been split off. - tok_text = tok_text.replace(" ##", "") - tok_text = tok_text.replace("##", "") - - # Clean whitespace - tok_text = tok_text.strip() - tok_text = " ".join(tok_text.split()) - orig_text = " ".join(orig_tokens) - - final_text = get_final_text(tok_text, orig_text, args.do_lower_case, args.verbose_logging) - return final_text - -def get_valid_prelim_predictions(start_indices, end_indices, feature, result, args): - - _PrelimPrediction = collections.namedtuple( - "PrelimPrediction", - ["start_index", "end_index", "start_logit", "end_logit"]) - prelim_predictions = [] - for start_index in start_indices: - for end_index in end_indices: - if start_index >= len(feature.tokens): - continue - if end_index >= len(feature.tokens): - continue - if start_index not in feature.token_to_orig_map: - continue - if end_index not in feature.token_to_orig_map: - continue - if not feature.token_is_max_context.get(start_index, False): - continue - if end_index < start_index: - continue - length = end_index - start_index + 1 - if length > args.max_answer_length: - continue - prelim_predictions.append( - _PrelimPrediction( - start_index=start_index, - end_index=end_index, - start_logit=result.start_logits[start_index], - end_logit=result.end_logits[end_index])) - return prelim_predictions - -def match_results(examples, features, results): - unique_f_ids = set([f.unique_id for f in features]) - unique_r_ids = set([r.unique_id for r in results]) - matching_ids = unique_f_ids & unique_r_ids - features = [f for f in features if f.unique_id in matching_ids] - results = [r for r in results if r.unique_id in matching_ids] - features.sort(key=lambda x: x.unique_id) - results.sort(key=lambda x: x.unique_id) - - for f, r in zip(features, results): #original code assumes strict ordering of examples. TODO: rewrite this - yield examples[f.example_index], f, r - -def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): - """Project the tokenized prediction back to the original text.""" - - # When we created the data, we kept track of the alignment between original - # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So - # now `orig_text` contains the span of our original text corresponding to the - # span that we predicted. - # - # However, `orig_text` may contain extra characters that we don't want in - # our prediction. - # - # For example, let's say: - # pred_text = steve smith - # orig_text = Steve Smith's - # - # We don't want to return `orig_text` because it contains the extra "'s". - # - # We don't want to return `pred_text` because it's already been normalized - # (the SQuAD eval script also does punctuation stripping/lower casing but - # our tokenizer does additional normalization like stripping accent - # characters). - # - # What we really want to return is "Steve Smith". - # - # Therefore, we have to apply a semi-complicated alignment heruistic between - # `pred_text` and `orig_text` to get a character-to-charcter alignment. This - # can fail in certain cases in which case we just return `orig_text`. - - def _strip_spaces(text): - ns_chars = [] - ns_to_s_map = collections.OrderedDict() - for (i, c) in enumerate(text): - if c == " ": - continue - ns_to_s_map[len(ns_chars)] = i - ns_chars.append(c) - ns_text = "".join(ns_chars) - return (ns_text, ns_to_s_map) - - # We first tokenize `orig_text`, strip whitespace from the result - # and `pred_text`, and check if they are the same length. If they are - # NOT the same length, the heuristic has failed. If they are the same - # length, we assume the characters are one-to-one aligned. - - tokenizer = BasicTokenizer(do_lower_case=do_lower_case) - - tok_text = " ".join(tokenizer.tokenize(orig_text)) - - start_position = tok_text.find(pred_text) - if start_position == -1: - if verbose_logging: - logger.info( - "Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) - return orig_text - end_position = start_position + len(pred_text) - 1 - - (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) - (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) - - if len(orig_ns_text) != len(tok_ns_text): - if verbose_logging: - logger.info("Length not equal after stripping spaces: '%s' vs '%s'", - orig_ns_text, tok_ns_text) - return orig_text - - # We then project the characters in `pred_text` back to `orig_text` using - # the character-to-character alignment. - tok_s_to_ns_map = {} - for (i, tok_index) in tok_ns_to_s_map.items(): - tok_s_to_ns_map[tok_index] = i - - orig_start_position = None - if start_position in tok_s_to_ns_map: - ns_start_position = tok_s_to_ns_map[start_position] - if ns_start_position in orig_ns_to_s_map: - orig_start_position = orig_ns_to_s_map[ns_start_position] - - if orig_start_position is None: - if verbose_logging: - logger.info("Couldn't map start position") - return orig_text - - orig_end_position = None - if end_position in tok_s_to_ns_map: - ns_end_position = tok_s_to_ns_map[end_position] - if ns_end_position in orig_ns_to_s_map: - orig_end_position = orig_ns_to_s_map[ns_end_position] - - if orig_end_position is None: - if verbose_logging: - logger.info("Couldn't map end position") - return orig_text - - output_text = orig_text[orig_start_position:(orig_end_position + 1)] - return output_text - - -def _get_best_indices(logits, n_best_size): - """Get the n-best logits from a list.""" - index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) - - best_indices = [] - for i in range(len(index_and_score)): - if i >= n_best_size: - break - best_indices.append(index_and_score[i][0]) - return best_indices - - -def _compute_softmax(scores): - """Compute softmax probability over raw logits.""" - if not scores: - return [] - - max_score = None - for score in scores: - if max_score is None or score > max_score: - max_score = score - - exp_scores = [] - total_sum = 0.0 - for score in scores: - x = math.exp(score - max_score) - exp_scores.append(x) - total_sum += x - - probs = [] - for score in exp_scores: - probs.append(score / total_sum) - return probs - - - -from apex.multi_tensor_apply import multi_tensor_applier -class GradientClipper: - """ - Clips gradient norm of an iterable of parameters. - """ - def __init__(self, max_grad_norm): - self.max_norm = max_grad_norm - if multi_tensor_applier.available: - import amp_C - self._overflow_buf = torch.cuda.IntTensor([0]) - self.multi_tensor_l2norm = amp_C.multi_tensor_l2norm - self.multi_tensor_scale = amp_C.multi_tensor_scale - else: - raise RuntimeError('Gradient clipping requires cuda extensions') - - def step(self, parameters): - l = [p.grad for p in parameters if p.grad is not None] - total_norm, _ = multi_tensor_applier(self.multi_tensor_l2norm, self._overflow_buf, [l], False) - total_norm = total_norm.item() - if (total_norm == float('inf')): return - clip_coef = self.max_norm / (total_norm + 1e-6) - if clip_coef < 1: - multi_tensor_applier(self.multi_tensor_scale, self._overflow_buf, [l, l], clip_coef) - -class RestrictedUnpickler(pickle.Unpickler): - - def find_class(self, module, name): - # Only allow safe classes from builtins. - if module == "builtins" and name in safe_builtins: - return getattr(builtins, name) - # Forbid everything else. - raise pickle.UnpicklingError("global '%s.%s' is forbidden" % - (module, name)) - -def restricted_loads(s): - """Helper function analogous to pickle.loads().""" - return RestrictedUnpickler(io.BytesIO(s)).load() - - -def train_func(model, agent, args, dllogger, global_step, train_examples, num_train_optimization_steps, n_gpu, device, optimizer): - model = agent.model.model - - if args.cache_dir is None: - cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}'.format( - list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), - str(args.max_query_length)) - else: - cached_train_features_file = args.cache_dir.strip('/') + '/' + args.train_file.split('/')[-1] + '_{0}_{1}_{2}_{3}'.format( - list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), - str(args.max_query_length)) - - train_features = None - try: - with open(cached_train_features_file, "rb") as reader: - train_features = restricted_loads(reader) - except: - train_features = convert_examples_to_features( - examples=train_examples, - tokenizer=tokenizer, - max_seq_length=args.max_seq_length, - doc_stride=args.doc_stride, - max_query_length=args.max_query_length, - is_training=True) - - if not args.skip_cache and is_main_process(): - dllogger.log(step="PARAMETER", data={"Cached_train features_file": cached_train_features_file}) - with open(cached_train_features_file, "wb") as writer: - pickle.dump(train_features, writer) - - dllogger.log(step="PARAMETER", data={"train_start": True}) - dllogger.log(step="PARAMETER", data={"training_samples": len(train_examples)}) - dllogger.log(step="PARAMETER", data={"training_features": len(train_features)}) - dllogger.log(step="PARAMETER", data={"train_batch_size":args.train_batch_size}) - dllogger.log(step="PARAMETER", data={"steps":num_train_optimization_steps}) - all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) - all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) - all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) - all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) - all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) - train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, - all_start_positions, all_end_positions) - if args.local_rank == -1: - train_sampler = RandomSampler(train_data) - else: - train_sampler = DistributedSampler(train_data) - train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size * n_gpu) - - args.train_features = train_features - model.train() - gradClipper = GradientClipper(max_grad_norm=1.0) - final_loss = None - train_start = time.time() - - #pruning - agent.pre_epoch_begin() - - for epoch in range(int(args.num_train_epochs)): - train_iter = tqdm(train_dataloader, desc="Iteration", disable=args.disable_progress_bar) if is_main_process() else train_dataloader - agent.on_epoch_begin(epoch) - for step, batch in enumerate(train_iter): - # Terminate early for benchmarking - - agent.on_batch_begin(step) - - if args.max_steps > 0 and global_step > args.max_steps: - break - - if n_gpu == 1: - batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self - input_ids, input_mask, segment_ids, start_positions, end_positions = batch - start_logits, end_logits = model(input_ids, segment_ids, input_mask) - # If we are on multi-GPU, split add a dimension - if len(start_positions.size()) > 1: - start_positions = start_positions.squeeze(-1) - if len(end_positions.size()) > 1: - end_positions = end_positions.squeeze(-1) - # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = start_logits.size(1) - start_positions.clamp_(0, ignored_index) - end_positions.clamp_(0, ignored_index) - - loss_fct = torch.nn.CrossEntropyLoss(ignore_index=ignored_index) - start_loss = loss_fct(start_logits, start_positions) - end_loss = loss_fct(end_logits, end_positions) - loss = (start_loss + end_loss) / 2 - if n_gpu > 1: - loss = loss.mean() # mean() to average on multi-gpu. - if args.gradient_accumulation_steps > 1: - loss = loss / args.gradient_accumulation_steps - if args.fp16: - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() - else: - loss.backward() - - # gradient clipping - gradClipper.step(amp.master_params(optimizer)) - - if (step + 1) % args.gradient_accumulation_steps == 0: - if args.fp16 : - # modify learning rate with special warm up for BERT which FusedAdam doesn't do - scheduler.step() - - optimizer.step() - agent.on_post_grad() - optimizer.zero_grad() - - global_step += 1 - - final_loss = loss.item() - if step % args.log_freq == 0: - dllogger.log(step=(epoch, global_step,), data={"step_loss": final_loss, - "learning_rate": optimizer.param_groups[0]['lr']}) - - agent.on_batch_end() - - agent.on_epoch_end() - args.time_to_train = time.time() - train_start - args.final_loss = final_loss - -def eval_func(model, args, dllogger, tokenizer, device): - if not args.do_train and args.fp16: - model.half() - - eval_examples = read_squad_examples( - input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) - eval_features = convert_examples_to_features( - examples=eval_examples, - tokenizer=tokenizer, - max_seq_length=args.max_seq_length, - doc_stride=args.doc_stride, - max_query_length=args.max_query_length, - is_training=False) - - dllogger.log(step="PARAMETER", data={"infer_start": True}) - dllogger.log(step="PARAMETER", data={"eval_samples": len(eval_examples)}) - dllogger.log(step="PARAMETER", data={"eval_features": len(eval_features)}) - dllogger.log(step="PARAMETER", data={"predict_batch_size": args.predict_batch_size}) - - all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) - all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) - all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) - all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) - eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) - # Run prediction for full data - eval_sampler = SequentialSampler(eval_data) - eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) - - args.eval_features = eval_features - infer_start = time.time() - model.eval() - all_results = [] - dllogger.log(step="PARAMETER", data={"eval_start": True}) - for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=args.disable_progress_bar): - if len(all_results) % 1000 == 0: - dllogger.log(step="PARAMETER", data={"sample_number": len(all_results)}) - input_ids = input_ids.to(device) - input_mask = input_mask.to(device) - segment_ids = segment_ids.to(device) - with torch.no_grad(): - batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask) - for i, example_index in enumerate(example_indices): - start_logits = batch_start_logits[i].detach().cpu().tolist() - end_logits = batch_end_logits[i].detach().cpu().tolist() - eval_feature = eval_features[example_index.item()] - unique_id = int(eval_feature.unique_id) - all_results.append(RawResult(unique_id=unique_id, - start_logits=start_logits, - end_logits=end_logits)) - - time_to_infer = time.time() - infer_start - output_prediction_file = os.path.join(args.output_dir, "predictions.json") - output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") - - answers, nbest_answers = get_answers(eval_examples, eval_features, all_results, args) - with open(output_prediction_file, "w") as f: - f.write(json.dumps(answers, indent=4) + "\n") - with open(output_nbest_file, "w") as f: - f.write(json.dumps(nbest_answers, indent=4) + "\n") - - # output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") - # write_predictions(eval_examples, eval_features, all_results, - # args.n_best_size, args.max_answer_length, - # args.do_lower_case, output_prediction_file, - # output_nbest_file, output_null_log_odds_file, args.verbose_logging, - # args.version_2_with_negative, args.null_score_diff_threshold) - - if args.do_eval and is_main_process(): - import sys - import subprocess - import shlex - eval_out = subprocess.check_output([sys.executable, shlex.quote(args.eval_script), - shlex.quote(args.predict_file), shlex.quote(args.output_dir) + "/predictions.json"]) - scores = str(eval_out).strip() - exact_match = float(scores.split(":")[1].split(",")[0]) - f1 = float(scores.split(":")[2].split("}")[0]) - args.exact_match = exact_match - args.f1 = f1 - args.time_to_infer = time_to_infer - -def main(): - parser = argparse.ArgumentParser() - - ## Required parameters - parser.add_argument("--bert_model", default=None, type=str, required=True, - help="Bert pre-trained model selected in the list: bert-base-uncased, " - "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " - "bert-base-multilingual-cased, bert-base-chinese.") - parser.add_argument("--output_dir", default=None, type=str, required=True, - help="The output directory where the model checkpoints and predictions will be written.") - parser.add_argument("--init_checkpoint", - default=None, - type=str, - required=True, - help="The checkpoint file from pretraining") - - ## Other parameters - parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") - parser.add_argument("--predict_file", default=None, type=str, - help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") - parser.add_argument("--max_seq_length", default=384, type=int, - help="The maximum total input sequence length after WordPiece tokenization. Sequences " - "longer than this will be truncated, and sequences shorter than this will be padded.") - parser.add_argument("--doc_stride", default=128, type=int, - help="When splitting up a long document into chunks, how much stride to take between chunks.") - parser.add_argument("--max_query_length", default=64, type=int, - help="The maximum number of tokens for the question. Questions longer than this will " - "be truncated to this length.") - parser.add_argument("--do_train", action='store_true', help="Whether to run training.") - parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.") - parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") - parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") - parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") - parser.add_argument("--num_train_epochs", default=3.0, type=float, - help="Total number of training epochs to perform.") - parser.add_argument("--max_steps", default=-1.0, type=float, - help="Total number of training steps to perform.") - parser.add_argument("--warmup_proportion", default=0.1, type=float, - help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% " - "of training.") - parser.add_argument("--n_best_size", default=20, type=int, - help="The total number of n-best predictions to generate in the nbest_predictions.json " - "output file.") - parser.add_argument("--max_answer_length", default=30, type=int, - help="The maximum length of an answer that can be generated. This is needed because the start " - "and end predictions are not conditioned on one another.") - parser.add_argument("--verbose_logging", action='store_true', - help="If true, all of the warnings related to data processing will be printed. " - "A number of warnings are expected for a normal SQuAD evaluation.") - parser.add_argument("--no_cuda", - action='store_true', - help="Whether not to use CUDA when available") - parser.add_argument('--seed', - type=int, - default=42, - help="random seed for initialization") - parser.add_argument('--gradient_accumulation_steps', - type=int, - default=1, - help="Number of updates steps to accumulate before performing a backward/update pass.") - parser.add_argument("--do_lower_case", - action='store_true', - help="Whether to lower case the input text. True for uncased models, False for cased models.") - parser.add_argument("--local_rank", - type=int, - default=os.getenv('LOCAL_RANK', -1), - help="local_rank for distributed training on gpus") - parser.add_argument('--fp16', - default=False, - action='store_true', - help="Mixed precision training") - parser.add_argument('--amp', - default=False, - action='store_true', - help="Mixed precision training") - parser.add_argument('--loss_scale', - type=float, default=0, - help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" - "0 (default value): dynamic loss scaling.\n" - "Positive power of 2: static loss scaling value.\n") - parser.add_argument('--version_2_with_negative', - action='store_true', - help='If true, the SQuAD examples contain some that do not have an answer.') - parser.add_argument('--null_score_diff_threshold', - type=float, default=0.0, - help="If null_score - best_non_null is greater than the threshold predict null.") - parser.add_argument('--vocab_file', - type=str, default=None, required=True, - help="Vocabulary mapping/file BERT was pretrainined on") - parser.add_argument("--config_file", - default=None, - type=str, - required=True, - help="The BERT model config") - parser.add_argument('--log_freq', - type=int, default=50, - help='frequency of logging loss.') - parser.add_argument('--json-summary', type=str, default="results/dllogger.json", - help='If provided, the json summary will be written to' - 'the specified file.') - parser.add_argument("--eval_script", - help="Script to evaluate squad predictions", - default="evaluate.py", - type=str) - parser.add_argument("--do_eval", - action='store_true', - help="Whether to use evaluate accuracy of predictions") - parser.add_argument("--use_env", - action='store_true', - help="Whether to read local rank from ENVVAR") - parser.add_argument('--skip_checkpoint', - default=False, - action='store_true', - help="Whether to save checkpoints") - parser.add_argument('--disable-progress-bar', - default=False, - action='store_true', - help='Disable tqdm progress bar') - parser.add_argument("--skip_cache", - default=False, - action='store_true', - help="Whether to cache train features") - parser.add_argument("--cache_dir", - default=None, - type=str, - help="Location to cache train feaures. Will default to the dataset directory") - parser.add_argument("--prune_config", - default='prune_bert.yaml', - help="pruning config") - parser.add_argument('--do_prune', - action='store_true', - help="prune model") - - args = parser.parse_args() - args.fp16 = args.fp16 or args.amp - - if args.local_rank == -1 or args.no_cuda: - device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") - n_gpu = torch.cuda.device_count() - else: - torch.cuda.set_device(args.local_rank) - device = torch.device("cuda", args.local_rank) - # Initializes the distributed backend which will take care of synchronizing nodes/GPUs - torch.distributed.init_process_group(backend='nccl', init_method='env://') - n_gpu = 1 - - if is_main_process(): - dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, - filename=args.json_summary), - dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step)]) - else: - dllogger.init(backends=[]) - - print("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( - device, n_gpu, bool(args.local_rank != -1), args.fp16)) - - dllogger.log(step="PARAMETER", data={"Config": [str(args)]}) - - if args.gradient_accumulation_steps < 1: - raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( - args.gradient_accumulation_steps)) - - args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps - - random.seed(args.seed) - np.random.seed(args.seed) - torch.manual_seed(args.seed) - dllogger.log(step="PARAMETER", data={"SEED": args.seed}) - - if n_gpu > 0: - torch.cuda.manual_seed_all(args.seed) - - if not args.do_train and not args.do_predict: - raise ValueError("At least one of `do_train` or `do_predict` must be True.") - - if args.do_train: - if not args.train_file: - raise ValueError( - "If `do_train` is True, then `train_file` must be specified.") - if args.do_predict: - if not args.predict_file: - raise ValueError( - "If `do_predict` is True, then `predict_file` must be specified.") - - if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and os.listdir(args.output_dir)!=['logfile.txt']: - print("WARNING: Output directory {} already exists and is not empty.".format(args.output_dir), os.listdir(args.output_dir)) - if not os.path.exists(args.output_dir) and is_main_process(): - os.makedirs(args.output_dir) - - tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large - # tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) - - train_examples = None - num_train_optimization_steps = None - if args.do_train: - train_examples = read_squad_examples( - input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative) - num_train_optimization_steps = int( - len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs - if args.local_rank != -1: - num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() - - # Prepare model - config = modeling.BertConfig.from_json_file(args.config_file) - # Padding for divisibility by 8 - if config.vocab_size % 8 != 0: - config.vocab_size += 8 - (config.vocab_size % 8) - - modeling.ACT2FN["bias_gelu"] = modeling.bias_gelu_training - model = modeling.BertForQuestionAnswering(config) - # model = modeling.BertForQuestionAnswering.from_pretrained(args.bert_model, - # cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) - dllogger.log(step="PARAMETER", data={"loading_checkpoint": True}) - model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')["model"], strict=False) - #model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'), strict=False) - dllogger.log(step="PARAMETER", data={"loaded_checkpoint": True}) - model.to(device) - num_weights = sum([p.numel() for p in model.parameters() if p.requires_grad]) - dllogger.log(step="PARAMETER", data={"model_weights_num":num_weights}) - - # Prepare optimizer - param_optimizer = list(model.named_parameters()) - - # hack to remove pooler, which is not used - # thus it produce None grad that break apex - param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] - - no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] - optimizer_grouped_parameters = [ - {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, - {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] - if args.do_train: - if args.fp16: - try: - from apex.optimizers import FusedAdam - except ImportError: - raise ImportError( - "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") - optimizer = FusedAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - bias_correction=False) - - if args.loss_scale == 0: - model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, - loss_scale="dynamic") - else: - model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale=args.loss_scale) - if args.do_train: - scheduler = LinearWarmUpScheduler(optimizer, warmup=args.warmup_proportion, total_steps=num_train_optimization_steps) - - else: - optimizer = BertAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - warmup=args.warmup_proportion, - t_total=num_train_optimization_steps) - - if args.local_rank != -1: - try: - from apex.parallel import DistributedDataParallel as DDP - except ImportError: - raise ImportError( - "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") - - model = DDP(model) - elif n_gpu > 1: - model = torch.nn.DataParallel(model) - - global_step = 0 - - if args.do_prune: - # Pruning! - from intel_extension_for_transformers.transformers import NoTrainerOptimizer, PrunerConfig, PruningConfig - pruner_config = PrunerConfig( - prune_type="GroupLasso", - target_sparsity_ratio=0.7, - names=['bert.encoder.layer.0.attention.output.dense.weight'], - parameters={"alpha": 0.006, "pattern": "tile_pattern_1x2"}, - ) - pruning_conf = PruningConfig(pruner_config=pruner_config) - no_trainer_optimizer = NoTrainerOptimizer(model, output_dir=args.output_dir) - agent = no_trainer_optimizer.init_pruner(pruning_config=pruning_conf) - - def train_func_nc(model): - return train_func(model, agent, args, dllogger, global_step, train_examples, num_train_optimization_steps, n_gpu, device, optimizer) - - def eval_func_nc(model): - return eval_func(model, args, dllogger, tokenizer, device) - - if args.do_train: - # train_func(args, dllogger, global_step) - no_trainer_optimizer.train_func = train_func_nc - - - if args.do_train and is_main_process() and not args.skip_checkpoint: - # Save a trained model and the associated configuration - model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self - output_model_file = os.path.join(args.output_dir, modeling.WEIGHTS_NAME) - torch.save({"model":model_to_save.state_dict()}, output_model_file) - output_config_file = os.path.join(args.output_dir, modeling.CONFIG_NAME) - with open(output_config_file, 'w') as f: - f.write(model_to_save.config.to_json_string()) - - if args.do_predict and (args.local_rank == -1 or is_main_process()): - no_trainer_optimizer.eval_func = eval_func_nc - - if args.do_prune: - model = no_trainer_optimizer.prune() - - if args.do_train: - gpu_count = n_gpu - if torch.distributed.is_initialized(): - gpu_count = torch.distributed.get_world_size() - - if args.max_steps == -1: - dllogger.log(step=tuple(), data={"e2e_train_time": args.time_to_train, - "training_sequences_per_second": len(args.train_features) * args.num_train_epochs / args.time_to_train, - "final_loss": args.final_loss}) - else: - dllogger.log(step=tuple(), data={"e2e_train_time": time_to_train, - "training_sequences_per_second": args.train_batch_size * args.gradient_accumulation_steps \ - * args.max_steps * gpu_count / time_to_train, - "final_loss": final_loss}) - if args.do_predict and is_main_process(): - dllogger.log(step=tuple(), data={"e2e_inference_time": args.time_to_infer, - "inference_sequences_per_second": len(args.eval_features) / args.time_to_infer}) - if args.do_eval and is_main_process(): - dllogger.log(step=tuple(), data={"exact_match": args.exact_match, "F1": args.f1}) - -if __name__ == "__main__": - main() - dllogger.flush() - diff --git a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/schedulers.py b/examples/huggingface/pytorch/question-answering/pruning/group_lasso/schedulers.py deleted file mode 100644 index 4dd99b43a15..00000000000 --- a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/schedulers.py +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. -# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -import torch -from torch.optim.optimizer import Optimizer -from torch.optim.lr_scheduler import _LRScheduler - - -class LRScheduler(_LRScheduler): - def __init__(self, optimizer, last_epoch=-1): - # Check if using mixed precision training - self.mixed_training = False - base_optimizer = optimizer - - # Check that optimizer param is valid - if not isinstance(optimizer, Optimizer): - raise TypeError('{} is not an Optimizer'.format( - type(optimizer).__name__)) - - super(LRScheduler, self).__init__(base_optimizer, last_epoch) - - def step(self, epoch=None): - # Set the current training step - # ('epoch' is used to be consistent with _LRScheduler) - if self.mixed_training: - # The assumption is that the step will be constant - state_dict = self.optimizer.state[self.optimizer.param_groups[0]['params'][0]] - if 'step' in state_dict: - self.last_epoch = state_dict['step'] + 1 - else: - self.last_epoch = 1 - else: - self.last_epoch = epoch if epoch is not None else self.last_epoch + 1 - - for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()): - param_group['lr'] = lr - - -class CosineWarmUpScheduler(LRScheduler): - """ - Applies a warm up period to the learning rate. - """ - - def __init__(self, optimizer, warmup, total_steps, last_epoch=-1): - self.warmup = warmup - self.total_steps = total_steps - super(CosineWarmUpScheduler, self).__init__(optimizer, last_epoch) - - def get_lr(self): - progress = self.last_epoch / self.total_steps - if progress < self.warmup: - return [base_lr * progress / self.warmup for base_lr in self.base_lrs] - else: - return [base_lr * (0.5 * (1.0 + torch.cos(math.pi + progress))) for base_lr in self.base_lrs] - - -class ConstantWarmUpScheduler(LRScheduler): - """ - Applies a warm up period to the learning rate. - """ - - def __init__(self, optimizer, warmup, total_steps, last_epoch=-1): - self.warmup = warmup - self.total_steps = total_steps - super(ConstantWarmUpScheduler, self).__init__(optimizer, last_epoch) - - def get_lr(self): - progress = self.last_epoch / self.total_steps - if progress < self.warmup: - return [base_lr * progress / self.warmup for base_lr in self.base_lrs] - else: - return self.base_lrs - - -class LinearWarmUpScheduler(LRScheduler): - """ - Applies a warm up period to the learning rate. - """ - - def __init__(self, optimizer, warmup, total_steps, last_epoch=-1): - self.warmup = warmup - self.total_steps = total_steps - super(LinearWarmUpScheduler, self).__init__(optimizer, last_epoch) - - def get_lr(self): - progress = self.last_epoch / self.total_steps - if progress < self.warmup: - return [base_lr * progress / self.warmup for base_lr in self.base_lrs] - else: - return [base_lr * max(( progress - 1.0)/(self.warmup - 1.0), 0.) for base_lr in self.base_lrs] - - -class PolyWarmUpScheduler(LRScheduler): - """ - Applies a warm up period to the learning rate. - """ - - def __init__(self, optimizer, warmup, total_steps, degree=0.5, last_epoch=-1): - self.warmup = warmup - self.total_steps = total_steps - self.degree = degree - super(PolyWarmUpScheduler, self).__init__(optimizer, last_epoch) - - def step(self, epoch=None): - param_group = self.optimizer.param_groups[0] - if 'step' in param_group: - self.last_epoch = param_group['step'] + 1 - else: - self.last_epoch = 1 - - for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()): - param_group['lr'] = lr - - def get_lr(self): - progress = self.last_epoch / self.total_steps - if progress < self.warmup: - return [base_lr * progress / self.warmup for base_lr in self.base_lrs] - else: - return [base_lr * ((1.0 - progress) ** self.degree) for base_lr in self.base_lrs] diff --git a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/scripts/run_squad_sparse.sh b/examples/huggingface/pytorch/question-answering/pruning/group_lasso/scripts/run_squad_sparse.sh deleted file mode 100644 index 581dd1db883..00000000000 --- a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/scripts/run_squad_sparse.sh +++ /dev/null @@ -1,117 +0,0 @@ -#!/usr/bin/env bash - -# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -echo "Container nvidia build = " $NVIDIA_BUILD_ID - -init_checkpoint=${1:-"/path/to/ckpt_8601.pt"} -epochs=${2:-"2.0"} -batch_size=${3:-"4"} -learning_rate=${4:-"3e-5"} -precision=${5:-"tf32"} -num_gpu="1" -seed="1" -BERT_PREP_WORKING_DIR=${6:-'/path/to/bert_data'} -squad_dir="$BERT_PREP_WORKING_DIR/download/squad/v1.1" -vocab_file="$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt" -OUT_DIR=${7:-"./results/SQuAD/"} -prune_config=${8:-"prune_bert.yaml"} -json_summary=${9:-"$OUT_DIR/dllogger.json"} -echo $init_checkpoint $epochs $batch_size $learning_rate \ -$precision $num_gpu $seed $squad_dir $vocab_file \ -$OUT_DIR $prune_config $json_summary - - -#init_checkpoint=${1:-"/workspace/bert/checkpoints/bert_uncased.pt"} -#epochs=${2:-"2.0"} -#batch_size=${3:-"4"} -#learning_rate=${4:-"3e-5"} -#precision=${5:-"fp16"} -#num_gpu=${6:-"8"} -#seed=${7:-"1"} -#squad_dir=${8:-"$BERT_PREP_WORKING_DIR/download/squad/v1.1"} -#vocab_file=${9:-"$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt"} -#OUT_DIR=${10:-"/workspace/bert/results/SQuAD"} -mode=${11:-"train eval"} -CONFIG_FILE=${12:-"/workspace/bert/bert_config.json"} -CONFIG_FILE="$PWD/bert_config.json" -max_steps=${13:-"-1"} - -echo "out dir is $OUT_DIR" -mkdir -p $OUT_DIR -if [ ! -d "$OUT_DIR" ]; then - echo "ERROR: non existing $OUT_DIR" - exit 1 -fi - -use_fp16="" -if [ "$precision" = "fp16" ] ; then - echo "fp16 activated!" - use_fp16=" --fp16 " -fi - -if [ "$num_gpu" = "1" ] ; then - export CUDA_VISIBLE_DEVICES=0 - mpi_command="" -else - unset CUDA_VISIBLE_DEVICES - mpi_command=" -m torch.distributed.launch --nproc_per_node=$num_gpu" -fi - -CMD="python $mpi_command run_squad_sparse.py " -CMD+="--do_prune " -CMD+="--prune_config=$prune_config " -CMD+="--json-summary=$json_summary " -CMD+="--init_checkpoint=$init_checkpoint " -if [ "$mode" = "train" ] ; then - CMD+="--do_train " - CMD+="--train_file=$squad_dir/train-v1.1.json " - CMD+="--train_batch_size=$batch_size " -elif [ "$mode" = "eval" ] ; then - CMD+="--do_predict " - CMD+="--predict_file=$squad_dir/dev-v1.1.json " - CMD+="--predict_batch_size=$batch_size " - CMD+="--eval_script=$squad_dir/evaluate-v1.1.py " - CMD+="--do_eval " -elif [ "$mode" = "prediction" ] ; then - CMD+="--do_predict " - CMD+="--predict_file=$squad_dir/dev-v1.1.json " - CMD+="--predict_batch_size=$batch_size " -else - CMD+=" --do_train " - CMD+=" --train_file=$squad_dir/train-v1.1.json " - CMD+=" --train_batch_size=$batch_size " - CMD+="--do_predict " - CMD+="--predict_file=$squad_dir/dev-v1.1.json " - CMD+="--predict_batch_size=$batch_size " - CMD+="--eval_script=$squad_dir/evaluate-v1.1.py " - CMD+="--do_eval " -fi - -CMD+=" --do_lower_case " -CMD+=" --bert_model=bert-large-uncased " -CMD+=" --learning_rate=$learning_rate " -CMD+=" --seed=$seed " -CMD+=" --num_train_epochs=$epochs " -CMD+=" --max_seq_length=384 " -CMD+=" --doc_stride=128 " -CMD+=" --output_dir=$OUT_DIR " -CMD+=" --vocab_file=$vocab_file " -CMD+=" --config_file=$CONFIG_FILE " -CMD+=" --max_steps=$max_steps " -CMD+=" $use_fp16" - -LOGFILE=$OUT_DIR/logfile.txt -echo "$CMD |& tee $LOGFILE" -time $CMD |& tee $LOGFILE diff --git a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/tokenization.py b/examples/huggingface/pytorch/question-answering/pruning/group_lasso/tokenization.py deleted file mode 100644 index fb3cffe20ca..00000000000 --- a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/tokenization.py +++ /dev/null @@ -1,392 +0,0 @@ -# coding=utf-8 -# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. -# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tokenization classes.""" - -from __future__ import absolute_import, division, print_function, unicode_literals - -import collections -import logging -import os -import unicodedata -import six -from io import open - -from file_utils import cached_path - -logger = logging.getLogger(__name__) - -PRETRAINED_VOCAB_ARCHIVE_MAP = { - 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", - 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", - 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", - 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", - 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt", - 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", - 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", -} -PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = { - 'bert-base-uncased': 512, - 'bert-large-uncased': 512, - 'bert-base-cased': 512, - 'bert-large-cased': 512, - 'bert-base-multilingual-uncased': 512, - 'bert-base-multilingual-cased': 512, - 'bert-base-chinese': 512, -} -VOCAB_NAME = 'vocab.txt' - -def convert_to_unicode(text): - """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" - if six.PY3: - if isinstance(text, str): - return text - elif isinstance(text, bytes): - return text.decode("utf-8", "ignore") - else: - raise ValueError("Unsupported string type: %s" % (type(text))) - elif six.PY2: - if isinstance(text, str): - return text.decode("utf-8", "ignore") - elif isinstance(text, unicode): - return text - else: - raise ValueError("Unsupported string type: %s" % (type(text))) - else: - raise ValueError("Not running on Python2 or Python 3?") - - -def load_vocab(vocab_file): - """Loads a vocabulary file into a dictionary.""" - vocab = collections.OrderedDict() - index = 0 - with open(vocab_file, "r", encoding="utf-8") as reader: - while True: - token = reader.readline() - if not token: - break - token = token.strip() - vocab[token] = index - index += 1 - return vocab - - -def whitespace_tokenize(text): - """Runs basic whitespace cleaning and splitting on a piece of text.""" - text = text.strip() - if not text: - return [] - tokens = text.split() - return tokens - - -class BertTokenizer(object): - """Runs end-to-end tokenization: punctuation splitting + wordpiece""" - - def __init__(self, vocab_file, do_lower_case=True, max_len=None, - never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): - if not os.path.isfile(vocab_file): - raise ValueError( - "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " - "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)) - self.vocab = load_vocab(vocab_file) - self.ids_to_tokens = collections.OrderedDict( - [(ids, tok) for tok, ids in self.vocab.items()]) - self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case, - never_split=never_split) - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) - self.max_len = max_len if max_len is not None else int(1e12) - - def tokenize(self, text): - split_tokens = [] - for token in self.basic_tokenizer.tokenize(text): - for sub_token in self.wordpiece_tokenizer.tokenize(token): - split_tokens.append(sub_token) - return split_tokens - - def convert_tokens_to_ids(self, tokens): - """Converts a sequence of tokens into ids using the vocab.""" - ids = [] - for token in tokens: - ids.append(self.vocab[token]) - if len(ids) > self.max_len: - raise ValueError( - "Token indices sequence length is longer than the specified maximum " - " sequence length for this BERT model ({} > {}). Running this" - " sequence through BERT will result in indexing errors".format(len(ids), self.max_len) - ) - return ids - - def convert_ids_to_tokens(self, ids): - """Converts a sequence of ids in wordpiece tokens using the vocab.""" - tokens = [] - for i in ids: - tokens.append(self.ids_to_tokens[i]) - return tokens - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): - """ - Instantiate a PreTrainedBertModel from a pre-trained model file. - Download and cache the pre-trained model file if needed. - """ - if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: - vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path] - else: - vocab_file = pretrained_model_name_or_path - if os.path.isdir(vocab_file): - vocab_file = os.path.join(vocab_file, VOCAB_NAME) - # redirect to the cache, if necessary - try: - resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) - except EnvironmentError: - logger.error( - "Model name '{}' was not found in model name list ({}). " - "We assumed '{}' was a path or url but couldn't find any file " - "associated to this path or url.".format( - pretrained_model_name_or_path, - ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), - vocab_file)) - return None - if resolved_vocab_file == vocab_file: - logger.info("loading vocabulary file {}".format(vocab_file)) - else: - logger.info("loading vocabulary file {} from cache at {}".format( - vocab_file, resolved_vocab_file)) - if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: - # if we're using a pretrained model, ensure the tokenizer won't index sequences longer - # than the number of positional embeddings - max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path] - kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) - # Instantiate tokenizer. - tokenizer = cls(resolved_vocab_file, *inputs, **kwargs) - return tokenizer - - -class BasicTokenizer(object): - """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" - - def __init__(self, - do_lower_case=True, - never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): - """Constructs a BasicTokenizer. - - Args: - do_lower_case: Whether to lower case the input. - """ - self.do_lower_case = do_lower_case - self.never_split = never_split - - def tokenize(self, text): - """Tokenizes a piece of text.""" - text = self._clean_text(text) - # This was added on November 1st, 2018 for the multilingual and Chinese - # models. This is also applied to the English models now, but it doesn't - # matter since the English models were not trained on any Chinese data - # and generally don't have any Chinese data in them (there are Chinese - # characters in the vocabulary because Wikipedia does have some Chinese - # words in the English Wikipedia.). - text = self._tokenize_chinese_chars(text) - orig_tokens = whitespace_tokenize(text) - split_tokens = [] - for token in orig_tokens: - if self.do_lower_case and token not in self.never_split: - token = token.lower() - token = self._run_strip_accents(token) - split_tokens.extend(self._run_split_on_punc(token)) - - output_tokens = whitespace_tokenize(" ".join(split_tokens)) - return output_tokens - - def _run_strip_accents(self, text): - """Strips accents from a piece of text.""" - text = unicodedata.normalize("NFD", text) - output = [] - for char in text: - cat = unicodedata.category(char) - if cat == "Mn": - continue - output.append(char) - return "".join(output) - - def _run_split_on_punc(self, text): - """Splits punctuation on a piece of text.""" - if text in self.never_split: - return [text] - chars = list(text) - i = 0 - start_new_word = True - output = [] - while i < len(chars): - char = chars[i] - if _is_punctuation(char): - output.append([char]) - start_new_word = True - else: - if start_new_word: - output.append([]) - start_new_word = False - output[-1].append(char) - i += 1 - - return ["".join(x) for x in output] - - def _tokenize_chinese_chars(self, text): - """Adds whitespace around any CJK character.""" - output = [] - for char in text: - cp = ord(char) - if self._is_chinese_char(cp): - output.append(" ") - output.append(char) - output.append(" ") - else: - output.append(char) - return "".join(output) - - def _is_chinese_char(self, cp): - """Checks whether CP is the codepoint of a CJK character.""" - # This defines a "chinese character" as anything in the CJK Unicode block: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - # - # Note that the CJK Unicode block is NOT all Japanese and Korean characters, - # despite its name. The modern Korean Hangul alphabet is a different block, - # as is Japanese Hiragana and Katakana. Those alphabets are used to write - # space-separated words, so they are not treated specially and handled - # like the all of the other languages. - if ((cp >= 0x4E00 and cp <= 0x9FFF) or # - (cp >= 0x3400 and cp <= 0x4DBF) or # - (cp >= 0x20000 and cp <= 0x2A6DF) or # - (cp >= 0x2A700 and cp <= 0x2B73F) or # - (cp >= 0x2B740 and cp <= 0x2B81F) or # - (cp >= 0x2B820 and cp <= 0x2CEAF) or - (cp >= 0xF900 and cp <= 0xFAFF) or # - (cp >= 0x2F800 and cp <= 0x2FA1F)): # - return True - - return False - - def _clean_text(self, text): - """Performs invalid character removal and whitespace cleanup on text.""" - output = [] - for char in text: - cp = ord(char) - if cp == 0 or cp == 0xfffd or _is_control(char): - continue - if _is_whitespace(char): - output.append(" ") - else: - output.append(char) - return "".join(output) - - -class WordpieceTokenizer(object): - """Runs WordPiece tokenization.""" - - def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): - self.vocab = vocab - self.unk_token = unk_token - self.max_input_chars_per_word = max_input_chars_per_word - - def tokenize(self, text): - """Tokenizes a piece of text into its word pieces. - - This uses a greedy longest-match-first algorithm to perform tokenization - using the given vocabulary. - - For example: - input = "unaffable" - output = ["un", "##aff", "##able"] - - Args: - text: A single token or whitespace separated tokens. This should have - already been passed through `BasicTokenizer`. - - Returns: - A list of wordpiece tokens. - """ - - output_tokens = [] - for token in whitespace_tokenize(text): - chars = list(token) - if len(chars) > self.max_input_chars_per_word: - output_tokens.append(self.unk_token) - continue - - is_bad = False - start = 0 - sub_tokens = [] - while start < len(chars): - end = len(chars) - cur_substr = None - while start < end: - substr = "".join(chars[start:end]) - if start > 0: - substr = "##" + substr - if substr in self.vocab: - cur_substr = substr - break - end -= 1 - if cur_substr is None: - is_bad = True - break - sub_tokens.append(cur_substr) - start = end - - if is_bad: - output_tokens.append(self.unk_token) - else: - output_tokens.extend(sub_tokens) - return output_tokens - - -def _is_whitespace(char): - """Checks whether `chars` is a whitespace character.""" - # \t, \n, and \r are technically control characters but we treat them - # as whitespace since they are generally considered as such. - if char == " " or char == "\t" or char == "\n" or char == "\r": - return True - cat = unicodedata.category(char) - if cat == "Zs": - return True - return False - - -def _is_control(char): - """Checks whether `chars` is a control character.""" - # These are technically control characters but we count them as whitespace - # characters. - if char == "\t" or char == "\n" or char == "\r": - return False - cat = unicodedata.category(char) - if cat.startswith("C"): - return True - return False - - -def _is_punctuation(char): - """Checks whether `chars` is a punctuation character.""" - cp = ord(char) - # We treat all non-letter/number ASCII as punctuation. - # Characters such as "^", "$", and "`" are not in the Unicode - # Punctuation class but we treat them as punctuation anyways, for - # consistency. - if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or - (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): - return True - cat = unicodedata.category(char) - if cat.startswith("P"): - return True - return False diff --git a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/utils.py b/examples/huggingface/pytorch/question-answering/pruning/group_lasso/utils.py deleted file mode 100644 index f4f88e8eff9..00000000000 --- a/examples/huggingface/pytorch/question-answering/pruning/group_lasso/utils.py +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -import torch.distributed as dist - -from pathlib import Path - - -def get_rank(): - if not dist.is_available(): - return 0 - if not dist.is_initialized(): - return 0 - return dist.get_rank() - - -def get_world_size(): - if not dist.is_available(): - return 1 - if not dist.is_initialized(): - return 1 - return dist.get_world_size() - - -def is_main_process(): - return get_rank() == 0 - - -def barrier(): - if dist.is_available() and dist.is_initialized(): - dist.barrier() - - -def format_step(step): - if isinstance(step, str): - return step - s = "" - if len(step) > 0: - s += "Training Epoch: {} ".format(step[0]) - if len(step) > 1: - s += "Training Iteration: {} ".format(step[1]) - if len(step) > 2: - s += "Validation Iteration: {} ".format(step[2]) - return s - - -def mkdir(path): - Path(path).mkdir(parents=True, exist_ok=True) - - -def mkdir_by_main_process(path): - if is_main_process(): - mkdir(path) - barrier() diff --git a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/README.md b/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/README.md deleted file mode 100644 index 9da0b53c3d2..00000000000 --- a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/README.md +++ /dev/null @@ -1,52 +0,0 @@ -Step-by-Step -============ - -This document is used to list steps of reproducing PyTorch longformer-base-4096 pruning result. - - -# Prerequisite - -## 1. Environment - -```shell -pip install intel-extension-for-transformers -pip install -r requirements.txt -pip install transformers==4.34.1 -``` ->**Note**: Please use transformers no higher than 4.34.1 - - -## 2. Prepare Dataset - -The dataset will be downloaded and converted to squad format automatically with `./scripts/download_data_and_convert.sh`. - -```shell -bash ./scripts/download_data_and_convert.sh -``` - -There will generate two squad format files: `squad-wikipedia-train-4096.json` and `squad-wikipedia-dev-4096.json` - - -# Run Examples - -### pruning longformer-base-4096 - -Run the `./scripts/longformer_base_sparse_global_4x1_pruning.sh` to prune with `global sparse 80% and 4*1 pattern`. In this script, we set `per_device_train_batch_size=1` which is same with [the original longformer codes](https://github.com/allenai/longformer). - -```shell -bash ./scripts/longformer_base_sparse_global_4x1_pruning.sh -``` - -Fine-tuning of the dense model is also supported by running the `./scripts/longformer_base_dense_fintune.sh` - - -### Results -The snip-momentum pruning method is used by default and the initial dense model is well fine-tuned. - -| Model | Dataset | Sparsity pattern | sparsity ratio | Dense F1 |Sparse F1 | Relative drop| -| :----: | :----: | :----: | :----: |:----: |:----:| :----: | -| longformer-base-4096 | triviaqa | 4x1 | global 80% | 75.2 (from [the paper](https://arxiv.org/abs/2004.05150))/74.9235 (ours) | 74.48 | -0.96% | - -## References -* [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) - diff --git a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/modeling_longformer.py b/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/modeling_longformer.py deleted file mode 100644 index 3a08b4aaf96..00000000000 --- a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/modeling_longformer.py +++ /dev/null @@ -1,2282 +0,0 @@ -# coding=utf-8 -# Copyright 2020 The Allen Institute for AI team and The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""PyTorch Longformer model.""" - -import math -from dataclasses import dataclass -from typing import Optional, Tuple, Union - -import torch -import torch.utils.checkpoint -from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss - -from transformers.activations import ACT2FN, gelu -from transformers.modeling_utils import PreTrainedModel -from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer -from transformers.utils import ( - ModelOutput, - add_code_sample_docstrings, - add_start_docstrings, - add_start_docstrings_to_model_forward, - logging, - replace_return_docstrings, -) -from transformers.models.longformer.configuration_longformer import LongformerConfig - - -logger = logging.get_logger(__name__) - -_CHECKPOINT_FOR_DOC = "allenai/longformer-base-4096" -_CONFIG_FOR_DOC = "LongformerConfig" -_TOKENIZER_FOR_DOC = "LongformerTokenizer" - -LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [ - "allenai/longformer-base-4096", - "allenai/longformer-large-4096", - "allenai/longformer-large-4096-finetuned-triviaqa", - "allenai/longformer-base-4096-extra.pos.embd.only", - "allenai/longformer-large-4096-extra.pos.embd.only", - # See all Longformer models at https://huggingface.co/models?filter=longformer -] - - -@dataclass -class LongformerBaseModelOutput(ModelOutput): - """ - Base class for Longformer's outputs, with potential hidden states, local and global attentions. - Args: - last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x + - attention_window + 1)`, where `x` is the number of tokens with global attention mask. - Local attentions weights after the attention softmax, used to compute the weighted average in the - self-attention heads. Those are the attention weights from every token in the sequence to every token with - global attention (first `x` values) and to every token in the attention window (remaining `attention_window - + 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the - remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a - token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding - (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens. - If the attention window contains a token with global attention, the attention weight at the corresponding - index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global - attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be - accessed from `global_attentions`. - global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`, - where `x` is the number of tokens with global attention mask. - Global attentions weights after the attention softmax, used to compute the weighted average in the - self-attention heads. Those are the attention weights from every token with global attention to every token - in the sequence. - """ - - last_hidden_state: torch.FloatTensor - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - global_attentions: Optional[Tuple[torch.FloatTensor]] = None - - -@dataclass -class LongformerBaseModelOutputWithPooling(ModelOutput): - """ - Base class for Longformer's outputs that also contains a pooling of the last hidden states. - Args: - last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`): - Last layer hidden-state of the first token of the sequence (classification token) further processed by a - Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence - prediction (classification) objective during pretraining. - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x + - attention_window + 1)`, where `x` is the number of tokens with global attention mask. - Local attentions weights after the attention softmax, used to compute the weighted average in the - self-attention heads. Those are the attention weights from every token in the sequence to every token with - global attention (first `x` values) and to every token in the attention window (remaining `attention_window - + 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the - remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a - token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding - (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens. - If the attention window contains a token with global attention, the attention weight at the corresponding - index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global - attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be - accessed from `global_attentions`. - global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`, - where `x` is the number of tokens with global attention mask. - Global attentions weights after the attention softmax, used to compute the weighted average in the - self-attention heads. Those are the attention weights from every token with global attention to every token - in the sequence. - """ - - last_hidden_state: torch.FloatTensor - pooler_output: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - global_attentions: Optional[Tuple[torch.FloatTensor]] = None - - -@dataclass -class LongformerMaskedLMOutput(ModelOutput): - """ - Base class for masked language models outputs. - Args: - loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): - Masked language modeling (MLM) loss. - logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x + - attention_window + 1)`, where `x` is the number of tokens with global attention mask. - Local attentions weights after the attention softmax, used to compute the weighted average in the - self-attention heads. Those are the attention weights from every token in the sequence to every token with - global attention (first `x` values) and to every token in the attention window (remaining `attention_window - + 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the - remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a - token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding - (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens. - If the attention window contains a token with global attention, the attention weight at the corresponding - index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global - attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be - accessed from `global_attentions`. - global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`, - where `x` is the number of tokens with global attention mask. - Global attentions weights after the attention softmax, used to compute the weighted average in the - self-attention heads. Those are the attention weights from every token with global attention to every token - in the sequence. - """ - - loss: Optional[torch.FloatTensor] = None - logits: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - global_attentions: Optional[Tuple[torch.FloatTensor]] = None - - -@dataclass -class LongformerQuestionAnsweringModelOutput(ModelOutput): - """ - Base class for outputs of question answering Longformer models. - Args: - loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): - Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. - start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): - Span-start scores (before SoftMax). - end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): - Span-end scores (before SoftMax). - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x + - attention_window + 1)`, where `x` is the number of tokens with global attention mask. - Local attentions weights after the attention softmax, used to compute the weighted average in the - self-attention heads. Those are the attention weights from every token in the sequence to every token with - global attention (first `x` values) and to every token in the attention window (remaining `attention_window - + 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the - remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a - token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding - (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens. - If the attention window contains a token with global attention, the attention weight at the corresponding - index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global - attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be - accessed from `global_attentions`. - global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`, - where `x` is the number of tokens with global attention mask. - Global attentions weights after the attention softmax, used to compute the weighted average in the - self-attention heads. Those are the attention weights from every token with global attention to every token - in the sequence. - """ - - loss: Optional[torch.FloatTensor] = None - start_logits: torch.FloatTensor = None - end_logits: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - global_attentions: Optional[Tuple[torch.FloatTensor]] = None - - -@dataclass -class LongformerSequenceClassifierOutput(ModelOutput): - """ - Base class for outputs of sentence classification models. - Args: - loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): - Classification (or regression if config.num_labels==1) loss. - logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`): - Classification (or regression if config.num_labels==1) scores (before SoftMax). - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x + - attention_window + 1)`, where `x` is the number of tokens with global attention mask. - Local attentions weights after the attention softmax, used to compute the weighted average in the - self-attention heads. Those are the attention weights from every token in the sequence to every token with - global attention (first `x` values) and to every token in the attention window (remaining `attention_window - + 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the - remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a - token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding - (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens. - If the attention window contains a token with global attention, the attention weight at the corresponding - index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global - attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be - accessed from `global_attentions`. - global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`, - where `x` is the number of tokens with global attention mask. - Global attentions weights after the attention softmax, used to compute the weighted average in the - self-attention heads. Those are the attention weights from every token with global attention to every token - in the sequence. - """ - - loss: Optional[torch.FloatTensor] = None - logits: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - global_attentions: Optional[Tuple[torch.FloatTensor]] = None - - -@dataclass -class LongformerMultipleChoiceModelOutput(ModelOutput): - """ - Base class for outputs of multiple choice Longformer models. - Args: - loss (`torch.FloatTensor` of shape *(1,)*, *optional*, returned when `labels` is provided): - Classification loss. - logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`): - *num_choices* is the second dimension of the input tensors. (see *input_ids* above). - Classification scores (before SoftMax). - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x + - attention_window + 1)`, where `x` is the number of tokens with global attention mask. - Local attentions weights after the attention softmax, used to compute the weighted average in the - self-attention heads. Those are the attention weights from every token in the sequence to every token with - global attention (first `x` values) and to every token in the attention window (remaining `attention_window - + 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the - remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a - token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding - (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens. - If the attention window contains a token with global attention, the attention weight at the corresponding - index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global - attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be - accessed from `global_attentions`. - global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`, - where `x` is the number of tokens with global attention mask. - Global attentions weights after the attention softmax, used to compute the weighted average in the - self-attention heads. Those are the attention weights from every token with global attention to every token - in the sequence. - """ - - loss: Optional[torch.FloatTensor] = None - logits: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - global_attentions: Optional[Tuple[torch.FloatTensor]] = None - - -@dataclass -class LongformerTokenClassifierOutput(ModelOutput): - """ - Base class for outputs of token classification models. - Args: - loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided) : - Classification loss. - logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`): - Classification scores (before SoftMax). - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x + - attention_window + 1)`, where `x` is the number of tokens with global attention mask. - Local attentions weights after the attention softmax, used to compute the weighted average in the - self-attention heads. Those are the attention weights from every token in the sequence to every token with - global attention (first `x` values) and to every token in the attention window (remaining `attention_window - + 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the - remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a - token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding - (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens. - If the attention window contains a token with global attention, the attention weight at the corresponding - index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global - attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be - accessed from `global_attentions`. - global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`, - where `x` is the number of tokens with global attention mask. - Global attentions weights after the attention softmax, used to compute the weighted average in the - self-attention heads. Those are the attention weights from every token with global attention to every token - in the sequence. - """ - - loss: Optional[torch.FloatTensor] = None - logits: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - global_attentions: Optional[Tuple[torch.FloatTensor]] = None - - -def _get_question_end_index(input_ids, sep_token_id): - """ - Computes the index of the first occurrence of `sep_token_id`. - """ - - sep_token_indices = (input_ids == sep_token_id).nonzero() - batch_size = input_ids.shape[0] - - assert sep_token_indices.shape[1] == 2, "`input_ids` should have two dimensions" - # here is the revised because of data preprocessing, - # but same to longformer codes: https://github.com/allenai/longformer - assert sep_token_indices.shape[0] == 2 * batch_size, ( - f"There should be exactly three separator tokens: {sep_token_id} in every sample for questions answering. You" - " might also consider to set `global_attention_mask` manually in the forward function to avoid this error." - ) - return sep_token_indices.view(batch_size, 2, 2)[:, 0, 1] - - -def _compute_global_attention_mask(input_ids, sep_token_id, before_sep_token=True): - """ - Computes global attention mask by putting attention on all tokens before `sep_token_id` if `before_sep_token is - True` else after `sep_token_id`. - """ - question_end_index = _get_question_end_index(input_ids, sep_token_id) - question_end_index = question_end_index.unsqueeze(dim=1) # size: batch_size x 1 - # bool attention mask with True in locations of global attention - attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device) - if before_sep_token is True: - attention_mask = (attention_mask.expand_as(input_ids) < question_end_index).to(torch.uint8) - else: - # last token is separation token and should not be counted and in the middle are two separation tokens - attention_mask = (attention_mask.expand_as(input_ids) > (question_end_index + 1)).to(torch.uint8) * ( - attention_mask.expand_as(input_ids) < input_ids.shape[-1] - ).to(torch.uint8) - - return attention_mask - - -def create_position_ids_from_input_ids(input_ids, padding_idx): - """ - Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols - are ignored. This is modified from fairseq's `utils.make_positions`. - Args: - x: torch.Tensor x: - Returns: torch.Tensor - """ - # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA. - mask = input_ids.ne(padding_idx).int() - incremental_indices = torch.cumsum(mask, dim=1).type_as(mask) * mask - return incremental_indices.long() + padding_idx - - -class LongformerEmbeddings(nn.Module): - """ - Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. - """ - - def __init__(self, config): - super().__init__() - self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) - self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) - self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) - - # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load - # any TensorFlow checkpoint file - self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - - self.padding_idx = config.pad_token_id - self.position_embeddings = nn.Embedding( - config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx - ) - - def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None): - if position_ids is None: - if input_ids is not None: - # Create the position ids from the input token ids. Any padded tokens remain padded. - position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx).to(input_ids.device) - else: - position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) - - if input_ids is not None: - input_shape = input_ids.size() - else: - input_shape = inputs_embeds.size()[:-1] - - if token_type_ids is None: - token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=position_ids.device) - - if inputs_embeds is None: - inputs_embeds = self.word_embeddings(input_ids) - position_embeddings = self.position_embeddings(position_ids) - token_type_embeddings = self.token_type_embeddings(token_type_ids) - - embeddings = inputs_embeds + position_embeddings + token_type_embeddings - embeddings = self.LayerNorm(embeddings) - embeddings = self.dropout(embeddings) - return embeddings - - def create_position_ids_from_inputs_embeds(self, inputs_embeds): - """ - We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids. - Args: - inputs_embeds: torch.Tensor inputs_embeds: - Returns: torch.Tensor - """ - input_shape = inputs_embeds.size()[:-1] - sequence_length = input_shape[1] - - position_ids = torch.arange( - self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device - ) - return position_ids.unsqueeze(0).expand(input_shape) - - -class LongformerSelfAttention(nn.Module): - def __init__(self, config, layer_id): - super().__init__() - if config.hidden_size % config.num_attention_heads != 0: - raise ValueError( - f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " - f"heads ({config.num_attention_heads})" - ) - self.num_heads = config.num_attention_heads - self.head_dim = int(config.hidden_size / config.num_attention_heads) - self.embed_dim = config.hidden_size - - self.query = nn.Linear(config.hidden_size, self.embed_dim) - self.key = nn.Linear(config.hidden_size, self.embed_dim) - self.value = nn.Linear(config.hidden_size, self.embed_dim) - - # separate projection layers for tokens with global attention - self.query_global = nn.Linear(config.hidden_size, self.embed_dim) - self.key_global = nn.Linear(config.hidden_size, self.embed_dim) - self.value_global = nn.Linear(config.hidden_size, self.embed_dim) - - self.dropout = config.attention_probs_dropout_prob - - self.layer_id = layer_id - attention_window = config.attention_window[self.layer_id] - assert ( - attention_window % 2 == 0 - ), f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}" - assert ( - attention_window > 0 - ), f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}" - - self.one_sided_attn_window_size = attention_window // 2 - - def forward( - self, - hidden_states, - attention_mask=None, - layer_head_mask=None, - is_index_masked=None, - is_index_global_attn=None, - is_global_attn=None, - output_attentions=False, - ): - """ - [`LongformerSelfAttention`] expects *len(hidden_states)* to be multiple of *attention_window*. Padding to - *attention_window* happens in [`LongformerModel.forward`] to avoid redoing the padding on each layer. - The *attention_mask* is changed in [`LongformerModel.forward`] from 0, 1, 2 to: - - -10000: no attention - - 0: local attention - - +10000: global attention - """ - hidden_states = hidden_states.transpose(0, 1) - - # project hidden states - query_vectors = self.query(hidden_states) - key_vectors = self.key(hidden_states) - value_vectors = self.value(hidden_states) - - seq_len, batch_size, embed_dim = hidden_states.size() - assert ( - embed_dim == self.embed_dim - ), f"hidden_states should have embed_dim = {self.embed_dim}, but has {embed_dim}" - - # normalize query - query_vectors /= math.sqrt(self.head_dim) - - query_vectors = query_vectors.view(seq_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1) - key_vectors = key_vectors.view(seq_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1) - - attn_scores = self._sliding_chunks_query_key_matmul( - query_vectors, key_vectors, self.one_sided_attn_window_size - ) - - # values to pad for attention probs - remove_from_windowed_attention_mask = (attention_mask != 0)[:, :, None, None] - - # cast to fp32/fp16 then replace 1's with -inf - float_mask = remove_from_windowed_attention_mask.type_as(query_vectors).masked_fill( - remove_from_windowed_attention_mask, torch.finfo(query_vectors.dtype).min - ) - # diagonal mask with zeros everywhere and -inf inplace of padding - diagonal_mask = self._sliding_chunks_query_key_matmul( - float_mask.new_ones(size=float_mask.size()), float_mask, self.one_sided_attn_window_size - ) - - # pad local attention probs - attn_scores += diagonal_mask - - assert list(attn_scores.size()) == [ - batch_size, - seq_len, - self.num_heads, - self.one_sided_attn_window_size * 2 + 1, - ], ( - f"local_attn_probs should be of size ({batch_size}, {seq_len}, {self.num_heads}," - f" {self.one_sided_attn_window_size * 2 + 1}), but is of size {attn_scores.size()}" - ) - - # compute local attention probs from global attention keys and contact over window dim - if is_global_attn: - # compute global attn indices required through out forward fn - ( - max_num_global_attn_indices, - is_index_global_attn_nonzero, - is_local_index_global_attn_nonzero, - is_local_index_no_global_attn_nonzero, - ) = self._get_global_attn_indices(is_index_global_attn) - # calculate global attn probs from global key - - global_key_attn_scores = self._concat_with_global_key_attn_probs( - query_vectors=query_vectors, - key_vectors=key_vectors, - max_num_global_attn_indices=max_num_global_attn_indices, - is_index_global_attn_nonzero=is_index_global_attn_nonzero, - is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero, - is_local_index_no_global_attn_nonzero=is_local_index_no_global_attn_nonzero, - ) - # concat to local_attn_probs - # (batch_size, seq_len, num_heads, extra attention count + 2*window+1) - attn_scores = torch.cat((global_key_attn_scores, attn_scores), dim=-1) - - # free memory - del global_key_attn_scores - - attn_probs = nn.functional.softmax( - attn_scores, dim=-1, dtype=torch.float32 - ) # use fp32 for numerical stability - - if layer_head_mask is not None: - assert layer_head_mask.size() == ( - self.num_heads, - ), f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}" - attn_probs = layer_head_mask.view(1, 1, -1, 1) * attn_probs - - # softmax sometimes inserts NaN if all positions are masked, replace them with 0 - attn_probs = torch.masked_fill(attn_probs, is_index_masked[:, :, None, None], 0.0) - attn_probs = attn_probs.type_as(attn_scores) - - # free memory - del attn_scores - - # apply dropout - attn_probs = nn.functional.dropout(attn_probs, p=self.dropout, training=self.training) - - value_vectors = value_vectors.view(seq_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1) - - # compute local attention output with global attention value and add - if is_global_attn: - # compute sum of global and local attn - attn_output = self._compute_attn_output_with_global_indices( - value_vectors=value_vectors, - attn_probs=attn_probs, - max_num_global_attn_indices=max_num_global_attn_indices, - is_index_global_attn_nonzero=is_index_global_attn_nonzero, - is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero, - ) - else: - # compute local attn only - attn_output = self._sliding_chunks_matmul_attn_probs_value( - attn_probs, value_vectors, self.one_sided_attn_window_size - ) - - assert attn_output.size() == (batch_size, seq_len, self.num_heads, self.head_dim), "Unexpected size" - attn_output = attn_output.transpose(0, 1).reshape(seq_len, batch_size, embed_dim).contiguous() - - # compute value for global attention and overwrite to attention output - # TODO: remove the redundant computation - if is_global_attn: - global_attn_output, global_attn_probs = self._compute_global_attn_output_from_hidden( - hidden_states=hidden_states, - max_num_global_attn_indices=max_num_global_attn_indices, - layer_head_mask=layer_head_mask, - is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero, - is_index_global_attn_nonzero=is_index_global_attn_nonzero, - is_local_index_no_global_attn_nonzero=is_local_index_no_global_attn_nonzero, - is_index_masked=is_index_masked, - ) - - # get only non zero global attn output - nonzero_global_attn_output = global_attn_output[ - is_local_index_global_attn_nonzero[0], :, is_local_index_global_attn_nonzero[1] - ] - - # overwrite values with global attention - attn_output[is_index_global_attn_nonzero[::-1]] = nonzero_global_attn_output.view( - len(is_local_index_global_attn_nonzero[0]), -1 - ) - # The attention weights for tokens with global attention are - # just filler values, they were never used to compute the output. - # Fill with 0 now, the correct values are in 'global_attn_probs'. - attn_probs[is_index_global_attn_nonzero] = 0 - - outputs = (attn_output.transpose(0, 1),) - - if output_attentions: - outputs += (attn_probs,) - - return outputs + (global_attn_probs,) if (is_global_attn and output_attentions) else outputs - - @staticmethod - def _pad_and_transpose_last_two_dims(hidden_states_padded, padding): - """pads rows and then flips rows and columns""" - hidden_states_padded = nn.functional.pad( - hidden_states_padded, padding - ) # padding value is not important because it will be overwritten - hidden_states_padded = hidden_states_padded.view( - *hidden_states_padded.size()[:-2], hidden_states_padded.size(-1), hidden_states_padded.size(-2) - ) - return hidden_states_padded - - @staticmethod - def _pad_and_diagonalize(chunked_hidden_states): - """ - shift every row 1 step right, converting columns into diagonals. - Example: - ```python - chunked_hidden_states: [ - 0.4983, - 2.6918, - -0.0071, - 1.0492, - -1.8348, - 0.7672, - 0.2986, - 0.0285, - -0.7584, - 0.4206, - -0.0405, - 0.1599, - 2.0514, - -1.1600, - 0.5372, - 0.2629, - ] - window_overlap = num_rows = 4 - ``` - (pad & diagonalize) => [ 0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000 - 0.0000, -1.8348, 0.7672, 0.2986, 0.0285, 0.0000, 0.0000 0.0000, 0.0000, -0.7584, 0.4206, - -0.0405, 0.1599, 0.0000 0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629 ] - """ - total_num_heads, num_chunks, window_overlap, hidden_dim = chunked_hidden_states.size() - chunked_hidden_states = nn.functional.pad( - chunked_hidden_states, (0, window_overlap + 1) - ) # total_num_heads x num_chunks x window_overlap x (hidden_dim+window_overlap+1). Padding value is not important because it'll be overwritten - chunked_hidden_states = chunked_hidden_states.view( - total_num_heads, num_chunks, -1 - ) # total_num_heads x num_chunks x window_overlap*window_overlap+window_overlap - chunked_hidden_states = chunked_hidden_states[ - :, :, :-window_overlap - ] # total_num_heads x num_chunks x window_overlap*window_overlap - chunked_hidden_states = chunked_hidden_states.view( - total_num_heads, num_chunks, window_overlap, window_overlap + hidden_dim - ) - chunked_hidden_states = chunked_hidden_states[:, :, :, :-1] - return chunked_hidden_states - - @staticmethod - def _chunk(hidden_states, window_overlap): - """convert into overlapping chunks. Chunk size = 2w, overlap size = w""" - - # non-overlapping chunks of size = 2w - hidden_states = hidden_states.view( - hidden_states.size(0), - hidden_states.size(1) // (window_overlap * 2), - window_overlap * 2, - hidden_states.size(2), - ) - - # use `as_strided` to make the chunks overlap with an overlap size = window_overlap - chunk_size = list(hidden_states.size()) - chunk_size[1] = chunk_size[1] * 2 - 1 - - chunk_stride = list(hidden_states.stride()) - chunk_stride[1] = chunk_stride[1] // 2 - return hidden_states.as_strided(size=chunk_size, stride=chunk_stride) - - @staticmethod - def _mask_invalid_locations(input_tensor, affected_seq_len) -> torch.Tensor: - beginning_mask_2d = input_tensor.new_ones(affected_seq_len, affected_seq_len + 1).tril().flip(dims=[0]) - beginning_mask = beginning_mask_2d[None, :, None, :] - ending_mask = beginning_mask.flip(dims=(1, 3)) - beginning_input = input_tensor[:, :affected_seq_len, :, : affected_seq_len + 1] - beginning_mask = beginning_mask.expand(beginning_input.size()) - beginning_input.masked_fill_(beginning_mask == 1, -float("inf")) # `== 1` converts to bool or uint8 - ending_input = input_tensor[:, -affected_seq_len:, :, -(affected_seq_len + 1) :] - ending_mask = ending_mask.expand(ending_input.size()) - ending_input.masked_fill_(ending_mask == 1, -float("inf")) # `== 1` converts to bool or uint8 - - def _sliding_chunks_query_key_matmul(self, query: torch.Tensor, key: torch.Tensor, window_overlap: int): - """ - Matrix multiplication of query and key tensors using with a sliding window attention pattern. This - implementation splits the input into overlapping chunks of size 2w (e.g. 512 for pretrained Longformer) with an - overlap of size window_overlap - """ - batch_size, seq_len, num_heads, head_dim = query.size() - assert ( - seq_len % (window_overlap * 2) == 0 - ), f"Sequence length should be multiple of {window_overlap * 2}. Given {seq_len}" - assert query.size() == key.size() - - chunks_count = seq_len // window_overlap - 1 - - # group batch_size and num_heads dimensions into one, then chunk seq_len into chunks of size window_overlap * 2 - query = query.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim) - key = key.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim) - - query = self._chunk(query, window_overlap) - key = self._chunk(key, window_overlap) - - # matrix multiplication - # bcxd: batch_size * num_heads x chunks x 2window_overlap x head_dim - # bcyd: batch_size * num_heads x chunks x 2window_overlap x head_dim - # bcxy: batch_size * num_heads x chunks x 2window_overlap x 2window_overlap - diagonal_chunked_attention_scores = torch.einsum("bcxd,bcyd->bcxy", (query, key)) # multiply - - # convert diagonals into columns - diagonal_chunked_attention_scores = self._pad_and_transpose_last_two_dims( - diagonal_chunked_attention_scores, padding=(0, 0, 0, 1) - ) - - # allocate space for the overall attention matrix where the chunks are combined. The last dimension - # has (window_overlap * 2 + 1) columns. The first (window_overlap) columns are the window_overlap lower triangles (attention from a word to - # window_overlap previous words). The following column is attention score from each word to itself, then - # followed by window_overlap columns for the upper triangle. - - diagonal_attention_scores = diagonal_chunked_attention_scores.new_empty( - (batch_size * num_heads, chunks_count + 1, window_overlap, window_overlap * 2 + 1) - ) - - # copy parts from diagonal_chunked_attention_scores into the combined matrix of attentions - # - copying the main diagonal and the upper triangle - diagonal_attention_scores[:, :-1, :, window_overlap:] = diagonal_chunked_attention_scores[ - :, :, :window_overlap, : window_overlap + 1 - ] - diagonal_attention_scores[:, -1, :, window_overlap:] = diagonal_chunked_attention_scores[ - :, -1, window_overlap:, : window_overlap + 1 - ] - # - copying the lower triangle - diagonal_attention_scores[:, 1:, :, :window_overlap] = diagonal_chunked_attention_scores[ - :, :, -(window_overlap + 1) : -1, window_overlap + 1 : - ] - - diagonal_attention_scores[:, 0, 1:window_overlap, 1:window_overlap] = diagonal_chunked_attention_scores[ - :, 0, : window_overlap - 1, 1 - window_overlap : - ] - - # separate batch_size and num_heads dimensions again - diagonal_attention_scores = diagonal_attention_scores.view( - batch_size, num_heads, seq_len, 2 * window_overlap + 1 - ).transpose(2, 1) - - self._mask_invalid_locations(diagonal_attention_scores, window_overlap) - return diagonal_attention_scores - - def _sliding_chunks_matmul_attn_probs_value( - self, attn_probs: torch.Tensor, value: torch.Tensor, window_overlap: int - ): - """ - Same as _sliding_chunks_query_key_matmul but for attn_probs and value tensors. Returned tensor will be of the - same shape as `attn_probs` - """ - batch_size, seq_len, num_heads, head_dim = value.size() - - assert seq_len % (window_overlap * 2) == 0 - assert attn_probs.size()[:3] == value.size()[:3] - assert attn_probs.size(3) == 2 * window_overlap + 1 - chunks_count = seq_len // window_overlap - 1 - # group batch_size and num_heads dimensions into one, then chunk seq_len into chunks of size 2 window overlap - - chunked_attn_probs = attn_probs.transpose(1, 2).reshape( - batch_size * num_heads, seq_len // window_overlap, window_overlap, 2 * window_overlap + 1 - ) - - # group batch_size and num_heads dimensions into one - value = value.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim) - - # pad seq_len with w at the beginning of the sequence and another window overlap at the end - padded_value = nn.functional.pad(value, (0, 0, window_overlap, window_overlap), value=-1) - - # chunk padded_value into chunks of size 3 window overlap and an overlap of size window overlap - chunked_value_size = (batch_size * num_heads, chunks_count + 1, 3 * window_overlap, head_dim) - chunked_value_stride = padded_value.stride() - chunked_value_stride = ( - chunked_value_stride[0], - window_overlap * chunked_value_stride[1], - chunked_value_stride[1], - chunked_value_stride[2], - ) - chunked_value = padded_value.as_strided(size=chunked_value_size, stride=chunked_value_stride) - - chunked_attn_probs = self._pad_and_diagonalize(chunked_attn_probs) - - context = torch.einsum("bcwd,bcdh->bcwh", (chunked_attn_probs, chunked_value)) - return context.view(batch_size, num_heads, seq_len, head_dim).transpose(1, 2) - - @staticmethod - def _get_global_attn_indices(is_index_global_attn): - """compute global attn indices required throughout forward pass""" - # helper variable - num_global_attn_indices = is_index_global_attn.long().sum(dim=1) - - # max number of global attn indices in batch - max_num_global_attn_indices = num_global_attn_indices.max() - - # indices of global attn - is_index_global_attn_nonzero = is_index_global_attn.nonzero(as_tuple=True) - - # helper variable - is_local_index_global_attn = torch.arange( - max_num_global_attn_indices, device=is_index_global_attn.device - ) < num_global_attn_indices.unsqueeze(dim=-1) - - # location of the non-padding values within global attention indices - is_local_index_global_attn_nonzero = is_local_index_global_attn.nonzero(as_tuple=True) - - # location of the padding values within global attention indices - is_local_index_no_global_attn_nonzero = (is_local_index_global_attn == 0).nonzero(as_tuple=True) - return ( - max_num_global_attn_indices, - is_index_global_attn_nonzero, - is_local_index_global_attn_nonzero, - is_local_index_no_global_attn_nonzero, - ) - - def _concat_with_global_key_attn_probs( - self, - key_vectors, - query_vectors, - max_num_global_attn_indices, - is_index_global_attn_nonzero, - is_local_index_global_attn_nonzero, - is_local_index_no_global_attn_nonzero, - ): - batch_size = key_vectors.shape[0] - - # create only global key vectors - key_vectors_only_global = key_vectors.new_zeros( - batch_size, max_num_global_attn_indices, self.num_heads, self.head_dim - ) - - key_vectors_only_global[is_local_index_global_attn_nonzero] = key_vectors[is_index_global_attn_nonzero] - - # (batch_size, seq_len, num_heads, max_num_global_attn_indices) - attn_probs_from_global_key = torch.einsum("blhd,bshd->blhs", (query_vectors, key_vectors_only_global)) - - attn_probs_from_global_key[ - is_local_index_no_global_attn_nonzero[0], :, :, is_local_index_no_global_attn_nonzero[1] - ] = torch.finfo(attn_probs_from_global_key.dtype).min - - return attn_probs_from_global_key - - def _compute_attn_output_with_global_indices( - self, - value_vectors, - attn_probs, - max_num_global_attn_indices, - is_index_global_attn_nonzero, - is_local_index_global_attn_nonzero, - ): - batch_size = attn_probs.shape[0] - - # cut local attn probs to global only - attn_probs_only_global = attn_probs.narrow(-1, 0, max_num_global_attn_indices) - # get value vectors for global only - value_vectors_only_global = value_vectors.new_zeros( - batch_size, max_num_global_attn_indices, self.num_heads, self.head_dim - ) - value_vectors_only_global[is_local_index_global_attn_nonzero] = value_vectors[is_index_global_attn_nonzero] - - # use `matmul` because `einsum` crashes sometimes with fp16 - # attn = torch.einsum('blhs,bshd->blhd', (selected_attn_probs, selected_v)) - # compute attn output only global - attn_output_only_global = torch.matmul( - attn_probs_only_global.transpose(1, 2).clone(), value_vectors_only_global.transpose(1, 2).clone() - ).transpose(1, 2) - - # reshape attn probs - attn_probs_without_global = attn_probs.narrow( - -1, max_num_global_attn_indices, attn_probs.size(-1) - max_num_global_attn_indices - ).contiguous() - - # compute attn output with global - attn_output_without_global = self._sliding_chunks_matmul_attn_probs_value( - attn_probs_without_global, value_vectors, self.one_sided_attn_window_size - ) - return attn_output_only_global + attn_output_without_global - - def _compute_global_attn_output_from_hidden( - self, - hidden_states, - max_num_global_attn_indices, - layer_head_mask, - is_local_index_global_attn_nonzero, - is_index_global_attn_nonzero, - is_local_index_no_global_attn_nonzero, - is_index_masked, - ): - seq_len, batch_size = hidden_states.shape[:2] - - # prepare global hidden states - global_attn_hidden_states = hidden_states.new_zeros(max_num_global_attn_indices, batch_size, self.embed_dim) - global_attn_hidden_states[is_local_index_global_attn_nonzero[::-1]] = hidden_states[ - is_index_global_attn_nonzero[::-1] - ] - - # global key, query, value - global_query_vectors_only_global = self.query_global(global_attn_hidden_states) - global_key_vectors = self.key_global(hidden_states) - global_value_vectors = self.value_global(hidden_states) - - # normalize - global_query_vectors_only_global /= math.sqrt(self.head_dim) - - # reshape - global_query_vectors_only_global = ( - global_query_vectors_only_global.contiguous() - .view(max_num_global_attn_indices, batch_size * self.num_heads, self.head_dim) - .transpose(0, 1) - ) # (batch_size * self.num_heads, max_num_global_attn_indices, head_dim) - global_key_vectors = ( - global_key_vectors.contiguous().view(-1, batch_size * self.num_heads, self.head_dim).transpose(0, 1) - ) # batch_size * self.num_heads, seq_len, head_dim) - global_value_vectors = ( - global_value_vectors.contiguous().view(-1, batch_size * self.num_heads, self.head_dim).transpose(0, 1) - ) # batch_size * self.num_heads, seq_len, head_dim) - - # compute attn scores - global_attn_scores = torch.bmm(global_query_vectors_only_global, global_key_vectors.transpose(1, 2)) - - assert list(global_attn_scores.size()) == [ - batch_size * self.num_heads, - max_num_global_attn_indices, - seq_len, - ], ( - "global_attn_scores have the wrong size. Size should be" - f" {(batch_size * self.num_heads, max_num_global_attn_indices, seq_len)}, but is" - f" {global_attn_scores.size()}." - ) - - global_attn_scores = global_attn_scores.view(batch_size, self.num_heads, max_num_global_attn_indices, seq_len) - - global_attn_scores[ - is_local_index_no_global_attn_nonzero[0], :, is_local_index_no_global_attn_nonzero[1], : - ] = torch.finfo(global_attn_scores.dtype).min - - global_attn_scores = global_attn_scores.masked_fill( - is_index_masked[:, None, None, :], - torch.finfo(global_attn_scores.dtype).min, - ) - - global_attn_scores = global_attn_scores.view(batch_size * self.num_heads, max_num_global_attn_indices, seq_len) - - # compute global attn probs - global_attn_probs_float = nn.functional.softmax( - global_attn_scores, dim=-1, dtype=torch.float32 - ) # use fp32 for numerical stability - - # apply layer head masking - if layer_head_mask is not None: - assert layer_head_mask.size() == ( - self.num_heads, - ), f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}" - global_attn_probs_float = layer_head_mask.view(1, -1, 1, 1) * global_attn_probs_float.view( - batch_size, self.num_heads, max_num_global_attn_indices, seq_len - ) - global_attn_probs_float = global_attn_probs_float.view( - batch_size * self.num_heads, max_num_global_attn_indices, seq_len - ) - - global_attn_probs = nn.functional.dropout( - global_attn_probs_float.type_as(global_attn_scores), p=self.dropout, training=self.training - ) - - # global attn output - global_attn_output = torch.bmm(global_attn_probs, global_value_vectors) - - assert list(global_attn_output.size()) == [ - batch_size * self.num_heads, - max_num_global_attn_indices, - self.head_dim, - ], ( - "global_attn_output tensor has the wrong size. Size should be" - f" {(batch_size * self.num_heads, max_num_global_attn_indices, self.head_dim)}, but is" - f" {global_attn_output.size()}." - ) - - global_attn_probs = global_attn_probs.view(batch_size, self.num_heads, max_num_global_attn_indices, seq_len) - global_attn_output = global_attn_output.view( - batch_size, self.num_heads, max_num_global_attn_indices, self.head_dim - ) - return global_attn_output, global_attn_probs - - -# Copied from transformers.models.bert.modeling_bert.BertSelfOutput -class LongformerSelfOutput(nn.Module): - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - - def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states) - hidden_states = self.LayerNorm(hidden_states + input_tensor) - return hidden_states - - -class LongformerAttention(nn.Module): - def __init__(self, config, layer_id=0): - super().__init__() - self.self = LongformerSelfAttention(config, layer_id) - self.output = LongformerSelfOutput(config) - self.pruned_heads = set() - - def prune_heads(self, heads): - if len(heads) == 0: - return - heads, index = find_pruneable_heads_and_indices( - heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads - ) - - # Prune linear layers - self.self.query = prune_linear_layer(self.self.query, index) - self.self.key = prune_linear_layer(self.self.key, index) - self.self.value = prune_linear_layer(self.self.value, index) - self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) - - # Update hyper params and store pruned heads - self.self.num_attention_heads = self.self.num_attention_heads - len(heads) - self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads - self.pruned_heads = self.pruned_heads.union(heads) - - def forward( - self, - hidden_states, - attention_mask=None, - layer_head_mask=None, - is_index_masked=None, - is_index_global_attn=None, - is_global_attn=None, - output_attentions=False, - ): - self_outputs = self.self( - hidden_states, - attention_mask=attention_mask, - layer_head_mask=layer_head_mask, - is_index_masked=is_index_masked, - is_index_global_attn=is_index_global_attn, - is_global_attn=is_global_attn, - output_attentions=output_attentions, - ) - attn_output = self.output(self_outputs[0], hidden_states) - outputs = (attn_output,) + self_outputs[1:] - return outputs - - -# Copied from transformers.models.bert.modeling_bert.BertIntermediate -class LongformerIntermediate(nn.Module): - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.intermediate_size) - if isinstance(config.hidden_act, str): - self.intermediate_act_fn = ACT2FN[config.hidden_act] - else: - self.intermediate_act_fn = config.hidden_act - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) - hidden_states = self.intermediate_act_fn(hidden_states) - return hidden_states - - -# Copied from transformers.models.bert.modeling_bert.BertOutput -class LongformerOutput(nn.Module): - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.intermediate_size, config.hidden_size) - self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - - def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states) - hidden_states = self.LayerNorm(hidden_states + input_tensor) - return hidden_states - - -class LongformerLayer(nn.Module): - def __init__(self, config, layer_id=0): - super().__init__() - self.attention = LongformerAttention(config, layer_id) - self.intermediate = LongformerIntermediate(config) - self.output = LongformerOutput(config) - self.chunk_size_feed_forward = config.chunk_size_feed_forward - self.seq_len_dim = 1 - - def forward( - self, - hidden_states, - attention_mask=None, - layer_head_mask=None, - is_index_masked=None, - is_index_global_attn=None, - is_global_attn=None, - output_attentions=False, - ): - self_attn_outputs = self.attention( - hidden_states, - attention_mask=attention_mask, - layer_head_mask=layer_head_mask, - is_index_masked=is_index_masked, - is_index_global_attn=is_index_global_attn, - is_global_attn=is_global_attn, - output_attentions=output_attentions, - ) - attn_output = self_attn_outputs[0] - outputs = self_attn_outputs[1:] - - layer_output = apply_chunking_to_forward( - self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attn_output - ) - outputs = (layer_output,) + outputs - return outputs - - def ff_chunk(self, attn_output): - intermediate_output = self.intermediate(attn_output) - layer_output = self.output(intermediate_output, attn_output) - return layer_output - - -class LongformerEncoder(nn.Module): - def __init__(self, config): - super().__init__() - self.config = config - self.layer = nn.ModuleList([LongformerLayer(config, layer_id=i) for i in range(config.num_hidden_layers)]) - self.gradient_checkpointing = False - - def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - padding_len=0, - output_attentions=False, - output_hidden_states=False, - return_dict=True, - ): - - is_index_masked = attention_mask < 0 - is_index_global_attn = attention_mask > 0 - is_global_attn = is_index_global_attn.flatten().any().item() - - all_hidden_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None # All local attentions. - all_global_attentions = () if (output_attentions and is_global_attn) else None - - # check if head_mask has a correct number of layers specified if desired - if head_mask is not None: - assert head_mask.size()[0] == ( - len(self.layer) - ), f"The head_mask should be specified for {len(self.layer)} layers, but it is for {head_mask.size()[0]}." - for idx, layer_module in enumerate(self.layer): - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - if self.gradient_checkpointing and self.training: - - def create_custom_forward(module): - def custom_forward(*inputs): - return module(*inputs, is_global_attn, output_attentions) - - return custom_forward - - layer_outputs = torch.utils.checkpoint.checkpoint( - create_custom_forward(layer_module), - hidden_states, - attention_mask, - head_mask[idx] if head_mask is not None else None, - is_index_masked, - is_index_global_attn, - ) - else: - layer_outputs = layer_module( - hidden_states, - attention_mask=attention_mask, - layer_head_mask=head_mask[idx] if head_mask is not None else None, - is_index_masked=is_index_masked, - is_index_global_attn=is_index_global_attn, - is_global_attn=is_global_attn, - output_attentions=output_attentions, - ) - hidden_states = layer_outputs[0] - - if output_attentions: - # bzs x seq_len x num_attn_heads x (num_global_attn + attention_window_len + 1) => bzs x num_attn_heads x seq_len x (num_global_attn + attention_window_len + 1) - all_attentions = all_attentions + (layer_outputs[1].transpose(1, 2),) - - if is_global_attn: - # bzs x num_attn_heads x num_global_attn x seq_len => bzs x num_attn_heads x seq_len x num_global_attn - all_global_attentions = all_global_attentions + (layer_outputs[2].transpose(2, 3),) - - # Add last layer - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - # undo padding - if padding_len > 0: - # unpad `hidden_states` because the calling function is expecting a length == input_ids.size(1) - hidden_states = hidden_states[:, :-padding_len] - if output_hidden_states: - all_hidden_states = tuple([state[:, :-padding_len] for state in all_hidden_states]) - - if output_attentions: - all_attentions = tuple([state[:, :, :-padding_len, :] for state in all_attentions]) - - if not return_dict: - return tuple( - v for v in [hidden_states, all_hidden_states, all_attentions, all_global_attentions] if v is not None - ) - return LongformerBaseModelOutput( - last_hidden_state=hidden_states, - hidden_states=all_hidden_states, - attentions=all_attentions, - global_attentions=all_global_attentions, - ) - - -# Copied from transformers.models.bert.modeling_bert.BertPooler -class LongformerPooler(nn.Module): - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.activation = nn.Tanh() - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - # We "pool" the model by simply taking the hidden state corresponding - # to the first token. - first_token_tensor = hidden_states[:, 0] - pooled_output = self.dense(first_token_tensor) - pooled_output = self.activation(pooled_output) - return pooled_output - - -# Copied from transformers.models.roberta.modeling_roberta.RobertaLMHead with Roberta->Longformer -class LongformerLMHead(nn.Module): - """Longformer Head for masked language modeling.""" - - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - - self.decoder = nn.Linear(config.hidden_size, config.vocab_size) - self.bias = nn.Parameter(torch.zeros(config.vocab_size)) - self.decoder.bias = self.bias - - def forward(self, features, **kwargs): - x = self.dense(features) - x = gelu(x) - x = self.layer_norm(x) - - # project back to size of vocabulary with bias - x = self.decoder(x) - - return x - - def _tie_weights(self): - # To tie those two weights if they get disconnected (on TPU or when the bias is resized) - self.bias = self.decoder.bias - - -class LongformerPreTrainedModel(PreTrainedModel): - """ - An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained - models. - """ - - config_class = LongformerConfig - base_model_prefix = "longformer" - supports_gradient_checkpointing = True - _keys_to_ignore_on_load_unexpected = [r"position_ids"] - - def _init_weights(self, module): - """Initialize the weights""" - if isinstance(module, nn.Linear): - # Slightly different from the TF version which uses truncated_normal for initialization - # cf https://github.com/pytorch/pytorch/pull/5617 - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - elif isinstance(module, nn.LayerNorm): - module.bias.data.zero_() - module.weight.data.fill_(1.0) - - def _set_gradient_checkpointing(self, module, value=False): - if isinstance(module, LongformerEncoder): - module.gradient_checkpointing = value - - -LONGFORMER_START_DOCSTRING = r""" - This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads - etc.) - This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. - Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage - and behavior. - Parameters: - config ([`LongformerConfig`]): Model configuration class with all the parameters of the - model. Initializing with a config file does not load the weights associated with the model, only the - configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - -LONGFORMER_INPUTS_DOCSTRING = r""" - Args: - input_ids (`torch.LongTensor` of shape `({0})`): - Indices of input sequence tokens in the vocabulary. - Indices can be obtained using [`LongformerTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - [What are input IDs?](../glossary#input-ids) - attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - [What are attention masks?](../glossary#attention-mask) - global_attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): - Mask to decide the attention given on each token, local attention or global attention. Tokens with global - attention attends to all other tokens, and all other tokens attend to them. This is important for - task-specific finetuning because it makes the model more flexible at representing the task. For example, - for classification, the token should be given global attention. For QA, all question tokens should also - have global attention. Please refer to the [Longformer paper](https://arxiv.org/abs/2004.05150) for more - details. Mask values selected in `[0, 1]`: - - 0 for local attention (a sliding window attention), - - 1 for global attention (tokens that attend to all other tokens, and all other tokens attend to them). - head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*): - Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`: - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - decoder_head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*): - Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`: - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): - Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, - 1]`: - - 0 corresponds to a *sentence A* token, - - 1 corresponds to a *sentence B* token. - [What are token type IDs?](../glossary#token-type-ids) - position_ids (`torch.LongTensor` of shape `({0})`, *optional*): - Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, - config.max_position_embeddings - 1]`. - [What are position IDs?](../glossary#position-ids) - inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This - is useful if you want more control over how to convert `input_ids` indices into associated vectors than the - model's internal embedding lookup matrix. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - - -@add_start_docstrings( - "The bare Longformer Model outputting raw hidden-states without any specific head on top.", - LONGFORMER_START_DOCSTRING, -) -class LongformerModel(LongformerPreTrainedModel): - """ - This class copied code from [`RobertaModel`] and overwrote standard self-attention with longformer self-attention - to provide the ability to process long sequences following the self-attention approach described in [Longformer: - the Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, and Arman Cohan. - Longformer self-attention combines a local (sliding window) and global attention to extend to long documents - without the O(n^2) increase in memory and compute. - The self-attention module `LongformerSelfAttention` implemented here supports the combination of local and global - attention but it lacks support for autoregressive attention and dilated attention. Autoregressive and dilated - attention are more relevant for autoregressive language modeling than finetuning on downstream tasks. Future - release will add support for autoregressive attention, but the support for dilated attention requires a custom CUDA - kernel to be memory and compute efficient. - """ - - def __init__(self, config, add_pooling_layer=True): - super().__init__(config) - self.config = config - - if isinstance(config.attention_window, int): - assert config.attention_window % 2 == 0, "`config.attention_window` has to be an even value" - assert config.attention_window > 0, "`config.attention_window` has to be positive" - config.attention_window = [config.attention_window] * config.num_hidden_layers # one value per layer - else: - assert len(config.attention_window) == config.num_hidden_layers, ( - "`len(config.attention_window)` should equal `config.num_hidden_layers`. " - f"Expected {config.num_hidden_layers}, given {len(config.attention_window)}" - ) - - self.embeddings = LongformerEmbeddings(config) - self.encoder = LongformerEncoder(config) - self.pooler = LongformerPooler(config) if add_pooling_layer else None - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.embeddings.word_embeddings - - def set_input_embeddings(self, value): - self.embeddings.word_embeddings = value - - def _prune_heads(self, heads_to_prune): - """ - Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base - class PreTrainedModel - """ - for layer, heads in heads_to_prune.items(): - self.encoder.layer[layer].attention.prune_heads(heads) - - def _pad_to_window_size( - self, - input_ids: torch.Tensor, - attention_mask: torch.Tensor, - token_type_ids: torch.Tensor, - position_ids: torch.Tensor, - inputs_embeds: torch.Tensor, - pad_token_id: int, - ): - """A helper function to pad tokens and mask to work with implementation of Longformer self-attention.""" - # padding - attention_window = ( - self.config.attention_window - if isinstance(self.config.attention_window, int) - else max(self.config.attention_window) - ) - - assert attention_window % 2 == 0, f"`attention_window` should be an even value. Given {attention_window}" - input_shape = input_ids.shape if input_ids is not None else inputs_embeds.shape - batch_size, seq_len = input_shape[:2] - - padding_len = (attention_window - seq_len % attention_window) % attention_window - if padding_len > 0: - ''' - logger.info( - f"Input ids are automatically padded from {seq_len} to {seq_len + padding_len} to be a multiple of " - f"`config.attention_window`: {attention_window}" - ) - ''' - if input_ids is not None: - input_ids = nn.functional.pad(input_ids, (0, padding_len), value=pad_token_id) - if position_ids is not None: - # pad with position_id = pad_token_id as in modeling_roberta.RobertaEmbeddings - position_ids = nn.functional.pad(position_ids, (0, padding_len), value=pad_token_id) - if inputs_embeds is not None: - input_ids_padding = inputs_embeds.new_full( - (batch_size, padding_len), - self.config.pad_token_id, - dtype=torch.long, - ) - inputs_embeds_padding = self.embeddings(input_ids_padding) - inputs_embeds = torch.cat([inputs_embeds, inputs_embeds_padding], dim=-2) - - attention_mask = nn.functional.pad( - attention_mask, (0, padding_len), value=False - ) # no attention on the padding tokens - token_type_ids = nn.functional.pad(token_type_ids, (0, padding_len), value=0) # pad with token_type_id = 0 - - return padding_len, input_ids, attention_mask, token_type_ids, position_ids, inputs_embeds - - def _merge_to_attention_mask(self, attention_mask: torch.Tensor, global_attention_mask: torch.Tensor): - # longformer self attention expects attention mask to have 0 (no attn), 1 (local attn), 2 (global attn) - # (global_attention_mask + 1) => 1 for local attention, 2 for global attention - # => final attention_mask => 0 for no attention, 1 for local attention 2 for global attention - if attention_mask is not None: - attention_mask = attention_mask * (global_attention_mask + 1) - else: - # simply use `global_attention_mask` as `attention_mask` - # if no `attention_mask` is given - attention_mask = global_attention_mask + 1 - return attention_mask - - @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) - @replace_return_docstrings(output_type=LongformerBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - global_attention_mask: Optional[torch.Tensor] = None, - head_mask: Optional[torch.Tensor] = None, - token_type_ids: Optional[torch.Tensor] = None, - position_ids: Optional[torch.Tensor] = None, - inputs_embeds: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, LongformerBaseModelOutputWithPooling]: - r""" - Returns: - Examples: - ```python - >>> import torch - >>> from transformers import LongformerModel, LongformerTokenizer - >>> model = LongformerModel.from_pretrained("allenai/longformer-base-4096") - >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096") - >>> SAMPLE_TEXT = " ".join(["Hello world! "] * 1000) # long input document - >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0) # batch of size 1 - >>> attention_mask = torch.ones( - ... input_ids.shape, dtype=torch.long, device=input_ids.device - ... ) # initialize to local attention - >>> global_attention_mask = torch.zeros( - ... input_ids.shape, dtype=torch.long, device=input_ids.device - ... ) # initialize to global attention to be deactivated for all tokens - >>> global_attention_mask[ - ... :, - ... [ - ... 1, - ... 4, - ... 21, - ... ], - ... ] = 1 # Set global attention to random tokens for the sake of this example - >>> # Usually, set global attention based on the task. For example, - >>> # classification: the token - >>> # QA: question tokens - >>> # LM: potentially on the beginning of sentences and paragraphs - >>> outputs = model(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask) - >>> sequence_output = outputs.last_hidden_state - >>> pooled_output = outputs.pooler_output - ```""" - - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") - elif input_ids is not None: - input_shape = input_ids.size() - elif inputs_embeds is not None: - input_shape = inputs_embeds.size()[:-1] - else: - raise ValueError("You have to specify either input_ids or inputs_embeds") - - device = input_ids.device if input_ids is not None else inputs_embeds.device - - if attention_mask is None: - attention_mask = torch.ones(input_shape, device=device) - if token_type_ids is None: - token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) - - # merge `global_attention_mask` and `attention_mask` - if global_attention_mask is not None: - attention_mask = self._merge_to_attention_mask(attention_mask, global_attention_mask) - - padding_len, input_ids, attention_mask, token_type_ids, position_ids, inputs_embeds = self._pad_to_window_size( - input_ids=input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - inputs_embeds=inputs_embeds, - pad_token_id=self.config.pad_token_id, - ) - - # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] - # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)[ - :, 0, 0, : - ] - - embedding_output = self.embeddings( - input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds - ) - - encoder_outputs = self.encoder( - embedding_output, - attention_mask=extended_attention_mask, - head_mask=head_mask, - padding_len=padding_len, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - sequence_output = encoder_outputs[0] - pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - - return LongformerBaseModelOutputWithPooling( - last_hidden_state=sequence_output, - pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - global_attentions=encoder_outputs.global_attentions, - ) - - -@add_start_docstrings("""Longformer Model with a `language modeling` head on top.""", LONGFORMER_START_DOCSTRING) -class LongformerForMaskedLM(LongformerPreTrainedModel): - - _keys_to_ignore_on_load_unexpected = [r"pooler"] - - def __init__(self, config): - super().__init__(config) - - self.longformer = LongformerModel(config, add_pooling_layer=False) - self.lm_head = LongformerLMHead(config) - - # Initialize weights and apply final processing - self.post_init() - - def get_output_embeddings(self): - return self.lm_head.decoder - - def set_output_embeddings(self, new_embeddings): - self.lm_head.decoder = new_embeddings - - @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) - @replace_return_docstrings(output_type=LongformerMaskedLMOutput, config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - global_attention_mask: Optional[torch.Tensor] = None, - head_mask: Optional[torch.Tensor] = None, - token_type_ids: Optional[torch.Tensor] = None, - position_ids: Optional[torch.Tensor] = None, - inputs_embeds: Optional[torch.Tensor] = None, - labels: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, LongformerMaskedLMOutput]: - r""" - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., - config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the - loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` - kwargs (`Dict[str, any]`, optional, defaults to *{}*): - Used to hide legacy arguments that have been deprecated. - Returns: - Mask filling example: - ```python - >>> from transformers import LongformerTokenizer, LongformerForMaskedLM - >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096") - >>> model = LongformerForMaskedLM.from_pretrained("allenai/longformer-base-4096") - ``` - Let's try a very long input. - ```python - >>> TXT = ( - ... "My friends are but they eat too many carbs." - ... + " That's why I decide not to eat with them." * 300 - ... ) - >>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"] - >>> logits = model(input_ids).logits - >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item() - >>> probs = logits[0, masked_index].softmax(dim=0) - >>> values, predictions = probs.topk(5) - >>> tokenizer.decode(predictions).split() - ['healthy', 'skinny', 'thin', 'good', 'vegetarian'] - ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.longformer( - input_ids, - attention_mask=attention_mask, - global_attention_mask=global_attention_mask, - head_mask=head_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - sequence_output = outputs[0] - prediction_scores = self.lm_head(sequence_output) - - masked_lm_loss = None - if labels is not None: - loss_fct = CrossEntropyLoss() - masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - - return LongformerMaskedLMOutput( - loss=masked_lm_loss, - logits=prediction_scores, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - global_attentions=outputs.global_attentions, - ) - - -@add_start_docstrings( - """ - Longformer Model transformer with a sequence classification/regression head on top (a linear layer on top of the - pooled output) e.g. for GLUE tasks. - """, - LONGFORMER_START_DOCSTRING, -) -class LongformerForSequenceClassification(LongformerPreTrainedModel): - - _keys_to_ignore_on_load_unexpected = [r"pooler"] - - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - self.config = config - - self.longformer = LongformerModel(config, add_pooling_layer=False) - self.classifier = LongformerClassificationHead(config) - - # Initialize weights and apply final processing - self.post_init() - - @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) - @add_code_sample_docstrings( - processor_class=_TOKENIZER_FOR_DOC, - checkpoint="jpelhaw/longformer-base-plagiarism-detection", - output_type=LongformerSequenceClassifierOutput, - config_class=_CONFIG_FOR_DOC, - expected_output="'ORIGINAL'", - expected_loss=5.44, - ) - def forward( - self, - input_ids: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - global_attention_mask: Optional[torch.Tensor] = None, - head_mask: Optional[torch.Tensor] = None, - token_type_ids: Optional[torch.Tensor] = None, - position_ids: Optional[torch.Tensor] = None, - inputs_embeds: Optional[torch.Tensor] = None, - labels: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, LongformerSequenceClassifierOutput]: - r""" - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - if global_attention_mask is None: - logger.info("Initializing global attention on CLS token...") - global_attention_mask = torch.zeros_like(input_ids) - # global attention on cls token - global_attention_mask[:, 0] = 1 - - outputs = self.longformer( - input_ids, - attention_mask=attention_mask, - global_attention_mask=global_attention_mask, - head_mask=head_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - sequence_output = outputs[0] - logits = self.classifier(sequence_output) - - loss = None - if labels is not None: - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(logits, labels) - - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - - return LongformerSequenceClassifierOutput( - loss=loss, - logits=logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - global_attentions=outputs.global_attentions, - ) - - -class LongformerClassificationHead(nn.Module): - """Head for sentence-level classification tasks.""" - - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.out_proj = nn.Linear(config.hidden_size, config.num_labels) - - def forward(self, hidden_states, **kwargs): - hidden_states = hidden_states[:, 0, :] # take token (equiv. to [CLS]) - hidden_states = self.dropout(hidden_states) - hidden_states = self.dense(hidden_states) - hidden_states = torch.tanh(hidden_states) - hidden_states = self.dropout(hidden_states) - output = self.out_proj(hidden_states) - return output - - -@add_start_docstrings( - """ - Longformer Model with a span classification head on top for extractive question-answering tasks like SQuAD / - TriviaQA (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). - """, - LONGFORMER_START_DOCSTRING, -) -class LongformerForQuestionAnswering(LongformerPreTrainedModel): - - _keys_to_ignore_on_load_unexpected = [r"pooler"] - - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - - self.longformer = LongformerModel(config, add_pooling_layer=False) - self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - - # Initialize weights and apply final processing - self.post_init() - - @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) - @replace_return_docstrings(output_type=LongformerQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - global_attention_mask: Optional[torch.Tensor] = None, - head_mask: Optional[torch.Tensor] = None, - token_type_ids: Optional[torch.Tensor] = None, - position_ids: Optional[torch.Tensor] = None, - inputs_embeds: Optional[torch.Tensor] = None, - start_positions: Optional[torch.Tensor] = None, - end_positions: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, LongformerQuestionAnsweringModelOutput]: - r""" - start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for position (index) of the start of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence - are not taken into account for computing the loss. - end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for position (index) of the end of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence - are not taken into account for computing the loss. - Returns: - Examples: - ```python - >>> from transformers import LongformerTokenizer, LongformerForQuestionAnswering - >>> import torch - >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa") - >>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa") - >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" - >>> encoding = tokenizer(question, text, return_tensors="pt") - >>> input_ids = encoding["input_ids"] - >>> # default is local attention everywhere - >>> # the forward method will automatically set global attention on question tokens - >>> attention_mask = encoding["attention_mask"] - >>> outputs = model(input_ids, attention_mask=attention_mask) - >>> start_logits = outputs.start_logits - >>> end_logits = outputs.end_logits - >>> all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist()) - >>> answer_tokens = all_tokens[torch.argmax(start_logits) : torch.argmax(end_logits) + 1] - >>> answer = tokenizer.decode( - ... tokenizer.convert_tokens_to_ids(answer_tokens) - ... ) # remove space prepending space token - ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - if global_attention_mask is None: - if input_ids is None: - logger.warning( - "It is not possible to automatically generate the `global_attention_mask` because input_ids is" - " None. Please make sure that it is correctly set." - ) - else: - # set global attention on question tokens automatically - global_attention_mask = _compute_global_attention_mask(input_ids, self.config.sep_token_id) - - outputs = self.longformer( - input_ids, - attention_mask=attention_mask, - global_attention_mask=global_attention_mask, - head_mask=head_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - sequence_output = outputs[0] - # because of batch=1 and not max_seq_length, the blow code not use. - padding_len = input_ids[0].eq(1).sum() - if padding_len > 0: - sequence_output = sequence_output[:, :-padding_len] - - logits = self.qa_outputs(sequence_output) - start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1).contiguous() - end_logits = end_logits.squeeze(-1).contiguous() - - # align to original longformer loss. - regular_softmax_loss = False - - total_loss = None - if start_positions is not None and end_positions is not None: - # If we are on multi-GPU, split add a dimension - if len(start_positions.size()) > 1: - start_positions = start_positions.squeeze(-1) - if len(end_positions.size()) > 1: - end_positions = end_positions.squeeze(-1) - - if not regular_softmax_loss: - # loss function suggested in section 2.2 here https://arxiv.org/pdf/1710.10723.pdf - # NOTE: this returns sum of losses, not mean, so loss won't be normalized across different batch sizes - # but batch size is always 1, so this is not a problem - start_loss = self.or_softmax_cross_entropy_loss_one_doc(start_logits, start_positions, ignore_index=-1) - end_loss = self.or_softmax_cross_entropy_loss_one_doc(end_logits, end_positions, ignore_index=-1) - else: - loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-1) - start_positions = start_positions[:, 0:1] - end_positions = end_positions[:, 0:1] - start_loss = loss_fct(start_logits, start_positions[:, 0]) - end_loss = loss_fct(end_logits, end_positions[:, 0]) - - total_loss = (start_loss + end_loss) / 2 - - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - - return LongformerQuestionAnsweringModelOutput( - loss=total_loss, - start_logits=start_logits, - end_logits=end_logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - global_attentions=outputs.global_attentions, - ) - - def or_softmax_cross_entropy_loss_one_doc(self, logits, target, ignore_index=-1, dim=-1): - """loss function suggested in section 2.2 here https://arxiv.org/pdf/1710.10723.pdf""" - assert logits.ndim == 2 - assert target.ndim == 2 - assert logits.size(0) == target.size(0) - - # with regular CrossEntropyLoss, the numerator is only one of the logits specified by the target - # here, the numerator is the sum of a few potential targets, where some of them is the correct answer - - # compute a target mask - target_mask = target == ignore_index - # replaces ignore_index with 0, so `gather` will select logit at index 0 for the msked targets - masked_target = target * (1 - target_mask.long()) - # gather logits - gathered_logits = logits.gather(dim=dim, index=masked_target) - # Apply the mask to gathered_logits. Use a mask of -inf because exp(-inf) = 0 - gathered_logits[target_mask] = float('-inf') - - # each batch is one example - gathered_logits = gathered_logits.view(1, -1) - logits = logits.view(1, -1) - - # numerator = log(sum(exp(gathered logits))) - log_score = torch.logsumexp(gathered_logits, dim=dim, keepdim=False) - # denominator = log(sum(exp(logits))) - log_norm = torch.logsumexp(logits, dim=dim, keepdim=False) - - # compute the loss - loss = -(log_score - log_norm) - - # some of the examples might have a loss of `inf` when `target` is all `ignore_index`. - # remove those from the loss before computing the sum. Use sum instead of mean because - # it is easier to compute - return loss[~torch.isinf(loss)].sum() - - -@add_start_docstrings( - """ - Longformer Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. - for Named-Entity-Recognition (NER) tasks. - """, - LONGFORMER_START_DOCSTRING, -) -class LongformerForTokenClassification(LongformerPreTrainedModel): - - _keys_to_ignore_on_load_unexpected = [r"pooler"] - - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - - self.longformer = LongformerModel(config, add_pooling_layer=False) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.classifier = nn.Linear(config.hidden_size, config.num_labels) - - # Initialize weights and apply final processing - self.post_init() - - @add_start_docstrings_to_model_forward(LONGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) - @add_code_sample_docstrings( - processor_class=_TOKENIZER_FOR_DOC, - checkpoint="brad1141/Longformer-finetuned-norm", - output_type=LongformerTokenClassifierOutput, - config_class=_CONFIG_FOR_DOC, - expected_output=( - "['Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence', 'Evidence'," - " 'Evidence', 'Evidence', 'Evidence', 'Evidence']" - ), - expected_loss=0.63, - ) - def forward( - self, - input_ids: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - global_attention_mask: Optional[torch.Tensor] = None, - head_mask: Optional[torch.Tensor] = None, - token_type_ids: Optional[torch.Tensor] = None, - position_ids: Optional[torch.Tensor] = None, - inputs_embeds: Optional[torch.Tensor] = None, - labels: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, LongformerTokenClassifierOutput]: - r""" - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.longformer( - input_ids, - attention_mask=attention_mask, - global_attention_mask=global_attention_mask, - head_mask=head_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - sequence_output = outputs[0] - - sequence_output = self.dropout(sequence_output) - logits = self.classifier(sequence_output) - - loss = None - if labels is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - - return LongformerTokenClassifierOutput( - loss=loss, - logits=logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - global_attentions=outputs.global_attentions, - ) - - -@add_start_docstrings( - """ - Longformer Model with a multiple choice classification head on top (a linear layer on top of the pooled output and - a softmax) e.g. for RocStories/SWAG tasks. - """, - LONGFORMER_START_DOCSTRING, -) -class LongformerForMultipleChoice(LongformerPreTrainedModel): - def __init__(self, config): - super().__init__(config) - - self.longformer = LongformerModel(config) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.classifier = nn.Linear(config.hidden_size, 1) - - # Initialize weights and apply final processing - self.post_init() - - @add_start_docstrings_to_model_forward( - LONGFORMER_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") - ) - @add_code_sample_docstrings( - processor_class=_TOKENIZER_FOR_DOC, - checkpoint=_CHECKPOINT_FOR_DOC, - output_type=LongformerMultipleChoiceModelOutput, - config_class=_CONFIG_FOR_DOC, - ) - def forward( - self, - input_ids: Optional[torch.Tensor] = None, - token_type_ids: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - global_attention_mask: Optional[torch.Tensor] = None, - head_mask: Optional[torch.Tensor] = None, - labels: Optional[torch.Tensor] = None, - position_ids: Optional[torch.Tensor] = None, - inputs_embeds: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, LongformerMultipleChoiceModelOutput]: - r""" - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., - num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See - `input_ids` above) - """ - num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # set global attention on question tokens - if global_attention_mask is None and input_ids is not None: - logger.info("Initializing global attention on multiple choice...") - # put global attention on all tokens after `config.sep_token_id` - global_attention_mask = torch.stack( - [ - _compute_global_attention_mask(input_ids[:, i], self.config.sep_token_id, before_sep_token=False) - for i in range(num_choices) - ], - dim=1, - ) - - flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None - flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None - flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None - flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None - flat_global_attention_mask = ( - global_attention_mask.view(-1, global_attention_mask.size(-1)) - if global_attention_mask is not None - else None - ) - flat_inputs_embeds = ( - inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) - if inputs_embeds is not None - else None - ) - - outputs = self.longformer( - flat_input_ids, - position_ids=flat_position_ids, - token_type_ids=flat_token_type_ids, - attention_mask=flat_attention_mask, - global_attention_mask=flat_global_attention_mask, - head_mask=head_mask, - inputs_embeds=flat_inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - pooled_output = outputs[1] - - pooled_output = self.dropout(pooled_output) - logits = self.classifier(pooled_output) - reshaped_logits = logits.view(-1, num_choices) - - loss = None - if labels is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(reshaped_logits, labels) - - if not return_dict: - output = (reshaped_logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - - return LongformerMultipleChoiceModelOutput( - loss=loss, - logits=reshaped_logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - global_attentions=outputs.global_attentions, - ) diff --git a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/requirements.txt b/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/requirements.txt deleted file mode 100644 index 84310f9ea50..00000000000 --- a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -accelerate -datasets -transformers -torch==2.3.0 -neural-compressor==2.0 diff --git a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/run_qa_no_trainer.py b/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/run_qa_no_trainer.py deleted file mode 100644 index a0ff5e1e30a..00000000000 --- a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/run_qa_no_trainer.py +++ /dev/null @@ -1,1305 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 - -# Apache v2 license -# Copyright (C) 2021 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -# Copyright 2020 The HuggingFace Team All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Fine-tuning the library models for question answering. -""" -# You can also adapt this script on your own question answering task. Pointers for this are left as comments. -""" -This script is based on HuggingFace/transformers example: https://github.com/huggingface/transformers/blob/v4.6.1/examples/pytorch/question-answering/run_qa.py -Changes made to the script: - 1. Added pruning capabilities - 2. Added model distillation capabilities - 3. Added learning rate rewinding option - 4. Added methods to save all hyper-parameters used - 5. Added quantization capabilities -""" - -import logging -import os -import sys -from dataclasses import dataclass, field -from typing import Optional -from collections import defaultdict -from tqdm.auto import tqdm -import math - -import torch -import datasets -from datasets import load_dataset, load_metric - -import transformers -from trainer_qa import QuestionAnsweringTrainer -from transformers import ( - AutoConfig, - AutoModelForQuestionAnswering, - AutoTokenizer, - DataCollatorWithPadding, - EvalPrediction, - HfArgumentParser, - PreTrainedTokenizerFast, - TrainingArguments, - default_data_collator, - set_seed, - get_scheduler, - CONFIG_MAPPING, - MODEL_MAPPING, - SchedulerType -) -from transformers.trainer_utils import get_last_checkpoint, is_main_process -from transformers.utils import check_min_version -from transformers.file_utils import get_full_repo_name - -from utils_qa import postprocess_qa_predictions - -from huggingface_hub import Repository - -from functools import partial -from accelerate import Accelerator -from torch.utils.data import DataLoader -import argparse -from accelerate.logging import get_logger -import numpy as np -import utils_qa -import json -from neural_compressor.training import Pruning, prepare_compression -from neural_compressor.training import WeightPruningConfig - -os.environ["WANDB_DISABLED"] = "true" -os.environ["HTTP_PROXY"] = "" - -# Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.6.0") - -logger = get_logger(__name__) -# You should update this to your particular problem to have better documentation of `model_type` -MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys()) -MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) - -# (['loss', 'start_logits', 'end_logits']) -# batch(['attention_mask', 'end_positions', 'input_ids', 'start_positions', 'token_type_ids'] -def get_loss_one_logit(student_logit, teacher_logit): - t = 2.0 - from torch.nn import functional as F - return F.kl_div( - input=F.log_softmax(student_logit / t, dim=-1), - target=F.softmax(teacher_logit / t, dim=-1), - reduction="batchmean" - ) * (t ** 2) - -def save_prefixed_metrics(results, output_dir, file_name: str = "all_results.json", metric_key_prefix: str = "eval"): - """ - Save results while prefixing metric names. - Args: - results: (:obj:`dict`): - A dictionary of results. - output_dir: (:obj:`str`): - An output directory. - file_name: (:obj:`str`, `optional`, defaults to :obj:`all_results.json`): - An output file name. - metric_key_prefix: (:obj:`str`, `optional`, defaults to :obj:`eval`): - A metric name prefix. - """ - # Prefix all keys with metric_key_prefix + '_' - for key in list(results.keys()): - if not key.startswith(f"{metric_key_prefix}_"): - results[f"{metric_key_prefix}_{key}"] = results.pop(key) - - with open(os.path.join(output_dir, file_name), "w") as f: - json.dump(results, f, indent=4) - -def parse_args(): - parser = argparse.ArgumentParser(description="Finetune a transformers model on a Question Answering task") - parser.add_argument( - "--dataset_name", - type=str, - default=None, - help="The name of the dataset to use (via the datasets library).", - ) - parser.add_argument( - "--dataset_config_name", - type=str, - default=None, - help="The configuration name of the dataset to use (via the datasets library).", - ) - parser.add_argument( - "--train_file", - type=str, - default=None, - help="A csv or a json file containing the training data." - ) - parser.add_argument( - "--preprocessing_num_workers", - type=int, default=10, - help="A csv or a json file containing the training data." - ) - - parser.add_argument( - "--do_predict", - action="store_true", - help="To do prediction on the question answering model" - ) - parser.add_argument( - "--validation_file", - type=str, - default=None, - help="A csv or a json file containing the validation data." - ) - parser.add_argument( - "--test_file", - type=str, - default=None, - help="A csv or a json file containing the Prediction data." - ) - parser.add_argument( - "--max_seq_length", - type=int, - default=384, - help=( - "The maximum total input sequence length after tokenization. Sequences longer than this will be truncated," - " sequences shorter will be padded if `--pad_to_max_lengh` is passed." - ), - ) - parser.add_argument( - "--pad_to_max_length", - action="store_true", - help="If passed, pad all samples to `max_seq_length`. Otherwise, dynamic padding is used.", - ) - parser.add_argument( - "--model_name_or_path", - type=str, - help="Path to pretrained model or model identifier from huggingface.co/models." - ) - parser.add_argument( - "--teacher_model_name_or_path", - type=str, - default=None, - help="Path to pretrained model or model identifier from huggingface.co/models.", - required=False - ) - parser.add_argument( - "--config_name", - type=str, - default=None, - help="Pretrained config name or path if not the same as model_name", - ) - parser.add_argument( - "--tokenizer_name", - type=str, - default=None, - help="Pretrained tokenizer name or path if not the same as model_name", - ) - parser.add_argument( - "--use_slow_tokenizer", - action="store_true", - help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).", - ) - parser.add_argument( - "--per_device_train_batch_size", - type=int, - default=8, - help="Batch size (per device) for the training dataloader.", - ) - parser.add_argument( - "--distill_loss_weight", - type=float, - default=0.0, - help="distiller loss weight" - ) - parser.add_argument( - "--per_device_eval_batch_size", - type=int, - default=8, - help="Batch size (per device) for the evaluation dataloader.", - ) - parser.add_argument( - "--learning_rate", - type=float, - default=5e-5, - help="Initial learning rate (after the potential warmup period) to use.", - ) - parser.add_argument( - "--weight_decay", - type=float, - default=0.0, - help="Weight decay to use." - ) - parser.add_argument( - "--num_train_epochs", - type=int, - default=3, - help="Total number of training epochs to perform." - ) - parser.add_argument( - "--max_train_steps", - type=int, - default=None, - help="Total number of training steps to perform. If provided, overrides num_train_epochs.", - ) - parser.add_argument( - "--gradient_accumulation_steps", - type=int, - default=1, - help="Number of updates steps to accumulate before performing a backward/update pass.", - ) - parser.add_argument( - "--lr_scheduler_type", - type=SchedulerType, - default="linear", - help="The scheduler type to use.", - choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"], - ) - - parser.add_argument( - "--warm_epochs", - type=int, - default=0, - help="Number of epochs the network not be purned" - ) - parser.add_argument( - "--num_warmup_steps", - type=int, - default=0, - help="Number of steps for the warmup in the lr scheduler." - ) - parser.add_argument( - "--output_dir", - type=str, - default=None, - help="Where to store the final model." - ) - parser.add_argument( - "--seed", - type=int, - default=None, - help="A seed for reproducible training." - ) - parser.add_argument( - "--doc_stride", - type=int, - default=128, - help="When splitting up a long document into chunks how much stride to take between chunks.", - ) - parser.add_argument( - "--n_best_size", - type=int, - default=20, - help="The total number of n-best predictions to generate when looking for an answer.", - ) - parser.add_argument( - "--null_score_diff_threshold", - type=float, - default=0.0, - help=( - "The threshold used to select the null answer: if the best answer has a score that is less than " - "the score of the null answer minus this threshold, the null answer is selected for this example. " - "Only useful when `version_2_with_negative=True`." - ), - ) - parser.add_argument( - "--version_2_with_negative", - action="store_true", - help="If true, some of the examples do not have an answer.", - ) - parser.add_argument( - "--max_answer_length", - type=int, - default=30, - help=( - "The maximum length of an answer that can be generated. This is needed because the start " - "and end predictions are not conditioned on one another." - ), - ) - parser.add_argument( - "--max_train_samples", - type=int, - default=None, - help=( - "For debugging purposes or quicker training, truncate the number of training examples to this " - "value if set." - ), - ) - parser.add_argument( - "--max_eval_samples", - type=int, - default=None, - help=( - "For debugging purposes or quicker training, truncate the number of evaluation examples to this " - "value if set." - ), - ) - parser.add_argument( - "--overwrite_cache", type=bool, default=False, help="Overwrite the cached training and evaluation sets" - ) - parser.add_argument( - "--max_predict_samples", - type=int, - default=None, - help="For debugging purposes or quicker training, truncate the number of prediction examples to this", - ) - parser.add_argument( - "--model_type", - type=str, - default=None, - help="Model type to use if training from scratch.", - choices=MODEL_TYPES, - ) - parser.add_argument( - "--cooldown_epochs", - type=int, default=0, - help="Cooling epochs after pruning." - ) - parser.add_argument( - "--do_prune", action="store_true", - help="Whether or not to prune the model" - ) - parser.add_argument( - "--pruning_scope", - type=str, default="global", - help="pruning scope, we support global and local." - ) - parser.add_argument( - "--pruning_pattern", - type=str, default="4x1", - help="pruning pattern type, we support NxM and N:M." - ) - parser.add_argument( - "--target_sparsity", - type=float, default=0.8, - help="Target sparsity of the model." - ) - parser.add_argument( - "--pruning_frequency", - type=int, default=-1, - help="Sparse step frequency for iterative pruning, default to a quarter of pruning steps." - ) - - parser.add_argument( - "--keep_conf", action="store_true", - help="Whether or not to keep the prune config infos" - ) - parser.add_argument( - "--pruning_config", - type=str, - help="pruning_config" - ) - - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether or not to push the model to the Hub." - ) - parser.add_argument( - "--hub_model_id", - type=str, - help="The name of the repository to keep in sync with the local `output_dir`." - ) - parser.add_argument( - "--hub_token", - type=str, - help="The token to use to push to the Model Hub." - ) - parser.add_argument( - "--checkpointing_steps", - type=str, - default=None, - help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.", - ) - parser.add_argument( - "--resume_from_checkpoint", - type=str, - default=None, - help="If the training should continue from a checkpoint folder.", - ) - - parser.add_argument( - "--with_tracking", - action="store_true", - help="Whether to enable experiment trackers for logging.", - ) - parser.add_argument( - "--report_to", - type=str, - default="all", - help=( - 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,' - ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.' - "Only applicable when `--with_tracking` is passed." - ), - ) - - parser.add_argument( - "--cache_dir", - type=str, - default=None, - help="Path to directory to store the pretrained models downloaded from huggingface.co", - ) - - parser.add_argument( - "--model_revision", - type=str, - default="main", - help="The specific model version to use (can be a branch name, tag name or commit id).", - ) - - parser.add_argument( - "--use_auth_token", - type=bool, - default=False, - help="Will use the token generated when running `transformers-cli login` (necessary to use this script with private models).", - ) - - parser.add_argument( - "--do_train", - action="store_true", - help="Whether to run training.", - ) - - parser.add_argument( - "--do_eval", - action="store_true", - help="Whether to run eval on the dev set.", - ) - - args = parser.parse_args() - - # Sanity checks - if ( - args.dataset_name is None - and args.train_file is None - and args.validation_file is None - and args.test_file is None - ): - raise ValueError("Need either a dataset name or a training/validation/test file.") - else: - if args.train_file is not None: - extension = args.train_file.split(".")[-1] - assert extension in ["csv", "json"], "`train_file` should be a csv or a json file." - if args.validation_file is not None: - extension = args.validation_file.split(".")[-1] - assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file." - if args.test_file is not None: - extension = args.test_file.split(".")[-1] - assert extension in ["csv", "json"], "`test_file` should be a csv or a json file." - - if args.push_to_hub: - assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed." - - return args - -def main(): - - args = parse_args() - - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - # send_example_telemetry("run_qa_no_trainer", args) - - # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. - # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers - # in the environment - - accelerator = ( - Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator() - ) - - ''' - accelerator_log_kwargs = {} - if args.with_tracking: - accelerator_log_kwargs["log_with"] = args.report_to - accelerator_log_kwargs["logging_dir"] = args.output_dir - accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs) - ''' - # Make one log on every process with the configuration for debugging. - logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - level=logging.INFO, - ) - logger.info(accelerator.state, main_process_only=False) - if accelerator.is_local_main_process: - datasets.utils.logging.set_verbosity_warning() - transformers.utils.logging.set_verbosity_info() - else: - datasets.utils.logging.set_verbosity_error() - transformers.utils.logging.set_verbosity_error() - - # If passed along, set the training seed now. - if args.seed is not None: - set_seed(args.seed) - - # Handle the repository creation - if accelerator.is_main_process: - if args.push_to_hub: - if args.hub_model_id is None: - repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) - else: - repo_name = args.hub_model_id - repo = Repository(args.output_dir, clone_from=repo_name) - - with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: - if "step_*" not in gitignore: - gitignore.write("step_*\n") - if "epoch_*" not in gitignore: - gitignore.write("epoch_*\n") - elif args.output_dir is not None: - os.makedirs(args.output_dir, exist_ok=True) - accelerator.wait_for_everyone() - - script_path = os.path.split(os.path.abspath(__file__))[0] - - # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) - # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ - # (the dataset will be downloaded automatically from the datasets Hub). - # - # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called - # 'text' is found. You can easily tweak this behavior (see below). - # - # In distributed training, the load_dataset function guarantee that only one local process can concurrently - # download the dataset. - if args.dataset_name is not None: - # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name, cache_dir=args.cache_dir) - else: - data_files = {} - if args.train_file is not None: - data_files["train"] = args.train_file - extension = args.train_file.split(".")[-1] - - if args.validation_file is not None: - data_files["dev"] = args.validation_file - extension = args.validation_file.split(".")[-1] - if args.test_file is not None: - data_files["test"] = args.test_file - extension = args.test_file.split(".")[-1] - # datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir) - raw_datasets = load_dataset(os.path.join(script_path, "squad.py"), data_files=data_files, cache_dir=args.cache_dir) - - # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at - # https://huggingface.co/docs/datasets/loading_datasets.html. - - # Load pretrained model and tokenizer - # - # Distributed training: - # The .from_pretrained methods guarantee that only one local process can concurrently - # download model & vocab. - config = AutoConfig.from_pretrained( - args.config_name if args.config_name else args.model_name_or_path, - cache_dir=args.cache_dir, - revision=args.model_revision, - use_auth_token=True if args.use_auth_token else None, - ) - tokenizer = AutoTokenizer.from_pretrained( - args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, - cache_dir=args.cache_dir, - use_fast=True, - revision=args.model_revision, - use_auth_token=True if args.use_auth_token else None, - ) - - # local py module - from modeling_longformer import LongformerForQuestionAnswering - model_class = LongformerForQuestionAnswering - - if args.distill_loss_weight > 0: - teacher_path = args.teacher_model_name_or_path - if teacher_path is None: - teacher_path = args.model_name_or_path - teacher_model = model_class.from_pretrained( - teacher_path, - from_tf=bool(".ckpt" in args.model_name_or_path), - config=config, - ) - - if args.model_name_or_path: - model = model_class.from_pretrained( - args.model_name_or_path, - from_tf=bool(".ckpt" in args.model_name_or_path), - config=config, - cache_dir=args.cache_dir, - revision=args.model_revision, - use_auth_token=True if args.use_auth_token else None, - ) - else: - logger.info("Training new model from scratch") - model = model_class.from_config(config) - - # Preprocessing the datasets. - # Preprocessing is slightly different for training and evaluation. - if args.do_train: - column_names = raw_datasets["train"].column_names - elif args.do_eval: - column_names = raw_datasets["validation"].column_names - else: - column_names = raw_datasets["test"].column_names - question_column_name = "question" if "question" in column_names else column_names[0] - context_column_name = "context" if "context" in column_names else column_names[1] - answer_column_name = "answers" if "answers" in column_names else column_names[2] - - # Padding side determines if we do (question|context) or (context|question). - # pad_on_right = tokenizer.padding_side == "right" - # max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) - max_seq_length = args.max_seq_length - - # preprocess context and answers - def preprocess_context(examples): - new_examples = {} - - def is_whitespace(c): - if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: - return True - return False - - def pre_tokenize(p): - doc_tokens = [] - char_to_word_offset = [] - prev_is_whitespace = True - for c in p: - if is_whitespace(c): - prev_is_whitespace = True - else: - if prev_is_whitespace: - doc_tokens.append(c) - else: - doc_tokens[-1] += c - prev_is_whitespace = False - char_to_word_offset.append(len(doc_tokens) - 1) - - return ' '.join(doc_tokens), char_to_word_offset - - new_examples[context_column_name] = [] - new_examples["answer_spans"] = [] - for i, p in enumerate(examples[context_column_name]): - tokenized_p, char_to_word_offset = pre_tokenize(p) - new_examples[context_column_name].append(tokenized_p) - - answer_spans = [] - for orig_answer_text, answer_offset in zip(examples[answer_column_name][i]['text'], examples[answer_column_name][i]['answer_start']): - answer_length = len(orig_answer_text) - try: - start_position = char_to_word_offset[answer_offset] - end_position = char_to_word_offset[answer_offset + answer_length - 1] - token_ids = tokenizer.encode(orig_answer_text) - except RuntimeError: - logger.info(f'Reading example {idx} failed') - start_position = 0 - end_position = 0 - answer_spans.append({'start': start_position, 'end': end_position, - 'text': orig_answer_text, 'token_ids': token_ids}) - new_examples["answer_spans"].append(answer_spans) - - for key in examples: - if key != context_column_name: - new_examples[key] = examples[key] - return new_examples - - # preprocessing - def prepare_features(examples, max_question_len=55, max_doc_len=4096, max_num_answers=64, ignore_seq_with_no_answers=False, mode="eval"): - - tokenized_examples = {} - tokenized_examples["input_ids"] = [] - tokenized_examples["attention_mask"] = [] - if mode == "train": - tokenized_examples["start_positions"] = [] - tokenized_examples["end_positions"] = [] - elif mode == "eval": - tokenized_examples["example_id"] = [] - else: - raise NotImplementedError("not implemented yet.") - - # not use for roberta - #tokenized_examples["token_type_ids"] = [] - - # Some of the questions have lots of whitespace on the left, which is not useful and will make the - # truncation of the context fail (the tokenized question will take a lots of space). So we remove that - # left whitespace - examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]] - - for example_index in range(len(examples[question_column_name])): - question_text = examples[question_column_name][example_index] - query_tokens = tokenizer.tokenize(question_text) - query_tokens = query_tokens[:max_question_len] - doc_tokens = examples[context_column_name][example_index].split(" ") - answer_spans = examples["answer_spans"][example_index] - tok_to_orig_index = [] - orig_to_tok_index = [] - all_doc_tokens = [] - for (i, token) in enumerate(doc_tokens): - orig_to_tok_index.append(len(all_doc_tokens)) - sub_tokens = tokenizer.tokenize(f'. {token}')[1:] if i > 0 else tokenizer.tokenize(token) - for sub_token in sub_tokens: - tok_to_orig_index.append(i) - all_doc_tokens.append(sub_token) - all_doc_tokens = all_doc_tokens[:max_doc_len] - # The -3 accounts for , and - max_tokens_per_doc_slice = max_seq_length - len(query_tokens) - 3 - assert max_tokens_per_doc_slice > 0 - - if args.doc_stride < 0: - # negative doc_stride indicates no sliding window, but using first slice - args.doc_stride = -100 * len(all_doc_tokens) # large -ve value for the next loop to execute once - - input_ids_list = [] - input_mask_list = [] - segment_ids_list = [] - start_positions_list = [] - end_positions_list = [] - answer_token_ids_list = [] - - for slice_start in range(0, len(all_doc_tokens), max_tokens_per_doc_slice - args.doc_stride): - slice_end = min(slice_start + max_tokens_per_doc_slice, len(all_doc_tokens)) - doc_slice_tokens = all_doc_tokens[slice_start:slice_end] - tokens = [tokenizer.cls_token] + query_tokens + [tokenizer.sep_token] \ - + doc_slice_tokens + [tokenizer.sep_token] - - # but don't use for roberta - segment_ids = [0] * (len(query_tokens) + 2) + [1] * (len(doc_slice_tokens) + 1) - assert len(segment_ids) == len(tokens) - - input_ids = tokenizer.convert_tokens_to_ids(tokens) - input_mask = [1] * len(input_ids) - - #if data_args.pad_to_max_length: # no need to pad if document is not strided - if False: - # Zero-pad up to the sequence length. - padding_len = max_seq_length - len(input_ids) - input_ids.extend([tokenizer.pad_token_id] * padding_len) - input_mask.extend([0] * padding_len) - segment_ids.extend([0] * padding_len) - - assert len(input_ids) == max_seq_length - assert len(input_mask) == max_seq_length - assert len(segment_ids) == max_seq_length - - doc_offset = len(query_tokens) + 2 - slice_start - - start_positions = [] - end_positions = [] - answer_token_ids = [] - for answer_span in answer_spans: - start_position = answer_span['start'] - end_position = answer_span['end'] - tok_start_position_in_doc = orig_to_tok_index[start_position] - not_end_of_doc = int(end_position + 1 < len(orig_to_tok_index)) - tok_end_position_in_doc = orig_to_tok_index[end_position + not_end_of_doc] - not_end_of_doc - if tok_start_position_in_doc < slice_start or tok_end_position_in_doc > slice_end: - # this answer is outside the current slice - continue - - start_positions.append(tok_start_position_in_doc + doc_offset) - end_positions.append(tok_end_position_in_doc + doc_offset) - answer_token_ids.append(answer_span['token_ids']) - - assert len(start_positions) == len(end_positions) - if ignore_seq_with_no_answers and len(start_positions) == 0: - continue - - # answers from start_positions and end_positions if > self.max_num_answers - start_positions = start_positions[:max_num_answers] - end_positions = end_positions[:max_num_answers] - answer_token_ids = answer_token_ids[:max_num_answers] - - # -1 padding up to self.max_num_answers - # -1 means empty answer in last token, while normal squad in [CLS] token - padding_len = max_num_answers - len(start_positions) - start_positions.extend([-1] * padding_len) - end_positions.extend([-1] * padding_len) - answer_token_ids.extend([[]] * padding_len) - - # replace duplicate start/end positions with `-1` because duplicates can result into -ve loss values - found_start_positions = set() - found_end_positions = set() - found_answer_token_ids = set() - for i, (start_position, end_position, answer_tokens) in enumerate( - zip(start_positions, end_positions, answer_token_ids) - ): - if start_position in found_start_positions: - start_positions[i] = -1 - if end_position in found_end_positions: - end_positions[i] = -1 - answer_tokens_as_str = ','.join([str(x) for x in answer_tokens]) - if answer_tokens_as_str in found_answer_token_ids: - answer_token_ids[i] = [] - - found_start_positions.add(start_position) - found_end_positions.add(end_position) - found_answer_token_ids.add(answer_tokens_as_str) - - input_ids_list.append(input_ids) - input_mask_list.append(input_mask) - segment_ids_list.append(segment_ids) - start_positions_list.append(start_positions) - end_positions_list.append(end_positions) - answer_token_ids_list.append(answer_token_ids) - - # pad answers in answer_token_ids_list to the longest answer - max_answer_len = max([len(item) for sublist in answer_token_ids_list for item in sublist]) # flat list - if max_answer_len == 0: - max_answer_len = 2 - for answers_of_one_slice in answer_token_ids_list: - for answer_tokens in answers_of_one_slice: - if len(answer_tokens) == 0: - # TODO: or ? - padding_len = max_answer_len - len(answer_tokens) - 2 - answer_tokens.extend([tokenizer.bos_token_id, tokenizer.eos_token_id] + - ([tokenizer.pad_token_id] * padding_len)) - else: - padding_len = max_answer_len - len(answer_tokens) - answer_tokens.extend([tokenizer.pad_token_id] * padding_len) - - - tokenized_examples["input_ids"].extend(input_ids_list) - tokenized_examples["attention_mask"].extend(input_mask_list) - - if mode == "train": - # only one answer used for training - #tokenized_examples["start_positions"].extend([each[0] for each in start_positions_list]) - #tokenized_examples["end_positions"].extend([each[0] for each in end_positions_list]) - tokenized_examples["start_positions"].append(start_positions_list[0]) - tokenized_examples["end_positions"].append(end_positions_list[0]) - elif mode == "eval": - tokenized_examples["example_id"].append(examples["id"][example_index]) - - return tokenized_examples - - prepare_train_features = partial(prepare_features, mode="train") - if args.do_train: - if "train" not in raw_datasets: - raise ValueError("--do_train requires a train dataset") - train_dataset = raw_datasets["train"] - if args.max_train_samples is not None: - # We will select sample from whole data if augment is specified - train_dataset = train_dataset.select(range(args.max_train_samples)) - with accelerator.main_process_first(): - # preprocess - train_dataset = train_dataset.map( - preprocess_context, - batched=True, - num_proc=args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not args.overwrite_cache, - ) - - # Create train feature from dataset - train_dataset = train_dataset.map( - prepare_train_features, - batched=True, - num_proc=args.preprocessing_num_workers, - remove_columns=column_names + ["answer_spans"], - load_from_cache_file=not args.overwrite_cache, - ) - if args.max_train_samples is not None: - # Number of samples might increase during Feature Creation, We select only specified max samples - train_dataset = train_dataset.select(range(args.max_train_samples)) - - prepare_validation_features = partial(prepare_features, mode="eval") - - if args.do_eval: - if "validation" not in raw_datasets: - raise ValueError("--do_eval requires a validation dataset") - eval_examples = raw_datasets["validation"] - if args.max_eval_samples is not None: - # We will select sample from whole data - eval_examples = eval_examples.select(range(args.max_eval_samples)) - with accelerator.main_process_first(): - # preprocess - eval_examples = eval_examples.map( - preprocess_context, - batched=True, - num_proc=args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not args.overwrite_cache, - ) - # Validation Feature Creation - eval_dataset = eval_examples.map( - prepare_validation_features, - batched=True, - num_proc=args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not args.overwrite_cache, - ) - - if args.max_eval_samples is not None: - # During Feature creation dataset samples might increase, we will select required samples again - eval_dataset = eval_dataset.select(range(args.max_eval_samples)) - - - # DataLoaders creation: - if args.pad_to_max_length: - # If padding was already done ot max length, we use the default data collator that will just convert everything - # to tensors. - data_collator = default_data_collator - else: - # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of - # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple - # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). - data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)) - - train_dataloader = DataLoader( - train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size - ) - - eval_dataset_for_model = eval_dataset.remove_columns(["example_id", "answer_spans"]) - eval_dataloader = DataLoader( - eval_dataset_for_model, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size - ) - - # Post-processing: - def post_processing_function(examples, features, predictions, stage="eval"): - # Post-processing: we match the start logits and end logits to answers in the original context. - predictions = postprocess_qa_predictions( - examples=examples, - features=features, - predictions=predictions, - tokenizer=tokenizer, - version_2_with_negative=args.version_2_with_negative, - n_best_size=args.n_best_size, - max_answer_length=args.max_answer_length, - null_score_diff_threshold=args.null_score_diff_threshold, - output_dir=args.output_dir, - prefix=stage, - ) - # Format the result to the format the metric expects. - if args.version_2_with_negative: - formatted_predictions = [ - {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items() - ] - else: - formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()] - - references = [{"id": ex["id"], "answers": ex[answer_column_name], "aliases": ex["aliases"]} for ex in examples] - - return EvalPrediction(predictions=predictions, label_ids=references) - - # Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor - def create_and_fill_np_array(start_or_end_logits, dataset, max_len): - """ - Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor - Args: - start_or_end_logits(:obj:`tensor`): - This is the output predictions of the model. We can only enter either start or end logits. - eval_dataset: Evaluation dataset - max_len(:obj:`int`): - The maximum length of the output tensor. ( See the model.eval() part for more details ) - """ - - step = 0 - # create a numpy array and fill it with -100. - logits_concat = np.full((len(dataset), max_len), -100, dtype=np.float64) - # Now since we have create an array now we will populate it with the outputs gathered using accelerator.gather - for i, output_logit in enumerate(start_or_end_logits): # populate columns - # We have to fill it such that we have to take the whole tensor and replace it on the newly created array - # And after every iteration we have to change the step - - batch_size = output_logit.shape[0] - cols = output_logit.shape[1] - - if step + batch_size < len(dataset): - logits_concat[step: step + batch_size, :cols] = output_logit - else: - logits_concat[step:, :cols] = output_logit[: len(dataset) - step] - - step += batch_size - - return logits_concat - - # Optimizer - # Split weights in two groups, one with weight decay and the other not. - no_decay = ["bias", "LayerNorm.weight"] - no_decay_outputs = ["bias", "LayerNorm.weight", "qa_outputs"] - optimizer_grouped_parameters = [ - { - "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], - "weight_decay": args.weight_decay, - }, - { - "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], - "weight_decay": 0.0, - }, - ] - if args.do_prune: - optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, betas=[0.9, 0.9]) - else: - optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate) - - # Scheduler and math around the number of training steps. - overrode_max_train_steps = False - num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) - if args.max_train_steps is None: - args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch - overrode_max_train_steps = True - - lr_scheduler = get_scheduler( - name=args.lr_scheduler_type, - optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps, - num_training_steps=args.max_train_steps, - ) - - if args.distill_loss_weight > 0: - teacher_model, model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( - teacher_model, model, optimizer, train_dataloader, eval_dataloader, lr_scheduler - ) - teacher_model.eval() - else: - # Prepare everything with our `accelerator`. - model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( - model, optimizer, train_dataloader, eval_dataloader, lr_scheduler - ) - # We need to recalculate our total training steps as the size of the training dataloader may have changed. - num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) - if overrode_max_train_steps: - args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch - # Afterwards we recalculate our number of training epochs - args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) - - # Figure out how many steps we should save the Accelerator states - if hasattr(args.checkpointing_steps, "isdigit"): - checkpointing_steps = args.checkpointing_steps - if args.checkpointing_steps.isdigit(): - checkpointing_steps = int(args.checkpointing_steps) - else: - checkpointing_steps = None - - # We need to initialize the trackers we use, and also store our configuration. - # We initialize the trackers only on main process because `accelerator.log` - # only logs on main process and we don't want empty logs/runs on other processes. - if args.with_tracking: - if accelerator.is_main_process: - experiment_config = vars(args) - # TensorBoard cannot log Enums, need the raw value - experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value - accelerator.init_trackers("qa_no_trainer", experiment_config) - - # Train! - total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps - - logger.info("***** Running training *****") - logger.info(f" Num examples = {len(train_dataset)}") - logger.info(f" Num Epochs = {args.num_train_epochs}") - logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size}") - logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") - logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") - logger.info(f" Total optimization steps = {args.max_train_steps}") - - # Only show the progress bar once on each machine. - progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) - completed_steps = 0 - starting_epoch = 0 - - # Potentially load in the weights and states from a previous save - if args.resume_from_checkpoint: - if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": - accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}") - accelerator.load_state(args.resume_from_checkpoint) - path = os.path.basename(args.resume_from_checkpoint) - else: - # Get the most recent checkpoint - dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] - dirs.sort(key=os.path.getctime) - path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last - # Extract `epoch_{i}` or `step_{i}` - training_difference = os.path.splitext(path)[0] - - if "epoch" in training_difference: - starting_epoch = int(training_difference.replace("epoch_", "")) + 1 - resume_step = None - else: - resume_step = int(training_difference.replace("step_", "")) - starting_epoch = resume_step // len(train_dataloader) - resume_step -= starting_epoch * len(train_dataloader) - - # Pruning preparation - num_iterations = len(train_dataset) / total_batch_size - num_warm = int(args.warm_epochs * num_iterations) + args.num_warmup_steps - total_iterations = int(num_iterations * (args.num_train_epochs - args.cooldown_epochs)) - frequency = int((total_iterations - num_warm + 1) / 40) if args.pruning_frequency == -1 \ - else args.pruning_frequency - - pruning_start = num_warm - pruning_end = total_iterations - if not args.do_prune: - pruning_start = num_iterations * args.num_train_epochs + 1 - pruning_end = pruning_start - - pruning_configs=[ - { - "pruning_type": "snip_momentum", - "pruning_scope": "global", - "sparsity_decay_type": "exp", - "excluded_op_names": ["qa_outputs", "pooler", ".*embeddings*"], - "pruning_op_types": ["Linear"], - "max_sparsity_ratio_per_op": 0.98 - } - ] - - configs = WeightPruningConfig( - pruning_configs, - pruning_scope=args.pruning_scope, - target_sparsity=args.target_sparsity, - pattern=args.pruning_pattern, - pruning_frequency=frequency, - start_step=pruning_start, - end_step=pruning_end - ) - - compression_manager = prepare_compression(model=model, confs=configs) - compression_manager.callbacks.on_train_begin() - model = compression_manager.model - - - for epoch in range(starting_epoch, args.num_train_epochs): - model.train() - if epoch >= args.warm_epochs: - if args.with_tracking: - total_loss = 0 - for step, batch in enumerate(train_dataloader): - compression_manager.callbacks.on_step_begin(step) - - outputs = model(**batch) - loss = outputs.loss - # We keep track of the loss at each epoch - if args.with_tracking: - total_loss += loss.detach().float() - if args.distill_loss_weight > 0: - distill_loss_weight = args.distill_loss_weight - with torch.no_grad(): - teacher_outputs = teacher_model(**batch) - loss = (distill_loss_weight) / 2 * get_loss_one_logit(outputs['start_logits'], - teacher_outputs['start_logits']) \ - + (distill_loss_weight) / 2 * get_loss_one_logit(outputs['end_logits'], - teacher_outputs['end_logits']) - loss = loss / args.gradient_accumulation_steps - accelerator.backward(loss) - - if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: - compression_manager.callbacks.on_before_optimizer_step() - optimizer.step() - compression_manager.callbacks.on_after_optimizer_step() - lr_scheduler.step() - optimizer.zero_grad() - progress_bar.update(1) - completed_steps += 1 - - - if isinstance(checkpointing_steps, int): - if completed_steps % checkpointing_steps == 0: - output_dir = f"step_{completed_steps}" - if args.output_dir is not None: - output_dir = os.path.join(args.output_dir, output_dir) - accelerator.save_state(output_dir) - - if completed_steps >= args.max_train_steps: - break - else: - for step, batch in enumerate(train_dataloader): - outputs = model(**batch) - loss = outputs.loss - loss = loss / args.gradient_accumulation_steps - accelerator.backward(loss) - if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: - optimizer.step() - lr_scheduler.step() - optimizer.zero_grad() - progress_bar.update(1) - completed_steps += 1 - - if completed_steps >= args.max_train_steps: - break - - if args.checkpointing_steps == "epoch": - output_dir = f"epoch_{epoch}" - if args.output_dir is not None: - output_dir = os.path.join(args.output_dir, output_dir) - accelerator.save_state(output_dir) - - if args.push_to_hub and epoch < args.num_train_epochs - 1: - accelerator.wait_for_everyone() - unwrapped_model = accelerator.unwrap_model(model) - unwrapped_model.save_pretrained( - args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save - ) - if accelerator.is_main_process: - tokenizer.save_pretrained(args.output_dir) - repo.push_to_hub( - commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True - ) - - # eval each epoch - logger.info(f"***** Running Evaluation*****") - all_start_logits = [] - all_end_logits = [] - - # pruner.on_before_eval() - model.eval() - for step, batch in enumerate(eval_dataloader): - with torch.no_grad(): - outputs = model(**batch) - start_logits = outputs.start_logits - end_logits = outputs.end_logits - - if not args.pad_to_max_length: # necessary to pad predictions and labels for being gathered - start_logits = accelerator.pad_across_processes(start_logits, dim=1, pad_index=-100) - end_logits = accelerator.pad_across_processes(end_logits, dim=1, pad_index=-100) - - all_start_logits.append(accelerator.gather(start_logits).cpu().numpy()) - all_end_logits.append(accelerator.gather(end_logits).cpu().numpy()) - - max_len = max([x.shape[1] for x in all_start_logits]) # Get the max_length of the tensor - # pruner.on_after_eval() - - # concatenate the numpy array - start_logits_concat = create_and_fill_np_array(all_start_logits, eval_dataset, max_len) - end_logits_concat = create_and_fill_np_array(all_end_logits, eval_dataset, max_len) - - # delete the list of numpy arrays - del all_start_logits - del all_end_logits - - outputs_numpy = (start_logits_concat, end_logits_concat) - eval_preds = post_processing_function(eval_examples, eval_dataset, outputs_numpy) - - metrics = utils_qa.evaluate_triviaqa(eval_preds.label_ids, eval_preds.predictions) - logger.info(metrics) - - - if args.output_dir is not None: - accelerator.wait_for_everyone() - unwrapped_model = accelerator.unwrap_model(model.model) - unwrapped_model.save_pretrained( - args.output_dir + f"eph{args.num_train_epochs}_lr{args.learning_rate}_bs{total_batch_size}", - is_main_process=accelerator.is_main_process, save_function=accelerator.save - ) - if accelerator.is_main_process: - tokenizer.save_pretrained(args.output_dir) - if args.push_to_hub: - repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True) - - logger.info(json.dumps(metrics, indent=4)) - save_prefixed_metrics(metrics, args.output_dir) - - -def _mp_fn(index): - # For xla_spawn (TPUs) - main() - - -if __name__ == "__main__": - main() diff --git a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/scripts/download_data_and_convert.sh b/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/scripts/download_data_and_convert.sh deleted file mode 100644 index f0d7d0f3fa4..00000000000 --- a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/scripts/download_data_and_convert.sh +++ /dev/null @@ -1,19 +0,0 @@ -# from http://nlp.cs.washington.edu/triviaqa/ and https://github.com/mandarjoshi90/triviaqa -wget http://nlp.cs.washington.edu/triviaqa/data/triviaqa-rc.tar.gz - -tar -xvzf triviaqa-rc.tar.gz - -# the blow codes from the original paper code: https://github.com/allenai/longformer -python -m utils.convert_to_squad_format \ - --triviaqa_file ./qa/wikipedia-train.json \ - --wikipedia_dir ./evidence/wikipedia/ \ - --web_dir ./evidence/web/ \ - --max_num_tokens 4096 \ - --squad_file squad-wikipedia-train-4096.json - -python utils.convert_to_squad_format \ - --triviaqa_file ./qa/wikipedia-dev.json \ - --wikipedia_dir ./evidence/wikipedia/ \ - --web_dir ./evidence/web/ \ - --max_num_tokens 4096 \ - --squad_file squad-wikipedia-dev-4096.json diff --git a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/scripts/longformer_base_dense_fintune.sh b/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/scripts/longformer_base_dense_fintune.sh deleted file mode 100644 index ce21e329c16..00000000000 --- a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/scripts/longformer_base_dense_fintune.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -set -x - -train_file=./squad-wikipedia-train-4096.json -validation_file=./squad-wikipedia-dev-4096.json -pretrained_model=allenai/longformer-base-4096 - -accelerate launch --main_process_port 29245 run_qa_no_trainer.py \ - --model_name_or_path $pretrained_model \ - --do_train \ - --do_eval \ - --train_file $train_file \ - --validation_file $validation_file \ - --cache_dir ./tmp_cached \ - --max_seq_length 4096 \ - --doc_stride -1 \ - --per_device_train_batch_size 1 \ - --gradient_accumulation_steps 16 \ - --per_device_eval_batch_size 1 \ - --num_warmup_steps 1000 \ - --learning_rate 3.5e-5 \ - --num_train_epochs 4 \ - --output_dir longformer-base-4096-dense-baseline diff --git a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/scripts/longformer_base_sparse_global_4x1_pruning.sh b/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/scripts/longformer_base_sparse_global_4x1_pruning.sh deleted file mode 100644 index 3c08207aa62..00000000000 --- a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/scripts/longformer_base_sparse_global_4x1_pruning.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash -set -x - -train_file=./squad-wikipedia-train-4096.json -validation_file=./squad-wikipedia-dev-4096.json -teacher_model=Intel/longformer-base-4096-finetuned-triviaqa - -accelerate launch --main_process_port 29745 run_qa_no_trainer.py \ - --model_name_or_path $teacher_model \ - --do_train \ - --do_eval \ - --train_file $train_file \ - --validation_file $validation_file \ - --cache_dir ./tmp_cached \ - --max_seq_length 4096 \ - --doc_stride -1 \ - --per_device_train_batch_size 1 \ - --gradient_accumulation_steps 8 \ - --per_device_eval_batch_size 1 \ - --num_warmup_steps 1000 \ - --do_prune \ - --target_sparsity 0.8 \ - --pruning_scope "global" \ - --pruning_pattern "4x1" \ - --pruning_frequency 1000 \ - --cooldown_epochs 10 \ - --learning_rate 1e-4 \ - --num_train_epochs 18 \ - --weight_decay 0.01 \ - --output_dir longformer-base-4096-pruned-global-sparse80 \ - --teacher_model_name_or_path $teacher_model \ - --distill_loss_weight 3 diff --git a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/squad.py b/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/squad.py deleted file mode 100644 index b9a2847449d..00000000000 --- a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/squad.py +++ /dev/null @@ -1,144 +0,0 @@ -# coding=utf-8 -# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Lint as: python3 -"""SQUAD: The Stanford Question Answering Dataset.""" - - -import json - -import datasets -from datasets.tasks import QuestionAnsweringExtractive - - -logger = datasets.logging.get_logger(__name__) - - -_CITATION = """\ -@article{2016arXiv160605250R, - author = {{Rajpurkar}, Pranav and {Zhang}, Jian and {Lopyrev}, - Konstantin and {Liang}, Percy}, - title = "{SQuAD: 100,000+ Questions for Machine Comprehension of Text}", - journal = {arXiv e-prints}, - year = 2016, - eid = {arXiv:1606.05250}, - pages = {arXiv:1606.05250}, -archivePrefix = {arXiv}, - eprint = {1606.05250}, -} -""" - -_DESCRIPTION = """\ -Stanford Question Answering Dataset (SQuAD) is a reading comprehension \ -dataset, consisting of questions posed by crowdworkers on a set of Wikipedia \ -articles, where the answer to every question is a segment of text, or span, \ -from the corresponding reading passage, or the question might be unanswerable. -""" - -_URL = "https://rajpurkar.github.io/SQuAD-explorer/dataset/" -_URLS = { - "train": _URL + "train-v1.1.json", - "dev": _URL + "dev-v1.1.json", -} - - -class SquadConfig(datasets.BuilderConfig): - """BuilderConfig for SQUAD.""" - - def __init__(self, **kwargs): - """BuilderConfig for SQUAD. - Args: - **kwargs: keyword arguments forwarded to super. - """ - super(SquadConfig, self).__init__(**kwargs) - - -class Squad(datasets.GeneratorBasedBuilder): - """SQUAD: The Stanford Question Answering Dataset. Version 1.1.""" - - BUILDER_CONFIGS = [ - SquadConfig( - name="plain_text", - version=datasets.Version("1.0.0", ""), - description="Plain text", - ), - ] - print(BUILDER_CONFIGS) - - def _info(self): - return datasets.DatasetInfo( - description=_DESCRIPTION, - features=datasets.Features( - { - "id": datasets.Value("string"), - "title": datasets.Value("string"), - "context": datasets.Value("string"), - "question": datasets.Value("string"), - "answers": datasets.features.Sequence( - { - "text": datasets.Value("string"), - "answer_start": datasets.Value("int32"), - } - ), - "aliases": datasets.features.Sequence(datasets.Value("string")), - } - ), - # No default supervised_keys (as we have to pass both question - # and context as input). - supervised_keys=None, - homepage="https://rajpurkar.github.io/SQuAD-explorer/", - citation=_CITATION, - task_templates=[ - QuestionAnsweringExtractive( - question_column="question", context_column="context", answers_column="answers" - ) - ], - ) - - def _split_generators(self, dl_manager): - #downloaded_files = dl_manager.download_and_extract(_URLS) - downloaded_files = self.config.data_files - return [ - datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"][0]}), - datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"filepath": downloaded_files["dev"][0]}), - ] - - def _generate_examples(self, filepath): - """This function returns the examples in the raw (text) form.""" - logger.info("generating examples from = %s", filepath) - key = 0 - with open(filepath, encoding="utf-8") as f: - squad = json.load(f) - for article in squad["data"]: - title = article.get("title", "") - for paragraph in article["paragraphs"]: - context = paragraph["context"] # do not strip leading blank spaces GH-2585 - for qa in paragraph["qas"]: - answer_starts = [answer["answer_start"] for answer in qa["answers"]] - answers = [answer["text"] for answer in qa["answers"]] - # Features currently used are "context", "question", and "answers". - # Others are extracted here for the ease of future expansions. - yield key, { - "title": title, - "context": context, - "question": qa["question"], - "id": qa["id"].split('--')[0], - "answers": { - "answer_start": answer_starts, - "text": answers, - }, - "aliases": qa["aliases"] if qa.get("aliases") is not None else [], - } - key += 1 diff --git a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/trainer_qa.py b/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/trainer_qa.py deleted file mode 100644 index af237521e8a..00000000000 --- a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/trainer_qa.py +++ /dev/null @@ -1,150 +0,0 @@ -# coding=utf-8 - -# Apache v2 license -# Copyright (C) 2021 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -# Copyright 2020 The HuggingFace Team All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -A subclass of `Trainer` specific to Question-Answering tasks -""" -""" -This script is based on HuggingFace/transformers example: https://github.com/huggingface/transformers/blob/v4.6.1/examples/pytorch/question-answering/trainer_qa.py -""" - -from transformers import Trainer, is_torch_tpu_available -from transformers.trainer_utils import PredictionOutput -import utils_qa -import collections -from collections import defaultdict -import numpy as np -import torch -import json - - -if is_torch_tpu_available(): - import torch_xla.core.xla_model as xm - import torch_xla.debug.metrics as met - - -class QuestionAnsweringTrainer(Trainer): - def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs): - super().__init__(*args, **kwargs) - self.eval_examples = eval_examples - self.post_process_function = post_process_function - - def evaluate(self, eval_dataset=None, eval_examples=None, ignore_keys=None): - eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset - eval_dataloader = self.get_eval_dataloader(eval_dataset) - eval_examples = self.eval_examples if eval_examples is None else eval_examples - - # Temporarily disable metric computation, we will do it in the loop here. - compute_metrics = self.compute_metrics - self.compute_metrics = None - eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop - try: - output = eval_loop( - eval_dataloader, - description="Evaluation", - # No point gathering the predictions if there are no metrics, otherwise we defer to - # self.args.prediction_loss_only - prediction_loss_only=None, - ignore_keys=ignore_keys, - ) - finally: - self.compute_metrics = compute_metrics - - if self.post_process_function is not None and self.compute_metrics is None: - eval_preds = self.post_process_function(eval_examples, eval_dataset, output.predictions) - metrics = utils_qa.evaluate_triviaqa(eval_preds.label_ids, eval_preds.predictions) - #metrics = self.compute_metrics(eval_preds) - - #self.log(metrics) - else: - metrics = {} - - #if self.args.tpu_metrics_debug or self.args.debug: - # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) - # xm.master_print(met.metrics_report()) - - #self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics) - return metrics - - def predict(self, predict_dataset, predict_examples, ignore_keys=None, n_best_size=20, max_answer_length=30): - predict_dataloader = self.get_test_dataloader(predict_dataset) - - # Temporarily disable metric computation, we will do it in the loop here. - compute_metrics = self.compute_metrics - self.compute_metrics = None - eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop - output = eval_loop( - predict_dataloader, - description="Prediction", - # No point gathering the predictions if there are no metrics, otherwise we defer to - # self.args.prediction_loss_only - prediction_loss_only=None, - ignore_keys=ignore_keys, - ) - - all_start_logits, all_end_logits = output.predictions - - all_predictions = collections.OrderedDict() - - qa_with_duplicates = defaultdict(list) - - for example_index, example in enumerate(predict_examples): - input_ids = torch.tensor([predict_dataset[example_index]["input_ids"]]) - qid = predict_dataset[example_index]["example_id"] - - eos_token_indices = (input_ids == self.tokenizer.eos_token_id).nonzero() - question_end_index = eos_token_indices.view(input_ids.size(0), 2, 2)[:, 0, 1] - start_logits = all_start_logits[example_index] - end_logits = all_end_logits[example_index] - start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist() - end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist() - potential_answers = [] - for start_index in start_indexes: - for end_index in end_indexes: - if start_index <= question_end_index[0]: - continue - if end_index <= question_end_index[0]: - continue - if start_index > end_index: - continue - answer_len = end_index - start_index + 1 - if answer_len > max_answer_length: - continue - potential_answers.append({'start': start_index, 'end': end_index, - 'start_logit': start_logits[start_index].item(), - 'end_logit': end_logits[end_index].item()}) - sorted_answers = sorted(potential_answers, key=lambda x: (x['start_logit'] + x['end_logit']), reverse=True) - if len(sorted_answers) == 0: - answer = {'text': 'NoAnswerFound', 'score': -1000000} - else: - answer = sorted_answers[0] - answer_token_ids = input_ids[0, answer['start']: answer['end'] + 1] - answer_tokens = self.tokenizer.convert_ids_to_tokens(answer_token_ids.tolist()) - text = self.tokenizer.convert_tokens_to_string(answer_tokens) - score = answer['start_logit'] + answer['end_logit'] - answer = {'text': text, 'score': score} - qa_with_duplicates[qid].append({'answer_score': answer['score'], 'answer_text': answer['text'], }) - - qid_to_answer_text = {} - for qid, answer_metrics in qa_with_duplicates.items(): - top_answer = sorted(answer_metrics, key=lambda x: x['answer_score'], reverse=True)[0] - qid_to_answer_text[qid] = top_answer['answer_text'] - - with open('predictions.json', 'w') as f: - f.write(json.dumps(qid_to_answer_text, indent=4) + "\n") diff --git a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/utils/__init__.py b/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/utils/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/utils/convert_to_squad_format.py b/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/utils/convert_to_squad_format.py deleted file mode 100644 index 6279320e045..00000000000 --- a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/utils/convert_to_squad_format.py +++ /dev/null @@ -1,143 +0,0 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import file_utils -from . import dataset_utils -import os -from tqdm import tqdm -import random -import nltk -import argparse - - -def get_text(qad, domain): - local_file = os.path.join(args.web_dir, qad['Filename']) if domain == 'SearchResults' else os.path.join(args.wikipedia_dir, qad['Filename']) - return file_utils.get_file_contents(local_file, encoding='utf-8') - - -def select_relevant_portion(text): - paras = text.split('\n') - selected = [] - done = False - for para in paras: - # nltk is slow, but we have to use its word tokenizer for the distant supervision matching to work - # TODO: try both see which one works better - # words = para.split() - # extra_words = args.max_num_tokens - len(selected) - # selected.extend(words[:extra_words]) - # if len(selected) >= args.max_num_tokens: - # break - sents = sent_tokenize.tokenize(para) - for sent in sents: - words = nltk.word_tokenize(sent) - for word in words: - selected.append(word) - if len(selected) >= args.max_num_tokens: - done = True - break - if done: - break - if done: - break - selected.append('\n') - st = ' '.join(selected).strip() - return st - - -def add_triple_data(datum, page, domain): - qad = {'Source': domain} - for key in ['QuestionId', 'Question', 'Answer']: - if key == 'Answer' and key not in datum: - qad[key] = {'NormalizedAliases': []} - qid = datum['QuestionId'] - print(f'qid: {qid} does not have an answer.') - else: - qad[key] = datum[key] - for key in page: - qad[key] = page[key] - return qad - - -def get_qad_triples(data): - qad_triples = [] - for datum in data['Data']: - for key in ['EntityPages', 'SearchResults']: - for page in datum.get(key, []): - qad = add_triple_data(datum, page, key) - qad_triples.append(qad) - return qad_triples - - -def convert_to_squad_format(qa_json_file, squad_file): - qa_json = dataset_utils.read_triviaqa_data(qa_json_file) - qad_triples = get_qad_triples(qa_json) - random.seed(args.seed) - random.shuffle(qad_triples) - - data = [] - for qad in tqdm(qad_triples): - qid = qad['QuestionId'] - - text = get_text(qad, qad['Source']) - selected_text = select_relevant_portion(text) - - question = qad['Question'] - para = {'context': selected_text, 'qas': [{'question': question, 'answers': []}]} - data.append({'paragraphs': [para]}) - qa = para['qas'][0] - qa['id'] = dataset_utils.get_question_doc_string(qid, qad['Filename']) - qa['qid'] = qid - - answers_in_doc = dataset_utils.answer_index_in_document(qad['Answer'], selected_text) - qa['answers'] = answers_in_doc - # We want all answers in the document, not just the first answer - # if index == -1: - # if qa_json['Split'] == 'train': - # continue - # else: - # qa['answers'].append({'text': ans_string, 'answer_start': index}) - - # This doesn't fit the squad format, but we need it for evaluation - qa['aliases'] = qad['Answer']['NormalizedAliases'] - - if qa_json['Split'] == 'train' and len(data) >= args.sample_size and qa_json['Domain'] == 'Web': - break - - if len(data) >= args.sample_size: - break - - squad = {'data': data, 'version': qa_json['Version']} - file_utils.write_json_to_file(squad, squad_file) - print('Added', len(data)) - - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument('--triviaqa_file', help='Triviaqa file') - parser.add_argument('--squad_file', help='Squad file') - parser.add_argument('--wikipedia_dir', help='Wikipedia doc dir') - parser.add_argument('--web_dir', help='Web doc dir') - - parser.add_argument('--seed', default=10, type=int, help='Random seed') - parser.add_argument('--max_num_tokens', default=800, type=int, help='Maximum number of tokens from a document') - parser.add_argument('--sample_size', default=8000000000000, type=int, help='Random seed') - parser.add_argument('--tokenizer', default='tokenizers/punkt/english.pickle', help='Sentence tokenizer') - args = parser.parse_args() - return args - - -if __name__ == '__main__': - args = get_args() - sent_tokenize = nltk.data.load(args.tokenizer) - convert_to_squad_format(args.triviaqa_file, args.squad_file) diff --git a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/utils/dataset_utils.py b/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/utils/dataset_utils.py deleted file mode 100644 index dd42c6cac2a..00000000000 --- a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/utils/dataset_utils.py +++ /dev/null @@ -1,75 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2020 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import file_utils -import re - - -# Key for wikipedia eval is question-id. Key for web eval is the (question_id, filename) tuple -def get_key_to_ground_truth(data): - if data['Domain'] == 'Wikipedia': - return {datum['QuestionId']: datum['Answer'] for datum in data['Data']} - else: - return get_qd_to_answer(data) - - -def get_question_doc_string(qid, doc_name): - return '{}--{}'.format(qid, doc_name) - -def get_qd_to_answer(data): - key_to_answer = {} - for datum in data['Data']: - for page in datum.get('EntityPages', []) + datum.get('SearchResults', []): - qd_tuple = get_question_doc_string(datum['QuestionId'], page['Filename']) - key_to_answer[qd_tuple] = datum['Answer'] - return key_to_answer - - -def read_clean_part(datum): - for key in ['EntityPages', 'SearchResults']: - new_page_list = [] - for page in datum.get(key, []): - if page['DocPartOfVerifiedEval']: - new_page_list.append(page) - datum[key] = new_page_list - assert len(datum['EntityPages']) + len(datum['SearchResults']) > 0 - return datum - - -def read_triviaqa_data(qajson): - data = file_utils.read_json(qajson) - # read only documents and questions that are a part of clean data set - if data['VerifiedEval']: - clean_data = [] - for datum in data['Data']: - if datum['QuestionPartOfVerifiedEval']: - if data['Domain'] == 'Web': - datum = read_clean_part(datum) - clean_data.append(datum) - data['Data'] = clean_data - return data - - -def answer_index_in_document(answer, document): - answer_list = answer['NormalizedAliases'] - answers_in_doc = [] - for answer_string_in_doc in answer_list: - indices = [m.start() for m in re.finditer(answer_string_in_doc, document, flags=re.IGNORECASE)] - for index in indices: - answers_in_doc.append({ - 'text': answer_string_in_doc, - 'answer_start': index - }) - return answers_in_doc diff --git a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/utils/file_utils.py b/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/utils/file_utils.py deleted file mode 100644 index ad165c545e4..00000000000 --- a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/utils/file_utils.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json - - -def write_json_to_file(json_object, json_file, mode='w', encoding='utf-8'): - with open(json_file, mode, encoding=encoding) as outfile: - json.dump(json_object, outfile, indent=4, sort_keys=True, ensure_ascii=False) - - -def get_file_contents(filename, encoding='utf-8'): - with open(filename, encoding=encoding) as f: - content = f.read() - return content - - -def read_json(filename, encoding='utf-8'): - contents = get_file_contents(filename, encoding=encoding) - return json.loads(contents) - - -def get_file_contents_as_list(file_path, encoding='utf-8', ignore_blanks=True): - contents = get_file_contents(file_path, encoding=encoding) - lines = contents.split('\n') - lines = [line for line in lines if line != ''] if ignore_blanks else lines - return lines diff --git a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/utils_qa.py b/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/utils_qa.py deleted file mode 100644 index 53924013612..00000000000 --- a/examples/huggingface/pytorch/question-answering/pruning/longformer_triviaqa/utils_qa.py +++ /dev/null @@ -1,451 +0,0 @@ -# coding=utf-8 - -# Apache v2 license -# Copyright (C) 2021 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -# Copyright 2020 The HuggingFace Team All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Post-processing utilities for question answering. -""" -""" -This script is based on HuggingFace/transformers examples: https://github.com/huggingface/transformers/blob/v4.6.1/examples/pytorch/question-answering/utils_qa.py -""" -import collections -import json -import logging -import os -from typing import Optional, Tuple - -import numpy as np -from tqdm.auto import tqdm -import sys -from collections import Counter -import string -import re -from collections import defaultdict -import torch - - -logger = logging.getLogger(__name__) - - -def postprocess_qa_predictions( - examples, - features, - predictions: Tuple[np.ndarray, np.ndarray], - tokenizer=None, - version_2_with_negative: bool = False, - n_best_size: int = 20, - max_answer_length: int = 30, - null_score_diff_threshold: float = 0.0, - output_dir: Optional[str] = None, - prefix: Optional[str] = None, - is_world_process_zero: bool = True, -): - """ - Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the - original contexts. This is the base postprocessing functions for models that only return start and end logits. - Args: - examples: The non-preprocessed dataset (see the main script for more information). - features: The processed dataset (see the main script for more information). - predictions (:obj:`Tuple[np.ndarray, np.ndarray]`): - The predictions of the model: two arrays containing the start logits and the end logits respectively. Its - first dimension must match the number of elements of :obj:`features`. - version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether or not the underlying dataset contains examples with no answers. - n_best_size (:obj:`int`, `optional`, defaults to 20): - The total number of n-best predictions to generate when looking for an answer. - max_answer_length (:obj:`int`, `optional`, defaults to 30): - The maximum length of an answer that can be generated. This is needed because the start and end predictions - are not conditioned on one another. - null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0): - The threshold used to select the null answer: if the best answer has a score that is less than the score of - the null answer minus this threshold, the null answer is selected for this example (note that the score of - the null answer for an example giving several features is the minimum of the scores for the null answer on - each feature: all features must be aligned on the fact they `want` to predict a null answer). - Only useful when :obj:`version_2_with_negative` is :obj:`True`. - output_dir (:obj:`str`, `optional`): - If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if - :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null - answers, are saved in `output_dir`. - prefix (:obj:`str`, `optional`): - If provided, the dictionaries mentioned above are saved with `prefix` added to their names. - is_world_process_zero (:obj:`bool`, `optional`, defaults to :obj:`True`): - Whether this process is the main process or not (used to determine if logging/saves should be done). - """ - assert len(predictions) == 2, "`predictions` should be a tuple with two elements (start_logits, end_logits)." - all_start_logits, all_end_logits = predictions - - assert len(predictions[0]) == len(features), f"Got {len(predictions[0])} predictions and {len(features)} features." - - # Build a map example to its corresponding features. - example_id_to_index = {} - index = 0 - for qid in examples["id"]: - if qid in example_id_to_index: - continue - example_id_to_index[qid] = index - index += 1 - - features_per_example = collections.defaultdict(list) - for i, feature in enumerate(features): - features_per_example[example_id_to_index[feature["example_id"]]].append(i) - - # Logging. - logger.setLevel(logging.INFO if is_world_process_zero else logging.WARN) - logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.") - - qa_with_duplicates = defaultdict(list) - - for qid in tqdm(example_id_to_index): - - feature_indices = features_per_example[example_id_to_index[qid]] - - # Looping through all the features associated to the current example. - for feature_index in feature_indices: - potential_answers = [] - # We grab the predictions of the model for this feature. - start_logits = all_start_logits[feature_index] - end_logits = all_end_logits[feature_index] - - input_ids = torch.tensor([features[feature_index]["input_ids"]]) - - # Go through all possibilities for the `n_best_size` greater start and end logits. - start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist() - end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist() - - eos_token_indices = (input_ids == tokenizer.eos_token_id).nonzero() - question_end_index = eos_token_indices.view(input_ids.size(0), 2, 2)[:, 0, 1] - doc_end_index = eos_token_indices.view(input_ids.size(0), 2, 2)[:, 1, 1] - for start_index in start_indexes: - for end_index in end_indexes: - if start_index >= doc_end_index[0]: - continue - if end_index >= doc_end_index[0]: - continue - if start_index <= question_end_index[0]: - continue - if end_index <= question_end_index[0]: - continue - if start_index > end_index: - continue - answer_len = end_index - start_index + 1 - if answer_len > max_answer_length: - continue - potential_answers.append({'start': start_index, 'end': end_index, - 'start_logit': start_logits[start_index].item(), - 'end_logit': end_logits[end_index].item()}) - sorted_answers = sorted(potential_answers, key=lambda x: (x['start_logit'] + x['end_logit']), reverse=True) - - if len(sorted_answers) == 0: - answer = {'text': 'NoAnswerFound', 'score': -1000000} - else: - answer = sorted_answers[0] - answer_token_ids = input_ids[0, answer['start']: answer['end'] + 1] - answer_tokens = tokenizer.convert_ids_to_tokens(answer_token_ids.tolist()) - text = tokenizer.convert_tokens_to_string(answer_tokens) - score = answer['start_logit'] + answer['end_logit'] - answer = {'text': text, 'score': score} - - qa_with_duplicates[qid].append({'answer_score': answer['score'], 'answer_text': answer['text'], }) - - qid_to_answer_text = {} - for qid, answer_metrics in qa_with_duplicates.items(): - top_answer = sorted(answer_metrics, key=lambda x: x['answer_score'], reverse=True)[0] - qid_to_answer_text[qid] = top_answer['answer_text'] - - - # If we have an output_dir, let's save all those dicts. - if output_dir is not None: - assert os.path.isdir(output_dir), f"{output_dir} is not a directory." - - prediction_file = os.path.join( - output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json" - ) - logger.info(f"Saving predictions to {prediction_file}.") - with open(prediction_file, "w") as writer: - writer.write(json.dumps(qid_to_answer_text, indent=4) + "\n") - return qid_to_answer_text - - -def postprocess_qa_predictions_with_beam_search( - examples, - features, - predictions: Tuple[np.ndarray, np.ndarray], - version_2_with_negative: bool = False, - n_best_size: int = 20, - max_answer_length: int = 30, - start_n_top: int = 5, - end_n_top: int = 5, - output_dir: Optional[str] = None, - prefix: Optional[str] = None, - is_world_process_zero: bool = True, -): - """ - Post-processes the predictions of a question-answering model with beam search to convert them to answers that are substrings of the - original contexts. This is the postprocessing functions for models that return start and end logits, indices, as well as - cls token predictions. - Args: - examples: The non-preprocessed dataset (see the main script for more information). - features: The processed dataset (see the main script for more information). - predictions (:obj:`Tuple[np.ndarray, np.ndarray]`): - The predictions of the model: two arrays containing the start logits and the end logits respectively. Its - first dimension must match the number of elements of :obj:`features`. - version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether or not the underlying dataset contains examples with no answers. - n_best_size (:obj:`int`, `optional`, defaults to 20): - The total number of n-best predictions to generate when looking for an answer. - max_answer_length (:obj:`int`, `optional`, defaults to 30): - The maximum length of an answer that can be generated. This is needed because the start and end predictions - are not conditioned on one another. - start_n_top (:obj:`int`, `optional`, defaults to 5): - The number of top start logits too keep when searching for the :obj:`n_best_size` predictions. - end_n_top (:obj:`int`, `optional`, defaults to 5): - The number of top end logits too keep when searching for the :obj:`n_best_size` predictions. - output_dir (:obj:`str`, `optional`): - If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if - :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null - answers, are saved in `output_dir`. - prefix (:obj:`str`, `optional`): - If provided, the dictionaries mentioned above are saved with `prefix` added to their names. - is_world_process_zero (:obj:`bool`, `optional`, defaults to :obj:`True`): - Whether this process is the main process or not (used to determine if logging/saves should be done). - """ - assert len(predictions) == 5, "`predictions` should be a tuple with five elements." - start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = predictions - - assert len(predictions[0]) == len( - features - ), f"Got {len(predictions[0])} predicitions and {len(features)} features." - - # Build a map example to its corresponding features. - example_id_to_index = {k: i for i, k in enumerate(examples["id"])} - features_per_example = collections.defaultdict(list) - for i, feature in enumerate(features): - features_per_example[example_id_to_index[feature["example_id"]]].append(i) - - # The dictionaries we have to fill. - all_predictions = collections.OrderedDict() - all_nbest_json = collections.OrderedDict() - scores_diff_json = collections.OrderedDict() if version_2_with_negative else None - - # Logging. - logger.setLevel(logging.INFO if is_world_process_zero else logging.WARN) - logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.") - - # Let's loop over all the examples! - for example_index, example in enumerate(tqdm(examples)): - # Those are the indices of the features associated to the current example. - feature_indices = features_per_example[example_index] - - min_null_score = None - prelim_predictions = [] - - # Looping through all the features associated to the current example. - for feature_index in feature_indices: - # We grab the predictions of the model for this feature. - start_log_prob = start_top_log_probs[feature_index] - start_indexes = start_top_index[feature_index] - end_log_prob = end_top_log_probs[feature_index] - end_indexes = end_top_index[feature_index] - feature_null_score = cls_logits[feature_index] - # This is what will allow us to map some the positions in our logits to span of texts in the original - # context. - offset_mapping = features[feature_index]["offset_mapping"] - # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context - # available in the current feature. - token_is_max_context = features[feature_index].get("token_is_max_context", None) - - # Update minimum null prediction - if min_null_score is None or feature_null_score < min_null_score: - min_null_score = feature_null_score - - # Go through all possibilities for the `n_start_top`/`n_end_top` greater start and end logits. - for i in range(start_n_top): - for j in range(end_n_top): - start_index = int(start_indexes[i]) - j_index = i * end_n_top + j - end_index = int(end_indexes[j_index]) - # Don't consider out-of-scope answers (last part of the test should be unnecessary because of the - # p_mask but let's not take any risk) - if ( - start_index >= len(offset_mapping) - or end_index >= len(offset_mapping) - or offset_mapping[start_index] is None - or offset_mapping[end_index] is None - ): - continue - # Don't consider answers with a length negative or > max_answer_length. - if end_index < start_index or end_index - start_index + 1 > max_answer_length: - continue - # Don't consider answer that don't have the maximum context available (if such information is - # provided). - if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False): - continue - prelim_predictions.append( - { - "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]), - "score": start_log_prob[i] + end_log_prob[j_index], - "start_log_prob": start_log_prob[i], - "end_log_prob": end_log_prob[j_index], - } - ) - - # Only keep the best `n_best_size` predictions. - predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size] - - # Use the offsets to gather the answer text in the original context. - context = example["context"] - for pred in predictions: - offsets = pred.pop("offsets") - pred["text"] = context[offsets[0] : offsets[1]] - - # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid - # failure. - if len(predictions) == 0: - predictions.insert(0, {"text": "", "start_logit": -1e-6, "end_logit": -1e-6, "score": -2e-6}) - - # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using - # the LogSumExp trick). - scores = np.array([pred.pop("score") for pred in predictions]) - exp_scores = np.exp(scores - np.max(scores)) - probs = exp_scores / exp_scores.sum() - - # Include the probabilities in our predictions. - for prob, pred in zip(probs, predictions): - pred["probability"] = prob - - # Pick the best prediction and set the probability for the null answer. - all_predictions[example["id"]] = predictions[0]["text"] - if version_2_with_negative: - scores_diff_json[example["id"]] = float(min_null_score) - - # Make `predictions` JSON-serializable by casting np.float back to float. - all_nbest_json[example["id"]] = [ - {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()} - for pred in predictions - ] - - # If we have an output_dir, let's save all those dicts. - if output_dir is not None: - assert os.path.isdir(output_dir), f"{output_dir} is not a directory." - - prediction_file = os.path.join( - output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json" - ) - nbest_file = os.path.join( - output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json" - ) - if version_2_with_negative: - null_odds_file = os.path.join( - output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json" - ) - - print(f"Saving predictions to {prediction_file}.") - with open(prediction_file, "w") as writer: - writer.write(json.dumps(all_predictions, indent=4) + "\n") - print(f"Saving nbest_preds to {nbest_file}.") - with open(nbest_file, "w") as writer: - writer.write(json.dumps(all_nbest_json, indent=4) + "\n") - if version_2_with_negative: - print(f"Saving null_odds to {null_odds_file}.") - with open(null_odds_file, "w") as writer: - writer.write(json.dumps(scores_diff_json, indent=4) + "\n") - - return all_predictions, scores_diff_json - - -def normalize_answer(s): - """Lower text and remove punctuation, articles and extra whitespace.""" - - def remove_articles(text): - return re.sub(r'\b(a|an|the)\b', ' ', text) - - def white_space_fix(text): - return ' '.join(text.split()) - - def handle_punc(text): - exclude = set(string.punctuation + "".join([u"‘", u"’", u"´", u"`"])) - return ''.join(ch if ch not in exclude else ' ' for ch in text) - - def lower(text): - return text.lower() - - def replace_underscore(text): - return text.replace('_', ' ') - - return white_space_fix(remove_articles(handle_punc(lower(replace_underscore(s))))).strip() - -def f1_score(prediction, ground_truth): - prediction_tokens = normalize_answer(prediction).split() - ground_truth_tokens = normalize_answer(ground_truth).split() - common = Counter(prediction_tokens) & Counter(ground_truth_tokens) - num_same = sum(common.values()) - if num_same == 0: - return 0 - precision = 1.0 * num_same / len(prediction_tokens) - recall = 1.0 * num_same / len(ground_truth_tokens) - f1 = (2 * precision * recall) / (precision + recall) - return f1 - -def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): - scores_for_ground_truths = [] - for ground_truth in ground_truths: - score = metric_fn(prediction, ground_truth) - scores_for_ground_truths.append(score) - return max(scores_for_ground_truths) - - -def is_exact_match(answer_object, prediction): - ground_truths = get_ground_truths(answer_object) - for ground_truth in ground_truths: - if exact_match_score(prediction, ground_truth): - return True - return False - - -def has_exact_match(ground_truths, candidates): - for ground_truth in ground_truths: - if ground_truth in candidates: - return True - return False - -def exact_match_score(prediction, ground_truth): - return int(normalize_answer(prediction) == normalize_answer(ground_truth)) - -def evaluate_triviaqa(references, predictions): - f1 = exact_match = common = total = 0 - for qa in references: - total += 1 - if qa["id"] not in predictions: - message = "Unanswered question " + qa["id"] + " will receive score 0." - print(message, file=sys.stderr) - continue - common += 1 - prediction = predictions[qa["id"]] - ground_truths = qa["answers"]["text"] + qa["aliases"] - em_for_this_question = metric_max_over_ground_truths( - exact_match_score, prediction, ground_truths) - - exact_match += em_for_this_question - - f1_for_this_question = metric_max_over_ground_truths( - f1_score, prediction, ground_truths) - f1 += f1_for_this_question - exact_match = 100.0 * exact_match / total - f1 = 100.0 * f1 / total - - return {"exact_match": exact_match, "f1": f1} diff --git a/examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/README.md b/examples/huggingface/pytorch/question-answering/pruning/magnitude/README.md similarity index 100% rename from examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/README.md rename to examples/huggingface/pytorch/question-answering/pruning/magnitude/README.md diff --git a/examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/requirements.txt b/examples/huggingface/pytorch/question-answering/pruning/magnitude/requirements.txt similarity index 100% rename from examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/requirements.txt rename to examples/huggingface/pytorch/question-answering/pruning/magnitude/requirements.txt diff --git a/examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/run_benchmark.sh b/examples/huggingface/pytorch/question-answering/pruning/magnitude/run_benchmark.sh similarity index 100% rename from examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/run_benchmark.sh rename to examples/huggingface/pytorch/question-answering/pruning/magnitude/run_benchmark.sh diff --git a/examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/run_qa.py b/examples/huggingface/pytorch/question-answering/pruning/magnitude/run_qa.py similarity index 97% rename from examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/run_qa.py rename to examples/huggingface/pytorch/question-answering/pruning/magnitude/run_qa.py index 35f572aa50c..7bc9b835440 100644 --- a/examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/run_qa.py +++ b/examples/huggingface/pytorch/question-answering/pruning/magnitude/run_qa.py @@ -26,7 +26,8 @@ import transformers from dataclasses import dataclass, field from datasets import load_dataset, load_metric -from intel_extension_for_transformers.transformers import metrics, OptimizedModel, PrunerConfig, PruningConfig, PruningMode +from intel_extension_for_transformers.transformers import metrics, OptimizedModel +from neural_compressor.config import WeightPruningConfig from trainer_qa import QuestionAnsweringTrainer from transformers import ( AutoConfig, @@ -210,8 +211,8 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply prune."}, ) pruning_approach: Optional[str] = field( - default="BasicMagnitude", - metadata={"help": "Pruning approach. Supported approach is basic_magnite."}, + default="magnitude", + metadata={"help": "Pruning approach. Supported approach is magnite."}, ) target_sparsity_ratio: Optional[float] = field( default=None, @@ -631,12 +632,15 @@ def compute_metrics(p: EvalPrediction): raise ValueError("do_train must be set to True for pruning.") tune_metric = metrics.Metric(name=metric_name) - prune_type = 'BasicMagnitude' if optim_args.pruning_approach else optim_args.pruning_approach + prune_type = optim_args.pruning_approach \ + if optim_args.pruning_approach else 'pattern_lock' target_sparsity_ratio = optim_args.target_sparsity_ratio \ if optim_args.target_sparsity_ratio else None - pruner_config = PrunerConfig(prune_type=prune_type, target_sparsity_ratio=target_sparsity_ratio) - pruning_conf = PruningConfig(pruner_config=pruner_config, metrics=tune_metric) - + trainer.metrics = tune_metric + pruning_conf = WeightPruningConfig([{"start_step": 0, "end_step": 2}], + target_sparsity=target_sparsity_ratio, + pruning_scope="local", + pruning_type=prune_type) model = trainer.prune(pruning_config=pruning_conf) trainer.save_model(training_args.output_dir) @@ -653,7 +657,7 @@ def compute_metrics(p: EvalPrediction): max_eval_samples = data_args.max_eval_samples \ if data_args.max_eval_samples is not None else len(eval_dataset) eval_samples = min(max_eval_samples, len(eval_dataset)) - samples = eval_samples - (eval_samples % batch_size) \ + samples = eval_samples - (eval_samples % optim_args.batch_size) \ if training_args.dataloader_drop_last else eval_samples logger.info("metrics keys: {}".format(results.keys())) bert_task_acc_keys = ['eval_f1', 'eval_accuracy', 'eval_matthews_correlation', diff --git a/examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/run_tuning.sh b/examples/huggingface/pytorch/question-answering/pruning/magnitude/run_tuning.sh similarity index 100% rename from examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/run_tuning.sh rename to examples/huggingface/pytorch/question-answering/pruning/magnitude/run_tuning.sh diff --git a/examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/trainer_qa.py b/examples/huggingface/pytorch/question-answering/pruning/magnitude/trainer_qa.py similarity index 100% rename from examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/trainer_qa.py rename to examples/huggingface/pytorch/question-answering/pruning/magnitude/trainer_qa.py diff --git a/examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/utils_qa.py b/examples/huggingface/pytorch/question-answering/pruning/magnitude/utils_qa.py similarity index 100% rename from examples/huggingface/pytorch/question-answering/pruning/basic_magnitude/utils_qa.py rename to examples/huggingface/pytorch/question-answering/pruning/magnitude/utils_qa.py diff --git a/examples/huggingface/pytorch/question-answering/quantization/README.md b/examples/huggingface/pytorch/question-answering/quantization/README.md index 0e7132666ff..4d1707dc6f7 100644 --- a/examples/huggingface/pytorch/question-answering/quantization/README.md +++ b/examples/huggingface/pytorch/question-answering/quantization/README.md @@ -1,6 +1,6 @@ Step-by-Step​ ============ -The script `run_qa.py` provides three quantization approaches (PostTrainingStatic, PostTrainingStatic and QuantizationAwareTraining) based on [Intel® Neural Compressor](https://github.com/intel/neural-compressor). +The script `run_qa.py` provides three quantization approaches (dynamic, static and qat) based on [Intel® Neural Compressor](https://github.com/intel/neural-compressor). # Prerequisite​ ## 1. Create Environment​ @@ -8,9 +8,8 @@ Recommended python 3.9 or higher version. ```shell pip install intel-extension-for-transformers pip install -r requirements.txt -pip install transformers==4.34.1 + ``` ->**Note**: Please use transformers no higher than 4.34.1 # Run ## 1. Quantization @@ -22,7 +21,7 @@ python run_qa.py \ --model_name_or_path distilbert-base-uncased-distilled-squad \ --dataset_name squad \ --tune \ - --quantization_approach PostTrainingStatic \ + --quantization_approach static \ --do_train \ --do_eval \ --output_dir ./tmp/squad_output \ @@ -36,7 +35,7 @@ python run_qa.py \ --model_name_or_path bert-large-uncased-whole-word-masking-finetuned-squad \ --dataset_name squad \ --tune \ - --quantization_approach PostTrainingStatic \ + --quantization_approach static \ --do_train \ --do_eval \ --output_dir ./tmp/squad_output \ @@ -65,7 +64,7 @@ python -m torch.distributed.launch --master_addr= --nproc_per_no --model_name_or_path bert-large-uncased-whole-word-masking-finetuned-squad \ --dataset_name squad \ --tune \ - --quantization_approach QuantizationAwareTraining \ + --quantization_approach qat \ --do_train \ --do_eval \ --output_dir ./tmp/squad_output \ @@ -75,7 +74,7 @@ python -m torch.distributed.launch --master_addr= --nproc_per_no ## 3. Validated Model List ### Stock PyTorch Validated model list -|Dataset|Pretrained model|PostTrainingDynamic | PostTrainingStatic | QuantizationAwareTraining +|Dataset|Pretrained model|dynamic | static | qat |---|------------------------------------|---|---|--- |squad|distilbert-base-uncased-distilled-squad| ✅| ✅| ✅ |squad|valhalla/longformer-base-4096-finetuned-squadv1| ✅| ✅| N/A diff --git a/examples/huggingface/pytorch/question-answering/quantization/run_qa.py b/examples/huggingface/pytorch/question-answering/quantization/run_qa.py index ac4b0237bf9..6f7bebfaa76 100644 --- a/examples/huggingface/pytorch/question-answering/quantization/run_qa.py +++ b/examples/huggingface/pytorch/question-answering/quantization/run_qa.py @@ -26,7 +26,13 @@ import transformers from dataclasses import dataclass, field from datasets import load_dataset, load_metric -from intel_extension_for_transformers.transformers import metrics , OptimizedModel, QuantizationConfig +from intel_extension_for_transformers.transformers import OptimizedModel, metrics +from neural_compressor.config import ( + PostTrainingQuantConfig, + QuantizationAwareTrainingConfig, + TuningCriterion, + AccuracyCriterion +) from trainer_qa import QuestionAnsweringTrainer from transformers import ( AutoConfig, @@ -215,13 +221,13 @@ class OptimizationArguments: metadata={"help": "Tuning strategy. Supported strategies are basic, bayesian, mse, mse_v2."}, ) quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, + default="static", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic and qat."}, ) framework: Optional[str] = field( - default="pytorch", - metadata={"help": "Deep learning framework. Supported framework are pytorch, ipex"}, + default="default", + metadata={"help": "Deep learning framework. Supported framework are default, ipex"}, ) metric_name: Optional[str] = field( default="eval_f1", @@ -320,10 +326,12 @@ def main(): # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. + if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, + trust_remote_code=True ) else: data_files = {} @@ -337,7 +345,8 @@ def main(): if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] - raw_datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir) + raw_datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir, + trust_remote_code=True) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. @@ -627,7 +636,7 @@ def post_processing_function(examples, features, predictions, stage="eval"): references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples] return EvalPrediction(predictions=formatted_predictions, label_ids=references) - metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad") + metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad", trust_remote_code=True) def compute_metrics(p: EvalPrediction): return metric.compute(predictions=p.predictions, references=p.label_ids) @@ -655,31 +664,46 @@ def compute_metrics(p: EvalPrediction): if not training_args.do_eval: raise ValueError("do_eval must be set to True for quantization.") - if optim_args.quantization_approach != "PostTrainingDynamic": + if optim_args.quantization_approach != "dynamic": if not training_args.do_train: raise ValueError( "do_train must be set to True for static and aware training quantization." ) - if optim_args.quantization_approach == "QuantizationAwareTraining": - early_stopping_patience = 6 - early_stopping_threshold = 0.001 # optional - trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, - early_stopping_threshold)) - tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol ) - quantization_config = QuantizationConfig( - approach=optim_args.quantization_approach, - max_trials=200, - metrics=[tune_metric], - sampling_size = len(train_dataset)//20 - ) - if optim_args.strategy == "mse_v2": - quantization_config.strategy = "mse_v2" - if optim_args.framework == "ipex": - quantization_config.framework = "pytorch_ipex" - trainer.calib_dataloader = calib_dataloader + trainer.metrics = tune_metric + if optim_args.quantization_approach != "qat": + tuning_criterion = TuningCriterion(max_trials=600, objective=["performance"]) + accuracy_criterion = AccuracyCriterion( + higher_is_better=True, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. + ) + quantization_config = PostTrainingQuantConfig( + backend = optim_args.framework, + approach=optim_args.quantization_approach, + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + if optim_args.framework == "ipex": + trainer.calib_dataloader = calib_dataloader + else: + tuning_criterion = TuningCriterion(max_trials=600, objective=["performance"]) + accuracy_criterion = AccuracyCriterion( + higher_is_better=True, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. + ) + quantization_config = QuantizationAwareTrainingConfig( + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + early_stopping_patience = 2 + early_stopping_threshold = 0.001 # optional + trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, \ + early_stopping_threshold)) + model = trainer.quantize(quant_config=quantization_config) if optim_args.benchmark_only: diff --git a/examples/huggingface/pytorch/question-answering/quantization/run_tuning.sh b/examples/huggingface/pytorch/question-answering/quantization/run_tuning.sh index 02dc6a88075..d4d1d24fe36 100644 --- a/examples/huggingface/pytorch/question-answering/quantization/run_tuning.sh +++ b/examples/huggingface/pytorch/question-answering/quantization/run_tuning.sh @@ -17,7 +17,7 @@ function init_params { extra_cmd="" batch_size=8 MAX_SEQ_LENGTH=384 - approach="PostTrainingStatic" + approach="static" for var in "$@" do case $var in @@ -52,15 +52,15 @@ function run_tuning { if [ "${topology}" = "distilbert_base_squad_static" ]; then DATASET_NAME="squad" model_name_or_path="distilbert-base-uncased-distilled-squad" - approach="PostTrainingStatic" + approach="static" elif [ "${topology}" = "distilbert_base_squad_dynamic" ]; then DATASET_NAME="squad" model_name_or_path="distilbert-base-uncased-distilled-squad" - approach="PostTrainingDynamic" + approach="dynamic" elif [ "${topology}" = "distilbert_base_squad_qat" ]; then DATASET_NAME="squad" model_name_or_path="distilbert-base-uncased-distilled-squad" - approach="QuantizationAwareTraining" + approach="qat" extra_cmd=$extra_cmd" --learning_rate 1e-5 \ --num_train_epochs 6 \ --eval_steps 100 \ @@ -69,35 +69,36 @@ function run_tuning { --load_best_model_at_end True \ --evaluation_strategy steps \ --save_strategy steps \ - --save_total_limit 1" + --save_total_limit 1 \ + --save_safetensors False" elif [ "${topology}" = "bert_large_SQuAD_static" ]; then DATASET_NAME="squad" model_name_or_path="bert-large-uncased-whole-word-masking-finetuned-squad" - approach="PostTrainingStatic" + approach="static" elif [ "${topology}" = "roberta_base_SQuAD2_static" ]; then DATASET_NAME="squad" model_name_or_path="deepset/roberta-base-squad2" - approach="PostTrainingStatic" + approach="static" # extra_cmd=$extra_cmd" --version_2_with_negative" elif [ "${topology}" = "longformer_base_squad_static" ]; then DATASET_NAME="squad" model_name_or_path="valhalla/longformer-base-4096-finetuned-squadv1" - approach="PostTrainingStatic" + approach="static" extra_cmd=$extra_cmd" --strategy mse_v2" elif [ "${topology}" = "longformer_base_squad_dynamic" ]; then DATASET_NAME="squad" model_name_or_path="valhalla/longformer-base-4096-finetuned-squadv1" - approach="PostTrainingDynamic" + approach="dynamic" extra_cmd=$extra_cmd" --strategy mse_v2" elif [ "${topology}" = "distilbert_base_squad_ipex" ]; then DATASET_NAME="squad" model_name_or_path="distilbert-base-uncased-distilled-squad" extra_cmd=$extra_cmd" --perf_tol 0.02" - approach="PostTrainingStatic" + approach="static" elif [ "${topology}" = "bert_large_squad_ipex" ]; then DATASET_NAME="squad" model_name_or_path="bert-large-uncased-whole-word-masking-finetuned-squad" - approach="PostTrainingStatic" + approach="static" fi python -u ./run_qa.py \ diff --git a/examples/huggingface/pytorch/summarization/quantization/README.md b/examples/huggingface/pytorch/summarization/quantization/README.md index c48c2a83704..f8f64520f8a 100644 --- a/examples/huggingface/pytorch/summarization/quantization/README.md +++ b/examples/huggingface/pytorch/summarization/quantization/README.md @@ -49,18 +49,16 @@ python examples/pytorch/summarization/run_summarization.py \ --dataset_name samsum \ --do_train \ --do_eval \ - --train_file path_to_csv_or_jsonlines_file \ - --validation_file path_to_csv_or_jsonlines_file \ --output_dir /tmp/tst-summarization \ --overwrite_output_dir \ - --per_device_train_batch_size=8 \ - --per_device_eval_batch_size=8 \ + --per_device_train_batch_size 8 \ + --per_device_eval_batch_size 8 \ --tune \ --predict_with_generate \ --perf_tol 0.03 ``` ### 2. Validated Model List -|Dataset|Pretrained model|PostTrainingDynamic | PostTrainingStatic | QuantizationAwareTraining +|Dataset|Pretrained model|dynamic | static | qat |---|------------------------------------|---|---|--- |samsum|pegasus_samsum| ✅| N/A | N/A |cnn_dailymail|t5_base_cnn| ✅| N/A | N/A diff --git a/examples/huggingface/pytorch/summarization/quantization/run_benchmark.sh b/examples/huggingface/pytorch/summarization/quantization/run_benchmark.sh index 4fe01c952dd..d3b375060aa 100644 --- a/examples/huggingface/pytorch/summarization/quantization/run_benchmark.sh +++ b/examples/huggingface/pytorch/summarization/quantization/run_benchmark.sh @@ -78,11 +78,11 @@ function run_benchmark { elif [ "${topology}" == "flan_t5_large_samsum_dynamic" ]; then DATASET_NAME="samsum" model_name_or_path="stacked-summaries/flan-t5-large-stacked-samsum-1024" - approach="PostTrainingDynamic" + approach="dynamic" elif [ "${topology}" == "flan_t5_large_samsum_static" ]; then DATASET_NAME="samsum" model_name_or_path="stacked-summaries/flan-t5-large-stacked-samsum-1024" - approach="PostTrainingStatic" + approach="static" else echo "unsupported topology: ${topology}" exit 1 diff --git a/examples/huggingface/pytorch/summarization/quantization/run_summarization.py b/examples/huggingface/pytorch/summarization/quantization/run_summarization.py index 9d9cc794d38..3844e45173f 100755 --- a/examples/huggingface/pytorch/summarization/quantization/run_summarization.py +++ b/examples/huggingface/pytorch/summarization/quantization/run_summarization.py @@ -30,8 +30,13 @@ from datasets import load_dataset, load_metric from filelock import FileLock -from intel_extension_for_transformers.transformers import OptimizedModel, QuantizationConfig -from intel_extension_for_transformers.transformers import metrics as nlp_metrics +from intel_extension_for_transformers.transformers import OptimizedModel, metrics +from neural_compressor.config import ( + PostTrainingQuantConfig, + QuantizationAwareTrainingConfig, + TuningCriterion, + AccuracyCriterion +) from intel_extension_for_transformers.transformers.trainer import NLPSeq2SeqTrainer from transformers import ( AutoConfig, @@ -267,9 +272,9 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply quantization."}, ) quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, + default="static", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic and qat."}, ) metric_name: Optional[str] = field( default="eval_rougeLsum", @@ -700,26 +705,42 @@ def compute_metrics(eval_preds): trainer.save_model(training_args.output_dir) - if optim_args.quantization_approach != "PostTrainingDynamic": + if optim_args.quantization_approach != "dynamic": if not training_args.do_train: raise ValueError( "do_train must be set to True for static and aware training quantization." ) - if optim_args.quantization_approach == "QuantizationAwareTraining": - early_stopping_patience = 6 - early_stopping_threshold = 0.001 # optional - trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, - early_stopping_threshold)) - - tune_metric = nlp_metrics.Metric( + tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol ) - quantization_config = QuantizationConfig( - approach=optim_args.quantization_approach, - max_trials=200, - metrics=[tune_metric], - sampling_size = len(train_dataset)//20 - ) + trainer.metrics = tune_metric + if optim_args.quantization_approach != "qat": + tuning_criterion = TuningCriterion(max_trials=600, objective=["performance"]) + accuracy_criterion = AccuracyCriterion( + higher_is_better=True, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. + ) + quantization_config = PostTrainingQuantConfig( + approach=optim_args.quantization_approach, + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + else: + tuning_criterion = TuningCriterion(max_trials=600, objective=["performance"]) + accuracy_criterion = AccuracyCriterion( + higher_is_better=True, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. + ) + quantization_config = QuantizationAwareTrainingConfig( + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + early_stopping_patience = 2 + early_stopping_threshold = 0.001 # optional + trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, \ + early_stopping_threshold)) trainer.max_length = max_length trainer.num_beams = num_beams model = trainer.quantize(quant_config=quantization_config) diff --git a/examples/huggingface/pytorch/summarization/quantization/run_tuning.sh b/examples/huggingface/pytorch/summarization/quantization/run_tuning.sh index c42d6045c89..e9f714d2bfb 100644 --- a/examples/huggingface/pytorch/summarization/quantization/run_tuning.sh +++ b/examples/huggingface/pytorch/summarization/quantization/run_tuning.sh @@ -15,7 +15,7 @@ function init_params { DATASET_NAME="xsum" extra_cmd="" batch_size=8 - approach="PostTrainingStatic" + approach="static" for var in "$@" do case $var in @@ -45,24 +45,24 @@ function run_tuning { if [ "${topology}" == "pegasus_samsum_dynamic" ]; then DATASET_NAME="samsum" model_name_or_path="lvwerra/pegasus-samsum" - approach="PostTrainingDynamic" + approach="dynamic" elif [ "${topology}" == "t5_base_cnn_dynamic" ]; then DATASET_NAME="cnn_dailymail" model_name_or_path="flax-community/t5-base-cnn-dm" - approach="PostTrainingDynamic" + approach="dynamic" elif [ "${topology}" == "t5_large_cnn_dynamic" ]; then DATASET_NAME="cnn_dailymail" model_name_or_path="sysresearch101/t5-large-finetuned-xsum-cnn" - approach="PostTrainingDynamic" + approach="dynamic" elif [ "${topology}" == "flan_t5_large_samsum_dynamic" ]; then DATASET_NAME="samsum" model_name_or_path="stacked-summaries/flan-t5-large-stacked-samsum-1024" - approach="PostTrainingDynamic" + approach="dynamic" extra_cmd=$extra_cmd" --perf_tol 0.03" elif [ "${topology}" == "flan_t5_large_samsum_static" ]; then DATASET_NAME="samsum" model_name_or_path="stacked-summaries/flan-t5-large-stacked-samsum-1024" - approach="PostTrainingStatic" + approach="static" extra_cmd=$extra_cmd" --perf_tol 0.03" else echo "unsupported topology: ${topology}" diff --git a/examples/huggingface/pytorch/text-classification/distillation_for_quantization/run_glue.py b/examples/huggingface/pytorch/text-classification/distillation_for_quantization/run_glue.py index 831ecb734ea..2524d5e1ced 100644 --- a/examples/huggingface/pytorch/text-classification/distillation_for_quantization/run_glue.py +++ b/examples/huggingface/pytorch/text-classification/distillation_for_quantization/run_glue.py @@ -31,15 +31,15 @@ from datasets import load_dataset, load_metric from intel_extension_for_transformers.transformers import ( metrics, - PrunerConfig, - PruningConfig, - DistillationConfig, - QuantizationConfig, OptimizedModel, objectives ) +from neural_compressor.config import ( + DistillationConfig, + IntermediateLayersKnowledgeDistillationLossConfig, + QuantizationAwareTrainingConfig, +) from intel_extension_for_transformers.transformers.trainer import NLPTrainer -from intel_extension_for_transformers.transformers.distillation import Criterion from torch.utils.data import DataLoader from tqdm.auto import tqdm from transformers import ( @@ -543,29 +543,21 @@ def compute_metrics(p: EvalPrediction): tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol ) + trainer.metrics = tune_metric + layer_mappings = [[[f"bert.encoder.layer.{i}", "0"]] for i in range(12)] +\ [[[f"bert.encoder.layer.{i}.attention", "1"]] for i in range(12)] +\ [[["classifier"]]] - distillation_conf = DistillationConfig( - framework="pytorch_fx", metrics=tune_metric, - criterion=Criterion( - name="IntermediateLayersLoss", + criterion_conf = IntermediateLayersKnowledgeDistillationLossConfig( layer_mappings=layer_mappings, loss_types=["MSE"] * len(layer_mappings), loss_weight_ratio=[1.0 / len(layer_mappings)] * len(layer_mappings), add_origin_loss=True - ) - ) - - objective = objectives.performance - quantization_conf = QuantizationConfig( - approach="QuantizationAwareTraining", - max_trials=600, - metrics=[tune_metric], - objectives=[objective] ) + distillation_conf = DistillationConfig(teacher_model=teacher_model, criterion=criterion_conf) + quantization_conf = QuantizationAwareTrainingConfig() conf_list = [distillation_conf, quantization_conf] - model = trainer.orchestrate_optimizations(config_list=conf_list, teacher_model=teacher_model) + model = trainer.orchestrate_optimizations(config_list=conf_list) if optim_args.benchmark or optim_args.accuracy_only: # Load the model obtained after Intel Neural Compressor (INC) quantization diff --git a/examples/huggingface/pytorch/text-classification/orchestrate_optimizations/README.md b/examples/huggingface/pytorch/text-classification/orchestrate_optimizations/README.md index afd199d00c4..461e84d3dd7 100644 --- a/examples/huggingface/pytorch/text-classification/orchestrate_optimizations/README.md +++ b/examples/huggingface/pytorch/text-classification/orchestrate_optimizations/README.md @@ -17,7 +17,7 @@ python run_glue.py \ --model_name_or_path Intel/distilbert-base-uncased-sparse-90-unstructured-pruneofa \ --teacher_model distilbert-base-uncased-finetuned-sst-2-english \ --task_name sst2 \ - --quantization_approach QuantizationAwareTraining \ + --quantization_approach qat \ --do_train \ --do_eval \ --orchestrate_optimization \ @@ -37,7 +37,7 @@ python -m torch.distributed.launch --master_addr= --nproc_per_no --model_name_or_path Intel/distilbert-base-uncased-sparse-90-unstructured-pruneofa \ --teacher_model distilbert-base-uncased-finetuned-sst-2-english \ --task_name sst2 \ - --quantization_approach QuantizationAwareTraining \ + --quantization_approach qat \ --do_train \ --do_eval \ --orchestrate_optimization \ diff --git a/examples/huggingface/pytorch/text-classification/orchestrate_optimizations/run_glue.py b/examples/huggingface/pytorch/text-classification/orchestrate_optimizations/run_glue.py index 26f3cd535b7..89c0f104064 100644 --- a/examples/huggingface/pytorch/text-classification/orchestrate_optimizations/run_glue.py +++ b/examples/huggingface/pytorch/text-classification/orchestrate_optimizations/run_glue.py @@ -30,13 +30,15 @@ from datasets import load_dataset, load_metric from intel_extension_for_transformers.transformers import ( metrics, - PrunerConfig, - PruningConfig, - DistillationConfig, - QuantizationConfig, OptimizedModel, objectives ) +from neural_compressor.config import ( + WeightPruningConfig, + DistillationConfig, + KnowledgeDistillationLossConfig, + QuantizationAwareTrainingConfig, +) from intel_extension_for_transformers.transformers.trainer import NLPTrainer from torch.utils.data import DataLoader from tqdm.auto import tqdm @@ -211,7 +213,7 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply prune."}, ) pruning_approach: Optional[str] = field( - default="BasicMagnitude", + default="magnitude", metadata={"help": "Pruning approach. Supported approach is basic_magnite."}, ) target_sparsity_ratio: Optional[float] = field( @@ -231,9 +233,9 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply quantization."}, ) quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, + default="static", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic and qat."}, ) metric_name: Optional[str] = field( default=None, @@ -637,10 +639,6 @@ def get_logits(teacher_model, train_dataset, teacher_train_dataset): logger.info("***** Number of student model parameters: {:.2f}M *****".format(\ para_counter(model)/10**6)) - # Trace model - from neural_compressor.adaptor.torch_utils.symbolic_trace import symbolic_trace - model = symbolic_trace(model, optim_args.quantization_approach=="QuantizationAwareTraining") - # Initialize our Trainer trainer = NLPTrainer( model=model, @@ -673,23 +671,20 @@ def get_logits(teacher_model, train_dataset, teacher_train_dataset): tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol ) - prune_type = 'PatternLock' \ + prune_type = 'pattern_lock' \ if optim_args.pruning_approach else optim_args.pruning_approach target_sparsity_ratio = optim_args.target_sparsity_ratio \ if optim_args.target_sparsity_ratio else None - pruner_config = PrunerConfig(prune_type=prune_type, target_sparsity_ratio=target_sparsity_ratio) - pruning_conf = PruningConfig(framework="pytorch_fx",pruner_config=[pruner_config], metrics=tune_metric) - distillation_conf = DistillationConfig(framework="pytorch_fx", metrics=tune_metric) - - objective = objectives.performance - quantization_conf = QuantizationConfig( - approach=optim_args.quantization_approach, - max_trials=600, - metrics=[tune_metric], - objectives=[objective] - ) + trainer.metrics = tune_metric + pruning_conf = WeightPruningConfig([{"start_step": 0, "end_step": 2}], + target_sparsity=target_sparsity_ratio, + pruning_scope="local", + pruning_type=prune_type) + distillation_criterion = KnowledgeDistillationLossConfig(loss_types=["CE", "KL"]) + distillation_conf = DistillationConfig(teacher_model=teacher_model, criterion=distillation_criterion) + quantization_conf = QuantizationAwareTrainingConfig() conf_list = [pruning_conf, distillation_conf, quantization_conf] - model = trainer.orchestrate_optimizations(config_list=conf_list, teacher_model=teacher_model) + model = trainer.orchestrate_optimizations(config_list=conf_list) if optim_args.benchmark or optim_args.accuracy_only: # Load the model obtained after Intel Neural Compressor (INC) quantization diff --git a/examples/huggingface/pytorch/text-classification/pruning/run_glue.py b/examples/huggingface/pytorch/text-classification/pruning/run_glue.py index 9b0f5edd087..bee4f20a070 100644 --- a/examples/huggingface/pytorch/text-classification/pruning/run_glue.py +++ b/examples/huggingface/pytorch/text-classification/pruning/run_glue.py @@ -29,9 +29,8 @@ from intel_extension_for_transformers.transformers import ( metrics, OptimizedModel, - PrunerConfig, - PruningConfig, ) +from neural_compressor.config import WeightPruningConfig from intel_extension_for_transformers.transformers.trainer import NLPTrainer from transformers import ( AutoConfig, @@ -47,7 +46,6 @@ ) from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version -from transformers.utils.fx import symbolic_trace from typing import Optional @@ -204,8 +202,8 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply prune."}, ) pruning_approach: Optional[str] = field( - default="BasicMagnitude", - metadata={"help": "Pruning approach. Supported approach is basic_magnite."}, + default="magnitude", + metadata={"help": "Pruning approach. Supported approach is magnite."}, ) target_sparsity_ratio: Optional[float] = field( default=None, @@ -521,13 +519,15 @@ def compute_metrics(p: EvalPrediction): raise ValueError("do_train must be set to True for pruning.") tune_metric = metrics.Metric(name=metric_name) - prune_type = 'BasicMagnitude' \ - if optim_args.pruning_approach else optim_args.pruning_approach + prune_type = optim_args.pruning_approach \ + if optim_args.pruning_approach else 'pattern_lock' target_sparsity_ratio = optim_args.target_sparsity_ratio \ if optim_args.target_sparsity_ratio else None - pruner_config = PrunerConfig(prune_type=prune_type, target_sparsity_ratio=target_sparsity_ratio) - pruning_conf = PruningConfig(pruner_config=pruner_config, metrics=tune_metric) - + trainer.metrics = tune_metric + pruning_conf = WeightPruningConfig([{"start_step": 0, "end_step": 2}], + target_sparsity=target_sparsity_ratio, + pruning_scope="local", + pruning_type=prune_type) model = trainer.prune(pruning_config=pruning_conf) trainer.save_model(training_args.output_dir) diff --git a/examples/huggingface/pytorch/text-classification/quantization/README.md b/examples/huggingface/pytorch/text-classification/quantization/README.md index d864de0263e..c851b4c165b 100644 --- a/examples/huggingface/pytorch/text-classification/quantization/README.md +++ b/examples/huggingface/pytorch/text-classification/quantization/README.md @@ -1,6 +1,6 @@ Step-by-Step​ ============ -The script `run_glue.py` provides three quantization approaches (PostTrainingStatic, PostTrainingStatic and QuantizationAwareTraining) based on [Intel® Neural Compressor](https://github.com/intel/neural-compressor). +The script `run_glue.py` provides three quantization approaches (dynamic, static and qat) based on [Intel® Neural Compressor](https://github.com/intel/neural-compressor). # Prerequisite​ ## 1. Create Environment​ @@ -8,9 +8,7 @@ Recommend python 3.9 or higher version. ```shell pip install intel-extension-for-transformers pip install -r requirements.txt -pip install transformers==4.34.1 ``` ->**Note**: Please use transformers no higher than 4.34.1 # Run @@ -23,7 +21,7 @@ python run_glue.py \ --model_name_or_path distilbert-base-uncased-finetuned-sst-2-english \ --task_name sst2 \ --tune \ - --quantization_approach PostTrainingStatic \ + --quantization_approach static \ --do_train \ --do_eval \ --output_dir ./saved_result \ @@ -43,7 +41,7 @@ python run_glue.py \ ``` **Notes**: - - Choice of `quantization_approach` can be `PostTrainingDynamic`, `PostTrainingStatic`, and `QuantizationAwareTraining`. + - Choice of `quantization_approach` can be `dynamic`, `static`, and `qat`. - Choice of `task_name` can be `cola`, `sst2`, `mrpc`, `stsb`, `qqp`, `mnli`, `qnli`, `rte`, and `wnli`. ## 2. Distributed Data Parallel Support @@ -66,16 +64,17 @@ python -m torch.distributed.launch --master_addr= --nproc_per_no --model_name_or_path distilbert-base-uncased-finetuned-sst-2-english \ --task_name sst2 \ --tune \ - --quantization_approach QuantizationAwareTraining \ + --quantization_approach qat \ --do_train \ --do_eval \ --output_dir ./saved_result \ - --overwrite_output_dir + --overwrite_output_dir \ + --save_safetensors False ``` ## 3. Validated model list -|Task|Pretrained model|PostTrainingDynamic | PostTrainingStatic | QuantizationAwareTraining +|Task|Pretrained model| dynamic| static | qat |---|------------------------------------|---|---|--- |MRPC|textattack/bert-base-uncased-MRPC| ✅| ✅| ✅ |MRPC|textattack/albert-base-v2-MRPC| ✅| ✅| N/A @@ -107,7 +106,7 @@ python -m torch.distributed.launch --master_addr= --nproc_per_no bash run_benchmark.sh --topology=[topology] --config=./saved_int8 --mode=benchmark --int8=true ``` -### QuantizationAwareTraining +### qat - Topology: - BERT-MRPC: bert_base_mrpc diff --git a/examples/huggingface/pytorch/text-classification/quantization/ptq/run_tuning.sh b/examples/huggingface/pytorch/text-classification/quantization/ptq/run_tuning.sh index cc6a24cf325..6719dc50825 100755 --- a/examples/huggingface/pytorch/text-classification/quantization/ptq/run_tuning.sh +++ b/examples/huggingface/pytorch/text-classification/quantization/ptq/run_tuning.sh @@ -16,7 +16,7 @@ function init_params { batch_size=8 MAX_SEQ_LENGTH=128 model_type="bert" - approach="PostTrainingStatic" + approach="static" script="../run_glue.py" for var in "$@" do @@ -46,88 +46,88 @@ function init_params { function run_tuning { if [ "${topology}" = "bert_base_mrpc_static" ]; then model_name_or_path="textattack/bert-base-uncased-MRPC" - approach="PostTrainingStatic" + approach="static" extra_cmd=$extra_cmd" --task_name mrpc" elif [ "${topology}" = "bert_base_mrpc_dynamic" ]; then extra_cmd=$extra_cmd" --task_name mrpc" model_name_or_path="textattack/bert-base-uncased-MRPC" - approach="PostTrainingDynamic" + approach="dynamic" elif [ "${topology}" = "bert_base_SST-2_static" ]; then extra_cmd=$extra_cmd" --task_name sst2" model_name_or_path="echarlaix/bert-base-uncased-sst2-acc91.1-d37-hybrid" - approach="PostTrainingStatic" + approach="static" elif [ "${topology}" = "bert_base_SST-2_dynamic" ]; then extra_cmd=$extra_cmd" --task_name sst2" model_name_or_path="echarlaix/bert-base-uncased-sst2-acc91.1-d37-hybrid" - approach="PostTrainingDynamic" + approach="dynamic" elif [ "${topology}" = "bert_base_CoLA_static" ]; then extra_cmd=$extra_cmd" --task_name cola" model_name_or_path="textattack/bert-base-uncased-CoLA" - approach="PostTrainingStatic" + approach="static" elif [ "${topology}" = "bert_base_STS-B_static" ]; then extra_cmd=$extra_cmd" --task_name stsb" model_name_or_path="Contrastive-Tension/BERT-Base-CT-STSb" - approach="PostTrainingStatic" + approach="static" elif [ "${topology}" = "bert_base_RTE_static" ]; then extra_cmd=$extra_cmd" --task_name rte" model_name_or_path="textattack/bert-base-uncased-RTE" - approach="PostTrainingStatic" + approach="static" elif [ "${topology}" = "bert_large_RTE_static" ]; then extra_cmd=$extra_cmd" --task_name rte" model_name_or_path="yoshitomo-matsubara/bert-large-uncased-rte" - approach="PostTrainingStatic" + approach="static" elif [ "${topology}" = "bert_large_CoLA_static" ]; then extra_cmd=$extra_cmd" --task_name cola" model_name_or_path="yoshitomo-matsubara/bert-large-uncased-cola" - approach="PostTrainingStatic" + approach="static" elif [ "${topology}" = "bert_large_MRPC_static" ]; then extra_cmd=$extra_cmd" --task_name mrpc" model_name_or_path="yoshitomo-matsubara/bert-large-uncased-mrpc" - approach="PostTrainingStatic" + approach="static" elif [ "${topology}" = "bert_large_QNLI_static" ]; then extra_cmd=$extra_cmd" --task_name qnli" model_name_or_path="yoshitomo-matsubara/bert-large-uncased-qnli" - approach="PostTrainingStatic" + approach="static" elif [ "${topology}" = "camembert_base_XNLI_dynamic" ]; then model_name_or_path="BaptisteDoyen/camembert-base-xnli" - approach="PostTrainingDynamic" + approach="dynamic" extra_cmd=$extra_cmd" --dataset_name xnli --dataset_config_name fr" elif [ "${topology}" = "xlnet_base_SST-2_static" ]; then extra_cmd=$extra_cmd" --task_name sst2" model_name_or_path="textattack/xlnet-base-cased-SST-2" - approach="PostTrainingStatic" + approach="static" elif [ "${topology}" = "funnel_small_MRPC_static" ]; then extra_cmd=$extra_cmd" --task_name mrpc" model_name_or_path="funnel-transformer/small-base" - approach="PostTrainingStatic" + approach="static" elif [ "${topology}" = "roberta_base_SST-2_dynamic" ]; then extra_cmd=$extra_cmd" --task_name sst2" model_name_or_path="textattack/roberta-base-SST-2" - approach="PostTrainingDynamic" + approach="dynamic" elif [ "${topology}" = "distillbert_base_SST-2_static" ]; then extra_cmd=$extra_cmd" --task_name sst2" model_name_or_path="distilbert-base-uncased-finetuned-sst-2-english" - approach="PostTrainingStatic" + approach="static" elif [ "${topology}" = "distillbert_base_SST-2_dynamic" ]; then extra_cmd=$extra_cmd" --task_name sst2" model_name_or_path="distilbert-base-uncased-finetuned-sst-2-english" - approach="PostTrainingDynamic" + approach="dynamic" elif [ "${topology}" = "albert_base_MRPC_static" ]; then extra_cmd=$extra_cmd" --task_name mrpc" model_name_or_path="textattack/albert-base-v2-MRPC" - approach="PostTrainingStatic" + approach="static" elif [ "${topology}" = "albert_base_MRPC_dynamic" ]; then extra_cmd=$extra_cmd" --task_name mrpc" model_name_or_path="textattack/albert-base-v2-MRPC" - approach="PostTrainingDynamic" + approach="dynamic" elif [ "${topology}" = "xlm_roberta_large_XNLI_dynamic" ]; then extra_cmd=$extra_cmd" --dataset_name xnli --dataset_config_name en" model_name_or_path="joeddav/xlm-roberta-large-xnli" - approach="PostTrainingDynamic" + approach="dynamic" elif [ "${topology}" = "bert_base_SST-2_static_no_trainer" ]; then extra_cmd=" --task_name sst2" model_name_or_path="echarlaix/bert-base-uncased-sst2-acc91.1-d37-hybrid" - approach="PostTrainingStatic" + approach="static" script="../run_glue_no_trainer.py" fi diff --git a/examples/huggingface/pytorch/text-classification/quantization/qat/run_tuning.sh b/examples/huggingface/pytorch/text-classification/quantization/qat/run_tuning.sh index 3d5e71b0212..28aba1b27b7 100644 --- a/examples/huggingface/pytorch/text-classification/quantization/qat/run_tuning.sh +++ b/examples/huggingface/pytorch/text-classification/quantization/qat/run_tuning.sh @@ -18,7 +18,7 @@ function init_params { batch_size=8 MAX_SEQ_LENGTH=128 model_type="bert" - approach="PostTrainingStatic" + approach="static" for var in "$@" do case $var in @@ -49,7 +49,7 @@ function run_tuning { TASK_NAME="mrpc" model_name_or_path="bert-base-uncased" model_type="bert" - approach="QuantizationAwareTraining" + approach="qat" extra_cmd=$extra_cmd" --learning_rate 1e-5 \ --num_train_epochs 6 \ --eval_steps 100 \ @@ -59,7 +59,8 @@ function run_tuning { --evaluation_strategy steps \ --save_strategy steps \ --metric_for_best_model accuracy \ - --save_total_limit 1" + --save_total_limit 1 \ + --save_safetensors False" fi diff --git a/examples/huggingface/pytorch/text-classification/quantization/run_glue.py b/examples/huggingface/pytorch/text-classification/quantization/run_glue.py index 0c7df87abea..7a915a56a8a 100644 --- a/examples/huggingface/pytorch/text-classification/quantization/run_glue.py +++ b/examples/huggingface/pytorch/text-classification/quantization/run_glue.py @@ -26,8 +26,14 @@ import transformers from dataclasses import dataclass, field from datasets import load_dataset, load_metric -from intel_extension_for_transformers.transformers import metrics, objectives, OptimizedModel, QuantizationConfig +from intel_extension_for_transformers.transformers import OptimizedModel, metrics, objectives from intel_extension_for_transformers.transformers.trainer import NLPTrainer +from neural_compressor.config import ( + PostTrainingQuantConfig, + QuantizationAwareTrainingConfig, + TuningCriterion, + AccuracyCriterion +) from transformers import ( AutoConfig, AutoModelForSequenceClassification, @@ -198,9 +204,9 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply quantization."}, ) quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, + default="static", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic and qat."}, ) metric_name: Optional[str] = field( default=None, @@ -534,28 +540,37 @@ def compute_metrics(p: EvalPrediction): if not training_args.do_eval: raise ValueError("do_eval must be set to True for quantization.") - if optim_args.quantization_approach != "PostTrainingDynamic": + if optim_args.quantization_approach != "dynamic": if not training_args.do_train: raise ValueError( "do_train must be set to True for static and aware training quantization." ) - if optim_args.quantization_approach == "QuantizationAwareTraining": - early_stopping_patience = 6 - early_stopping_threshold = 0.001 # optional - trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, - early_stopping_threshold)) - tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol ) + trainer.metrics = tune_metric objective = objectives.performance - quantization_config = QuantizationConfig( - approach=optim_args.quantization_approach, - max_trials=600, - metrics=[tune_metric], - objectives=[objective], - sampling_size = len(train_dataset)//20 + tuning_criterion = TuningCriterion(max_trials=600, objective=[objective.name]) + accuracy_criterion = AccuracyCriterion( + higher_is_better=True, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. ) + if optim_args.quantization_approach != "qat": + quantization_config = PostTrainingQuantConfig( + approach=optim_args.quantization_approach, + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + else: + quantization_config = QuantizationAwareTrainingConfig( + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + early_stopping_patience = 2 + early_stopping_threshold = 0.001 # optional + trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, \ + early_stopping_threshold)) model = trainer.quantize(quant_config=quantization_config) if optim_args.benchmark_only: diff --git a/examples/huggingface/pytorch/text-classification/quantization/run_glue_no_trainer.py b/examples/huggingface/pytorch/text-classification/quantization/run_glue_no_trainer.py deleted file mode 100644 index 00006ac0526..00000000000 --- a/examples/huggingface/pytorch/text-classification/quantization/run_glue_no_trainer.py +++ /dev/null @@ -1,563 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Finetuning a 🤗 Transformers model for sequence classification on GLUE.""" -import argparse -import logging -import math -import os -import random -import time -from pathlib import Path - -import pandas as pd # to read in different data - -import datasets -from datasets import load_dataset, load_metric -from torch.utils.data import DataLoader - -import transformers -from accelerate import Accelerator -from huggingface_hub import Repository -from intel_extension_for_transformers.transformers import (metrics, NoTrainerOptimizer, objectives, OptimizedModel, - QuantizationConfig) -from transformers import ( - AdamW, - AutoConfig, - AutoModelForSequenceClassification, - AutoTokenizer, - DataCollatorWithPadding, - PretrainedConfig, - SchedulerType, - default_data_collator, - get_scheduler, - set_seed, -) -from transformers.file_utils import get_full_repo_name -from transformers.utils.versions import require_version - -logger = logging.getLogger(__name__) - -require_version("datasets>=1.8.0", - "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") - -task_to_keys = { - "cola": ("sentence", None), - "mnli": ("premise", "hypothesis"), - "mrpc": ("sentence1", "sentence2"), - "qnli": ("question", "sentence"), - "qqp": ("question1", "question2"), - "rte": ("sentence1", "sentence2"), - "sst2": ("sentence", None), - "stsb": ("sentence1", "sentence2"), - "wnli": ("sentence1", "sentence2"), -} - - -def parse_args(): - parser = argparse.ArgumentParser( - description="Finetune a transformers model on a text classification task") - parser.add_argument( - "--task_name", - type=str, - default=None, - help="The name of the glue task to train on.", - choices=list(task_to_keys.keys()), - ) - parser.add_argument("--train_file", - type=str, - default=None, - help="A csv or a json file containing the training data.") - parser.add_argument("--validation_file", - type=str, - default=None, - help="A csv or a json file containing the validation data.") - parser.add_argument( - "--max_length", - type=int, - default=128, - help= - ("The maximum total input sequence length after tokenization. Sequences longer than this will be truncated," - " sequences shorter will be padded if `--pad_to_max_lengh` is passed."), - ) - parser.add_argument( - "--pad_to_max_length", - action="store_true", - help="If passed, pad all samples to `max_length`. Otherwise, dynamic padding is used.", - ) - parser.add_argument( - "--model_name_or_path", - type=str, - help="Path to pretrained model or model identifier from huggingface.co/models.", - required=True, - ) - parser.add_argument( - "--use_slow_tokenizer", - action="store_true", - help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).", - ) - parser.add_argument( - "--per_device_train_batch_size", - type=int, - default=8, - help="Batch size (per device) for the training dataloader.", - ) - parser.add_argument( - "--per_device_eval_batch_size", - type=int, - default=8, - help="Batch size (per device) for the evaluation dataloader.", - ) - parser.add_argument( - "--learning_rate", - type=float, - default=5e-5, - help="Initial learning rate (after the potential warmup period) to use.", - ) - parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.") - parser.add_argument("--num_train_epochs", - type=int, - default=3, - help="Total number of training epochs to perform.") - parser.add_argument( - "--max_train_steps", - type=int, - default=None, - help="Total number of training steps to perform. If provided, overrides num_train_epochs.", - ) - parser.add_argument( - "--gradient_accumulation_steps", - type=int, - default=1, - help="Number of updates steps to accumulate before performing a backward/update pass.", - ) - parser.add_argument( - "--lr_scheduler_type", - type=SchedulerType, - default="linear", - help="The scheduler type to use.", - choices=[ - "linear", "cosine", "cosine_with_restarts", "polynomial", "constant", - "constant_with_warmup" - ], - ) - parser.add_argument("--num_warmup_steps", - type=int, - default=0, - help="Number of steps for the warmup in the lr scheduler.") - parser.add_argument("--output_dir", - type=str, - default=None, - help="Where to store the final model.") - parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") - parser.add_argument("--push_to_hub", - action="store_true", - help="Whether or not to push the model to the Hub.") - parser.add_argument( - "--hub_model_id", - type=str, - help="The name of the repository to keep in sync with the local `output_dir`.") - parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") - parser.add_argument("--tune", action="store_true", help="tune a best model with Intel Extension for Transformers.") - parser.add_argument("--quantization_approach", - type=str, - default="PostTrainingStatic", - help="Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining.") - parser.add_argument("--metric_name", - type=str, - default=None, - help="Metric name used for the tuning strategy.") - parser.add_argument("--is_relative", - type=bool, - default=True, - help="Metric tolerance model, expected to be relative or absolute.") - parser.add_argument("--perf_tol", - type=float, - default=0.01, - help="Performance tolerance when optimizing the model.") - parser.add_argument("--benchmark", action="store_true", help="run benchmark.") - parser.add_argument("--int8", action="store_true", help="run benchmark with int8 model.") - parser.add_argument("--accuracy_only", - action="store_true", - help="Whether to only test accuracy for model tuned by Neural Compressor.") - parser.add_argument('-i', "--iter", default=0, type=int, help='For accuracy measurement only.') - parser.add_argument('-w', - "--warmup_iter", - default=1, - type=int, - help='For benchmark measurement only.') - args = parser.parse_args() - - # Sanity checks - if args.task_name is None and args.train_file is None and args.validation_file is None: - raise ValueError("Need either a task name or a training/validation file.") - else: - if args.train_file is not None: - extension = args.train_file.split(".")[-1] - assert extension in ["csv", "json"], "`train_file` should be a csv or a json file." - if args.validation_file is not None: - extension = args.validation_file.split(".")[-1] - assert extension in ["csv", - "json"], "`validation_file` should be a csv or a json file." - - if args.push_to_hub: - assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed." - - return args - - -def eval_func(args, model, accelerator, eval_dataloader, metric): - # Evaluation - batch_time = AverageMeter('Time', ':6.3f') - is_regression = args.task_name == "stsb" - model.eval() - for step, batch in enumerate(eval_dataloader): - if step >= args.warmup_iter: - start = time.time() - # soft labels - outputs = model(**batch) - # measure elapsed time - if step >= args.warmup_iter: - batch_time.update(time.time() - start) - predictions = outputs.logits.argmax( - dim=-1) if not is_regression else outputs.logits.squeeze() - metric.add_batch( - predictions=accelerator.gather(predictions), - references=accelerator.gather(batch["labels"]), - ) - eval_metric = metric.compute() - batch_size = args.per_device_eval_batch_size - print('Batch size = {}'.format(batch_size)) - print('Latency: %.3f ms' % (batch_time.avg / batch_size * 1000)) - print('Throughput: %.3f images/sec' % (batch_size / batch_time.avg)) - logger.info(f"{eval_metric}") - return eval_metric - - -def main(): - # read in the arguments - args = parse_args() - - # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. - accelerator = Accelerator() - # Make one log on every process with the configuration for debugging. - logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - level=logging.INFO, - ) - logger.info(accelerator.state) - - # Setup logging, we only want one process per machine to log things on the screen. - # accelerator.is_local_main_process is only True for one process per machine. - logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR) - if accelerator.is_local_main_process: - datasets.utils.logging.set_verbosity_warning() - transformers.utils.logging.set_verbosity_info() - else: - datasets.utils.logging.set_verbosity_error() - transformers.utils.logging.set_verbosity_error() - - # If passed along, set the training seed now. - if args.seed is not None: - set_seed(args.seed) - - # Handle the repository creation - if accelerator.is_main_process: - if args.push_to_hub: - if args.hub_model_id is None: - repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) - else: - repo_name = args.hub_model_id - repo = Repository(args.output_dir, clone_from=repo_name) - elif args.output_dir is not None: - os.makedirs(args.output_dir, exist_ok=True) - accelerator.wait_for_everyone() - - # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) - # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub). - - # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the - # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named - # label if at least two columns are provided. - - # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this - # single column. You can easily tweak this behavior (see below) - - # In distributed training, the load_dataset function guarantee that only one local process can concurrently - # download the dataset. - if args.task_name is not None: - # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset("glue", args.task_name) - ''' - 06/25/2022 - Distilled-sparse training for bert_mini, on sst2 - pre-load an augmented dataset - ''' - else: - # Loading the dataset from local csv or json file. - data_files = {} - if args.train_file is not None: - data_files["train"] = args.train_file - if args.validation_file is not None: - data_files["validation"] = args.validation_file - extension = (args.train_file - if args.train_file is not None else args.valid_file).split(".")[-1] - raw_datasets = load_dataset(extension, data_files=data_files) - # See more about loading any type of standard or custom dataset at - # https://huggingface.co/docs/datasets/loading_datasets.html. - - if args.task_name is not None: - is_regression = args.task_name == "stsb" - if not is_regression: - label_list = raw_datasets["train"].features["label"].names - num_labels = len(label_list) - else: - num_labels = 1 - else: - # Trying to have good defaults here, don't hesitate to tweak to your needs. - is_regression = raw_datasets["train"].features["label"].dtype in ["float32", "float64"] - if is_regression: - num_labels = 1 - else: - # A useful fast method: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique - label_list = raw_datasets["train"].unique("label") - label_list.sort() # Let's sort it for determinism - num_labels = len(label_list) - - # Load pretrained model and tokenizer - # - # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently - # download model & vocab. - config = AutoConfig.from_pretrained(args.model_name_or_path, - num_labels=num_labels, - finetuning_task=args.task_name) - tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, - use_fast=not args.use_slow_tokenizer) - if args.int8: - # Load the model obtained after Intel Neural Compressor (INC) quantization - model = OptimizedModel.from_pretrained( - args.model_name_or_path, - from_tf=bool(".ckpt" in args.model_name_or_path), - config=config - ) - else: - model = AutoModelForSequenceClassification.from_pretrained( - args.model_name_or_path, - from_tf=bool(".ckpt" in args.model_name_or_path), - config=config - ) - - # Preprocessing the datasets - if args.task_name is not None: - sentence1_key, sentence2_key = task_to_keys[args.task_name] - else: - # Again, we try to have some nice defaults but don't hesitate to tweak to your use case. - non_label_column_names = [ - name for name in raw_datasets["train"].column_names if name != "label" - ] - if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: - sentence1_key, sentence2_key = "sentence1", "sentence2" - else: - if len(non_label_column_names) >= 2: - sentence1_key, sentence2_key = non_label_column_names[:2] - else: - sentence1_key, sentence2_key = non_label_column_names[0], None - - # Some models have set the order of the labels to use, so let's make sure we do use it. - label_to_id = None - if (model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id - and args.task_name is not None and not is_regression): - # Some have all caps in their config, some don't. - label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()} - if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): - logger.info( - f"The configuration of the model provided the following label correspondence: {label_name_to_id}. " - "Using it!") - else: - logger.warning( - "Your model seems to have been trained with labels, but they don't match the dataset: ", - f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}." - "\nIgnoring the model labels as a result.", - ) - elif args.task_name is None: - label_to_id = {v: i for i, v in enumerate(label_list)} - - if label_to_id is not None: - model.config.label2id = label_to_id - model.config.id2label = {id: label for label, id in config.label2id.items()} - elif args.task_name is not None and not is_regression: - model.config.label2id = {l: i for i, l in enumerate(label_list)} - model.config.id2label = {id: label for label, id in config.label2id.items()} - - padding = "max_length" if args.pad_to_max_length else False - - def preprocess_function(examples): - # Tokenize the texts - texts = ((examples[sentence1_key], ) if sentence2_key is None else - (examples[sentence1_key], examples[sentence2_key])) - result = tokenizer(*texts, padding=padding, max_length=args.max_length, truncation=True) - - if "label" in examples: - if label_to_id is not None: - # Map labels to IDs (not necessary for GLUE tasks) - result["labels"] = [label_to_id[l] for l in examples["label"]] - else: - # In all cases, rename the column to labels because the model will expect that. - result["labels"] = examples["label"] - - return result - - with accelerator.main_process_first(): - - # original process - processed_datasets = raw_datasets.map( - preprocess_function, - batched=True, - remove_columns=raw_datasets["train"].column_names, - desc="Running tokenizer on dataset", - ) - train_dataset = processed_datasets["train"] - eval_dataset = processed_datasets["validation_matched" if args.task_name == - "mnli" else "validation"] - #if use_augmented: - # test_dataset = processed_datasets["test"] - # Log a few random samples from the training set: - for index in random.sample(range(len(train_dataset)), 3): - logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") - - # DataLoaders creation: - if args.pad_to_max_length: - # If padding was already done ot max length, we use the default data collator that will just convert everything - # to tensors. - data_collator = default_data_collator - else: - # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of - # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple - # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). - data_collator = DataCollatorWithPadding( - tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)) - - train_dataloader = DataLoader(train_dataset, - shuffle=True, - collate_fn=data_collator, - batch_size=args.per_device_train_batch_size) - eval_dataloader = DataLoader(eval_dataset, - collate_fn=data_collator, - batch_size=args.per_device_eval_batch_size) - - # Optimizer - # Split weights in two groups, one with weight decay and the other not. - no_decay = ["bias", "LayerNorm.weight"] - optimizer_grouped_parameters = [ - { - "params": - [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], - "weight_decay": args.weight_decay, - }, - { - "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], - "weight_decay": 0.0, - }, - ] - optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) - - # Prepare everything with our `accelerator`. - model, optimizer = accelerator.prepare(model, optimizer) - - # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be - # shorter in multiprocess) - - # Scheduler and math around the number of training steps. - num_update_steps_per_epoch = math.ceil( - len(train_dataloader) / args.gradient_accumulation_steps) - if args.max_train_steps is None: - args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch - else: - args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) - - lr_scheduler = get_scheduler( - name=args.lr_scheduler_type, - optimizer=optimizer, - num_warmup_steps=args.num_warmup_steps, - num_training_steps=args.max_train_steps, - ) - - # Get the metric function - if args.task_name is not None: - metric = load_metric("glue", args.task_name) - else: - metric = load_metric("accuracy") - - metric_name = (args.metric_name if args.metric_name is not None else - ("pearson" if args.task_name == "stsb" else - "matthews_correlation" if args.task_name == "cola" else "accuracy")) - - def eval_func_nc(model): - ret = eval_func(args, model, accelerator, eval_dataloader, metric) - return ret[metric_name] - - # Train! - if args.tune: - if accelerator.is_main_process: - tokenizer.save_pretrained(args.output_dir) - config.save_pretrained(args.output_dir, - is_main_process=accelerator.is_main_process, - save_function=accelerator.save) - tune_metric = metrics.Metric(name=metric_name, - is_relative=args.is_relative, - criterion=args.perf_tol) - objective = objectives.performance - q_config = QuantizationConfig(approach=args.quantization_approach, - max_trials=600, - metrics=[tune_metric], - objectives=[objective]) - quantizer = NoTrainerOptimizer(model, args.output_dir) - model = quantizer.quantize(q_config, - eval_func=eval_func_nc, - calib_dataloader=train_dataloader) - - if args.benchmark or args.accuracy_only: - results = eval_func(args, model, accelerator, eval_dataloader, metric) - print("Finally Eval {} Accuracy: {:.5f}".format(metric_name, results[metric_name])) - - -class AverageMeter(object): - """Computes and stores the average and current value""" - def __init__(self, name, fmt=':f'): - self.name = name - self.fmt = fmt - self.reset() - - def reset(self): - self.val = 0 - self.avg = 0 - self.sum = 0 - self.count = 0 - - def update(self, val, n=1): - self.val = val - self.sum += val * n - self.count += n - self.avg = self.sum / self.count - - def __str__(self): - fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' - return fmtstr.format(**self.__dict__) - - -if __name__ == "__main__": - main() diff --git a/examples/huggingface/pytorch/text-generation/inference/run_generation_with_deepspeed.py b/examples/huggingface/pytorch/text-generation/inference/run_generation_with_deepspeed.py index ee629d0fd17..f705e275d8f 100644 --- a/examples/huggingface/pytorch/text-generation/inference/run_generation_with_deepspeed.py +++ b/examples/huggingface/pytorch/text-generation/inference/run_generation_with_deepspeed.py @@ -79,6 +79,7 @@ parser.add_argument("--token-latency", action="store_true") parser.add_argument("--throughput", action="store_true") parser.add_argument("--accuracy-only", action="store_true") +parser.add_argument("--optimum-intel", action="store_true", help="Use IPEXModel in optimum-intel to optimize the model") parser.add_argument( "--acc-tasks", nargs="+", @@ -143,7 +144,7 @@ def get_repo_root(model_name_or_path): model_name_or_path, local_files_only=is_offline_mode(), cache_dir=os.getenv("TRANSFORMERS_CACHE", None), - ignore_patterns=["*.safetensors", "*.msgpack", "*.h5"], + ignore_patterns=["*.safetensors", "*.msgpack", "*.h5", "training_args.bin"], resume_download=True, ) @@ -153,7 +154,7 @@ def get_repo_root(model_name_or_path): model_name_or_path, local_files_only=is_offline_mode(), cache_dir=os.getenv("TRANSFORMERS_CACHE", None), - ignore_patterns=["*.safetensors", "*.msgpack", "*.h5"], + ignore_patterns=["*.safetensors", "*.msgpack", "*.h5", "training_args.bin"], resume_download=True, ) @@ -201,7 +202,7 @@ def print_mem_usage(msg): if args.benchmark: print_mem_usage("pre-from-pretrained") -is_meta_support = not model_type in ["falcon"] +is_meta_support = model_type not in ["falcon"] # Construct model with fake meta tensors, later will be replaced during ds-inference ckpt load with deepspeed.OnDevice(dtype=load_dtype, device="meta", enabled=is_meta_support): @@ -226,7 +227,8 @@ def print_mem_usage(msg): def write_checkpoints_json(): checkpoint_files = get_checkpoint_files(model_name) if local_rank == 0: - data = {"type": "BLOOM", "checkpoints": checkpoint_files, "version": 1.0} + type = "BLOOM" if model.config.model_type == "bloom" else "ds_model" + data = {"type": type, "checkpoints": checkpoint_files, "version": 1.0} json.dump(data, open(checkpoints_json, "w")) @@ -255,6 +257,7 @@ def write_checkpoints_json(): checkpoint=checkpoints_json if is_meta_support else None, **kwargs, ) +model = model.module if args.benchmark: print_mem_usage("post-ds-inference-init") @@ -264,7 +267,11 @@ def write_checkpoints_json(): # to ipex if args.ipex: - model = ipex.optimize_transformers(model.eval().to("xpu"), dtype=infer_dtype) + if args.optimum_intel and args.device == "cpu" and model.config.model_type == "llama": + from optimum.intel import IPEXModelForCausalLM + model = IPEXModelForCausalLM(model.eval(), config) + else: + model = ipex.optimize_transformers(model.eval().to(model.device), dtype=infer_dtype) # bypass assertion for beam4 if isinstance(model, deepspeed.InferenceEngine): @@ -378,7 +385,7 @@ def _model_generate(self, context, max_length, eos_token_id): config=config, model=model, tokenizer=tokenizer, - device="xpu", + device=model.device, num_beams=args.num_beams, batch_size=args.batch_size, dtype=args.dtype, @@ -480,7 +487,7 @@ def generate(): with torch.inference_mode(): # latency for i in range(cycles): - with torch.autograd.profiler_legacy.profile(enabled=do_profiling, use_xpu=True, record_shapes=True) as prof: + with torch.autograd.profiler_legacy.profile(enabled=do_profiling, use_xpu=True if model.device.type=="cpu" else False, record_shapes=True) as prof: t0 = time.time() gen_ids, outputs = generate() if args.cuda: @@ -488,7 +495,7 @@ def generate(): t1 = time.time() if do_profiling: - torch.save(prof.key_averages().table(sort_by="self_xpu_time_total"), "./profile_{}.pt".format(local_rank)) + torch.save(prof.key_averages().table(sort_by=f"self_{args.device}_time_total"), "./profile_{}.pt".format(local_rank)) torch.save(prof.table(sort_by="id", row_limit=-1),'./profile_{}_id.pt'.format(local_rank)) torch.save(prof.key_averages(group_by_input_shape=True).table(), "./profile_{}_detail.pt".format(local_rank)) prof.export_chrome_trace("./trace.json") diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh index d2e77761da2..db3a06767c9 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh +++ b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh @@ -16,7 +16,7 @@ function init_params { model_name_or_path="EleutherAI/gpt-j-6b" extra_cmd="" batch_size=8 - approach="PostTrainingStatic" + approach="static" script="run_generation_sq.py" alpha=0.5 weight_dtype="int4" diff --git a/examples/huggingface/pytorch/text-to-image/quantization/ptq/README.md b/examples/huggingface/pytorch/text-to-image/quantization/ptq/README.md index 345ccaba5c3..d74432313d2 100644 --- a/examples/huggingface/pytorch/text-to-image/quantization/ptq/README.md +++ b/examples/huggingface/pytorch/text-to-image/quantization/ptq/README.md @@ -22,7 +22,7 @@ pip install -r requirements.txt python run_diffusion.py \ --model_name_or_path lambdalabs/sd-pokemon-diffusers \ --tune \ - --quantization_approach PostTrainingStatic \ + --quantization_approach static \ --perf_tol 0.02 \ --output_dir /tmp/diffusion_output \ --base_images base_images \ diff --git a/examples/huggingface/pytorch/text-to-image/quantization/ptq/run_diffusion.py b/examples/huggingface/pytorch/text-to-image/quantization/ptq/run_diffusion.py index 5e2e2e62669..3f4ace9720d 100644 --- a/examples/huggingface/pytorch/text-to-image/quantization/ptq/run_diffusion.py +++ b/examples/huggingface/pytorch/text-to-image/quantization/ptq/run_diffusion.py @@ -30,7 +30,13 @@ from accelerate.utils import set_seed from diffusers import StableDiffusionPipeline -from intel_extension_for_transformers.transformers import metrics , NoTrainerOptimizer, OptimizedModel, QuantizationConfig +from intel_extension_for_transformers.transformers import metrics +from neural_compressor.config import ( + PostTrainingQuantConfig, + TuningCriterion, + AccuracyCriterion +) +from neural_compressor.quantization import fit from intel_extension_for_transformers.transformers.config import WEIGHTS_NAME from pytorch_fid import fid_score @@ -100,9 +106,9 @@ def parse_args(): parser.add_argument( "--quantization_approach", type=str, - default="PostTrainingStatic", - help="Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining.", + default="static", + help="Quantization approach. Supported approach are static, " + "dynamic and qat.", ) parser.add_argument( "--framework", @@ -301,18 +307,28 @@ def eval_func(model): criterion=args.perf_tol, greater_is_better=False ) - quantization_config = QuantizationConfig( - approach=args.quantization_approach, - max_trials=200, - metrics=[tune_metric], - ) + tuning_criterion = TuningCriterion(max_trials=600, objective=["performance"]) + accuracy_criterion = AccuracyCriterion( + higher_is_better=False, # optional. + criterion="relative" if args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=args.perf_tol, # optional. + ) + quantization_config = PostTrainingQuantConfig( + approach=args.quantization_approach, + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) os.makedirs(args.output_dir, exist_ok=True) - quantizer = NoTrainerOptimizer(model, args.output_dir) - model = quantizer.quantize(quantization_config, + model = fit(model, + quantization_config, eval_func=eval_func, calib_func=calibration_func, calib_dataloader=DataLoader(CalibDataset(), batch_size=1), - ) + ) + + weights_file = os.path.join(os.path.abspath( + os.path.expanduser(args.output_dir)), WEIGHTS_NAME) + torch.save(model.quantized_state_dict(), weights_file) setattr(pipe, name, model) logger.info(f"Optimized model {name} saved to: {args.output_dir}.") diff --git a/examples/huggingface/pytorch/text-to-image/quantization/ptq/run_tuning.sh b/examples/huggingface/pytorch/text-to-image/quantization/ptq/run_tuning.sh index 117762d8940..c288b195d27 100644 --- a/examples/huggingface/pytorch/text-to-image/quantization/ptq/run_tuning.sh +++ b/examples/huggingface/pytorch/text-to-image/quantization/ptq/run_tuning.sh @@ -15,7 +15,7 @@ function init_params { model_name_or_path="lambdalabs/sd-pokemon-diffusers" extra_cmd="" batch_size=8 - approach="PostTrainingStatic" + approach="static" for var in "$@" do case $var in @@ -45,10 +45,10 @@ function run_tuning { if [ "${topology}" = "sd_pokemon_diffusers_static" ]; then model_name_or_path="lambdalabs/sd-pokemon-diffusers" - approach="PostTrainingStatic" + approach="static" elif [ "${topology}" = "sd_pokemon_diffusers_dynamic" ]; then model_name_or_path="lambdalabs/sd-pokemon-diffusers" - approach="PostTrainingDynamic" + approach="dynamic" fi python -u ./run_diffusion.py \ diff --git a/examples/huggingface/pytorch/text2text-generation/run_tuning.sh b/examples/huggingface/pytorch/text2text-generation/run_tuning.sh index 3d35086f578..826469b1b3c 100644 --- a/examples/huggingface/pytorch/text2text-generation/run_tuning.sh +++ b/examples/huggingface/pytorch/text2text-generation/run_tuning.sh @@ -16,7 +16,7 @@ function init_params { model_name_or_path="google/flan-t5-large" extra_cmd="" batch_size=8 - approach="PostTrainingStatic" + approach="static" alpha=0.7 for var in "$@" do diff --git a/examples/huggingface/pytorch/textual-inversion/distillation_for_quantization/textual_inversion.py b/examples/huggingface/pytorch/textual-inversion/distillation_for_quantization/textual_inversion.py index 128c9248341..832da5ceb52 100644 --- a/examples/huggingface/pytorch/textual-inversion/distillation_for_quantization/textual_inversion.py +++ b/examples/huggingface/pytorch/textual-inversion/distillation_for_quantization/textual_inversion.py @@ -19,9 +19,10 @@ from diffusers.optimization import get_scheduler from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker from huggingface_hub import HfFolder, Repository, whoami -from intel_extension_for_transformers.transformers.config import ( +from neural_compressor.config import ( DistillationConfig, - QuantizationConfig, + IntermediateLayersKnowledgeDistillationLossConfig, + QuantizationAwareTrainingConfig, ) from intel_extension_for_transformers.transformers.utils import metrics, objectives from intel_extension_for_transformers.transformers.trainer import NLPTrainer @@ -769,12 +770,7 @@ def train_func(model): tune_metric = metrics.Metric(name="") if args.do_quantization: objective = objectives.performance - quantization_conf = QuantizationConfig( - approach="QuantizationAwareTraining", - max_trials=600, - metrics=[tune_metric], - objectives=[objective] - ) + quantization_conf = QuantizationAwareTrainingConfig() conf_list.append(quantization_conf) if args.do_distillation: @@ -828,17 +824,13 @@ def train_func(model): [['mid_block.resnets.1', ]], [['conv_out', ]], ] - - distillation_conf = DistillationConfig( - framework="pytorch_fx", metrics=tune_metric, - criterion=Criterion( - name="IntermediateLayersLoss", - layer_mappings=layer_mappings, - loss_types=["MSE"] * len(layer_mappings), - loss_weight_ratio=[1.0 / len(layer_mappings)] * len(layer_mappings), - add_origin_loss=True - ) + criterion_conf = IntermediateLayersKnowledgeDistillationLossConfig( + layer_mappings=layer_mappings, + loss_types=["MSE"] * len(layer_mappings), + loss_weight_ratio=[1.0 / len(layer_mappings)] * len(layer_mappings), + add_origin_loss=True ) + distillation_conf = DistillationConfig(teacher_model=teacher_model, criterion=criterion_conf) conf_list.append(distillation_conf) # Initialize our Trainer @@ -846,10 +838,10 @@ def train_func(model): model=model, args=TrainingArguments(output_dir=args.output_dir), ) + trainer.metrics = tune_metric model = trainer.orchestrate_optimizations( config_list=conf_list, - teacher_model=teacher_model, eval_func=lambda model:1, train_func=train_func) diff --git a/examples/huggingface/pytorch/token-classification/quantization/README.md b/examples/huggingface/pytorch/token-classification/quantization/README.md index 0cc950c742f..a718fceb82a 100644 --- a/examples/huggingface/pytorch/token-classification/quantization/README.md +++ b/examples/huggingface/pytorch/token-classification/quantization/README.md @@ -8,9 +8,7 @@ Token classification assigns a label to individual tokens in a sentence. One of ## 1. Environment ``` pip install -r requirements.txt -pip install transformers==4.34.1 ``` ->**Note**: Please use transformers no higher than 4.34.1 # Run @@ -22,7 +20,7 @@ pip install transformers==4.34.1 --model_name_or_path elastic/distilbert-base-uncased-finetuned-conll03-english \ --dataset_name conll2003 \ --tune \ - --quantization_approach PostTrainingStatic \ + --quantization_approach static \ --do_train \ --do_eval \ --pad_to_max_length \ @@ -32,7 +30,7 @@ pip install transformers==4.34.1 # Performance Data -|Dataset|Pretrained model|PostTrainingDynamic | PostTrainingStatic | QuantizationAwareTraining +|Dataset|Pretrained model|dynamic | static | qat |---|------------------------------------|---|---|--- |NER|elastic/distilbert-base-uncased-finetuned-conll03-english| ✅| ✅| ✅ diff --git a/examples/huggingface/pytorch/token-classification/quantization/run_ner.py b/examples/huggingface/pytorch/token-classification/quantization/run_ner.py index 7076b1bd5fe..d83ae181630 100644 --- a/examples/huggingface/pytorch/token-classification/quantization/run_ner.py +++ b/examples/huggingface/pytorch/token-classification/quantization/run_ner.py @@ -27,10 +27,12 @@ import transformers from dataclasses import dataclass, field from datasets import ClassLabel, load_dataset, load_metric -from intel_extension_for_transformers.transformers import( - metrics, - OptimizedModel, - QuantizationConfig, +from intel_extension_for_transformers.transformers import OptimizedModel, metrics +from neural_compressor.config import ( + PostTrainingQuantConfig, + QuantizationAwareTrainingConfig, + TuningCriterion, + AccuracyCriterion ) from intel_extension_for_transformers.transformers.trainer import NLPTrainer from transformers import ( @@ -200,9 +202,9 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply quantization."}, ) quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, + default="static", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic and qat."}, ) metric_name: Optional[str] = field( default="eval_f1", @@ -299,7 +301,8 @@ def main(): if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, + trust_remote_code=True ) else: data_files = {} @@ -310,7 +313,8 @@ def main(): if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.train_file.split(".")[-1] - raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) + raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir, + trust_remote_code=True) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. @@ -536,7 +540,7 @@ def tokenize_and_align_labels(examples): data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None) # Metrics - metric = load_metric("seqeval") + metric = load_metric("seqeval", trust_remote_code=True) def compute_metrics(p): predictions, labels = p @@ -573,6 +577,7 @@ def compute_metrics(p): metric_name = optim_args.metric_name training_args.metric_for_best_model = metric_name + # Initialize our Trainer trainer = NLPTrainer( model=model, @@ -590,25 +595,42 @@ def compute_metrics(p): raise ValueError("do_eval must be set to True for quantization.") trainer.save_model(training_args.output_dir) - if optim_args.quantization_approach != "PostTrainingDynamic": + if optim_args.quantization_approach != "dynamic": if not training_args.do_train: raise ValueError( "do_train must be set to True for static and aware training quantization." ) - if optim_args.quantization_approach == "QuantizationAwareTraining": - early_stopping_patience = 6 - early_stopping_threshold = 0.001 # optional - trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, - early_stopping_threshold)) - tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol ) - quantization_config = QuantizationConfig( - approach=optim_args.quantization_approach, - metrics=[tune_metric], - sampling_size = len(train_dataset)//20 - ) + trainer.metrics = tune_metric + if optim_args.quantization_approach != "qat": + tuning_criterion = TuningCriterion(max_trials=600, objective=["performance"]) + accuracy_criterion = AccuracyCriterion( + higher_is_better=True, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. + ) + quantization_config = PostTrainingQuantConfig( + approach=optim_args.quantization_approach, + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + else: + tuning_criterion = TuningCriterion(max_trials=600, objective=["performance"]) + accuracy_criterion = AccuracyCriterion( + higher_is_better=True, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. + ) + quantization_config = QuantizationAwareTrainingConfig( + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + early_stopping_patience = 2 + early_stopping_threshold = 0.001 # optional + trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, \ + early_stopping_threshold)) model = trainer.quantize(quantization_config) if optim_args.benchmark_only: diff --git a/examples/huggingface/pytorch/token-classification/quantization/run_tuning.sh b/examples/huggingface/pytorch/token-classification/quantization/run_tuning.sh index 35616849055..ce43f4c35b9 100644 --- a/examples/huggingface/pytorch/token-classification/quantization/run_tuning.sh +++ b/examples/huggingface/pytorch/token-classification/quantization/run_tuning.sh @@ -18,7 +18,7 @@ function init_params { batch_size=8 MAX_SEQ_LENGTH=384 model_type="bert" - approach="PostTrainingStatic" + approach="static" for var in "$@" do case $var in @@ -49,17 +49,17 @@ function run_tuning { DATASET_NAME="conll2003" model_name_or_path="elastic/distilbert-base-uncased-finetuned-conll03-english " model_type="bert" - approach="PostTrainingStatic" + approach="static" elif [ "${topology}" = "distilbert_base_ner_dynamic" ]; then DATASET_NAME="conll2003" model_name_or_path="elastic/distilbert-base-uncased-finetuned-conll03-english " model_type="bert" - approach="PostTrainingDynamic" + approach="dynamic" elif [ "${topology}" = "distilbert_base_ner_qat" ]; then DATASET_NAME="conll2003" model_name_or_path="elastic/distilbert-base-uncased-finetuned-conll03-english " model_type="bert" - approach="QuantizationAwareTraining" + approach="qat" extra_cmd=$extra_cmd" --learning_rate 1e-5 \ --num_train_epochs 6 \ --eval_steps 100 \ @@ -68,7 +68,8 @@ function run_tuning { --load_best_model_at_end True \ --evaluation_strategy steps \ --save_strategy steps \ - --save_total_limit 1" + --save_total_limit 1 \ + --save_safetensors False" fi python -u ./run_ner.py \ diff --git a/examples/huggingface/pytorch/translation/quantization/README.md b/examples/huggingface/pytorch/translation/quantization/README.md index 3eecf1d4316..8fd4c4844fe 100644 --- a/examples/huggingface/pytorch/translation/quantization/README.md +++ b/examples/huggingface/pytorch/translation/quantization/README.md @@ -9,9 +9,7 @@ This directory contains the example for quantization models on translation tasks ``` pip install intel-extension-for-transformers pip install -r requirements.txt -pip install transformers==4.34.1 ``` ->**Note**: Please use transformers no higher than 4.34.1 # Run @@ -30,8 +28,8 @@ python examples/pytorch/translation/run_translation.py \ --dataset_name wmt16 \ --dataset_config_name ro-en \ --output_dir /tmp/tst-translation \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ + --per_device_train_batch_size 4 \ + --per_device_eval_batch_size 4 \ --overwrite_output_dir \ --tune \ --predict_with_generate @@ -51,8 +49,8 @@ python examples/pytorch/translation/run_translation.py \ --dataset_name wmt16 \ --dataset_config_name ro-en \ --output_dir /tmp/tst-translation \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ + --per_device_train_batch_size 4 \ + --per_device_eval_batch_size 4 \ --overwrite_output_dir \ --tune \ --predict_with_generate diff --git a/examples/huggingface/pytorch/translation/quantization/run_translation.py b/examples/huggingface/pytorch/translation/quantization/run_translation.py index f5554905dcc..4d02698d21a 100755 --- a/examples/huggingface/pytorch/translation/quantization/run_translation.py +++ b/examples/huggingface/pytorch/translation/quantization/run_translation.py @@ -28,8 +28,13 @@ import numpy as np from datasets import load_dataset, load_metric -from intel_extension_for_transformers.transformers import OptimizedModel, QuantizationConfig -from intel_extension_for_transformers.transformers import metrics as nlp_metrics +from intel_extension_for_transformers.transformers import OptimizedModel, objectives, metrics +from neural_compressor.config import ( + PostTrainingQuantConfig, + QuantizationAwareTrainingConfig, + TuningCriterion, + AccuracyCriterion +) from intel_extension_for_transformers.transformers.trainer import NLPSeq2SeqTrainer import transformers from transformers import ( @@ -244,9 +249,9 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply quantization."}, ) quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, + default="static", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic and qat."}, ) metric_name: Optional[str] = field( default="eval_bleu", @@ -362,6 +367,7 @@ def main(): data_args.dataset_config_name, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, + trust_remote_code=True, ) else: data_files = {} @@ -379,6 +385,7 @@ def main(): data_files=data_files, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, + trust_remote_code=True, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. @@ -565,7 +572,7 @@ def preprocess_function(examples): ) # Metric - metric = load_metric("sacrebleu") + metric = load_metric("sacrebleu", trust_remote_code=True) def postprocess_text(preds, labels): preds = [pred.strip() for pred in preds] @@ -620,26 +627,43 @@ def compute_metrics(eval_preds): raise ValueError("do_eval must be set to True for quantization.") trainer.save_model(training_args.output_dir) - if optim_args.quantization_approach != "PostTrainingDynamic": + if optim_args.quantization_approach != "dynamic": if not training_args.do_train: raise ValueError( "do_train must be set to True for static and aware training quantization." ) - if optim_args.quantization_approach == "QuantizationAwareTraining": - early_stopping_patience = 6 - early_stopping_threshold = 0.001 # optional - trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, - early_stopping_threshold)) - - tune_metric = nlp_metrics.Metric( + objective = objectives.performance + tune_metric = metrics.Metric( name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol ) - quantization_config = QuantizationConfig( - approach=optim_args.quantization_approach, - max_trials=200, - metrics=[tune_metric], - sampling_size = len(train_dataset)//20 - ) + trainer.metrics = tune_metric + if optim_args.quantization_approach != "qat": + tuning_criterion = TuningCriterion(max_trials=600, objective=[objective.name]) + accuracy_criterion = AccuracyCriterion( + higher_is_better=True, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. + ) + quantization_config = PostTrainingQuantConfig( + approach=optim_args.quantization_approach, + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + else: + tuning_criterion = TuningCriterion(max_trials=600, objective=[objective.name]) + accuracy_criterion = AccuracyCriterion( + higher_is_better=True, # optional. + criterion="relative" if optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=optim_args.perf_tol, # optional. + ) + quantization_config = QuantizationAwareTrainingConfig( + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + early_stopping_patience = 2 + early_stopping_threshold = 0.001 # optional + trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, \ + early_stopping_threshold)) trainer.max_length = max_length trainer.num_beams = num_beams model = trainer.quantize(quant_config=quantization_config) diff --git a/examples/huggingface/pytorch/translation/quantization/run_tuning.sh b/examples/huggingface/pytorch/translation/quantization/run_tuning.sh index 14fef28308b..ef294a54587 100644 --- a/examples/huggingface/pytorch/translation/quantization/run_tuning.sh +++ b/examples/huggingface/pytorch/translation/quantization/run_tuning.sh @@ -15,7 +15,7 @@ function init_params { DATASET_NAME="xsum" extra_cmd="" batch_size=8 - approach="PostTrainingStatic" + approach="static" for var in "$@" do case $var in @@ -45,11 +45,11 @@ function run_tuning { if [ "${topology}" = "t5-small_dynamic" ]; then model_name_or_path="t5-small" extra_cmd=$extra_cmd" --source_lang en --target_lang ro --dataset_name wmt16 --dataset_config_name ro-en" - approach="PostTrainingDynamic" + approach="dynamic" elif [ "${topology}" = "marianmt_WMT_en_ro_dynamic" ]; then model_name_or_path='Helsinki-NLP/opus-mt-en-ro' extra_cmd=$extra_cmd" --source_lang en --target_lang ro --dataset_name wmt16 --dataset_config_name ro-en" - approach="PostTrainingDynamic" + approach="dynamic" else echo "unsupported topology: ${topology}" exit 1 diff --git a/examples/huggingface/tensorflow/language-modeling/quantization/README.md b/examples/huggingface/tensorflow/language-modeling/quantization/README.md deleted file mode 100644 index 883689068bc..00000000000 --- a/examples/huggingface/tensorflow/language-modeling/quantization/README.md +++ /dev/null @@ -1,63 +0,0 @@ -Step-by-Step -========= - -This document describes the step-by-step instructions for reproducing the quantization on models for the Language Modeling tasks. - -There are mainly two kinds of language modeling tasks: Causal Language Modeling (CLM) and Masked Language Modeling (MLM). Two scripts `run_clm.py` and `run_mlm.py` provide quantization examples on the above two kinds of models based on [Intel® Neural Compressor](https://github.com/intel/neural-compressor). Users can easily run the quantization with `run_tuning.sh` and the benchmarking with `run_benchmark.sh`. - -Please note that language modeling tasks use `loss` as the evaluation metric so the loss will appear where the accuracy should be in the final tune result statistics, and the `greater_is_better=False` should be set in the Python scripts. - -Users can also change the `--max_training_samples`, `--max_eval_samples`, and `--max_seq_length` in the scripts for quicker debugging and to avoid potential lack of memory. - -# Prerequisite -## 1. Installation - -Make sure you have installed Intel® Extension for Transformers and all the dependencies in the current example: - -```shell -pip install intel-extension-for-transformers -cd ptq -pip install -r requirements.txt -``` - -# Run - -## 1. Run Command for the CLM task (Shell) - -- Topology: - - distilgpt2_clm - -* To get the int8 model - -``` -cd ptq -bash run_tuning.sh --topology=[topology] -``` - -* To benchmark the int8 model - - -``` -cd ptq -bash run_benchmark.sh --topology=[topology] --mode=benchmark --int8=true -``` - -## 2. Run Command for the MLM task (Shell) - -- Topology: - - distilbert_mlm - - distilroberta_mlm - -* To get the int8 model - -``` -cd ptq -bash run_tuning.sh --topology=[topology] -``` - -* To benchmark the int8 model - -``` -cd ptq -bash run_benchmark.sh --topology=[topology] --mode=benchmark --int8=true -``` \ No newline at end of file diff --git a/examples/huggingface/tensorflow/language-modeling/quantization/ptq/requirements.txt b/examples/huggingface/tensorflow/language-modeling/quantization/ptq/requirements.txt deleted file mode 100644 index 62aa53701f4..00000000000 --- a/examples/huggingface/tensorflow/language-modeling/quantization/ptq/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -datasets >= 1.17 -sentencepiece != 0.1.92 -protobuf -intel-tensorflow -transformers -scikit-learn -accelerate \ No newline at end of file diff --git a/examples/huggingface/tensorflow/language-modeling/quantization/ptq/run_benchmark.sh b/examples/huggingface/tensorflow/language-modeling/quantization/ptq/run_benchmark.sh deleted file mode 100644 index e3cb8c3c55a..00000000000 --- a/examples/huggingface/tensorflow/language-modeling/quantization/ptq/run_benchmark.sh +++ /dev/null @@ -1,119 +0,0 @@ -#!/bin/bash -set -x - -function main { - - init_params "$@" - run_benchmark - -} - -# init params -function init_params { - topology="distilgpt2_clm" - iters=100 - batch_size=16 - tuned_checkpoint=saved_results - cache_dir="cache" - for var in "$@" - do - case $var in - --topology=*) - topology=$(echo $var |cut -f2 -d=) - ;; - --dataset_location=*) - dataset_location=$(echo $var |cut -f2 -d=) - ;; - --input_model=*) - input_model=$(echo $var |cut -f2 -d=) - ;; - --mode=*) - mode=$(echo $var |cut -f2 -d=) - ;; - --batch_size=*) - batch_size=$(echo $var |cut -f2 -d=) - ;; - --iters=*) - iters=$(echo ${var} |cut -f2 -d=) - ;; - --int8=*) - int8=$(echo ${var} |cut -f2 -d=) - ;; - --config=*) - tuned_checkpoint=$(echo $var |cut -f2 -d=) - ;; - --worker=*) - worker=$(echo $var |cut -f2 -d=) - ;; - --task_index=*) - task_index=$(echo $var |cut -f2 -d=) - ;; - --cache_dir=*) - cache_dir=$(echo $var |cut -f2 -d=) - ;; - *) - echo "Error: No such parameter: ${var}" - exit 1 - ;; - esac - done - -} - - -# run_benchmark -function run_benchmark { - extra_cmd='' - MAX_SEQ_LENGTH=128 - - if [[ ${mode} == "accuracy" ]]; then - mode_cmd=" --accuracy_only" - elif [[ ${mode} == "benchmark" ]]; then - mode_cmd=" --benchmark " - else - echo "Error: No such mode: ${mode}" - exit 1 - fi - - if [ "${topology}" = "distilgpt2_clm" ]; then - script="run_clm.py" - dataset_name="wikitext" - model_name_or_path="distilgpt2" - dataset_config_name="wikitext-2-raw-v1" - # remove following two parameters if you have enough memory - extra_cmd=$extra_cmd" --max_eval_samples 196 --block_size 128" - elif [ "${topology}" = "distilbert_mlm" ]; then - script="run_mlm.py" - dataset_name="wikitext" - model_name_or_path="distilbert-base-cased" - dataset_config_name="wikitext-2-raw-v1" - # remove following two parameters if you have enough memory - extra_cmd=$extra_cmd" --max_eval_samples 196 --max_seq_length 128" - elif [ "${topology}" = "distilroberta_mlm" ]; then - script="run_mlm.py" - dataset_name="wikitext" - model_name_or_path="Rocketknight1/distilroberta-base-finetuned-wikitext2" - dataset_config_name="wikitext-2-raw-v1" - # remove following two parameters if you have enough memory - extra_cmd=$extra_cmd" --max_eval_samples 196 --max_seq_length 128" - fi - - if [[ ${int8} == "true" ]]; then - extra_cmd=$extra_cmd" --int8" - fi - echo $extra_cmd - - python -u ../${script} \ - --model_name_or_path ${model_name_or_path} \ - --dataset_name ${dataset_name} \ - --dataset_config_name ${dataset_config_name} \ - --do_eval \ - --per_device_eval_batch_size ${batch_size} \ - --output_dir ${tuned_checkpoint} \ - --overwrite_output_dir \ - --cache_dir ${cache_dir} \ - ${mode_cmd} \ - ${extra_cmd} -} - -main "$@" diff --git a/examples/huggingface/tensorflow/language-modeling/quantization/ptq/run_tuning.sh b/examples/huggingface/tensorflow/language-modeling/quantization/ptq/run_tuning.sh deleted file mode 100644 index 6ffa270911f..00000000000 --- a/examples/huggingface/tensorflow/language-modeling/quantization/ptq/run_tuning.sh +++ /dev/null @@ -1,115 +0,0 @@ -#!/bin/bash -set -x - -function main { - - init_params "$@" - run_tuning - -} - -# init params -function init_params { - topology="distilgpt2_clm" - tuned_checkpoint="saved_results" - extra_cmd="" - batch_size=8 - MAX_SEQ_LENGTH=128 - model_type="bert" - approach="PostTrainingStatic" - cache_dir="cache" - for var in "$@" - do - case $var in - --topology=*) - topology=$(echo $var |cut -f2 -d=) - ;; - --dataset_location=*) - dataset_location=$(echo $var |cut -f2 -d=) - ;; - --input_model=*) - input_model=$(echo $var |cut -f2 -d=) - ;; - --output_model=*) - tuned_checkpoint=$(echo $var |cut -f2 -d=) - ;; - --worker=*) - worker=$(echo $var |cut -f2 -d=) - ;; - --task_index=*) - task_index=$(echo $var |cut -f2 -d=) - ;; - --cache_dir=*) - cache_dir=$(echo $var |cut -f2 -d=) - ;; - *) - echo "Error: No such parameter: ${var}" - exit 1 - ;; - esac - done - -} - -# run_tuning -function run_tuning { - if [ "${topology}" = "distilgpt2_clm" ]; then - script="run_clm.py" - model_name_or_path="distilgpt2" - dataset_name="wikitext" - approach="PostTrainingStatic" - dataset_config_name="wikitext-2-raw-v1" - # remove or change following two parameters if you have enough memory - extra_cmd=$extra_cmd" --max_eval_samples 96 --block_size 128 --perf_tol 0.08" - elif [ "${topology}" = "distilbert_mlm" ]; then - script="run_mlm.py" - model_name_or_path="distilbert-base-cased" - dataset_name="wikitext" - approach="PostTrainingStatic" - dataset_config_name="wikitext-2-raw-v1" - # remove or change following two parameters if you have enough memory - extra_cmd=$extra_cmd" --max_eval_samples 96 --max_seq_length 128 --perf_tol 0.08" - elif [ "${topology}" = "distilroberta_mlm" ]; then - script="run_mlm.py" - model_name_or_path="Rocketknight1/distilroberta-base-finetuned-wikitext2" - dataset_name="wikitext" - approach="PostTrainingStatic" - dataset_config_name="wikitext-2-raw-v1" - # remove or change following two parameters if you have enough memory - extra_cmd=$extra_cmd" --max_eval_samples 96 --max_seq_length 128 --perf_tol 0.08" - fi - - if [ "${worker}" = "" ] - then - python -u ../${script} \ - --model_name_or_path ${model_name_or_path} \ - --dataset_name ${dataset_name} \ - --dataset_config_name ${dataset_config_name} \ - --do_eval \ - --output_dir ${tuned_checkpoint} \ - --quantization_approach ${approach} \ - --do_train \ - --overwrite_output_dir \ - --cache_dir ${cache_dir} \ - --tune \ - ${extra_cmd} - else - python -u ../${script} \ - --model_name_or_path ${model_name_or_path} \ - --dataset_name ${dataset_name} \ - --dataset_config_name ${dataset_config_name} \ - --task_name ${TASK_NAME} \ - --do_eval \ - --output_dir ${tuned_checkpoint} \ - --quantization_approach ${approach} \ - --do_train \ - --overwrite_output_dir \ - --cache_dir ${cache_dir} \ - --tune \ - --worker "${worker}" \ - --task_index ${task_index} \ - ${extra_cmd} - fi -} - -main "$@" diff --git a/examples/huggingface/tensorflow/language-modeling/quantization/run_clm.py b/examples/huggingface/tensorflow/language-modeling/quantization/run_clm.py deleted file mode 100644 index 1b82d1ccf0f..00000000000 --- a/examples/huggingface/tensorflow/language-modeling/quantization/run_clm.py +++ /dev/null @@ -1,814 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Fine-tuning the library models for causal language modeling (GPT-2, GPT-Neo...) -on a text file or a dataset without using HuggingFace Trainer. -Here is the full list of checkpoints on the hub that can be fine-tuned by this script: -https://huggingface.co/models?filter=text-generation -""" -# You can also adapt this script on your own clm task. Pointers for this are left as comments. - -import json - -# region Imports -import logging -import math -import os -import random -import sys -from dataclasses import dataclass, field -from itertools import chain -from pathlib import Path -from typing import Optional -import time - -import numpy as np -import datasets -import tensorflow as tf -from datasets import load_dataset, load_metric -from sklearn.model_selection import train_test_split -from transformers.trainer_utils import get_last_checkpoint, is_main_process - -import transformers -from transformers import ( - CONFIG_MAPPING, - CONFIG_NAME, - TF2_WEIGHTS_NAME, - TF_MODEL_FOR_CAUSAL_LM_MAPPING, - AutoConfig, - AutoTokenizer, - HfArgumentParser, - TFAutoModelForCausalLM, - TFTrainingArguments, - create_optimizer, - set_seed, -) -from transformers.utils.versions import require_version - -logger = logging.getLogger(__name__) -require_version("datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/language-modeling/requirements.txt") -MODEL_CONFIG_CLASSES = list(TF_MODEL_FOR_CAUSAL_LM_MAPPING.keys()) -MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) -# endregion - -# region Command-line arguments -@dataclass -class ModelArguments: - """ - Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. - """ - - model_name_or_path: Optional[str] = field( - default=None, - metadata={ - "help": ( - "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch." - ) - }, - ) - model_type: Optional[str] = field( - default=None, - metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, - ) - config_overrides: Optional[str] = field( - default=None, - metadata={ - "help": ( - "Override some existing default config settings when a model is trained from scratch. Example: " - "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" - ) - }, - ) - config_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} - ) - tokenizer_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} - ) - cache_dir: Optional[str] = field( - default=None, - metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, - ) - use_fast_tokenizer: bool = field( - default=True, - metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, - ) - model_revision: str = field( - default="main", - metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, - ) - use_auth_token: bool = field( - default=False, - metadata={ - "help": ( - "Will use the token generated when running `huggingface-cli login` (necessary to use this script " - "with private models)." - ) - }, - ) - - def __post_init__(self): - if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): - raise ValueError( - "--config_overrides can't be used in combination with --config_name or --model_name_or_path" - ) - - -@dataclass -class DataTrainingArguments: - """ - Arguments pertaining to what data we are going to input our model for training and eval. - """ - - dataset_name: Optional[str] = field( - default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} - ) - dataset_config_name: Optional[str] = field( - default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} - ) - train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) - validation_file: Optional[str] = field( - default=None, - metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, - ) - max_seq_length: int = field( - default=128, - metadata={ - "help": "The maximum total input sequence length after tokenization. Sequences longer " - "than this will be truncated, sequences shorter will be padded." - }, - ) - pad_to_max_length: bool = field( - default=False, - metadata={ - "help": "Whether to pad all samples to `max_seq_length`. " - "If False, will pad the samples dynamically when batching to the maximum length in the batch." - }, - ) - overwrite_cache: bool = field( - default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} - ) - validation_split_percentage: Optional[int] = field( - default=5, - metadata={ - "help": "The percentage of the train set used as validation set in case there's no validation split" - }, - ) - block_size: Optional[int] = field( - default=None, - metadata={ - "help": ( - "Optional input sequence length after tokenization. " - "The training dataset will be truncated in block of this size for training. " - "Default to the model max input length for single sentence inputs (take into account special tokens)." - ) - }, - ) - preprocessing_num_workers: Optional[int] = field( - default=None, - metadata={"help": "The number of processes to use for the preprocessing."}, - ) - line_by_line: bool = field( - default=False, - metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."}, - ) - max_train_samples: Optional[int] = field( - default=None, - metadata={ - "help": ( - "For debugging purposes or quicker training, truncate the number of training examples to this " - "value if set." - ) - }, - ) - max_eval_samples: Optional[int] = field( - default=None, - metadata={ - "help": ( - "For debugging purposes or quicker training, truncate the number of evaluation examples to this " - "value if set." - ) - }, - ) - keep_linebreaks: bool = field( - default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} - ) - - def __post_init__(self): - if self.dataset_name is None and self.train_file is None and self.validation_file is None: - raise ValueError("Need either a dataset name or a training/validation file.") - else: - if self.train_file is not None: - extension = self.train_file.split(".")[-1] - assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." - if self.validation_file is not None: - extension = self.validation_file.split(".")[-1] - assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." - -@dataclass -class OptimizationArguments: - """ - Arguments pertaining to what type of optimization we are going to apply on the model. - """ - - tune: bool = field( - default=False, - metadata={"help": "Whether or not to apply quantization."}, - ) - quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, - ) - metric_name: Optional[str] = field( - default=None, - metadata={"help": "Metric used for the tuning strategy."}, - ) - is_relative: Optional[bool] = field( - default=True, - metadata={"help": "Metric tolerance model, expected to be relative or absolute."}, - ) - perf_tol: Optional[float] = field( - default=0.01, - metadata={"help": "Performance tolerance when optimizing the model."}, - ) - benchmark: bool = field( - default=False, - metadata={"help": "run benchmark."}) - int8: bool = field( - default=False, - metadata={"help":"Whether to use the quantized int8 model."}) - accuracy_only: bool = field( - default=False, - metadata={"help":"Whether to only test accuracy for model tuned by Neural Compressor."}) - -@dataclass -class DistributedArguments: - """ - Arguments setting the distributed multinode environment - """ - - worker: str = field( - default=None, - metadata={"help": "List of node ip addresses in a string, and there should not be space between addresses."}, - ) - task_index: int = field( - default=0, - metadata={"help": "Worker index, and 0 represents the chief worker while other workers are set as 1,2,3..."}, - ) - - -# endregion - -def main(): - # region Argument Parsing - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments, OptimizationArguments, DistributedArguments)) - if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): - # If we pass only one argument to the script and it's the path to a json file, - # let's parse it to get our arguments. - model_args, data_args, training_args, optim_args, distributed_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) - else: - model_args, data_args, training_args, optim_args, distributed_args = parser.parse_args_into_dataclasses() - - # region Setup logging - logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - handlers=[logging.StreamHandler(sys.stdout)], - ) - logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) - - # Set the verbosity to info of the Transformers logger (on main process only): - if is_main_process(training_args.local_rank): - transformers.utils.logging.set_verbosity_info() - transformers.utils.logging.enable_default_handler() - transformers.utils.logging.enable_explicit_format() - logger.info(f"Training/evaluation parameters {training_args}") - # endregion - - # Sanity checks - if data_args.dataset_name is None and data_args.train_file is None and data_args.validation_file is None: - raise ValueError("Need either a dataset name or a training/validation file.") - else: - if data_args.train_file is not None: - extension = data_args.train_file.split(".")[-1] - assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file." - if data_args.validation_file is not None: - extension = data_args.validation_file.split(".")[-1] - assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file." - - if training_args.output_dir is not None: - training_args.output_dir = Path(training_args.output_dir) - os.makedirs(training_args.output_dir, exist_ok=True) - # endregion - - # region Set the multinode environment, the strategy and paths - strategy = None - worker_list = None - if distributed_args.worker is not None: - logger.info("distributed environment initialization...") - - worker_list = distributed_args.worker.split(",") - - from intel_extension_for_transformers.transformers.utils.utility_tf import distributed_init - distributed_init(worker_list, "worker", distributed_args.task_index) - - strategy = tf.distribute.MultiWorkerMirroredStrategy() - from intel_extension_for_transformers.transformers.utils.utility_tf import get_filepath - training_args.output_dir = get_filepath(training_args.output_dir, strategy.cluster_resolver.task_type, strategy.cluster_resolver.task_id) - else: - strategy = training_args.strategy - #endregion - - # region Checkpoints - # Detecting last checkpoint. - checkpoint = None - if len(os.listdir(training_args.output_dir)) > 0 and not training_args.overwrite_output_dir and not training_args.do_eval: - config_path = training_args.output_dir / CONFIG_NAME - weights_path = training_args.output_dir / TF2_WEIGHTS_NAME - if config_path.is_file() and weights_path.is_file(): - checkpoint = training_args.output_dir - logger.info( - f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this" - " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch." - ) - else: - raise ValueError( - f"Output directory ({training_args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to continue regardless." - ) - - # endregion - - # If passed along, set the training seed now. - if training_args.seed is not None: - set_seed(training_args.seed) - - # region Load datasets - # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) - # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ - # (the dataset will be downloaded automatically from the datasets Hub). - # - # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called - # 'text' is found. You can easily tweak this behavior (see below). - # - # In distributed training, the load_dataset function guarantee that only one local process can concurrently - # download the dataset. - if data_args.dataset_name is not None: - # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset( - data_args.dataset_name, - data_args.dataset_config_name, - cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, - ) - if "validation" not in raw_datasets.keys(): - raw_datasets["validation"] = load_dataset( - data_args.dataset_name, - data_args.dataset_config_name, - split=f"train[:{data_args.validation_split_percentage}%]", - cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, - ) - raw_datasets["train"] = load_dataset( - data_args.dataset_name, - data_args.dataset_config_name, - split=f"train[{data_args.validation_split_percentage}%:]", - cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, - ) - else: - data_files = {} - dataset_args = {} - if data_args.train_file is not None: - data_files["train"] = data_args.train_file - if data_args.validation_file is not None: - data_files["validation"] = data_args.validation_file - extension = ( - data_args.train_file.split(".")[-1] - if data_args.train_file is not None - else data_args.validation_file.split(".")[-1] - ) - if extension == "txt": - extension = "text" - dataset_args["keep_linebreaks"] = data_args.keep_linebreaks - raw_datasets = load_dataset( - extension, - data_files=data_files, - cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, - **dataset_args, - ) - # If no validation data is there, validation_split_percentage will be used to divide the dataset. - if "validation" not in raw_datasets.keys(): - raw_datasets["validation"] = load_dataset( - extension, - data_files=data_files, - split=f"train[:{data_args.validation_split_percentage}%]", - cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, - **dataset_args, - ) - raw_datasets["train"] = load_dataset( - extension, - data_files=data_files, - split=f"train[{data_args.validation_split_percentage}%:]", - cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, - **dataset_args, - ) - # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at - # https://huggingface.co/docs/datasets/loading_datasets.html. - # endregion - # region Load pretrained model and tokenizer - # - # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently - # download model & vocab. - config = AutoConfig.from_pretrained( - model_args.config_name if model_args.config_name else model_args.model_name_or_path, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - _commit_hash="main", - ) - tokenizer = AutoTokenizer.from_pretrained( - model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, - cache_dir=model_args.cache_dir, - use_fast=model_args.use_fast_tokenizer, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - _commit_hash="main", - ) - # endregion - - - # region Dataset preprocessing - # First we tokenize all the texts. - column_names = raw_datasets["train"].column_names - text_column_name = "text" if "text" in column_names else column_names[0] - - def tokenize_function(examples): - return tokenizer(examples[text_column_name], return_token_type_ids=True) - - tokenized_datasets = raw_datasets.map( - tokenize_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on dataset", - ) - - if data_args.block_size is None: - block_size = tokenizer.model_max_length - if block_size > 1024: - logger.warning( - f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " - "Picking 1024 instead. You can change that default value by passing --block_size xxx." - ) - block_size = 1024 - else: - if data_args.block_size > tokenizer.model_max_length: - logger.warning( - f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" - f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." - ) - block_size = min(data_args.block_size, tokenizer.model_max_length) - - # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. - def group_texts(examples): - # Concatenate all texts. - concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} - total_length = len(concatenated_examples[list(examples.keys())[0]]) - # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can - # customize this part to your needs. - if total_length >= block_size: - total_length = (total_length // block_size) * block_size - # Split by chunks of max_len. - result = { - k: [t[i : i + block_size] for i in range(0, total_length, block_size)] - for k, t in concatenated_examples.items() - } - result["labels"] = result["input_ids"].copy() - return result - - # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder - # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower - # to preprocess. - # - # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map - - tokenized_datasets = tokenized_datasets.map( - group_texts, - batched=True, - num_proc=data_args.preprocessing_num_workers, - load_from_cache_file=not data_args.overwrite_cache, - desc=f"Grouping texts in chunks of {block_size}", - ) - - train_dataset = tokenized_datasets["train"] - if data_args.validation_file is not None: - eval_dataset = tokenized_datasets["validation"] - else: - logger.info( - f"Validation file not found: using {data_args.validation_split_percentage}% of the dataset as validation" - " as provided in data_args" - ) - train_indices, val_indices = train_test_split( - list(range(len(train_dataset))), test_size=data_args.validation_split_percentage / 100 - ) - - eval_dataset = train_dataset.select(val_indices) - train_dataset = train_dataset.select(train_indices) - - if data_args.max_train_samples is not None: - max_train_samples = min(len(train_dataset), data_args.max_train_samples) - train_dataset = train_dataset.select(range(max_train_samples)) - if data_args.max_eval_samples is not None: - max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) - eval_dataset = eval_dataset.select(range(max_eval_samples)) - - # Log a few random samples from the training set: - for index in random.sample(range(len(train_dataset)), min(3, len(train_dataset))): - logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") - # endregion - - with strategy.scope(): - # region Prepare model - if checkpoint is not None: - model = TFAutoModelForCausalLM.from_pretrained(checkpoint, config=config, cache_dir=model_args.cache_dir,) - elif model_args.model_name_or_path: - model = TFAutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, config=config, - cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None,) - else: - logger.info("Training new model from scratch") - model = TFAutoModelForCausalLM.from_config(config) - - model.resize_token_embeddings(len(tokenizer)) - # endregion - - # region TF Dataset preparation - num_replicas = (len(worker_list) if worker_list is not None else 1) - options = tf.data.Options() - options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF - - # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in - # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also - # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names - # yourself if you use this method, whereas they are automatically inferred from the model input names when - # using model.prepare_tf_dataset() - # For more info see the docs: - # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset - # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset - - if model_args.model_name_or_path == "distilgpt2": - train_dataset = train_dataset.remove_columns('token_type_ids') - eval_dataset = eval_dataset.remove_columns('token_type_ids') - - tf_train_dataset = model.prepare_tf_dataset( - train_dataset, - shuffle=True, - batch_size=num_replicas * training_args.per_device_train_batch_size, - ).with_options(options) - - tf_eval_dataset = model.prepare_tf_dataset( - eval_dataset, - shuffle=False, - batch_size=num_replicas * training_args.per_device_eval_batch_size, - drop_remainder=True, - ).with_options(options) - # endregion - - # region Optimizer and loss - num_train_steps = len(tf_train_dataset) * int(training_args.num_train_epochs) - if training_args.warmup_steps > 0: - num_warmup_steps = training_args.warmup_steps - elif training_args.warmup_ratio > 0: - num_warmup_steps = int(num_train_steps * training_args.warmup_ratio) - else: - num_warmup_steps = 0 - - # Bias and layernorm weights are automatically excluded from the decay - optimizer, lr_schedule = create_optimizer( - init_lr=training_args.learning_rate, - num_train_steps=num_train_steps, - num_warmup_steps=num_warmup_steps, - adam_beta1=training_args.adam_beta1, - adam_beta2=training_args.adam_beta2, - adam_epsilon=training_args.adam_epsilon, - weight_decay_rate=training_args.weight_decay, - adam_global_clipnorm=training_args.max_grad_norm, - ) - # no user-specified loss = will use the model internal loss - model.compile(optimizer=optimizer, jit_compile=training_args.xla) - - def compute_metrics(preds, labels): - preds = preds["logits"] - # preds have the same shape as the labels, after the argmax(-1) has been calculated - # by preprocess_logits_for_metrics but we need to shift the labels - labels = labels[:, 1:] - preds = preds[:, :-1] - return hf_compute_loss(labels, preds) - - # loss function for CLM model - def hf_compute_loss(labels, logits): - loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( - from_logits=True, reduction=tf.keras.losses.Reduction.NONE - ) - - # Clip negative labels to zero here to avoid NaNs and errors - those positions will get masked later anyway - unmasked_loss = loss_fn(tf.nn.relu(labels), logits) - # make sure only labels that are not equal to -100 affect the loss - loss_mask = tf.cast(labels != -100, dtype=unmasked_loss.dtype) - masked_loss = unmasked_loss * loss_mask - reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(loss_mask) - return tf.reshape(reduced_masked_loss, (1,)) - - def eval_func_clm(model): - label_ids: np.ndarray = None - - num_examples = sum(1 for _ in ( - tf_eval_dataset.unbatch() if hasattr(tf_eval_dataset, "unbatch") else tf_eval_dataset)) - logger.info(f"***** Running Evaluation *****") - logger.info(f" Num examples in dataset = {num_examples}") - logger.info(f" Batch size = {training_args.per_device_eval_batch_size}") - - preds: np.ndarray = None - infer = model.signatures["serving_default"] - - for idx, (inputs, labels) in enumerate(tf_eval_dataset): - for name in inputs: - inputs[name] = tf.constant(inputs[name].numpy(), dtype=infer.inputs[0].dtype) - - results = infer(**inputs) - if preds is None: - preds = results["Identity"].numpy() - else: - preds = np.append(preds, results["Identity"].numpy(), axis=0) - - if label_ids is None: - label_ids = labels[0].numpy() if isinstance( - labels, list) else labels.numpy() - else: - label_ids = np.append( - label_ids, - labels[0].numpy() - if isinstance(labels, list) else labels.numpy(), - axis=0) - test_predictions = {"logits": preds} - loss = compute_metrics(test_predictions, label_ids) - - return loss.numpy()[0] - - # region tuning - if optim_args.tune: - from intel_extension_for_transformers.transformers import metrics, objectives, QuantizationConfig, TFOptimization - optimization = TFOptimization( - model=model, - args=training_args, - train_dataset=tf_train_dataset, - eval_dataset=tf_eval_dataset, - compute_metrics=compute_metrics, - task_type=strategy.cluster_resolver.task_type if isinstance(strategy, tf.distribute.MultiWorkerMirroredStrategy) else None, - task_id=strategy.cluster_resolver.task_id if isinstance(strategy, tf.distribute.MultiWorkerMirroredStrategy) else None, - ) - - # use customized eval function - optimization.eval_func = eval_func_clm - - tune_metric = metrics.Metric( - name="loss", greater_is_better=False, is_relative=True, criterion=optim_args.perf_tol, - ) - quantization_config = QuantizationConfig( - framework="tensorflow", - approach="POSTTRAININGSTATIC", - metrics=[tune_metric], - objectives=[objectives.performance] - ) - quantized_model = optimization.quantize(quant_config=quantization_config) - exit(0) - # endregion - - # region Training and validation - if training_args.do_train: - logger.info("***** Running training *****") - logger.info(f" Num examples = {len(train_dataset)}") - logger.info(f" Num Epochs = {training_args.num_train_epochs}") - logger.info(f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}") - logger.info(f" Total train batch size = {training_args.per_device_train_batch_size * num_replicas}") - - # For long training runs, you may wish to use the PushToHub() callback here to save intermediate checkpoints - # to the Hugging Face Hub rather than just pushing the finished model. - # See https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.PushToHubCallback - history = model.fit( - tf_train_dataset, - validation_data=tf_eval_dataset, - epochs=int(training_args.num_train_epochs), - ) - train_loss = history.history["loss"][-1] - try: - train_perplexity = math.exp(train_loss) - except OverflowError: - train_perplexity = math.inf - logger.info(f" Final train loss: {train_loss:.3f}") - logger.info(f" Final train perplexity: {train_perplexity:.3f}") - validation_loss = history.history["val_loss"][-1] - try: - validation_perplexity = math.exp(validation_loss) - except OverflowError: - validation_perplexity = math.inf - logger.info(f" Final validation loss: {validation_loss:.3f}") - logger.info(f" Final validation perplexity: {validation_perplexity:.3f}") - - if training_args.output_dir is not None: - output_eval_file = os.path.join(training_args.output_dir, "all_results.json") - results_dict = dict() - results_dict["train_loss"] = train_loss - results_dict["train_perplexity"] = train_perplexity - results_dict["eval_loss"] = validation_loss - results_dict["eval_perplexity"] = validation_perplexity - with open(output_eval_file, "w") as writer: - writer.write(json.dumps(results_dict)) - - if training_args.output_dir is not None and not training_args.push_to_hub: - # If we're not pushing to hub, at least save a local copy when we're done - model.save_pretrained(training_args.output_dir) - # endregion - - # region Evaluation - if training_args.do_eval: - num_examples = sum(1 for _ in ( - tf_eval_dataset.unbatch() if hasattr(tf_eval_dataset, "unbatch") else tf_eval_dataset)) - - if optim_args.int8: - model = tf.saved_model.load(training_args.output_dir) - else: - from intel_extension_for_transformers.transformers.utils.utility_tf import keras2SavedModel - model = keras2SavedModel(model) - - preds: np.ndarray = None - label_ids: np.ndarray = None - infer = model.signatures["serving_default"] - - if optim_args.accuracy_only: - iterations = 1 - warmup = 0 - else: - iterations = 10 - warmup = 5 - latency_list = [] - - for idx in range(iterations): - iteration_time = 0 - for i, (inputs, labels) in enumerate(tf_eval_dataset): - for name in inputs: - inputs[name] = tf.constant(inputs[name].numpy(), dtype=infer.inputs[0].dtype) - - start = time.time() - results = infer(**inputs) - iteration_time += time.time() - start - if idx == 0: # only accumulate once all the preds and labels - if preds is None: - preds = results["Identity"].numpy() - else: - preds = np.append(preds, results["Identity"].numpy(), axis=0) - if label_ids is None: - label_ids = labels[0].numpy() if isinstance( - labels, list) else labels.numpy() - else: - label_ids = np.append( - label_ids, - labels[0].numpy() - if isinstance(labels, list) else labels.numpy(), - axis=0) - latency_list.append(iteration_time) - logger.info("Iteration {} time: {} sec".format(idx, iteration_time)) - - loss = compute_metrics({"logits": preds}, label_ids) - logger.info("\nEvaluation result: ") - logger.info("Accuracy: {}".format(loss.numpy()[0])) - - average_iteration_time = np.array(latency_list[warmup:]).mean() - logger.info( - "Throughput: {} samples/sec".format( - num_examples / average_iteration_time) - ) - #endregion - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/examples/huggingface/tensorflow/language-modeling/quantization/run_mlm.py b/examples/huggingface/tensorflow/language-modeling/quantization/run_mlm.py deleted file mode 100644 index be683113ccf..00000000000 --- a/examples/huggingface/tensorflow/language-modeling/quantization/run_mlm.py +++ /dev/null @@ -1,848 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...) -on a text file or a dataset without using HuggingFace Trainer. -Here is the full list of checkpoints on the hub that can be fine-tuned by this script: -https://huggingface.co/models?filter=fill-mask -""" -# You can also adapt this script on your own mlm task. Pointers for this are left as comments. - -import json -import logging -import math -import os -import random -import sys -from dataclasses import dataclass, field -from itertools import chain -from pathlib import Path -from typing import Optional -import time - -import datasets -import tensorflow as tf -from datasets import load_dataset -from sklearn.model_selection import train_test_split - -import numpy as np - -import transformers -from transformers import ( - CONFIG_MAPPING, - CONFIG_NAME, - TF2_WEIGHTS_NAME, - TF_MODEL_FOR_MASKED_LM_MAPPING, - AutoConfig, - AutoTokenizer, - DataCollatorForLanguageModeling, - HfArgumentParser, - PushToHubCallback, - TFAutoModelForMaskedLM, - TFTrainingArguments, - create_optimizer, - set_seed, -) - -from transformers.utils.versions import require_version -from transformers.trainer_utils import get_last_checkpoint, is_main_process - -logger = logging.getLogger(__name__) -require_version("datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/language-modeling/requirements.txt") -MODEL_CONFIG_CLASSES = list(TF_MODEL_FOR_MASKED_LM_MAPPING.keys()) -MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) - -# region Command-line arguments -@dataclass -class ModelArguments: - """ - Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. - """ - - model_name_or_path: Optional[str] = field( - default=None, - metadata={ - "help": ( - "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch." - ) - }, - ) - model_type: Optional[str] = field( - default=None, - metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, - ) - config_overrides: Optional[str] = field( - default=None, - metadata={ - "help": ( - "Override some existing default config settings when a model is trained from scratch. Example: " - "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" - ) - }, - ) - config_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} - ) - tokenizer_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} - ) - cache_dir: Optional[str] = field( - default=None, - metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, - ) - use_fast_tokenizer: bool = field( - default=True, - metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, - ) - model_revision: str = field( - default="main", - metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, - ) - use_auth_token: bool = field( - default=False, - metadata={ - "help": ( - "Will use the token generated when running `huggingface-cli login` (necessary to use this script " - "with private models)." - ) - }, - ) - - def __post_init__(self): - if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): - raise ValueError( - "--config_overrides can't be used in combination with --config_name or --model_name_or_path" - ) - - -@dataclass -class DataTrainingArguments: - """ - Arguments pertaining to what data we are going to input our model for training and eval. - """ - - dataset_name: Optional[str] = field( - default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} - ) - dataset_config_name: Optional[str] = field( - default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} - ) - train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) - validation_file: Optional[str] = field( - default=None, - metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, - ) - overwrite_cache: bool = field( - default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} - ) - validation_split_percentage: Optional[int] = field( - default=5, - metadata={ - "help": "The percentage of the train set used as validation set in case there's no validation split" - }, - ) - max_seq_length: Optional[int] = field( - default=None, - metadata={ - "help": ( - "The maximum total input sequence length after tokenization. Sequences longer " - "than this will be truncated." - ) - }, - ) - preprocessing_num_workers: Optional[int] = field( - default=None, - metadata={"help": "The number of processes to use for the preprocessing."}, - ) - mlm_probability: float = field( - default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"} - ) - line_by_line: bool = field( - default=False, - metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."}, - ) - pad_to_max_length: bool = field( - default=False, - metadata={ - "help": ( - "Whether to pad all samples to `max_seq_length`. " - "If False, will pad the samples dynamically when batching to the maximum length in the batch." - ) - }, - ) - max_train_samples: Optional[int] = field( - default=None, - metadata={ - "help": ( - "For debugging purposes or quicker training, truncate the number of training examples to this " - "value if set." - ) - }, - ) - max_eval_samples: Optional[int] = field( - default=None, - metadata={ - "help": ( - "For debugging purposes or quicker training, truncate the number of evaluation examples to this " - "value if set." - ) - }, - ) - - def __post_init__(self): - if self.dataset_name is None and self.train_file is None and self.validation_file is None: - raise ValueError("Need either a dataset name or a training/validation file.") - else: - if self.train_file is not None: - extension = self.train_file.split(".")[-1] - assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." - if self.validation_file is not None: - extension = self.validation_file.split(".")[-1] - assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." - - -@dataclass -class OptimizationArguments: - """ - Arguments pertaining to what type of optimization we are going to apply on the model. - """ - - tune: bool = field( - default=False, - metadata={"help": "Whether or not to apply quantization."}, - ) - quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, - ) - metric_name: Optional[str] = field( - default=None, - metadata={"help": "Metric used for the tuning strategy."}, - ) - is_relative: Optional[bool] = field( - default=True, - metadata={"help": "Metric tolerance model, expected to be relative or absolute."}, - ) - perf_tol: Optional[float] = field( - default=0.01, - metadata={"help": "Performance tolerance when optimizing the model."}, - ) - benchmark: bool = field( - default=False, - metadata={"help": "run benchmark."}) - int8: bool = field( - default=False, - metadata={"help":"Whether to use the quantized int8 model."}) - accuracy_only: bool = field( - default=False, - metadata={"help":"Whether to only test accuracy for model tuned by Neural Compressor."}) - -@dataclass -class DistributedArguments: - """ - Arguments setting the distributed multinode environment - """ - - worker: str = field( - default=None, - metadata={"help": "List of node ip addresses in a string, and there should not be space between addresses."}, - ) - task_index: int = field( - default=0, - metadata={"help": "Worker index, and 0 represents the chief worker while other workers are set as 1,2,3..."}, - ) - - -# endregion - - -def main(): - # region Argument Parsing - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments, OptimizationArguments, DistributedArguments)) - if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): - # If we pass only one argument to the script and it's the path to a json file, - # let's parse it to get our arguments. - model_args, data_args, training_args, optim_args, distributed_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) - else: - model_args, data_args, training_args, optim_args, distributed_args = parser.parse_args_into_dataclasses() - - # region Setup logging - logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - handlers=[logging.StreamHandler(sys.stdout)], - ) - logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) - - # Set the verbosity to info of the Transformers logger (on main process only): - if is_main_process(training_args.local_rank): - transformers.utils.logging.set_verbosity_info() - transformers.utils.logging.enable_default_handler() - transformers.utils.logging.enable_explicit_format() - logger.info(f"Training/evaluation parameters {training_args}") - # endregion - - # Sanity checks - if data_args.dataset_name is None and data_args.train_file is None and data_args.validation_file is None: - raise ValueError("Need either a dataset name or a training/validation file.") - else: - if data_args.train_file is not None: - extension = data_args.train_file.split(".")[-1] - assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file." - if data_args.validation_file is not None: - extension = data_args.validation_file.split(".")[-1] - assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file." - - if training_args.output_dir is not None: - training_args.output_dir = Path(training_args.output_dir) - os.makedirs(training_args.output_dir, exist_ok=True) - # endregion - - # region Set the multinode environment, the strategy and paths - strategy = None - worker_list = None - if distributed_args.worker is not None: - logger.info("distributed environment initialization...") - - worker_list = distributed_args.worker.split(",") - - from intel_extension_for_transformers.transformers.utils.utility_tf import distributed_init - distributed_init(worker_list, "worker", distributed_args.task_index) - - strategy = tf.distribute.MultiWorkerMirroredStrategy() - from intel_extension_for_transformers.transformers.utils.utility_tf import get_filepath - training_args.output_dir = get_filepath(training_args.output_dir, strategy.cluster_resolver.task_type, strategy.cluster_resolver.task_id) - else: - strategy = training_args.strategy - #endregion - - - # region Checkpoints - # Detecting last checkpoint. - checkpoint = None - if len(os.listdir(training_args.output_dir)) > 0 and not training_args.overwrite_output_dir: - config_path = training_args.output_dir / CONFIG_NAME - weights_path = training_args.output_dir / TF2_WEIGHTS_NAME - if config_path.is_file() and weights_path.is_file(): - checkpoint = training_args.output_dir - logger.warning( - f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this" - " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch." - ) - else: - raise ValueError( - f"Output directory ({training_args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to continue regardless." - ) - - # endregion - - # If passed along, set the training seed now. - if training_args.seed is not None: - set_seed(training_args.seed) - - # region Load datasets - # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) - # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ - # (the dataset will be downloaded automatically from the datasets Hub). - # - # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called - # 'text' is found. You can easily tweak this behavior (see below). - # - # In distributed training, the load_dataset function guarantee that only one local process can concurrently - # download the dataset. - if data_args.dataset_name is not None: - # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset( - data_args.dataset_name, - data_args.dataset_config_name, - cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, - ) - if "validation" not in raw_datasets.keys(): - raw_datasets["validation"] = load_dataset( - data_args.dataset_name, - data_args.dataset_config_name, - split=f"train[:{data_args.validation_split_percentage}%]", - cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, - ) - raw_datasets["train"] = load_dataset( - data_args.dataset_name, - data_args.dataset_config_name, - split=f"train[{data_args.validation_split_percentage}%:]", - cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, - ) - else: - data_files = {} - dataset_args = {} - if data_args.train_file is not None: - data_files["train"] = data_args.train_file - if data_args.validation_file is not None: - data_files["validation"] = data_args.validation_file - extension = ( - data_args.train_file.split(".")[-1] - if data_args.train_file is not None - else data_args.validation_file.split(".")[-1] - ) - if extension == "txt": - extension = "text" - dataset_args["keep_linebreaks"] = data_args.keep_linebreaks - raw_datasets = load_dataset( - extension, - data_files=data_files, - cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, - **dataset_args, - ) - # If no validation data is there, validation_split_percentage will be used to divide the dataset. - if "validation" not in raw_datasets.keys(): - raw_datasets["validation"] = load_dataset( - extension, - data_files=data_files, - split=f"train[:{data_args.validation_split_percentage}%]", - cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, - **dataset_args, - ) - raw_datasets["train"] = load_dataset( - extension, - data_files=data_files, - split=f"train[{data_args.validation_split_percentage}%:]", - cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, - **dataset_args, - ) - - # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at - # https://huggingface.co/docs/datasets/loading_datasets.html. - # endregion - - # region Load pretrained model and tokenizer - # - # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently - # download model & vocab. - config = AutoConfig.from_pretrained( - model_args.config_name if model_args.config_name else model_args.model_name_or_path, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - _commit_hash="main", - ) - tokenizer = AutoTokenizer.from_pretrained( - model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, - cache_dir=model_args.cache_dir, - use_fast=model_args.use_fast_tokenizer, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - _commit_hash="main", - ) - # endregion - - # region Dataset preprocessing - # First we tokenize all the texts. - column_names = raw_datasets["train"].column_names - text_column_name = "text" if "text" in column_names else column_names[0] - - if data_args.max_seq_length is None: - max_seq_length = tokenizer.model_max_length - if max_seq_length > 1024: - logger.warning( - f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " - "Picking 1024 instead. You can reduce that default value by passing --max_seq_length xxx." - ) - max_seq_length = 1024 - else: - if data_args.max_seq_length > tokenizer.model_max_length: - logger.warning( - f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" - f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." - ) - max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) - - if data_args.line_by_line: - # When using line_by_line, we just tokenize each nonempty line. - padding = "max_length" if data_args.pad_to_max_length else False - - def tokenize_function(examples): - # Remove empty lines - examples[text_column_name] = [ - line for line in examples[text_column_name] if len(line) > 0 and not line.isspace() - ] - return tokenizer( - examples[text_column_name], - padding=padding, - truncation=True, - max_length=max_seq_length, - # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it - # receives the `special_tokens_mask`. - return_special_tokens_mask=True, - return_token_type_ids=True - ) - - tokenized_datasets = raw_datasets.map( - tokenize_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=[text_column_name], - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on dataset line_by_line", - ) - else: - # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. - # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more - # efficient when it receives the `special_tokens_mask`. - def tokenize_function(examples): - return tokenizer(examples[text_column_name], return_special_tokens_mask=True, return_token_type_ids=True) - - tokenized_datasets = raw_datasets.map( - tokenize_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on every text in dataset", - ) - - # Main data processing function that will concatenate all texts from our dataset and generate chunks of - # max_seq_length. - def group_texts(examples): - # Concatenate all texts. - concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} - total_length = len(concatenated_examples[list(examples.keys())[0]]) - # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can - # customize this part to your needs. - if total_length >= max_seq_length: - total_length = (total_length // max_seq_length) * max_seq_length - # Split by chunks of max_len. - result = { - k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)] - for k, t in concatenated_examples.items() - } - return result - - # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a - # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value - # might be slower to preprocess. - # - # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map - - tokenized_datasets = tokenized_datasets.map( - group_texts, - batched=True, - num_proc=data_args.preprocessing_num_workers, - load_from_cache_file=not data_args.overwrite_cache, - desc=f"Grouping texts in chunks of {max_seq_length}", - ) - - train_dataset = tokenized_datasets["train"] - - if data_args.validation_file is not None: - eval_dataset = tokenized_datasets["validation"] - else: - logger.info( - f"Validation file not found: using {data_args.validation_split_percentage}% of the dataset as validation" - " as provided in data_args" - ) - train_indices, val_indices = train_test_split( - list(range(len(train_dataset))), test_size=data_args.validation_split_percentage / 100 - ) - - eval_dataset = train_dataset.select(val_indices) - train_dataset = train_dataset.select(train_indices) - - if data_args.max_train_samples is not None: - max_train_samples = min(len(train_dataset), data_args.max_train_samples) - train_dataset = train_dataset.select(range(max_train_samples)) - if data_args.max_eval_samples is not None: - max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) - eval_dataset = eval_dataset.select(range(max_eval_samples)) - - # Log a few random samples from the training set: - for index in random.sample(range(len(train_dataset)), min(3, len(train_dataset))): - logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") - # endregion - - with strategy.scope(): - # region Prepare model - if checkpoint is not None: - model = TFAutoModelForMaskedLM.from_pretrained(checkpoint, config=config, cache_dir=model_args.cache_dir,) - elif model_args.model_name_or_path: - model = TFAutoModelForMaskedLM.from_pretrained(model_args.model_name_or_path, config=config, - cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None,) - else: - logger.info("Training new model from scratch") - model = TFAutoModelForMaskedLM.from_config(config) - - model.resize_token_embeddings(len(tokenizer)) - # endregion - - # region TF Dataset preparation - num_replicas = training_args.strategy.num_replicas_in_sync - data_collator = DataCollatorForLanguageModeling( - tokenizer=tokenizer, mlm_probability=data_args.mlm_probability, return_tensors="tf" - ) - options = tf.data.Options() - options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF - - # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in - # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also - # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names - # yourself if you use this method, whereas they are automatically inferred from the model input names when - # using model.prepare_tf_dataset() - # For more info see the docs: - # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset - # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset - - tf_train_dataset = model.prepare_tf_dataset( - train_dataset, - shuffle=True, - batch_size=num_replicas * training_args.per_device_train_batch_size, - collate_fn=data_collator, - ).with_options(options) - - tf_eval_dataset = model.prepare_tf_dataset( - eval_dataset, - # labels are passed as input, as we will use the model's internal loss - shuffle=False, - batch_size=num_replicas * training_args.per_device_eval_batch_size, - collate_fn=data_collator, - drop_remainder=True, - ).with_options(options) - # endregion - - # region Optimizer and loss - num_train_steps = len(tf_train_dataset) * int(training_args.num_train_epochs) - if training_args.warmup_steps > 0: - num_warmup_steps = training_args.warmup_steps - elif training_args.warmup_ratio > 0: - num_warmup_steps = int(num_train_steps * training_args.warmup_ratio) - else: - num_warmup_steps = 0 - - # Bias and layernorm weights are automatically excluded from the decay - optimizer, lr_schedule = create_optimizer( - init_lr=training_args.learning_rate, - num_train_steps=num_train_steps, - num_warmup_steps=num_warmup_steps, - adam_beta1=training_args.adam_beta1, - adam_beta2=training_args.adam_beta2, - adam_epsilon=training_args.adam_epsilon, - weight_decay_rate=training_args.weight_decay, - adam_global_clipnorm=training_args.max_grad_norm, - ) - - # no user-specified loss = will use the model internal loss - model.compile(optimizer=optimizer, jit_compile=training_args.xla, run_eagerly=True) - # endregion - - def compute_metrics(preds, labels): - preds = preds["logits"] - return hf_compute_loss(labels, preds) - - # loss function for CLM model - def hf_compute_loss(labels, logits): - loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( - from_logits=True, reduction=tf.keras.losses.Reduction.NONE - ) - - # Clip negative labels to zero here to avoid NaNs and errors - those positions will get masked later anyway - unmasked_loss = loss_fn(tf.nn.relu(labels), logits) - # make sure only labels that are not equal to -100 affect the loss - loss_mask = tf.cast(labels != -100, dtype=unmasked_loss.dtype) - masked_loss = unmasked_loss * loss_mask - reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(loss_mask) - return tf.reshape(reduced_masked_loss, (1,)) - - def eval_func_mlm(model): - label_ids: np.ndarray = None - - num_examples = sum(1 for _ in ( - tf_eval_dataset.unbatch() if hasattr(tf_eval_dataset, "unbatch") else tf_eval_dataset)) - logger.info(f"***** Running Evaluation *****") - logger.info(f" Num examples in dataset = {num_examples}") - logger.info(f" Batch size = {training_args.per_device_eval_batch_size}") - - preds: np.ndarray = None - infer = model.signatures["serving_default"] - - for idx, (inputs, labels) in enumerate(tf_eval_dataset): - for name in inputs: - inputs[name] = tf.constant(inputs[name].numpy(), dtype=infer.inputs[0].dtype) - - results = infer(**inputs) - - if preds is None: - preds = results["Identity"].numpy() - else: - preds = np.append(preds, results["Identity"].numpy(), axis=0) - - if label_ids is None: - label_ids = labels[0].numpy() if isinstance( - labels, list) else labels.numpy() - else: - label_ids = np.append( - label_ids, - labels[0].numpy() - if isinstance(labels, list) else labels.numpy(), - axis=0) - test_predictions = {"logits": preds} - loss = compute_metrics(test_predictions, label_ids) - - return loss.numpy()[0] - - # region tuning - if optim_args.tune: - from intel_extension_for_transformers.transformers import metrics, objectives, QuantizationConfig, TFOptimization - optimization = TFOptimization( - model=model, - args=training_args, - train_dataset=tf_train_dataset, - eval_dataset=tf_eval_dataset, - compute_metrics=compute_metrics, - task_type=strategy.cluster_resolver.task_type if isinstance(strategy, tf.distribute.MultiWorkerMirroredStrategy) else None, - task_id=strategy.cluster_resolver.task_id if isinstance(strategy, tf.distribute.MultiWorkerMirroredStrategy) else None, - ) - - # use customized eval function - optimization.eval_func = eval_func_mlm - - tune_metric = metrics.Metric( - name="loss", greater_is_better=False, is_relative=True, criterion=optim_args.perf_tol, - ) - quantization_config = QuantizationConfig( - framework="tensorflow", - approach="POSTTRAININGSTATIC", - metrics=[tune_metric], - objectives=[objectives.performance] - ) - quantized_model = optimization.quantize(quant_config=quantization_config) - exit(0) - # endregion - - # region Training and validation - if training_args.do_train: - logger.info("***** Running training *****") - logger.info(f" Num examples = {len(train_dataset)}") - logger.info(f" Num Epochs = {training_args.num_train_epochs}") - logger.info(f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}") - logger.info(f" Total train batch size = {training_args.per_device_train_batch_size * num_replicas}") - - # For long training runs, you may wish to use the PushToHub() callback here to save intermediate checkpoints - # to the Hugging Face Hub rather than just pushing the finished model. - # See https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.PushToHubCallback - - history = model.fit( - tf_train_dataset, - validation_data=tf_eval_dataset, - epochs=int(training_args.num_train_epochs), - callbacks=callbacks, - ) - train_loss = history.history["loss"][-1] - try: - train_perplexity = math.exp(train_loss) - except OverflowError: - train_perplexity = math.inf - logger.info(f" Final train loss: {train_loss:.3f}") - logger.info(f" Final train perplexity: {train_perplexity:.3f}") - - validation_loss = history.history["val_loss"][-1] - try: - validation_perplexity = math.exp(validation_loss) - except OverflowError: - validation_perplexity = math.inf - logger.info(f" Final validation loss: {validation_loss:.3f}") - logger.info(f" Final validation perplexity: {validation_perplexity:.3f}") - - if training_args.output_dir is not None: - output_eval_file = os.path.join(training_args.output_dir, "all_results.json") - results_dict = dict() - results_dict["train_loss"] = train_loss - results_dict["train_perplexity"] = train_perplexity - results_dict["eval_loss"] = validation_loss - results_dict["eval_perplexity"] = validation_perplexity - with open(output_eval_file, "w") as writer: - writer.write(json.dumps(results_dict)) - # endregion - - # region Evaluation - if training_args.do_eval: - num_examples = sum(1 for _ in ( - tf_eval_dataset.unbatch() if hasattr(tf_eval_dataset, "unbatch") else tf_eval_dataset)) - - if optim_args.int8: - model = tf.saved_model.load(training_args.output_dir) - else: - from intel_extension_for_transformers.transformers.utils.utility_tf import keras2SavedModel - model = keras2SavedModel(model) - - preds: np.ndarray = None - label_ids: np.ndarray = None - infer = model.signatures["serving_default"] - - if optim_args.accuracy_only: - iterations = 1 - warmup = 0 - else: - iterations = 10 - warmup = 5 - latency_list = [] - - for idx in range(iterations): - iteration_time = 0 - for i, (inputs, labels) in enumerate(tf_eval_dataset): - for name in inputs: - inputs[name] = tf.constant(inputs[name].numpy(), dtype=infer.inputs[0].dtype) - - start = time.time() - results = infer(**inputs) - iteration_time += time.time() - start - if idx == 0: # only accumulate once all the preds and labels - if preds is None: - preds = results["Identity"].numpy() - else: - preds = np.append(preds, results["Identity"].numpy(), axis=0) - - if label_ids is None: - label_ids = labels[0].numpy() if isinstance( - labels, list) else labels.numpy() - else: - label_ids = np.append( - label_ids, - labels[0].numpy() - if isinstance(labels, list) else labels.numpy(), - axis=0) - latency_list.append(iteration_time) - logger.info("Iteration {} time: {} sec".format(idx, iteration_time)) - - loss = compute_metrics({"logits": preds}, label_ids) - logger.info("\nEvaluation result: ") - logger.info("Accuracy: {}".format(loss.numpy()[0])) - - average_iteration_time = np.array(latency_list[warmup:]).mean() - logger.info( - "Throughput: {} samples/sec".format( - num_examples / average_iteration_time) - ) - #endregion - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/examples/huggingface/tensorflow/multiple-choice/quantization/README.md b/examples/huggingface/tensorflow/multiple-choice/quantization/README.md deleted file mode 100644 index d204e4f0ae0..00000000000 --- a/examples/huggingface/tensorflow/multiple-choice/quantization/README.md +++ /dev/null @@ -1,34 +0,0 @@ -Step-by-Step -========= - -This document describes the step-by-step instructions for reproducing the quantization on models for the multiple choice tasks on the SWAG dataset. - -# Prerequisite -## 1. Installation - -Make sure you have installed Intel® Extension for Transformers and all the dependencies in the current example: - -```shell -pip install intel-extension-for-transformers -pip install -r requirements.txt -``` - -# Run - -## 1. Run Command (Shell) - -- Topology: - - distilbert_swag - -- To get the int8 model - -``` -bash run_tuning.sh --topology=[topology] -``` - -- To benchmark the int8 model - - -``` -bash run_benchmark.sh --topology=[topology] --mode=benchmark --int8=true -``` \ No newline at end of file diff --git a/examples/huggingface/tensorflow/multiple-choice/quantization/requirements.txt b/examples/huggingface/tensorflow/multiple-choice/quantization/requirements.txt deleted file mode 100644 index ffa62da04e1..00000000000 --- a/examples/huggingface/tensorflow/multiple-choice/quantization/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -datasets >= 1.17 -sentencepiece != 0.1.92 -protobuf -intel-tensorflow -transformers -accelerate \ No newline at end of file diff --git a/examples/huggingface/tensorflow/multiple-choice/quantization/run_benchmark.sh b/examples/huggingface/tensorflow/multiple-choice/quantization/run_benchmark.sh deleted file mode 100644 index d43bc97a53f..00000000000 --- a/examples/huggingface/tensorflow/multiple-choice/quantization/run_benchmark.sh +++ /dev/null @@ -1,101 +0,0 @@ -#!/bin/bash -set -x - -function main { - - init_params "$@" - run_benchmark - -} - -# init params -function init_params { - topology="distilbert_swag" - iters=100 - batch_size=16 - tuned_checkpoint=saved_results - cache_dir="cache" - for var in "$@" - do - case $var in - --topology=*) - topology=$(echo $var |cut -f2 -d=) - ;; - --dataset_location=*) - dataset_location=$(echo $var |cut -f2 -d=) - ;; - --input_model=*) - input_model=$(echo $var |cut -f2 -d=) - ;; - --mode=*) - mode=$(echo $var |cut -f2 -d=) - ;; - --batch_size=*) - batch_size=$(echo $var |cut -f2 -d=) - ;; - --iters=*) - iters=$(echo ${var} |cut -f2 -d=) - ;; - --int8=*) - int8=$(echo ${var} |cut -f2 -d=) - ;; - --config=*) - tuned_checkpoint=$(echo $var |cut -f2 -d=) - ;; - --worker=*) - worker=$(echo $var |cut -f2 -d=) - ;; - --task_index=*) - task_index=$(echo $var |cut -f2 -d=) - ;; - --cache_dir=*) - cache_dir=$(echo $var |cut -f2 -d=) - ;; - *) - echo "Error: No such parameter: ${var}" - exit 1 - ;; - esac - done - -} - - -# run_benchmark -function run_benchmark { - extra_cmd='' - MAX_SEQ_LENGTH=128 - - if [[ ${mode} == "accuracy" ]]; then - mode_cmd=" --accuracy_only" - elif [[ ${mode} == "benchmark" ]]; then - mode_cmd=" --benchmark " - else - echo "Error: No such mode: ${mode}" - exit 1 - fi - - if [ "${topology}" = "distilbert_swag" ]; then - script="run_swag.py" - model_name_or_path="Rocketknight1/bert-base-uncased-finetuned-swag" - # add following parameters for quicker debugging - extra_cmd=$extra_cmd" --max_eval_samples 512" - fi - - if [[ ${int8} == "true" ]]; then - extra_cmd=$extra_cmd" --int8" - fi - echo $extra_cmd - - python -u ${script} \ - --model_name_or_path ${model_name_or_path} \ - --do_eval \ - --per_device_eval_batch_size ${batch_size} \ - --output_dir ${tuned_checkpoint} \ - --overwrite_output_dir \ - --cache_dir ${cache_dir} \ - ${mode_cmd} \ - ${extra_cmd} -} - -main "$@" diff --git a/examples/huggingface/tensorflow/multiple-choice/quantization/run_swag.py b/examples/huggingface/tensorflow/multiple-choice/quantization/run_swag.py deleted file mode 100644 index dff1ae14227..00000000000 --- a/examples/huggingface/tensorflow/multiple-choice/quantization/run_swag.py +++ /dev/null @@ -1,653 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Fine-tuning the library models for multiple choice. -""" -# You can also adapt this script on your own multiple choice task. Pointers for this are left as comments. - -import json -import logging -import os -import sys -from dataclasses import dataclass, field -from itertools import chain -from pathlib import Path -from typing import Optional, Union -import numpy as np - -import datasets -import tensorflow as tf -from datasets import load_dataset - -import time - -import transformers -from transformers import ( - CONFIG_NAME, - TF2_WEIGHTS_NAME, - AutoConfig, - AutoTokenizer, - DefaultDataCollator, - HfArgumentParser, - PushToHubCallback, - TFAutoModelForMultipleChoice, - TFTrainingArguments, - create_optimizer, - set_seed, -) -from transformers.tokenization_utils_base import PreTrainedTokenizerBase -from transformers.utils import PaddingStrategy, check_min_version, send_example_telemetry -from transformers.trainer_utils import is_main_process - -logger = logging.getLogger(__name__) - - -# region Helper classes and functions - - -@dataclass -class DataCollatorForMultipleChoice: - """ - Data collator that will dynamically pad the inputs for multiple choice received. - Args: - tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]): - The tokenizer used for encoding the data. - padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`): - Select a strategy to pad the returned sequences (according to the model's padding side and padding index) - among: - - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence - if provided). - - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum - acceptable input length for the model if that argument is not provided. - - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different - lengths). - max_length (`int`, *optional*): - Maximum length of the returned list and optionally padding length (see above). - pad_to_multiple_of (`int`, *optional*): - If set will pad the sequence to a multiple of the provided value. - This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= - 7.5 (Volta). - """ - - tokenizer: PreTrainedTokenizerBase - padding: Union[bool, str, PaddingStrategy] = True - max_length: Optional[int] = None - pad_to_multiple_of: Optional[int] = None - - def __call__(self, features): - label_name = "label" if "label" in features[0].keys() else "labels" - labels = [feature.pop(label_name) for feature in features] - batch_size = len(features) - num_choices = len(features[0]["input_ids"]) - flattened_features = [ - [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features - ] - flattened_features = list(chain(*flattened_features)) - - batch = self.tokenizer.pad( - flattened_features, - padding=self.padding, - max_length=self.max_length, - pad_to_multiple_of=self.pad_to_multiple_of, - return_tensors="tf", - ) - - # Un-flatten - batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()} - # Add back labels - batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64) - return batch - - -# endregion - -# region Arguments -@dataclass -class ModelArguments: - """ - Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. - """ - - model_name_or_path: str = field( - metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} - ) - config_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} - ) - tokenizer_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} - ) - cache_dir: Optional[str] = field( - default=None, - metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, - ) - use_fast_tokenizer: bool = field( - default=True, - metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, - ) - model_revision: str = field( - default="main", - metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, - ) - use_auth_token: bool = field( - default=False, - metadata={ - "help": ( - "Will use the token generated when running `huggingface-cli login` (necessary to use this script " - "with private models)." - ) - }, - ) - - -@dataclass -class DataTrainingArguments: - """ - Arguments pertaining to what data we are going to input our model for training and eval. - """ - - train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) - validation_file: Optional[str] = field( - default=None, - metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, - ) - overwrite_cache: bool = field( - default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} - ) - preprocessing_num_workers: Optional[int] = field( - default=None, - metadata={"help": "The number of processes to use for the preprocessing."}, - ) - max_seq_length: Optional[int] = field( - default=None, - metadata={ - "help": ( - "The maximum total input sequence length after tokenization. If passed, sequences longer " - "than this will be truncated, sequences shorter will be padded." - ) - }, - ) - pad_to_max_length: bool = field( - default=False, - metadata={ - "help": ( - "Whether to pad all samples to the maximum sentence length. " - "If False, will pad the samples dynamically when batching to the maximum length in the batch. More " - "efficient on GPU but very bad for TPU." - ) - }, - ) - max_train_samples: Optional[int] = field( - default=None, - metadata={ - "help": ( - "For debugging purposes or quicker training, truncate the number of training examples to this " - "value if set." - ) - }, - ) - max_eval_samples: Optional[int] = field( - default=None, - metadata={ - "help": ( - "For debugging purposes or quicker training, truncate the number of evaluation examples to this " - "value if set." - ) - }, - ) - - def __post_init__(self): - if self.train_file is not None: - extension = self.train_file.split(".")[-1] - assert extension in ["csv", "json"], "`train_file` should be a csv or a json file." - if self.validation_file is not None: - extension = self.validation_file.split(".")[-1] - assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file." - - - -@dataclass -class OptimizationArguments: - """ - Arguments pertaining to what type of optimization we are going to apply on the model. - """ - - tune: bool = field( - default=False, - metadata={"help": "Whether or not to apply quantization."}, - ) - quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, - ) - metric_name: Optional[str] = field( - default=None, - metadata={"help": "Metric used for the tuning strategy."}, - ) - is_relative: Optional[bool] = field( - default=True, - metadata={"help": "Metric tolerance model, expected to be relative or absolute."}, - ) - perf_tol: Optional[float] = field( - default=0.01, - metadata={"help": "Performance tolerance when optimizing the model."}, - ) - benchmark: bool = field( - default=False, - metadata={"help": "run benchmark."}) - int8: bool = field( - default=False, - metadata={"help":"Whether to use the quantized int8 model."}) - accuracy_only: bool = field( - default=False, - metadata={"help":"Whether to only test accuracy for model tuned by Neural Compressor."}) - -@dataclass -class DistributedArguments: - """ - Arguments setting the distributed multinode environment - """ - - worker: str = field( - default=None, - metadata={"help": "List of node ip addresses in a string, and there should not be space between addresses."}, - ) - task_index: int = field( - default=0, - metadata={"help": "Worker index, and 0 represents the chief worker while other workers are set as 1,2,3..."}, - ) - -# endregion - - - -def main(): - # region Argument Parsing - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments, OptimizationArguments, DistributedArguments)) - if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): - # If we pass only one argument to the script and it's the path to a json file, - # let's parse it to get our arguments. - model_args, data_args, training_args, optim_args, distributed_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) - else: - model_args, data_args, training_args, optim_args, distributed_args = parser.parse_args_into_dataclasses() - - output_dir = Path(training_args.output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - # endregion - - # region Logging - logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - handlers=[logging.StreamHandler(sys.stdout)], - ) - logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) - - # Set the verbosity to info of the Transformers logger (on main process only): - if is_main_process(training_args.local_rank): - transformers.utils.logging.set_verbosity_info() - transformers.utils.logging.enable_default_handler() - transformers.utils.logging.enable_explicit_format() - logger.info(f"Training/evaluation parameters {training_args}") - # endregion - - # region Set the multinode environment, the strategy and paths - strategy = None - worker_list = None - if distributed_args.worker is not None: - logger.info("distributed environment initialization...") - - worker_list = distributed_args.worker.split(",") - - from intel_extension_for_transformers.transformers.utils.utility_tf import distributed_init - distributed_init(worker_list, "worker", distributed_args.task_index) - - strategy = tf.distribute.MultiWorkerMirroredStrategy() - from intel_extension_for_transformers.transformers.utils.utility_tf import get_filepath - training_args.output_dir = get_filepath(training_args.output_dir, strategy.cluster_resolver.task_type, strategy.cluster_resolver.task_id) - else: - strategy = training_args.strategy - #endregion - - # region Checkpoints - checkpoint = None - if len(os.listdir(training_args.output_dir)) > 0 and not training_args.overwrite_output_dir: - if (output_dir / CONFIG_NAME).is_file() and (output_dir / TF2_WEIGHTS_NAME).is_file(): - checkpoint = output_dir - logger.info( - f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this" - " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch." - ) - else: - raise ValueError( - f"Output directory ({training_args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to continue regardless." - ) - # endregion - - # Set seed before initializing model. - set_seed(training_args.seed) - - # region Load datasets - # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) - # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ - # (the dataset will be downloaded automatically from the datasets Hub). - - # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called - # 'text' is found. You can easily tweak this behavior (see below). - - # In distributed training, the load_dataset function guarantee that only one local process can concurrently - # download the dataset. - if data_args.train_file is not None or data_args.validation_file is not None: - data_files = {} - if data_args.train_file is not None: - data_files["train"] = data_args.train_file - if data_args.validation_file is not None: - data_files["validation"] = data_args.validation_file - extension = data_args.train_file.split(".")[-1] - raw_datasets = load_dataset( - extension, - data_files=data_files, - cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, - ) - else: - # Downloading and loading the swag dataset from the hub. - raw_datasets = load_dataset( - "swag", - "regular", - cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, - ) - # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at - # https://huggingface.co/docs/datasets/loading_datasets.html. - - # When using your own dataset or a different dataset from swag, you will probably need to change this. - ending_names = [f"ending{i}" for i in range(4)] - context_name = "sent1" - question_header_name = "sent2" - # endregion - - # region Load model config and tokenizer - if checkpoint is not None: - config_path = training_args.output_dir - elif model_args.config_name: - config_path = model_args.config_name - else: - config_path = model_args.model_name_or_path - - # Distributed training: - # The .from_pretrained methods guarantee that only one local process can concurrently - # download model & vocab. - config = AutoConfig.from_pretrained( - config_path, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - _commit_hash="main", - ) - tokenizer = AutoTokenizer.from_pretrained( - model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, - cache_dir=model_args.cache_dir, - use_fast=model_args.use_fast_tokenizer, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - _commit_hash="main", - ) - # endregion - - # region Dataset preprocessing - if data_args.max_seq_length is None: - max_seq_length = tokenizer.model_max_length - if max_seq_length > 1024: - logger.warning( - f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " - "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." - ) - max_seq_length = 1024 - else: - if data_args.max_seq_length > tokenizer.model_max_length: - logger.warning( - f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" - f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." - ) - max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) - - def preprocess_function(examples): - first_sentences = [[context] * 4 for context in examples[context_name]] - question_headers = examples[question_header_name] - second_sentences = [ - [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers) - ] - - # Flatten out - first_sentences = list(chain(*first_sentences)) - second_sentences = list(chain(*second_sentences)) - - # Tokenize - tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True, max_length=max_seq_length) - # Un-flatten - data = {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()} - return data - - - train_dataset = raw_datasets["train"] - if data_args.max_train_samples is not None: - max_train_samples = min(len(train_dataset), data_args.max_train_samples) - train_dataset = train_dataset.select(range(max_train_samples)) - train_dataset = train_dataset.map( - preprocess_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - load_from_cache_file=not data_args.overwrite_cache, - ) - - - eval_dataset = raw_datasets["validation"] - if data_args.max_eval_samples is not None: - max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) - eval_dataset = eval_dataset.select(range(max_eval_samples)) - eval_dataset = eval_dataset.map( - preprocess_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - load_from_cache_file=not data_args.overwrite_cache, - ) - - if data_args.pad_to_max_length: - data_collator = DefaultDataCollator(return_tensors="tf") - else: - # custom class defined above, as HF has no data collator for multiple choice - data_collator = DataCollatorForMultipleChoice(tokenizer) - # endregion - - with strategy.scope(): - # region Build model - if checkpoint is None: - model_path = model_args.model_name_or_path - else: - model_path = checkpoint - model = TFAutoModelForMultipleChoice.from_pretrained( - model_path, - config=config, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) - - num_replicas = training_args.strategy.num_replicas_in_sync - total_train_batch_size = training_args.per_device_train_batch_size * num_replicas - total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas - - num_train_steps = (len(train_dataset) // total_train_batch_size) * int(training_args.num_train_epochs) - if training_args.warmup_steps > 0: - num_warmup_steps = training_args.warmup_steps - elif training_args.warmup_ratio > 0: - num_warmup_steps = int(num_train_steps * training_args.warmup_ratio) - else: - num_warmup_steps = 0 - optimizer, lr_schedule = create_optimizer( - init_lr=training_args.learning_rate, - num_train_steps=num_train_steps, - num_warmup_steps=num_warmup_steps, - adam_beta1=training_args.adam_beta1, - adam_beta2=training_args.adam_beta2, - adam_epsilon=training_args.adam_epsilon, - weight_decay_rate=training_args.weight_decay, - adam_global_clipnorm=training_args.max_grad_norm, - ) - - - dataset_options = tf.data.Options() - dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF - - # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in - # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also - # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names - # yourself if you use this method, whereas they are automatically inferred from the model input names when - # using model.prepare_tf_dataset() - # For more info see the docs: - # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset - # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset - - tf_train_dataset = model.prepare_tf_dataset( - train_dataset, - shuffle=True, - batch_size=total_train_batch_size, - collate_fn=data_collator, - ).with_options(dataset_options) - - tf_eval_dataset = model.prepare_tf_dataset( - eval_dataset, - shuffle=False, - batch_size=total_eval_batch_size, - collate_fn=data_collator, - drop_remainder=True, - ).with_options(dataset_options) - - model.compile(optimizer=optimizer, metrics=["accuracy"], jit_compile=training_args.xla) - # endregion - - def compute_metrics(preds, labels): - predictions = preds["logits"] - preds = np.argmax(predictions, axis=1) - return {"accuracy": (preds == labels).astype(np.float32).mean().item()} - - # region tuning - if optim_args.tune: - from intel_extension_for_transformers.transformers import metrics, objectives, QuantizationConfig, TFOptimization - optimization = TFOptimization( - model=model, - args=training_args, - train_dataset=tf_train_dataset, - eval_dataset=tf_eval_dataset, - compute_metrics=compute_metrics, - task_type=strategy.cluster_resolver.task_type if isinstance(strategy, tf.distribute.MultiWorkerMirroredStrategy) else None, - task_id=strategy.cluster_resolver.task_id if isinstance(strategy, tf.distribute.MultiWorkerMirroredStrategy) else None, - ) - - # use customized eval function - tune_metric = metrics.Metric( - name="accuracy", greater_is_better=True, is_relative=True, criterion=optim_args.perf_tol, - ) - quantization_config = QuantizationConfig( - framework="tensorflow", - approach="POSTTRAININGSTATIC", - metrics=[tune_metric], - objectives=[objectives.performance] - ) - quantized_model = optimization.quantize(quant_config=quantization_config) - exit(0) - # endregion - - # region Training - eval_metrics = None - if training_args.do_train: - history = model.fit( - tf_train_dataset, - validation_data=tf_eval_dataset, - epochs=int(training_args.num_train_epochs), - ) - model.save("finetuned_model") - eval_metrics = {key: val[-1] for key, val in history.history.items()} - # endregion - - # region Evaluation - if training_args.do_eval: - num_examples = sum(1 for _ in ( - tf_eval_dataset.unbatch() if hasattr(tf_eval_dataset, "unbatch") else tf_eval_dataset)) - - if optim_args.int8: - model = tf.saved_model.load(training_args.output_dir) - else: - from intel_extension_for_transformers.transformers.utils.utility_tf import keras2SavedModel - model = keras2SavedModel(model) - - logger.info(f"***** Running Evaluation *****") - logger.info(f" Num examples in dataset = {num_examples}") - logger.info(f" Batch size = {training_args.per_device_eval_batch_size}") - - preds: np.ndarray = None - label_ids: np.ndarray = None - infer = model.signatures["serving_default"] - - if optim_args.accuracy_only: - iterations = 1 - warmup = 0 - else: - iterations = 10 - warmup = 5 - latency_list = [] - - for idx in range(iterations): - iteration_time = 0 - for i, (inputs, labels) in enumerate(tf_eval_dataset): - for name in inputs: - inputs[name] = tf.constant(inputs[name].numpy(), dtype=infer.inputs[0].dtype) - - start = time.time() - results = infer(**inputs) - iteration_time += time.time() - start - if idx == 0: # only accumulate once all the preds and labels - if preds is None: - preds = results["Identity"].numpy() - else: - preds = np.append(preds, results["Identity"].numpy(), axis=0) - if label_ids is None: - label_ids = labels[0].numpy() if isinstance( - labels, list) else labels.numpy() - else: - label_ids = np.append( - label_ids, - labels[0].numpy() if isinstance(labels, list) else labels.numpy(), - axis=0) - latency_list.append(iteration_time) - logger.info("Iteration {} time: {} sec".format(idx, iteration_time)) - - test_predictions = {"logits": preds} - eval_metrics = compute_metrics(test_predictions, label_ids) - logger.info("\nEvaluation result: ") - logger.info("Accuracy: {}".format(eval_metrics["accuracy"])) - - average_iteration_time = np.array(latency_list[warmup:]).mean() - logger.info( - "Throughput: {} samples/sec".format( - num_examples / average_iteration_time) - ) - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/examples/huggingface/tensorflow/multiple-choice/quantization/run_tuning.sh b/examples/huggingface/tensorflow/multiple-choice/quantization/run_tuning.sh deleted file mode 100644 index 79e6c5b7e87..00000000000 --- a/examples/huggingface/tensorflow/multiple-choice/quantization/run_tuning.sh +++ /dev/null @@ -1,91 +0,0 @@ -#!/bin/bash -set -x - -function main { - - init_params "$@" - run_tuning - -} - -# init params -function init_params { - topology="distilbert" - tuned_checkpoint="saved_results" - extra_cmd="" - batch_size=8 - MAX_SEQ_LENGTH=128 - model_type="bert" - approach="PostTrainingStatic" - cache_dir="cache" - for var in "$@" - do - case $var in - --topology=*) - topology=$(echo $var |cut -f2 -d=) - ;; - --dataset_location=*) - dataset_location=$(echo $var |cut -f2 -d=) - ;; - --input_model=*) - input_model=$(echo $var |cut -f2 -d=) - ;; - --output_model=*) - tuned_checkpoint=$(echo $var |cut -f2 -d=) - ;; - --worker=*) - worker=$(echo $var |cut -f2 -d=) - ;; - --task_index=*) - task_index=$(echo $var |cut -f2 -d=) - ;; - --cache_dir=*) - cache_dir=$(echo $var |cut -f2 -d=) - ;; - *) - echo "Error: No such parameter: ${var}" - exit 1 - ;; - esac - done - -} - -# run_tuning -function run_tuning { - if [ "${topology}" = "distilbert_swag" ]; then - script="run_swag.py" - model_name_or_path="Rocketknight1/bert-base-uncased-finetuned-swag" - approach="PostTrainingStatic" - # add following parameters for quicker debugging - extra_cmd=$extra_cmd" --max_train_samples 512 --max_eval_samples 1024 --perf_tol 0.035" - fi - - if [ "${worker}" = "" ] - then - python -u ${script} \ - --model_name_or_path ${model_name_or_path} \ - --output_dir ${tuned_checkpoint} \ - --quantization_approach ${approach} \ - --do_train \ - --tune \ - --overwrite_output_dir \ - --cache_dir ${cache_dir} \ - ${extra_cmd} - else - python -u ${script} \ - --model_name_or_path ${model_name_or_path} \ - --task_name ${TASK_NAME} \ - --output_dir ${tuned_checkpoint} \ - --quantization_approach ${approach} \ - --do_train \ - --tune \ - --overwrite_output_dir \ - --cache_dir ${cache_dir} \ - --worker "${worker}" \ - --task_index ${task_index} \ - ${extra_cmd} - fi -} - -main "$@" diff --git a/examples/huggingface/tensorflow/text-classification/pruning/README.md b/examples/huggingface/tensorflow/text-classification/pruning/README.md deleted file mode 100644 index b636b9de404..00000000000 --- a/examples/huggingface/tensorflow/text-classification/pruning/README.md +++ /dev/null @@ -1,86 +0,0 @@ -Step-by-Step -========= - -This document describes the step-by-step instructions for reproducing the pruning on models for the text classification (GLUE) tasks. - -# Prerequisite -## 1. Installation - -Make sure you have installed Intel® Extension for Transformers and all the dependencies in the current example: - -```shell -pip install intel-extension-for-transformers -pip install -r requirements.txt -pip install transformers==4.34.1 -``` ->**Note**: Please use transformers no higher than 4.34.1 - - -# Run - -## 1. Run Command (Shell) - -- Topology: - - distilbert_base_sst2 - -``` -bash run_tuning.sh --topology=[topology] -``` - -``` -bash run_benchmark.sh --topology=[topology] --mode=benchmark --use_pruned_model=true -``` - -## 2. Run Command (Python) - -``` -python run_glue.py \ - --model_name_or_path distilbert-base-uncased-finetuned-sst-2-english \ - --task_name sst2 \ - --prune \ - --do_train \ - --do_eval \ - --output_dir ./tmp/sst2_output \ - --overwrite_output_dir -``` - -# Multi-node Usage - -We also supported Distributed Data Parallel training on multi nodes settings for pruning. - -The default strategy we used is `MultiWorkerMirroredStrategy` in Tensorflow, and with `task_type` set as "worker", we are expected to pass following extra parameters to the script: - -* `worker`: a string of your worker ip addresses which is separated by comma and there should not be space between each two of them - -* `task_index`: 0 should be set on the chief node (leader) and 1, 2, 3... should be set as the rank of other follower nodes - -## Multi-node Example - -* On leader node - -``` -bash run_tuning.sh --topology=distilbert_base_sst2 --worker="localhost:12345,localhost:23456" --task_index=0 -``` - -which is equal to - -``` -python run_glue.py \ - --model_name_or_path distilbert-base-uncased-finetuned-sst-2-english \ - --task_name sst2 \ - --prune \ - --do_train \ - --do_eval \ - --output_dir ./tmp/sst2_output \ - --overwrite_output_dir \ - --worker "localhost:12345,localhost:23456" \ - --task_index 0 -``` - -* On follower node - -``` -bash run_tuning.sh --topology=distilbert_base_sst2 --worker="localhost:12345,localhost:23456" --task_index=1 -``` - -Please replace the worker ip address list with your own. diff --git a/examples/huggingface/tensorflow/text-classification/pruning/requirements.txt b/examples/huggingface/tensorflow/text-classification/pruning/requirements.txt deleted file mode 100644 index 245a729ec94..00000000000 --- a/examples/huggingface/tensorflow/text-classification/pruning/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -accelerate -datasets >= 1.17 -sentencepiece != 0.1.92 -protobuf -intel-tensorflow -transformers diff --git a/examples/huggingface/tensorflow/text-classification/pruning/run_benchmark.sh b/examples/huggingface/tensorflow/text-classification/pruning/run_benchmark.sh deleted file mode 100644 index 76c9b07045f..00000000000 --- a/examples/huggingface/tensorflow/text-classification/pruning/run_benchmark.sh +++ /dev/null @@ -1,90 +0,0 @@ -#!/bin/bash -set -x - -function main { - - init_params "$@" - run_benchmark - -} - -# init params -function init_params { - iters=100 - batch_size=64 - tuned_checkpoint=saved_results - topology="distilbert_base_sst2" - mode="benchmark" - for var in "$@" - do - case $var in - --topology=*) - topology=$(echo $var |cut -f2 -d=) - ;; - --dataset_location=*) - dataset_location=$(echo $var |cut -f2 -d=) - ;; - --input_model=*) - input_model=$(echo $var |cut -f2 -d=) - ;; - --mode=*) - mode=$(echo $var |cut -f2 -d=) - ;; - --batch_size=*) - batch_size=$(echo $var |cut -f2 -d=) - ;; - --iters=*) - iters=$(echo ${var} |cut -f2 -d=) - ;; - --use_pruned_model=*) - use_pruned_model=$(echo ${var} |cut -f2 -d=) - ;; - --config=*) - tuned_checkpoint=$(echo $var |cut -f2 -d=) - ;; - *) - echo "Error: No such parameter: ${var}" - exit 1 - ;; - esac - done - -} - - -# run_benchmark -function run_benchmark { - extra_cmd='' - MAX_SEQ_LENGTH=128 - - if [[ ${mode} == "accuracy" ]]; then - mode_cmd=" --accuracy_only" - elif [[ ${mode} == "benchmark" ]]; then - mode_cmd=" --benchmark " - else - echo "Error: No such mode: ${mode}" - exit 1 - fi - - if [ "${topology}" = "distilbert_base_sst2" ]; then - TASK_NAME='sst2' - model_name_or_path=distilbert-base-uncased-finetuned-sst-2-english - fi - - if [[ ${use_pruned_model} == "true" ]]; then - extra_cmd=$extra_cmd" --use_pruned_model" - fi - - python -u ./run_glue.py \ - --model_name_or_path ${model_name_or_path} \ - --task_name ${TASK_NAME} \ - --do_eval \ - --per_device_eval_batch_size ${batch_size} \ - --output_dir ${tuned_checkpoint} \ - --overwrite_cache \ - ${mode_cmd} \ - ${extra_cmd} - -} - -main "$@" diff --git a/examples/huggingface/tensorflow/text-classification/pruning/run_glue.py b/examples/huggingface/tensorflow/text-classification/pruning/run_glue.py deleted file mode 100644 index 67c35ff2471..00000000000 --- a/examples/huggingface/tensorflow/text-classification/pruning/run_glue.py +++ /dev/null @@ -1,689 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -# Copyright 2020 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Finetuning the library models for sequence classification on GLUE.""" -# You can also adapt this script on your own text classification task. Pointers for this are left as comments. - -import logging -import os -import sys -import numpy as np -import tensorflow as tf -import time -import transformers -from dataclasses import dataclass, field -from typing import Optional - -from datasets import load_dataset, load_metric - -from transformers import ( - AutoConfig, - AutoTokenizer, - DataCollatorWithPadding, - DefaultDataCollator, - HfArgumentParser, - PretrainedConfig, - TFAutoModelForSequenceClassification, - TFTrainingArguments, - set_seed, -) -from transformers.trainer_utils import get_last_checkpoint, is_main_process -from transformers.utils import check_min_version - - -# region Helper functions - - -class SavePretrainedCallback(tf.keras.callbacks.Callback): - # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary - # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback - # that saves the model with this method after each epoch. - def __init__(self, output_dir, **kwargs): - super().__init__() - self.output_dir = output_dir - - def on_epoch_end(self, epoch, logs=None): - self.model.save_pretrained(self.output_dir) - - -# endregion - - -# Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.17.0") - -task_to_keys = { - "cola": ("sentence", None), - "mnli": ("premise", "hypothesis"), - "mrpc": ("sentence1", "sentence2"), - "qnli": ("question", "sentence"), - "qqp": ("question1", "question2"), - "rte": ("sentence1", "sentence2"), - "sst2": ("sentence", None), - "stsb": ("sentence1", "sentence2"), - "wnli": ("sentence1", "sentence2"), -} - -logger = logging.getLogger(__name__) - - -# region Command-line arguments -@dataclass -class DataTrainingArguments: - """ - Arguments pertaining to what data we are going to input our model for training and eval. - - Using `HfArgumentParser` we can turn this class - into argparse arguments to be able to specify them on - the command line. - """ - - task_name: str = field( - metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())}, - ) - predict_file: str = field( - metadata={"help": "A file containing user-supplied examples to make predictions for"}, - default=None, - ) - max_seq_length: int = field( - default=128, - metadata={ - "help": "The maximum total input sequence length after tokenization. Sequences longer " - "than this will be truncated, sequences shorter will be padded." - }, - ) - overwrite_cache: bool = field( - default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."} - ) - pad_to_max_length: bool = field( - default=False, - metadata={ - "help": "Whether to pad all samples to `max_seq_length`. " - "If False, will pad the samples dynamically when batching to the maximum length in the batch." - }, - ) - max_train_samples: Optional[int] = field( - default=None, - metadata={ - "help": "For debugging purposes or quicker training, truncate the number of training examples to this " - "value if set." - }, - ) - max_eval_samples: Optional[int] = field( - default=None, - metadata={ - "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " - "value if set." - }, - ) - max_predict_samples: Optional[int] = field( - default=None, - metadata={ - "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this " - "value if set." - }, - ) - - def __post_init__(self): - self.task_name = self.task_name.lower() - if self.task_name not in task_to_keys.keys(): - raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys())) - - -@dataclass -class ModelArguments: - """ - Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. - """ - - model_name_or_path: str = field( - metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} - ) - config_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} - ) - tokenizer_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} - ) - cache_dir: Optional[str] = field( - default=None, - metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, - ) - use_fast_tokenizer: bool = field( - default=True, - metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, - ) - model_revision: str = field( - default="main", - metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, - ) - use_auth_token: bool = field( - default=False, - metadata={ - "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " - "with private models)." - }, - ) - - -@dataclass -class OptimizationArguments: - """ - Arguments pertaining to what type of optimization we are going to apply on the model. - """ - - prune: bool = field( - default=False, - metadata={"help": "Whether or not to apply prune."}, - ) - pruning_approach: Optional[str] = field( - default="BasicMagnitude", - metadata={"help": "Pruning approach. Supported approach is basic_magnite."}, - ) - target_sparsity_ratio: Optional[float] = field( - default=None, - metadata={"help": "Targeted sparsity when pruning the model."}, - ) - metric_name: Optional[str] = field( - default=None, - metadata={"help": "Metric used for the tuning strategy."}, - ) - tolerance_mode: Optional[str] = field( - default="relative", - metadata={"help": "Metric tolerance model, expected to be relative or absolute."}, - ) - perf_tol: Optional[float] = field( - default=0.01, - metadata={"help": "Performance tolerance when optimizing the model."}, - ) - benchmark: bool = field( - default=False, - metadata={"help": "Run benchmark."}) - use_pruned_model: bool = field( - default=False, - metadata={"help":"Whether to use pretrained pruned model."}) - accuracy_only: bool = field( - default=False, - metadata={"help":"Whether to only test accuracy for model tuned by Neural Compressor."}) - -@dataclass -class DistributedArguments: - """ - Arguments setting the distributed multinode environment - """ - - worker: str = field( - default=None, - metadata={"help": "List of node ip addresses in a string, and there should not be space between addresses."}, - ) - task_index: int = field( - default=0, - metadata={"help": "Worker index, and 0 represents the chief worker while other workers are set as 1,2,3..."}, - ) -# endregion - -def main(): - # region Argument parsing - # See all possible arguments in src/transformers/training_args.py - # or by passing the --help flag to this script. - # We now keep distinct sets of args, for a cleaner separation of concerns. - - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments, OptimizationArguments, DistributedArguments)) - if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): - # If we pass only one argument to the script and it's the path to a json file, - # let's parse it to get our arguments. - model_args, data_args, training_args, optim_args, distributed_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) - else: - model_args, data_args, training_args, optim_args, distributed_args = parser.parse_args_into_dataclasses() - - if not (training_args.do_train or training_args.do_eval or training_args.do_predict): - exit("Must specify at least one of --do_train, --do_eval or --do_predict!") - # endregion - - # region Set the multinode environment, the strategy and paths - strategy = None - worker_list = None - if distributed_args.worker is not None: - logger.info("distributed environment initialization...") - - worker_list = distributed_args.worker.split(",") - - from intel_extension_for_transformers.transformers.utils.utility_tf import distributed_init - distributed_init(worker_list, "worker", distributed_args.task_index) - - strategy = tf.distribute.MultiWorkerMirroredStrategy() - from intel_extension_for_transformers.transformers.utils.utility_tf import get_filepath - training_args.output_dir = get_filepath(training_args.output_dir, strategy.cluster_resolver.task_type, strategy.cluster_resolver.task_id) - else: - strategy = training_args.strategy - #endregion - - # region Checkpoints - checkpoint = None - if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: - checkpoint = get_last_checkpoint(training_args.output_dir) - if checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: - raise ValueError( - f"Output directory ({training_args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome." - ) - elif checkpoint is not None and training_args.resume_from_checkpoint is None: - logger.info( - f"Checkpoint detected, resuming training at {checkpoint}. To avoid this behavior, change " - "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." - ) - # endregion - - # region Logging - logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - handlers=[logging.StreamHandler(sys.stdout)], - ) - logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) - - # Set the verbosity to info of the Transformers logger (on main process only): - if is_main_process(training_args.local_rank): - transformers.utils.logging.set_verbosity_info() - transformers.utils.logging.enable_default_handler() - transformers.utils.logging.enable_explicit_format() - logger.info(f"Training/evaluation parameters {training_args}") - # endregion - - # region Dataset and labels - # Set seed before initializing model. - set_seed(training_args.seed) - - # Downloading and loading a dataset from the hub. In distributed training, the load_dataset function guarantee - # that only one local process can concurrently download the dataset. - datasets = load_dataset( - "glue", - data_args.task_name, - cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, - ) - # See more about loading any type of standard or custom dataset at - # https://huggingface.co/docs/datasets/loading_datasets.html. - - is_regression = data_args.task_name == "stsb" - if not is_regression: - label_list = datasets["train"].features["label"].names - num_labels = len(label_list) - else: - num_labels = 1 - - if data_args.predict_file is not None: - logger.info("Preparing user-supplied file for predictions...") - - data_files = {"data": data_args.predict_file} - - for key in data_files.keys(): - logger.info(f"Loading a local file for {key}: {data_files[key]}") - - if data_args.predict_file.endswith(".csv"): - # Loading a dataset from local csv files - user_dataset = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir) - else: - # Loading a dataset from local json files - user_dataset = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir) - needed_keys = task_to_keys[data_args.task_name] - for key in needed_keys: - assert key in user_dataset["data"].features, f"Your supplied predict_file is missing the {key} key!" - datasets["user_data"] = user_dataset["data"] - # endregion - - # region Load model config and tokenizer - # - # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently - # download model & vocab. - config = AutoConfig.from_pretrained( - model_args.config_name if model_args.config_name else model_args.model_name_or_path, - num_labels=num_labels, - finetuning_task=data_args.task_name, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) - tokenizer = AutoTokenizer.from_pretrained( - model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, - cache_dir=model_args.cache_dir, - use_fast=model_args.use_fast_tokenizer, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) - # endregion - - # region Dataset preprocessing - sentence1_key, sentence2_key = task_to_keys[data_args.task_name] - non_label_column_names = [name for name in datasets["train"].column_names if name != "label"] - - # Padding strategy - if data_args.pad_to_max_length: - padding = "max_length" - else: - # We will pad later, dynamically at batch creation, to the max sequence length in each batch - padding = False - - # Some models have set the order of the labels to use, so let's make sure we do use it. - label_to_id = None - if config.label2id != PretrainedConfig(num_labels=num_labels).label2id and not is_regression: - # Some have all caps in their config, some don't. - label_name_to_id = {k.lower(): v for k, v in config.label2id.items()} - if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): - label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)} - else: - logger.warning( - "Your model seems to have been trained with labels, but they don't match the dataset: ", - f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}." - "\nIgnoring the model labels as a result.", - ) - label_to_id = {label: i for i, label in enumerate(label_list)} - if label_to_id is not None: - config.label2id = label_to_id - config.id2label = {id: label for label, id in config.label2id.items()} - elif data_args.task_name is not None and not is_regression: - config.label2id = {l: i for i, l in enumerate(label_list)} - config.id2label = {id: label for label, id in config.label2id.items()} - - if data_args.max_seq_length > tokenizer.model_max_length: - logger.warning( - f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" - f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." - ) - max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) - - def preprocess_function(examples): - # Tokenize the texts - args = ( - (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key]) - ) - result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True) - - return result - - datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) - - if data_args.pad_to_max_length: - data_collator = DefaultDataCollator(return_tensors="tf") - else: - data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf") - # endregion - - # region Metric function - metric = load_metric("glue", data_args.task_name) - - def compute_metrics(preds, label_ids): - preds = preds["logits"] - preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) - result = metric.compute(predictions=preds, references=label_ids) - if len(result) > 1: - result["combined_score"] = np.mean(list(result.values())).item() - return result - # endregion - - if distributed_args.worker is None: - strategy = training_args.strategy - - with strategy.scope(): - # region Load pretrained model - if checkpoint is None: - model_path = model_args.model_name_or_path - else: - model_path = checkpoint - model = TFAutoModelForSequenceClassification.from_pretrained( - model_path, - config=config, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) - # endregion - - # region Optimizer, loss and compilation - optimizer = tf.keras.optimizers.Adam( - learning_rate=training_args.learning_rate, - beta_1=training_args.adam_beta1, - beta_2=training_args.adam_beta2, - epsilon=training_args.adam_epsilon, - clipnorm=training_args.max_grad_norm, - ) - if is_regression: - loss_fn = tf.keras.losses.MeanSquaredError() - metrics = [] - else: - loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( - from_logits=True, reduction=tf.keras.losses.Reduction.SUM - ) - metrics = ["accuracy"] - model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics) - # endregion - - # region Convert data to a tf.data.Dataset - tf_data = dict() - max_samples = { - "train": data_args.max_train_samples, - "validation": data_args.max_eval_samples, - "validation_matched": data_args.max_eval_samples, - "validation_mismatched": data_args.max_eval_samples, - "test": data_args.max_predict_samples, - "test_matched": data_args.max_predict_samples, - "test_mismatched": data_args.max_predict_samples, - "user_data": None, - } - - for key in datasets.keys(): - if key == "train" or key.startswith("validation"): - assert "label" in datasets[key].features, f"Missing labels from {key} data!" - if key == "train": - shuffle = True - batch_size = training_args.per_device_train_batch_size * (len(worker_list) if worker_list is not None else 1) - drop_remainder = True # Saves us worrying about scaling gradients for the last batch - else: - shuffle = False - batch_size = training_args.per_device_eval_batch_size * (len(worker_list) if worker_list is not None else 1) - drop_remainder = False - samples_limit = max_samples[key] - dataset = datasets[key] - if samples_limit is not None: - dataset = dataset.select(range(samples_limit)) - data = dataset.to_tf_dataset( - columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])], - shuffle=shuffle, - batch_size=batch_size, - collate_fn=data_collator, - drop_remainder=drop_remainder, - # `label_cols` is needed for user-defined losses, such as in this example - # datasets v2.3.x need "labels", not "label" - label_cols=["labels"] if "label" in dataset.column_names else None, - ) - tf_data[key] = data - # endregion - - # region Pruning - if optim_args.prune: - from intel_extension_for_transformers.transformers import metrics, PrunerConfig, PruningConfig, TFOptimization - optimization = TFOptimization( - model=model, - args=training_args, - train_dataset=tf_data["train"], - eval_dataset=tf_data["validation"], - compute_metrics=compute_metrics, - criterion=loss_fn, - optimizer=optimizer, - task_type=strategy.cluster_resolver.task_type if isinstance(strategy, tf.distribute.MultiWorkerMirroredStrategy) else None, - task_id=strategy.cluster_resolver.task_id if isinstance(strategy, tf.distribute.MultiWorkerMirroredStrategy) else None, - ) - tune_metric = metrics.Metric( - name="accuracy", greater_is_better=True, is_relative=True, criterion=0.01, - ) - prune_type = 'BasicMagnitude' \ - if optim_args.pruning_approach else optim_args.pruning_approach - target_sparsity_ratio = None \ - if optim_args.target_sparsity_ratio is None else optim_args.target_sparsity_ratio - pruner_config = PrunerConfig(prune_type=prune_type, target_sparsity_ratio=target_sparsity_ratio) - pruning_conf = PruningConfig( - epochs=int(training_args.num_train_epochs), pruner_config=pruner_config, metrics=tune_metric, - framework="tensorflow" - ) - p_model = optimization.prune(pruning_config=pruning_conf) - return - # endregion - - # region Training and validation - if training_args.do_train: - callbacks = [SavePretrainedCallback(output_dir=training_args.output_dir)] - if training_args.do_eval and not data_args.task_name == "mnli": - # Do both evaluation and training in the Keras fit loop, unless the task is MNLI - # because MNLI has two validation sets - validation_data = tf_data["validation"] - else: - validation_data = None - model.fit( - tf_data["train"], - validation_data=validation_data, - epochs=int(training_args.num_train_epochs), - callbacks=callbacks, - ) - # endregion - - # region Evaluation - if training_args.do_eval: - # We normally do validation as part of the Keras fit loop, but we run it independently - # if there was no fit() step (because we didn't train the model) or if the task is MNLI, - # because MNLI has a separate validation-mismatched validation set - logger.info("*** Evaluate ***") - - # Loop to handle MNLI double evaluation (matched, mis-matched) - if data_args.task_name == "mnli": - tasks = ["mnli", "mnli-mm"] - tf_datasets = [tf_data["validation_matched"], tf_data["validation_mismatched"]] - raw_datasets = [datasets["validation_matched"], datasets["validation_mismatched"]] - else: - tasks = [data_args.task_name] - tf_datasets = [tf_data["validation"]] - raw_datasets = [datasets["validation"]] - - total_time = 0 - num_examples = 0 - if optim_args.use_pruned_model: - model = tf.saved_model.load(training_args.output_dir) - for raw_dataset, tf_dataset, task in zip(raw_datasets, tf_datasets, - tasks): - num_examples += sum( - 1 for _ in (tf_dataset.unbatch() - if hasattr(tf_dataset, "unbatch") else tf_dataset - ) - ) - if optim_args.use_pruned_model: - preds: np.ndarray = None - label_ids: np.ndarray = None - infer = model.signatures[list(model.signatures.keys())[0]] - for i, (inputs, labels) in enumerate(tf_dataset): - for name in inputs: - inputs[name] = tf.constant(inputs[name].numpy(), dtype=infer.inputs[0].dtype) - start = time.time() - results = infer(**inputs) - total_time += time.time() - start - for val in results: - if preds is None: - preds = results[val].numpy() - else: - preds = np.append(preds, results[val].numpy(), axis=0) - if label_ids is None: - label_ids = labels.numpy() - else: - label_ids = np.append(label_ids, labels.numpy(), axis=0) - eval_metrics = compute_metrics({"logits": preds}, label_ids) - else: - start = time.time() - eval_predictions = model.predict(tf_dataset) - total_time += time.time() - start - eval_metrics = compute_metrics(eval_predictions, raw_dataset["label"]) - print(f"Evaluation metrics ({task}):") - print(eval_metrics) - - logger.info("metric ({}) Accuracy: {}".format(task, eval_metrics["accuracy"])) - logger.info( - "Throughput: {} samples/sec".format( - num_examples / total_time) - ) - # endregion - - # region Prediction - if training_args.do_predict or data_args.predict_file: - logger.info("*** Predict ***") - - # Loop to handle MNLI double evaluation (matched, mis-matched) - tasks = [] - tf_datasets = [] - raw_datasets = [] - if training_args.do_predict: - if data_args.task_name == "mnli": - tasks.extend(["mnli", "mnli-mm"]) - tf_datasets.extend([tf_data["test_matched"], tf_data["test_mismatched"]]) - raw_datasets.extend([datasets["test_matched"], datasets["test_mismatched"]]) - else: - tasks.append(data_args.task_name) - tf_datasets.append(tf_data["test"]) - raw_datasets.append(datasets["test"]) - if data_args.predict_file: - tasks.append("user_data") - tf_datasets.append(tf_data["user_data"]) - raw_datasets.append(datasets["user_data"]) - - if optim_args.use_pruned_model: - model = tf.saved_model.load(training_args.output_dir) - - for raw_dataset, tf_dataset, task in zip(raw_datasets, tf_datasets, tasks): - if optim_args.use_pruned_model: - preds: np.ndarray = None - infer = model.signatures[list(model.signatures.keys())[0]] - for i, (inputs, labels) in enumerate(tf_dataset): - for name in inputs: - inputs[name] = tf.constant(inputs[name].numpy(), dtype=infer.inputs[0].dtype) - results = infer(**inputs) - for val in results: - if preds is None: - preds = results[val].numpy() - else: - preds = np.append(preds, results[val].numpy(), axis=0) - test_predictions = {"logits": preds} - else: - test_predictions = model.predict(tf_dataset) - if "label" in raw_dataset: - test_metrics = compute_metrics(test_predictions, raw_dataset["label"]) - print(f"Test metrics ({task}):") - print(test_metrics) - - if is_regression: - predictions_to_write = np.squeeze(test_predictions["logits"]) - else: - predictions_to_write = np.argmax(test_predictions["logits"], axis=1) - - output_predict_file = os.path.join(training_args.output_dir, f"predict_results_{task}.txt") - with open(output_predict_file, "w") as writer: - logger.info(f"***** Writing prediction results for {task} *****") - writer.write("index\tprediction\n") - for index, item in enumerate(predictions_to_write): - if is_regression: - writer.write(f"{index}\t{item:3.3f}\n") - else: - item = config.id2label[item] - writer.write(f"{index}\t{item}\n") - # endregion - - -if __name__ == "__main__": - main() diff --git a/examples/huggingface/tensorflow/text-classification/pruning/run_tuning.sh b/examples/huggingface/tensorflow/text-classification/pruning/run_tuning.sh deleted file mode 100644 index 3fca9c69a4f..00000000000 --- a/examples/huggingface/tensorflow/text-classification/pruning/run_tuning.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/bin/bash -set -x - -function main { - - init_params "$@" - run_tuning - -} - -# init params -function init_params { - tuned_checkpoint=saved_results - topology="distilbert_base_sst2" - # topology="bert_base_mrpc_static" - for var in "$@" - do - case $var in - --topology=*) - topology=$(echo $var |cut -f2 -d=) - ;; - --dataset_location=*) - dataset_location=$(echo $var |cut -f2 -d=) - ;; - --input_model=*) - input_model=$(echo $var |cut -f2 -d=) - ;; - --output_model=*) - tuned_checkpoint=$(echo $var |cut -f2 -d=) - ;; - --worker=*) - worker=$(echo $var |cut -f2 -d=) - ;; - --task_index=*) - task_index=$(echo $var |cut -f2 -d=) - ;; - *) - echo "Error: No such parameter: ${var}" - exit 1 - ;; - esac - done - -} - -# run_tuning -function run_tuning { - extra_cmd='' - batch_size=64 - if [ "${topology}" = "distilbert_base_sst2" ]; then - TASK_NAME='sst2' - model_name_or_path=distilbert-base-uncased-finetuned-sst-2-english - fi - - if [ "${worker}" = "" ] - then - python -u ./run_glue.py \ - --model_name_or_path ${model_name_or_path} \ - --task_name ${TASK_NAME} \ - --target_sparsity_ratio 0.1 \ - --prune \ - --do_eval \ - --do_train \ - --per_device_train_batch_size ${batch_size} \ - --per_device_eval_batch_size ${batch_size} \ - --output_dir ${tuned_checkpoint} \ - --overwrite_output_dir \ - --overwrite_cache - else - python -u ./run_glue.py \ - --model_name_or_path ${model_name_or_path} \ - --task_name ${TASK_NAME} \ - --target_sparsity_ratio 0.1 \ - --prune \ - --do_eval \ - --do_train \ - --per_device_train_batch_size ${batch_size} \ - --per_device_eval_batch_size ${batch_size} \ - --output_dir ${tuned_checkpoint} \ - --overwrite_output_dir \ - --overwrite_cache \ - --worker "${worker}" \ - --task_index ${task_index} \ - ${extra_cmd} - fi -} - -main "$@" diff --git a/examples/huggingface/tensorflow/text-classification/quantization/README.md b/examples/huggingface/tensorflow/text-classification/quantization/README.md deleted file mode 100644 index ab459bf84d4..00000000000 --- a/examples/huggingface/tensorflow/text-classification/quantization/README.md +++ /dev/null @@ -1,132 +0,0 @@ -Step-by-Step -========= - -This document describes the step-by-step instructions for reproducing the quantization on models for the text classification (GLUE) tasks. - -GLUE is made up of a total of 9 different tasks. Here is how to run the script on one of them: - -# Prerequisite -## 1. Installation - -Make sure you have installed Intel® Extension for Transformers and all the dependencies in the current example: - -```shell -pip install intel-extension-for-transformers -cd ptq -pip install -r requirements.txt -``` - -# Run - -Here are two options: running with the shell script or running with the python script. Basically, they are equivalent and the shell script just wraps the invocation of the python script and is more concise and easy for users to get started. - -## 1. Run Command (Shell) - -- Topology: - - bert_base_mrpc_static - - xlnet_mrpc - - albert_large_mrpc - - legalbert_mrpc - -- To get the int8 model - - ``` - cd ptq - bash run_tuning.sh --topology=[topology] --output_model=./saved_int8 - ``` - -- To benchmark the int8 model - - ``` - cd ptq - bash run_benchmark.sh --topology=[topology] --config=./saved_int8 --mode=benchmark --int8=true - ``` - -## 2. Run Command (Python) - -- model_name_or_path: - - bert-base-cased-finetuned-mrpc - - xlnet-base-cased - - albert-large-v2 - - nlpaueb/legal-bert-small-uncased - -- To get int8 model - -``` -python run_glue.py - --model_name_or_path [model_name_or_path] \ - --task_name mrpc \ - --tune \ - --quantization_approach PostTrainingStatic \ - --do_train \ - --do_eval \ - --output_dir ./saved_result \ - --overwrite_output_dir -``` - - To reload int8 model - -``` -python run_glue.py - --model_name_or_path [model_name_or_path] \ - --task_name mrpc \ - --benchmark \ - --int8 \ - --do_eval \ - --output_dir ./saved_result \ - --overwrite_output_dir -``` - -> **Notes**: - - quantization_approach in Tensorflow consist of `PostTrainingStatic`, `QuantizationAwareTraining`. - - task_name consist of cola, sst2, mrpc, stsb, qqp, mnli, qnli, rte, wnli. - - -# Multi-node Usage - -We also supported Distributed Data Parallel training on multi nodes settings for quantization. - -> **Note**: multi node settings boost performance in the training process and may not show good performance with PostTrainingStatic quantization strategy - -The default strategy we used is `MultiWorkerMirroredStrategy` in Tensorflow, and with `task_type` set as "worker", we are expected to pass following extra parameters to the script: - -* `worker`: a string of your worker ip addresses which is separated by comma and there should not be space between each two of them - -* `task_index`: 0 should be set on the chief node (leader) and 1, 2, 3... should be set as the rank of other follower nodes - -## Multi-node Example - -### 1. Get Int8 Model - -* On leader node - -``` -bash run_tuning.sh --topology=bert_base_mrpc_static --output_model=./saved_int8 --worker="localhost:12345,localhost:23456" --task_index=0 -``` - -* On follower node - -``` -bash run_tuning.sh --topology=bert_base_mrpc_static --output_model=./saved_int8 --worker="localhost:12345,localhost:23456" --task_index=1 -``` - -Please replace the worker ip address list with your own. - -### 2. Reload Int8 Model - -* On leader node - -``` -bash run_benchmark.sh --topology=bert_base_mrpc_static --config=./saved_int8 --mode=benchmark --int8=true --worker="localhost:12345,localhost:23456" --task_index=0 -``` - -* On follower node - -``` -bash run_benchmark.sh --topology=bert_base_mrpc_static --config=./saved_int8 --mode=benchmark --int8=true --worker="localhost:12345,localhost:23456" --task_index=1 -``` - -Please replace the worker ip address list with your own. - - - - diff --git a/examples/huggingface/tensorflow/text-classification/quantization/ptq/requirements.txt b/examples/huggingface/tensorflow/text-classification/quantization/ptq/requirements.txt deleted file mode 100644 index 8067cf9633a..00000000000 --- a/examples/huggingface/tensorflow/text-classification/quantization/ptq/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -datasets >= 1.17 -sentencepiece != 0.1.92 -protobuf -intel-tensorflow -transformers -evaluate -accelerate \ No newline at end of file diff --git a/examples/huggingface/tensorflow/text-classification/quantization/ptq/run_benchmark.sh b/examples/huggingface/tensorflow/text-classification/quantization/ptq/run_benchmark.sh deleted file mode 100644 index 403a0e41b52..00000000000 --- a/examples/huggingface/tensorflow/text-classification/quantization/ptq/run_benchmark.sh +++ /dev/null @@ -1,131 +0,0 @@ -#!/bin/bash -set -x - -function main { - - init_params "$@" - run_benchmark - -} - -# init params -function init_params { - topology="bert_base_mrpc_static" - iters=100 - batch_size=1 - tuned_checkpoint=saved_results - cache_dir="cache" - for var in "$@" - do - case $var in - --topology=*) - topology=$(echo $var |cut -f2 -d=) - ;; - --dataset_location=*) - dataset_location=$(echo $var |cut -f2 -d=) - ;; - --input_model=*) - input_model=$(echo $var |cut -f2 -d=) - ;; - --mode=*) - mode=$(echo $var |cut -f2 -d=) - ;; - --batch_size=*) - batch_size=$(echo $var |cut -f2 -d=) - ;; - --iters=*) - iters=$(echo ${var} |cut -f2 -d=) - ;; - --int8=*) - int8=$(echo ${var} |cut -f2 -d=) - ;; - --config=*) - tuned_checkpoint=$(echo $var |cut -f2 -d=) - ;; - --worker=*) - worker=$(echo $var |cut -f2 -d=) - ;; - --task_index=*) - task_index=$(echo $var |cut -f2 -d=) - ;; - --cache_dir=*) - cache_dir=$(echo $var |cut -f2 -d=) - ;; - *) - echo "Error: No such parameter: ${var}" - exit 1 - ;; - esac - done - -} - - -# run_benchmark -function run_benchmark { - extra_cmd='' - MAX_SEQ_LENGTH=128 - - if [[ ${mode} == "accuracy" ]]; then - mode_cmd=" --accuracy_only" - elif [[ ${mode} == "benchmark" ]]; then - mode_cmd=" --benchmark " - else - echo "Error: No such mode: ${mode}" - exit 1 - fi - - if [ "${topology}" = "bert_base_mrpc_static" ]; then - TASK_NAME="mrpc" - model_name_or_path="bert-base-cased-finetuned-mrpc" - elif [ "${topology}" = "legalbert_mrpc" ]; then - TASK_NAME="mrpc" - model_name_or_path="nlpaueb/legal-bert-small-uncased" - elif [ "${topology}" = "xlnet_mrpc" ]; then - TASK_NAME="mrpc" - model_name_or_path="xlnet-base-cased" - elif [ "${topology}" = "albert_large_mrpc" ]; then - TASK_NAME="mrpc" - model_name_or_path="albert-large-v2" - # add following parameters for quicker debugging - extra_cmd=$extra_cmd" --max_eval_samples 48" - fi - - if [[ ${int8} == "true" ]]; then - extra_cmd=$extra_cmd" --int8" - fi - echo $extra_cmd - - if [ "${worker}" = "" ] - then - python -u ../run_glue.py \ - --model_name_or_path ${model_name_or_path} \ - --task_name ${TASK_NAME} \ - --do_eval \ - --max_seq_length ${MAX_SEQ_LENGTH} \ - --per_device_eval_batch_size ${batch_size} \ - --output_dir ${tuned_checkpoint} \ - --overwrite_output_dir \ - --cache_dir ${cache_dir} \ - --no_cuda \ - ${mode_cmd} \ - ${extra_cmd} - else - python -u ../run_glue.py \ - --model_name_or_path ${model_name_or_path} \ - --task_name ${TASK_NAME} \ - --do_eval \ - --max_seq_length ${MAX_SEQ_LENGTH} \ - --per_device_eval_batch_size ${batch_size} \ - --output_dir ${tuned_checkpoint} \ - --overwrite_output_dir \ - --cache_dir ${cache_dir} \ - --no_cuda \ - --worker "${worker}" \ - --task_index ${task_index} \ - ${mode_cmd} \ - ${extra_cmd} - fi -} - -main "$@" diff --git a/examples/huggingface/tensorflow/text-classification/quantization/ptq/run_tuning.sh b/examples/huggingface/tensorflow/text-classification/quantization/ptq/run_tuning.sh deleted file mode 100644 index c84c8654f62..00000000000 --- a/examples/huggingface/tensorflow/text-classification/quantization/ptq/run_tuning.sh +++ /dev/null @@ -1,115 +0,0 @@ -#!/bin/bash -set -x - -function main { - - init_params "$@" - run_tuning - -} - -# init params -function init_params { - topology="bert_base_mrpc_static" - tuned_checkpoint="saved_results" - extra_cmd="" - batch_size=8 - MAX_SEQ_LENGTH=128 - model_type="bert" - approach="PostTrainingStatic" - cache_dir="cache" - for var in "$@" - do - case $var in - --topology=*) - topology=$(echo $var |cut -f2 -d=) - ;; - --dataset_location=*) - dataset_location=$(echo $var |cut -f2 -d=) - ;; - --input_model=*) - input_model=$(echo $var |cut -f2 -d=) - ;; - --output_model=*) - tuned_checkpoint=$(echo $var |cut -f2 -d=) - ;; - --worker=*) - worker=$(echo $var |cut -f2 -d=) - ;; - --task_index=*) - task_index=$(echo $var |cut -f2 -d=) - ;; - --cache_dir=*) - cache_dir=$(echo $var |cut -f2 -d=) - ;; - *) - echo "Error: No such parameter: ${var}" - exit 1 - ;; - esac - done - -} - -# run_tuning -function run_tuning { - batch_size=64 - if [ "${topology}" = "bert_base_mrpc_static" ]; then - TASK_NAME="mrpc" - model_name_or_path="bert-base-cased-finetuned-mrpc" - approach="PostTrainingStatic" - elif [ "${topology}" = "legalbert_mrpc" ]; then - TASK_NAME="mrpc" - model_name_or_path="nlpaueb/legal-bert-small-uncased" - approach="PostTrainingStatic" - extra_cmd=$extra_cmd" --perf_tol 0.1" - elif [ "${topology}" = "xlnet_mrpc" ]; then - TASK_NAME="mrpc" - model_name_or_path="xlnet-base-cased" - approach="PostTrainingStatic" - elif [ "${topology}" = "albert_large_mrpc" ]; then - TASK_NAME="mrpc" - model_name_or_path="albert-large-v2" - approach="PostTrainingStatic" - extra_cmd=$extra_cmd" --perf_tol 0.05" - fi - - if [ "${worker}" = "" ] - then - python -u ../run_glue.py \ - --model_name_or_path ${model_name_or_path} \ - --task_name ${TASK_NAME} \ - --do_eval \ - --max_seq_length ${MAX_SEQ_LENGTH} \ - --per_device_train_batch_size ${batch_size} \ - --per_device_eval_batch_size ${batch_size} \ - --output_dir ${tuned_checkpoint} \ - --no_cuda \ - --overwrite_output_dir \ - --cache_dir ${cache_dir} \ - --quantization_approach ${approach} \ - --do_train \ - --tune \ - ${extra_cmd} - else - python -u ../run_glue.py \ - --model_name_or_path ${model_name_or_path} \ - --task_name ${TASK_NAME} \ - --do_eval \ - --max_seq_length ${MAX_SEQ_LENGTH} \ - --per_device_train_batch_size ${batch_size} \ - --per_device_eval_batch_size ${batch_size} \ - --output_dir ${tuned_checkpoint} \ - --no_cuda \ - --overwrite_output_dir \ - --cache_dir ${cache_dir} \ - --quantization_approach ${approach} \ - --do_train \ - --tune \ - --worker "${worker}" \ - --task_index ${task_index} \ - ${extra_cmd} - fi -} - -main "$@" diff --git a/examples/huggingface/tensorflow/text-classification/quantization/run_glue.py b/examples/huggingface/tensorflow/text-classification/quantization/run_glue.py deleted file mode 100644 index c2ca0c45603..00000000000 --- a/examples/huggingface/tensorflow/text-classification/quantization/run_glue.py +++ /dev/null @@ -1,731 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -# Copyright 2020 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Finetuning the library models for sequence classification on GLUE.""" -# You can also adapt this script on your own text classification task. Pointers for this are left as comments. - -import logging -import os -import sys -import time -from dataclasses import dataclass, field -from typing import Optional - -import numpy as np -import tensorflow as tf -from datasets import load_dataset - -import transformers -from transformers import ( - AutoConfig, - AutoTokenizer, - DataCollatorWithPadding, - DefaultDataCollator, - HfArgumentParser, - PretrainedConfig, - TFAutoModelForSequenceClassification, - TFTrainingArguments, - set_seed, -) -from transformers.trainer_utils import get_last_checkpoint, is_main_process -from transformers.utils import check_min_version - - -# region Helper functions - - -class SavePretrainedCallback(tf.keras.callbacks.Callback): - # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary - # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback - # that saves the model with this method after each epoch. - def __init__(self, output_dir, **kwargs): - super().__init__() - self.output_dir = output_dir - - def on_epoch_end(self, epoch, logs=None): - self.model.save_pretrained(self.output_dir) - - -# endregion - - -# Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.17.0") - -task_to_keys = { - "cola": ("sentence", None), - "mnli": ("premise", "hypothesis"), - "mrpc": ("sentence1", "sentence2"), - "qnli": ("question", "sentence"), - "qqp": ("question1", "question2"), - "rte": ("sentence1", "sentence2"), - "sst2": ("sentence", None), - "stsb": ("sentence1", "sentence2"), - "wnli": ("sentence1", "sentence2"), -} - -logger = logging.getLogger(__name__) - - -# region Command-line arguments -@dataclass -class DataTrainingArguments: - """ - Arguments pertaining to what data we are going to input our model for training and eval. - - Using `HfArgumentParser` we can turn this class - into argparse arguments to be able to specify them on - the command line. - """ - - task_name: str = field( - metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())}, - ) - predict_file: str = field( - metadata={"help": "A file containing user-supplied examples to make predictions for"}, - default=None, - ) - max_seq_length: int = field( - default=128, - metadata={ - "help": "The maximum total input sequence length after tokenization. Sequences longer " - "than this will be truncated, sequences shorter will be padded." - }, - ) - overwrite_cache: bool = field( - default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."} - ) - pad_to_max_length: bool = field( - default=False, - metadata={ - "help": "Whether to pad all samples to `max_seq_length`. " - "If False, will pad the samples dynamically when batching to the maximum length in the batch." - }, - ) - max_train_samples: Optional[int] = field( - default=None, - metadata={ - "help": "For debugging purposes or quicker training, truncate the number of training examples to this " - "value if set." - }, - ) - max_eval_samples: Optional[int] = field( - default=None, - metadata={ - "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " - "value if set." - }, - ) - max_predict_samples: Optional[int] = field( - default=None, - metadata={ - "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this " - "value if set." - }, - ) - - def __post_init__(self): - self.task_name = self.task_name.lower() - if self.task_name not in task_to_keys.keys(): - raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys())) - - -@dataclass -class ModelArguments: - """ - Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. - """ - - model_name_or_path: str = field( - metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} - ) - config_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} - ) - tokenizer_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} - ) - cache_dir: Optional[str] = field( - default=None, - metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, - ) - use_fast_tokenizer: bool = field( - default=True, - metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, - ) - model_revision: str = field( - default="main", - metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, - ) - use_auth_token: bool = field( - default=False, - metadata={ - "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " - "with private models)." - }, - ) - - -@dataclass -class OptimizationArguments: - """ - Arguments pertaining to what type of optimization we are going to apply on the model. - """ - - tune: bool = field( - default=False, - metadata={"help": "Whether or not to apply quantization."}, - ) - quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, - ) - metric_name: Optional[str] = field( - default=None, - metadata={"help": "Metric used for the tuning strategy."}, - ) - is_relative: Optional[bool] = field( - default=True, - metadata={"help": "Metric tolerance model, expected to be relative or absolute."}, - ) - perf_tol: Optional[float] = field( - default=0.01, - metadata={"help": "Performance tolerance when optimizing the model."}, - ) - benchmark: bool = field( - default=False, - metadata={"help": "run benchmark."}) - int8: bool = field( - default=False, - metadata={"help":"Whether to use the quantized int8 model."}) - accuracy_only: bool = field( - default=False, - metadata={"help":"Whether to only test accuracy for model tuned by Neural Compressor."}) - -@dataclass -class DistributedArguments: - """ - Arguments setting the distributed multinode environment - """ - - worker: str = field( - default=None, - metadata={"help": "List of node ip addresses in a string, and there should not be space between addresses."}, - ) - task_index: int = field( - default=0, - metadata={"help": "Worker index, and 0 represents the chief worker while other workers are set as 1,2,3..."}, - ) -# endregion - - -def main(): - # region Argument parsing - # See all possible arguments in src/transformers/training_args.py - # or by passing the --help flag to this script. - # We now keep distinct sets of args, for a cleaner separation of concerns. - - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments, OptimizationArguments, DistributedArguments)) - if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): - # If we pass only one argument to the script and it's the path to a json file, - # let's parse it to get our arguments. - model_args, data_args, training_args, optim_args, distributed_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) - else: - model_args, data_args, training_args, optim_args, distributed_args = parser.parse_args_into_dataclasses() - - if not (training_args.do_train or training_args.do_eval or training_args.do_predict): - exit("Must specify at least one of --do_train, --do_eval or --do_predict!") - # endregion - - # region Logging - logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - handlers=[logging.StreamHandler(sys.stdout)], - ) - logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) - - # Set the verbosity to info of the Transformers logger (on main process only): - if is_main_process(training_args.local_rank): - transformers.utils.logging.set_verbosity_info() - transformers.utils.logging.enable_default_handler() - transformers.utils.logging.enable_explicit_format() - logger.info(f"Training/evaluation parameters {training_args}") - # endregion - - # region Set the multinode environment, the strategy and paths - strategy = None - worker_list = None - if distributed_args.worker is not None: - logger.info("distributed environment initialization...") - - worker_list = distributed_args.worker.split(",") - - from intel_extension_for_transformers.transformers.utils.utility_tf import distributed_init - distributed_init(worker_list, "worker", distributed_args.task_index) - - strategy = tf.distribute.MultiWorkerMirroredStrategy() - from intel_extension_for_transformers.transformers.utils.utility_tf import get_filepath - training_args.output_dir = get_filepath(training_args.output_dir, strategy.cluster_resolver.task_type, strategy.cluster_resolver.task_id) - else: - strategy = training_args.strategy - #endregion - - # region Checkpoints - checkpoint = None - if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: - checkpoint = get_last_checkpoint(training_args.output_dir) - if checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: - raise ValueError( - f"Output directory ({training_args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome." - ) - elif checkpoint is not None and training_args.resume_from_checkpoint is None: - logger.info( - f"Checkpoint detected, resuming training at {checkpoint}. To avoid this behavior, change " - "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." - ) - # endregion - - # region Dataset and labels - # Set seed before initializing model. - set_seed(training_args.seed) - - # Downloading and loading a dataset from the hub. In distributed training, the load_dataset function guarantee - # that only one local process can concurrently download the dataset. - datasets = load_dataset( - "glue", - data_args.task_name, - cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, - ) - # See more about loading any type of standard or custom dataset at - # https://huggingface.co/docs/datasets/loading_datasets.html. - - is_regression = data_args.task_name == "stsb" - if not is_regression: - label_list = datasets["train"].features["label"].names - num_labels = len(label_list) - else: - num_labels = 1 - - if data_args.predict_file is not None: - logger.info("Preparing user-supplied file for predictions...") - - data_files = {"data": data_args.predict_file} - - for key in data_files.keys(): - logger.info(f"Loading a local file for {key}: {data_files[key]}") - - if data_args.predict_file.endswith(".csv"): - # Loading a dataset from local csv files - user_dataset = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir) - else: - # Loading a dataset from local json files - user_dataset = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir) - needed_keys = task_to_keys[data_args.task_name] - for key in needed_keys: - assert key in user_dataset["data"].features, f"Your supplied predict_file is missing the {key} key!" - datasets["user_data"] = user_dataset["data"] - # endregion - - # region Load model config and tokenizer - # - # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently - # download model & vocab. - config = AutoConfig.from_pretrained( - model_args.config_name if model_args.config_name else model_args.model_name_or_path, - num_labels=num_labels, - finetuning_task=data_args.task_name, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - _commit_hash="main", - ) - tokenizer = AutoTokenizer.from_pretrained( - model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, - cache_dir=model_args.cache_dir, - use_fast=model_args.use_fast_tokenizer, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - _commit_hash="main", - ) - # endregion - - # region Dataset preprocessing - sentence1_key, sentence2_key = task_to_keys[data_args.task_name] - non_label_column_names = [name for name in datasets["train"].column_names if name != "label"] - - # Padding strategy - if data_args.pad_to_max_length: - padding = "max_length" - else: - # We will pad later, dynamically at batch creation, to the max sequence length in each batch - padding = False - - # Some models have set the order of the labels to use, so let's make sure we do use it. - label_to_id = None - if config.label2id != PretrainedConfig(num_labels=num_labels).label2id and not is_regression: - # Some have all caps in their config, some don't. - label_name_to_id = {k.lower(): v for k, v in config.label2id.items()} - if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): - label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)} - else: - logger.warning( - "Your model seems to have been trained with labels, but they don't match the dataset: ", - f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}." - "\nIgnoring the model labels as a result.", - ) - label_to_id = {label: i for i, label in enumerate(label_list)} - if label_to_id is not None: - config.label2id = label_to_id - config.id2label = {id: label for label, id in config.label2id.items()} - elif data_args.task_name is not None and not is_regression: - config.label2id = {l: i for i, l in enumerate(label_list)} - config.id2label = {id: label for label, id in config.label2id.items()} - - if data_args.max_seq_length > tokenizer.model_max_length: - logger.warning( - f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" - f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." - ) - max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) - - def preprocess_function(examples): - # Tokenize the texts - args = ( - (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key]) - ) - result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True) - - return result - - datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) - - if data_args.pad_to_max_length: - data_collator = DefaultDataCollator(return_tensors="tf") - else: - data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf") - # endregion - - # region Metric function - from evaluate import load - metric = load("glue", data_args.task_name, cache_dir=model_args.cache_dir) - - def compute_metrics(preds, label_ids): - preds = preds["logits"] - preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) - result = metric.compute(predictions=preds, references=label_ids) - if len(result) > 1: - result["combined_score"] = np.mean(list(result.values())).item() - return result - - # endregion - - def eval_func_mrpc(model): - label_ids: np.ndarray = None - tf_eval_dataset = tf_data["validation"] - - num_examples = sum(1 for _ in ( - tf_eval_dataset.unbatch() if hasattr(tf_eval_dataset, "unbatch") else tf_eval_dataset)) - logger.info(f"***** Running Evaluation *****") - logger.info(f" Num examples in dataset = {num_examples}") - logger.info(f" Batch size = {training_args.per_device_eval_batch_size}") - - preds: np.ndarray = None - infer = model.signatures["serving_default"] - - for idx, (inputs, labels) in enumerate(tf_eval_dataset): - for name in inputs: - inputs[name] = tf.constant(inputs[name].numpy(), dtype=infer.inputs[0].dtype) - - results = infer(**inputs) - if preds is None: - preds = results["Identity"].numpy() - else: - preds = np.append(preds, results["Identity"].numpy(), axis=0) - - if label_ids is None: - label_ids = labels[0].numpy() if isinstance( - labels, list) else labels.numpy() - else: - label_ids = np.append( - label_ids, - labels[0].numpy() - if isinstance(labels, list) else labels.numpy(), - axis=0) - test_predictions = {"logits": preds} - metrics = compute_metrics(test_predictions, label_ids) - - return metrics["accuracy"] - - with strategy.scope(): - # region Load pretrained model - if checkpoint is None: - model_path = model_args.model_name_or_path - else: - model_path = checkpoint - model = TFAutoModelForSequenceClassification.from_pretrained( - model_path, - config=config, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) - # endregion - - # region Optimizer, loss and compilation - optimizer = tf.keras.optimizers.Adam( - learning_rate=training_args.learning_rate, - beta_1=training_args.adam_beta1, - beta_2=training_args.adam_beta2, - epsilon=training_args.adam_epsilon, - clipnorm=training_args.max_grad_norm, - ) - if is_regression: - loss_fn = tf.keras.losses.MeanSquaredError() - metrics = [] - else: - loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( - from_logits=True, reduction=tf.keras.losses.Reduction.SUM - ) - metrics = ["accuracy"] - model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics) - # endregion - - # region Convert data to a tf.data.Dataset - tf_data = dict() - max_samples = { - "train": data_args.max_train_samples, - "validation": data_args.max_eval_samples, - "validation_matched": data_args.max_eval_samples, - "validation_mismatched": data_args.max_eval_samples, - "test": data_args.max_predict_samples, - "test_matched": data_args.max_predict_samples, - "test_mismatched": data_args.max_predict_samples, - "user_data": None, - } - for key in datasets.keys(): - if key == "train" or key.startswith("validation"): - assert "label" in datasets[key].features, f"Missing labels from {key} data!" - if key == "train": - shuffle = True - batch_size = training_args.per_device_train_batch_size * (len(worker_list) if worker_list is not None else 1) - drop_remainder = True # Saves us worrying about scaling gradients for the last batch - else: - shuffle = False - batch_size = training_args.per_device_eval_batch_size * (len(worker_list) if worker_list is not None else 1) - drop_remainder = False - samples_limit = max_samples[key] - dataset = datasets[key] - if samples_limit is not None: - dataset = dataset.select(range(samples_limit)) - data = dataset.to_tf_dataset( - columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])], - shuffle=shuffle, - batch_size=batch_size, - collate_fn=data_collator, - drop_remainder=drop_remainder, - # `label_cols` is needed for user-defined losses, such as in this example - # datasets v2.3.x need "labels", not "label" - label_cols=["labels"] if "label" in dataset.column_names else None, - ) - tf_data[key] = data - # endregion - - if optim_args.tune: - from intel_extension_for_transformers.transformers import metrics, objectives, QuantizationConfig, TFOptimization - optimization = TFOptimization( - model=model, - args=training_args, - train_dataset=tf_data["train"], - eval_dataset=tf_data["validation"], - compute_metrics=compute_metrics, - task_type=strategy.cluster_resolver.task_type if isinstance(strategy, tf.distribute.MultiWorkerMirroredStrategy) else None, - task_id=strategy.cluster_resolver.task_id if isinstance(strategy, tf.distribute.MultiWorkerMirroredStrategy) else None, - ) - - # use customized eval function - optimization.eval_func = eval_func_mrpc - - tune_metric = metrics.Metric( - name="accuracy", greater_is_better=True, is_relative=True, criterion=optim_args.perf_tol, - ) - quantization_config = QuantizationConfig( - framework="tensorflow", - approach="POSTTRAININGSTATIC", - metrics=[tune_metric], - objectives=[objectives.performance] - ) - quantized_model = optimization.quantize(quant_config=quantization_config) - exit(0) - - # region Training and validation - if training_args.do_train: - callbacks = [SavePretrainedCallback(output_dir=training_args.output_dir)] - if training_args.do_eval and not data_args.task_name == "mnli": - # Do both evaluation and training in the Keras fit loop, unless the task is MNLI - # because MNLI has two validation sets - validation_data = tf_data["validation"] - else: - validation_data = None - model.fit( - tf_data["train"], - validation_data=validation_data, - epochs=2, - callbacks=callbacks, - ) - # endregion - - # region Evaluation - if training_args.do_eval: - # We normally do validation as part of the Keras fit loop, but we run it independently - # if there was no fit() step (because we didn't train the model) or if the task is MNLI, - # because MNLI has a separate validation-mismatched validation set - logger.info("*** Evaluate ***") - - # Loop to handle MNLI double evaluation (matched, mis-matched) - if data_args.task_name == "mnli": - tasks = ["mnli", "mnli-mm"] - tf_datasets = [tf_data["validation_matched"], tf_data["validation_mismatched"]] - raw_datasets = [datasets["validation_matched"], datasets["validation_mismatched"]] - else: - tasks = [data_args.task_name] - tf_datasets = [tf_data["validation"]] - raw_datasets = [datasets["validation"]] - - num_examples = 0 - if optim_args.int8: - model = tf.saved_model.load(training_args.output_dir) - else: - from intel_extension_for_transformers.transformers.utils.utility_tf import keras2SavedModel - model = keras2SavedModel(model) - for raw_dataset, tf_dataset, task in zip(raw_datasets, tf_datasets, tasks): - num_examples += sum( - 1 for _ in (tf_dataset.unbatch() - if hasattr(tf_dataset, "unbatch") else tf_dataset - ) - ) - preds: np.ndarray = None - label_ids: np.ndarray = None - infer = model.signatures[list(model.signatures.keys())[0]] - - if optim_args.accuracy_only: - iterations = 1 - warmup = 0 - else: - iterations = 10 - warmup = 5 - latency_list = [] - - for idx in range(iterations): - iteration_time = 0 - for i, (inputs, labels) in enumerate(tf_dataset): - for name in inputs: - inputs[name] = tf.constant(inputs[name].numpy(), dtype=infer.inputs[0].dtype) - start = time.time() - results = infer(**inputs) - iteration_time += time.time() - start - if idx == 0: # only accumulate once all the preds and labels - if preds is None: - preds = results["Identity"].numpy() - else: - preds = np.append(preds, results["Identity"].numpy(), axis=0) - if label_ids is None: - label_ids = labels.numpy() - else: - label_ids = np.append(label_ids, labels.numpy(), axis=0) - latency_list.append(iteration_time) - logger.info("Iteration {} time: {} sec".format(idx, iteration_time)) - eval_metrics = compute_metrics({"logits": preds}, label_ids) - logger.info("\nEvaluation result: ") - logger.info("metric ({}) Accuracy: {}".format(task, eval_metrics["accuracy"])) - - average_iteration_time = np.array(latency_list[warmup:]).mean() - logger.info( - "Throughput: {} samples/sec".format( - num_examples / average_iteration_time) - ) - - # endregion - - # region Prediction - if training_args.do_predict or data_args.predict_file: - logger.info("*** Predict ***") - - # Loop to handle MNLI double evaluation (matched, mis-matched) - tasks = [] - tf_datasets = [] - raw_datasets = [] - if training_args.do_predict: - if data_args.task_name == "mnli": - tasks.extend(["mnli", "mnli-mm"]) - tf_datasets.extend([tf_data["test_matched"], tf_data["test_mismatched"]]) - raw_datasets.extend([datasets["test_matched"], datasets["test_mismatched"]]) - else: - tasks.append(data_args.task_name) - tf_datasets.append(tf_data["test"]) - raw_datasets.append(datasets["test"]) - if data_args.predict_file: - tasks.append("user_data") - tf_datasets.append(tf_data["user_data"]) - raw_datasets.append(datasets["user_data"]) - - if optim_args.int8: - model = tf.saved_model.load(training_args.output_dir) - - for raw_dataset, tf_dataset, task in zip(raw_datasets, tf_datasets, tasks): - if optim_args.int8: - preds: np.ndarray = None - infer = model.signatures[list(model.signatures.keys())[0]] - for i, (inputs, labels) in enumerate(tf_dataset): - for name in inputs: - inputs[name] = tf.constant(inputs[name].numpy(), dtype=infer.inputs[0].dtype) - results = infer(**inputs) - for val in results: - if preds is None: - preds = results[val].numpy() - else: - preds = np.append(preds, results[val].numpy(), axis=0) - test_predictions = {"logits": preds} - else: - test_predictions = model.predict(tf_dataset) - if "label" in raw_dataset: - test_metrics = compute_metrics(test_predictions, - raw_dataset["label"]) - print(f"Test metrics ({task}):") - print(test_metrics) - - if is_regression: - predictions_to_write = np.squeeze(test_predictions["logits"]) - else: - predictions_to_write = np.argmax(test_predictions["logits"], axis=1) - - output_predict_file = os.path.join(training_args.output_dir, f"predict_results_{task}.txt") - with open(output_predict_file, "w") as writer: - logger.info(f"***** Writing prediction results for {task} *****") - writer.write("index\tprediction\n") - for index, item in enumerate(predictions_to_write): - if is_regression: - writer.write(f"{index}\t{item:3.3f}\n") - else: - item = config.id2label[item] - writer.write(f"{index}\t{item}\n") - # endregion - - -if __name__ == "__main__": - main() diff --git a/examples/huggingface/tensorflow/token-classification/quantization/README.md b/examples/huggingface/tensorflow/token-classification/quantization/README.md deleted file mode 100644 index 8b05a9c1974..00000000000 --- a/examples/huggingface/tensorflow/token-classification/quantization/README.md +++ /dev/null @@ -1,35 +0,0 @@ -Step-by-Step -========= - -This document describes the step-by-step instructions for reproducing the quantization on models for the token classification (NER) tasks. - -# Prerequisite -## 1. Installation - -Make sure you have installed Intel® Extension for Transformers and all the dependencies in the current example: - -```shell -pip install intel-extension-for-transformers -pip install -r requirements.txt -``` - -# Run - -## 1. Run Command (Shell) - -- Topology: - - bert_base_ner - -- To get the int8 model - - ``` - cd ptq - bash run_tuning.sh --topology=[topology] --output_model=./saved_int8 - ``` - -- To benchmark the int8 model - - ``` - cd ptq - bash run_benchmark.sh --topology=[topology] --config=./saved_int8 --mode=benchmark --int8=true - ``` \ No newline at end of file diff --git a/examples/huggingface/tensorflow/token-classification/quantization/requirements.txt b/examples/huggingface/tensorflow/token-classification/quantization/requirements.txt deleted file mode 100644 index 6e419404871..00000000000 --- a/examples/huggingface/tensorflow/token-classification/quantization/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -datasets >= 1.17 -sentencepiece != 0.1.92 -seqeval -protobuf -intel-tensorflow -transformers -accelerate \ No newline at end of file diff --git a/examples/huggingface/tensorflow/token-classification/quantization/run_benchmark.sh b/examples/huggingface/tensorflow/token-classification/quantization/run_benchmark.sh deleted file mode 100644 index ddf9d917410..00000000000 --- a/examples/huggingface/tensorflow/token-classification/quantization/run_benchmark.sh +++ /dev/null @@ -1,129 +0,0 @@ -#!/bin/bash -set -x - -function main { - - init_params "$@" - run_benchmark - -} - -# init params -function init_params { - topology="bert_base_ner" - iters=100 - batch_size=16 - tuned_checkpoint=saved_results - cache_dir="cache" - for var in "$@" - do - case $var in - --topology=*) - topology=$(echo $var |cut -f2 -d=) - ;; - --dataset_location=*) - dataset_location=$(echo $var |cut -f2 -d=) - ;; - --input_model=*) - input_model=$(echo $var |cut -f2 -d=) - ;; - --mode=*) - mode=$(echo $var |cut -f2 -d=) - ;; - --batch_size=*) - batch_size=$(echo $var |cut -f2 -d=) - ;; - --iters=*) - iters=$(echo ${var} |cut -f2 -d=) - ;; - --int8=*) - int8=$(echo ${var} |cut -f2 -d=) - ;; - --config=*) - tuned_checkpoint=$(echo $var |cut -f2 -d=) - ;; - --worker=*) - worker=$(echo $var |cut -f2 -d=) - ;; - --task_index=*) - task_index=$(echo $var |cut -f2 -d=) - ;; - --cache_dir=*) - cache_dir=$(echo $var |cut -f2 -d=) - ;; - *) - echo "Error: No such parameter: ${var}" - exit 1 - ;; - esac - done - -} - - -# run_benchmark -function run_benchmark { - extra_cmd='' - MAX_SEQ_LENGTH=128 - batch_size=1 - - if [[ ${mode} == "accuracy" ]]; then - mode_cmd=" --accuracy_only" - elif [[ ${mode} == "benchmark" ]]; then - mode_cmd=" --benchmark " - else - echo "Error: No such mode: ${mode}" - exit 1 - fi - - if [ "${topology}" = "bert_base_ner" ]; then - TASK_NAME="ner" - model_name_or_path="dslim/bert-base-NER" - approach="PostTrainingStatic" - dataset_name=conll2003 - fi - - if [[ ${int8} == "true" ]]; then - extra_cmd=$extra_cmd" --int8" - fi - echo $extra_cmd - - if [ "${worker}" = "" ] - then - python -u run_ner.py \ - --model_name_or_path ${model_name_or_path} \ - --dataset_name ${dataset_name} \ - --task_name ${TASK_NAME} \ - --pad_to_max_length \ - --do_eval \ - --max_length ${MAX_SEQ_LENGTH} \ - --per_device_eval_batch_size ${batch_size} \ - --max_eval_samples 408 \ - --output_dir ${tuned_checkpoint} \ - --overwrite_output_dir \ - --cache_dir ${cache_dir} \ - --no_cuda \ - ${mode_cmd} \ - ${extra_cmd} - else - python -u ../run_ner.py \ - --model_name_or_path ${model_name_or_path} \ - --task_name ${TASK_NAME} \ - --dataset_name ${dataset_name} \ - --pad_to_max_length \ - --do_eval \ - --max_length ${MAX_SEQ_LENGTH} \ - --per_device_eval_batch_size ${batch_size} \ - --max_eval_samples 408 \ - --output_dir ${tuned_checkpoint} \ - --overwrite_output_dir \ - --cache_dir ${cache_dir} \ - --no_cuda \ - --worker "${worker}" \ - --task_index ${task_index} \ - ${mode_cmd} \ - ${extra_cmd} - fi -} - -main "$@" diff --git a/examples/huggingface/tensorflow/token-classification/quantization/run_ner.py b/examples/huggingface/tensorflow/token-classification/quantization/run_ner.py deleted file mode 100644 index 30b9855c97f..00000000000 --- a/examples/huggingface/tensorflow/token-classification/quantization/run_ner.py +++ /dev/null @@ -1,696 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Fine-tuning a 🤗 Transformers model on token classification tasks (NER, POS, CHUNKS) relying on the accelerate library -without using a Trainer. -""" - -import logging -import sys -import random -import time -from dataclasses import dataclass, field -from typing import Optional - -from datasets import ClassLabel, load_dataset, load_metric -import numpy as np -import tensorflow as tf - -import transformers -from transformers import ( - AutoConfig, - AutoTokenizer, - DataCollatorForTokenClassification, - HfArgumentParser, - TFAutoModelForTokenClassification, - TFTrainingArguments, - set_seed, -) -from transformers.utils.versions import require_version -from transformers.trainer_utils import get_last_checkpoint, is_main_process - -logger = logging.getLogger(__name__) -require_version("datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/token-classification/requirements.txt") - -class SavePretrainedCallback(tf.keras.callbacks.Callback): - # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary - # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback - # that saves the model with this method after each epoch. - def __init__(self, output_dir, **kwargs): - super().__init__() - self.output_dir = output_dir - - def on_epoch_end(self, epoch, logs=None): - self.model.save_pretrained(self.output_dir) - - -# region Command-line arguments -@dataclass -class ModelArguments: - """ - Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. - """ - - model_name_or_path: str = field( - metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} - ) - config_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} - ) - tokenizer_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} - ) - cache_dir: Optional[str] = field( - default=None, - metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, - ) - model_revision: str = field( - default="main", - metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, - ) - use_auth_token: bool = field( - default=False, - metadata={ - "help": ( - "Will use the token generated when running `huggingface-cli login` (necessary to use this script " - "with private models)." - ) - }, - ) - -@dataclass -class DataTrainingArguments: - """ - Arguments pertaining to what data we are going to input our model for training and eval. - """ - - task_name: Optional[str] = field(default="ner", metadata={"help": "The name of the task (ner, pos...)."}) - dataset_name: Optional[str] = field( - default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} - ) - dataset_config_name: Optional[str] = field( - default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} - ) - train_file: Optional[str] = field( - default=None, metadata={"help": "The input training data file (a csv or JSON file)."} - ) - validation_file: Optional[str] = field( - default=None, - metadata={"help": "An optional input evaluation data file to evaluate on (a csv or JSON file)."}, - ) - test_file: Optional[str] = field( - default=None, - metadata={"help": "An optional input test data file to predict on (a csv or JSON file)."}, - ) - text_column_name: Optional[str] = field( - default=None, metadata={"help": "The column name of text to input in the file (a csv or JSON file)."} - ) - label_column_name: Optional[str] = field( - default=None, metadata={"help": "The column name of label to input in the file (a csv or JSON file)."} - ) - overwrite_cache: bool = field( - default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} - ) - preprocessing_num_workers: Optional[int] = field( - default=None, - metadata={"help": "The number of processes to use for the preprocessing."}, - ) - max_length: Optional[int] = field(default=256, metadata={"help": "Max length (in tokens) for truncation/padding"}) - pad_to_max_length: bool = field( - default=False, - metadata={ - "help": ( - "Whether to pad all samples to model maximum sentence length. " - "If False, will pad the samples dynamically when batching to the maximum length in the batch. More " - "efficient on GPU but very bad for TPU." - ) - }, - ) - max_train_samples: Optional[int] = field( - default=None, - metadata={ - "help": ( - "For debugging purposes or quicker training, truncate the number of training examples to this " - "value if set." - ) - }, - ) - max_eval_samples: Optional[int] = field( - default=None, - metadata={ - "help": ( - "For debugging purposes or quicker training, truncate the number of evaluation examples to this " - "value if set." - ) - }, - ) - max_predict_samples: Optional[int] = field( - default=None, - metadata={ - "help": ( - "For debugging purposes or quicker training, truncate the number of prediction examples to this " - "value if set." - ) - }, - ) - label_all_tokens: bool = field( - default=False, - metadata={ - "help": ( - "Whether to put the label for one word on all tokens of generated by that word or just on the " - "one (in which case the other tokens will have a padding index)." - ) - }, - ) - return_entity_level_metrics: bool = field( - default=False, - metadata={"help": "Whether to return all the entity levels during evaluation or just the overall ones."}, - ) - - def __post_init__(self): - if self.dataset_name is None and self.train_file is None and self.validation_file is None: - raise ValueError("Need either a dataset name or a training/validation file.") - else: - if self.train_file is not None: - extension = self.train_file.split(".")[-1] - assert extension in ["csv", "json"], "`train_file` should be a csv or a json file." - if self.validation_file is not None: - extension = self.validation_file.split(".")[-1] - assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file." - self.task_name = self.task_name.lower() - -@dataclass -class OptimizationArguments: - """ - Arguments pertaining to what type of optimization we are going to apply on the model. - """ - - tune: bool = field( - default=False, - metadata={"help": "Whether or not to apply quantization."}, - ) - quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, - ) - metric_name: Optional[str] = field( - default=None, - metadata={"help": "Metric used for the tuning strategy."}, - ) - is_relative: Optional[bool] = field( - default=True, - metadata={"help": "Metric tolerance model, expected to be relative or absolute."}, - ) - perf_tol: Optional[float] = field( - default=0.01, - metadata={"help": "Performance tolerance when optimizing the model."}, - ) - benchmark: bool = field( - default=False, - metadata={"help": "run benchmark."}) - int8: bool = field( - default=False, - metadata={"help":"Whether to use the quantized int8 model."}) - accuracy_only: bool = field( - default=False, - metadata={"help":"Whether to only test accuracy for model tuned by Neural Compressor."}) - -@dataclass -class DistributedArguments: - """ - Arguments setting the distributed multinode environment - """ - - worker: str = field( - default=None, - metadata={"help": "List of node ip addresses in a string, and there should not be space between addresses."}, - ) - task_index: int = field( - default=0, - metadata={"help": "Worker index, and 0 represents the chief worker while other workers are set as 1,2,3..."}, - ) - -# endregion - - -def main(): - # region Argument Parsing - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments, OptimizationArguments, DistributedArguments)) - model_args, data_args, training_args, optim_args, distributed_args = parser.parse_args_into_dataclasses() - # endregion - - # region Setup logging - # we only want one process per machine to log things on the screen. - # accelerator.is_local_main_process is only True for one process per machine. - # region Logging - logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - handlers=[logging.StreamHandler(sys.stdout)], - ) - logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) - - # Set the verbosity to info of the Transformers logger (on main process only): - if is_main_process(training_args.local_rank): - transformers.utils.logging.set_verbosity_info() - transformers.utils.logging.enable_default_handler() - transformers.utils.logging.enable_explicit_format() - logger.info(f"Training/evaluation parameters {training_args}") - # endregion - - # If passed along, set the training seed now. - if training_args.seed is not None: - set_seed(training_args.seed) - # endregion - - # region Set the multinode environment, the strategy and paths - strategy = None - worker_list = None - if distributed_args.worker is not None: - logger.info("distributed environment initialization...") - - worker_list = distributed_args.worker.split(",") - - from intel_extension_for_transformers.transformers.utils.utility_tf import distributed_init - distributed_init(worker_list, "worker", distributed_args.task_index) - - strategy = tf.distribute.MultiWorkerMirroredStrategy() - from intel_extension_for_transformers.transformers.utils.utility_tf import get_filepath - training_args.output_dir = get_filepath(training_args.output_dir, strategy.cluster_resolver.task_type, strategy.cluster_resolver.task_id) - else: - strategy = training_args.strategy - #endregion - - # region Loading datasets - # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) - # or just provide the name of one of the public datasets for token classification task available on the hub at https://huggingface.co/datasets/ - # (the dataset will be downloaded automatically from the datasets Hub). - # - # For CSV/JSON files, this script will use the column called 'tokens' or the first column if no column called - # 'tokens' is found. You can easily tweak this behavior (see below). - # - # In distributed training, the load_dataset function guarantee that only one local process can concurrently - # download the dataset. - if data_args.dataset_name is not None: - # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset( - data_args.dataset_name, - data_args.dataset_config_name, - use_auth_token=True if model_args.use_auth_token else None, - cache_dir=model_args.cache_dir, - ) - else: - data_files = {} - if data_args.train_file is not None: - data_files["train"] = data_args.train_file - if data_args.validation_file is not None: - data_files["validation"] = data_args.validation_file - extension = data_args.train_file.split(".")[-1] - raw_datasets = load_dataset( - extension, - data_files=data_files, - use_auth_token=True if model_args.use_auth_token else None, - cache_dir=model_args.cache_dir, - ) - # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at - # https://huggingface.co/docs/datasets/loading_datasets.html. - - if raw_datasets["train"] is not None: - column_names = raw_datasets["train"].column_names - features = raw_datasets["train"].features - else: - column_names = raw_datasets["validation"].column_names - features = raw_datasets["validation"].features - - if data_args.text_column_name is not None: - text_column_name = data_args.text_column_name - elif "tokens" in column_names: - text_column_name = "tokens" - else: - text_column_name = column_names[0] - - if data_args.label_column_name is not None: - label_column_name = data_args.label_column_name - elif f"{data_args.task_name}_tags" in column_names: - label_column_name = f"{data_args.task_name}_tags" - else: - label_column_name = column_names[1] - - # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the - # unique labels. - def get_label_list(labels): - unique_labels = set() - for label in labels: - unique_labels = unique_labels | set(label) - label_list = list(unique_labels) - label_list.sort() - return label_list - - if isinstance(features[label_column_name].feature, ClassLabel): - label_list = features[label_column_name].feature.names - # No need to convert the labels since they are already ints. - label_to_id = {i: i for i in range(len(label_list))} - else: - label_list = get_label_list(raw_datasets["train"][label_column_name]) - label_to_id = {l: i for i, l in enumerate(label_list)} - num_labels = len(label_list) - # endregion - - # region Load config and tokenizer - # - # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently - # download model & vocab. - - config = AutoConfig.from_pretrained( - model_args.config_name if model_args.config_name else model_args.model_name_or_path, - num_labels=num_labels, - finetuning_task=data_args.task_name, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - _commit_hash="main", - ) - - tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path - if not tokenizer_name_or_path: - raise ValueError( - "You are instantiating a new tokenizer from scratch. This is not supported by this script." - "You can do it from another script, save it, and load it from here, using --tokenizer_name." - ) - - if config.model_type in {"gpt2", "roberta"}: - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, cache_dir=model_args.cache_dir, use_fast=True, - add_prefix_space=True, _commit_hash="main",) - else: - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, cache_dir=model_args.cache_dir, use_fast=True, - _commit_hash="main",) - # endregion - - # region Preprocessing the raw datasets - # First we tokenize all the texts. - # should always use padding because the current ptq does not use tf > 2.8 - # so no RaggedTensor is supported - padding = "max_length" if data_args.pad_to_max_length else False - - # Tokenize all texts and align the labels with them. - - def tokenize_and_align_labels(examples): - tokenized_inputs = tokenizer( - examples[text_column_name], - max_length=data_args.max_length, - padding=padding, - truncation=True, - # We use this argument because the texts in our dataset are lists of words (with a label for each word). - is_split_into_words=True, - ) - - labels = [] - for i, label in enumerate(examples[label_column_name]): - word_ids = tokenized_inputs.word_ids(batch_index=i) - previous_word_idx = None - label_ids = [] - for word_idx in word_ids: - # Special tokens have a word id that is None. We set the label to -100 so they are automatically - # ignored in the loss function. - if word_idx is None: - label_ids.append(-100) - # We set the label for the first token of each word. - elif word_idx != previous_word_idx: - label_ids.append(label_to_id[label[word_idx]]) - # For the other tokens in a word, we set the label to either the current label or -100, depending on - # the label_all_tokens flag. - else: - label_ids.append(label_to_id[label[word_idx]] if data_args.label_all_tokens else -100) - previous_word_idx = word_idx - - labels.append(label_ids) - tokenized_inputs["labels"] = labels - return tokenized_inputs - - processed_raw_datasets = raw_datasets.map( - tokenize_and_align_labels, - batched=True, - remove_columns=raw_datasets["train"].column_names, - desc="Running tokenizer on dataset", - ) - - train_dataset = processed_raw_datasets["train"] - eval_dataset = processed_raw_datasets["validation"] - - if data_args.max_train_samples is not None: - max_train_samples = min(len(train_dataset), data_args.max_train_samples) - train_dataset = train_dataset.select(range(max_train_samples)) - - if data_args.max_eval_samples is not None: - max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) - eval_dataset = eval_dataset.select(range(max_eval_samples)) - - # Log a few random samples from the training set: - for index in random.sample(range(len(train_dataset)), 3): - logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") - # endregion - - # Metrics - metric = load_metric("seqeval") - - def get_labels(y_pred, y_true): - # Transform predictions and references tensos to numpy arrays - - # Remove ignored index (special tokens) - true_predictions = [ - [label_list[p] for (p, l) in zip(pred, gold_label) if l != -100] - for pred, gold_label in zip(y_pred, y_true) - ] - true_labels = [ - [label_list[l] for (p, l) in zip(pred, gold_label) if l != -100] - for pred, gold_label in zip(y_pred, y_true) - ] - return true_predictions, true_labels - - def compute_metrics(predictions, labels): - predictions = predictions["logits"] - predictions = np.argmax(predictions, axis=-1) - - attention_mask = eval_dataset.with_format("tf")["attention_mask"] - labels[attention_mask == 0] = -100 - - # Remove ignored index (special tokens) - preds, refs = get_labels(predictions, labels) - - metric.add_batch( - predictions=preds, - references=refs, - ) - results = metric.compute() - - if data_args.return_entity_level_metrics: - # Unpack nested dictionaries - final_results = {} - for key, value in results.items(): - if isinstance(value, dict): - for n, v in value.items(): - final_results[f"{key}_{n}"] = v - else: - final_results[key] = value - return final_results - else: - return { - "precision": results["overall_precision"], - "recall": results["overall_recall"], - "f1": results["overall_f1"], - "accuracy": results["overall_accuracy"], - } - - # endregion - - with strategy.scope(): - # region Initialize model - if model_args.model_name_or_path: - model = TFAutoModelForTokenClassification.from_pretrained( - model_args.model_name_or_path, - config=config, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) - else: - logger.info("Training new model from scratch") - model = TFAutoModelForTokenClassification.from_config(config) - - model.resize_token_embeddings(len(tokenizer)) - # endregion - - # region Create TF datasets - - # We need the DataCollatorForTokenClassification here, as we need to correctly pad labels as - # well as inputs. - collate_fn = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf") - total_train_batch_size = training_args.per_device_train_batch_size * (len(worker_list) if worker_list is not None else 1) - - dataset_options = tf.data.Options() - dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF - - # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in - # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also - # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names - # yourself if you use this method, whereas they are automatically inferred from the model input names when - # using model.prepare_tf_dataset() - # For more info see the docs: - # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset - # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset - - tf_train_dataset = model.prepare_tf_dataset( - train_dataset, - collate_fn=collate_fn, - batch_size=total_train_batch_size, - shuffle=True, - ).with_options(dataset_options) - total_eval_batch_size = training_args.per_device_eval_batch_size * (len(worker_list) if worker_list is not None else 1) - tf_eval_dataset = model.prepare_tf_dataset( - eval_dataset, - collate_fn=collate_fn, - batch_size=total_eval_batch_size, - shuffle=False, - ).with_options(dataset_options) - - # endregion - - # region Optimizer, loss and compilation - optimizer = tf.keras.optimizers.Adam( - learning_rate=training_args.learning_rate, - beta_1=training_args.adam_beta1, - beta_2=training_args.adam_beta2, - epsilon=training_args.adam_epsilon, - clipnorm=training_args.max_grad_norm, - ) - - model.compile(optimizer=optimizer, jit_compile=training_args.xla) - # endregion - - if optim_args.tune: - from intel_extension_for_transformers.transformers import metrics, objectives, QuantizationConfig, TFOptimization - optimization = TFOptimization( - model=model, - args=training_args, - train_dataset=tf_train_dataset, - eval_dataset=tf_eval_dataset, - compute_metrics=compute_metrics, - task_type=strategy.cluster_resolver.task_type if isinstance(strategy, tf.distribute.MultiWorkerMirroredStrategy) else None, - task_id=strategy.cluster_resolver.task_id if isinstance(strategy, tf.distribute.MultiWorkerMirroredStrategy) else None, - ) - tune_metric = metrics.Metric( - name="accuracy", greater_is_better=True, is_relative=True, criterion=optim_args.perf_tol, - ) - quantization_config = QuantizationConfig( - framework="tensorflow", - approach="POSTTRAININGSTATIC", - metrics=[tune_metric], - objectives=[objectives.performance] - ) - quantized_model = optimization.quantize(quant_config=quantization_config) - exit(0) - - # region Training - if training_args.do_train: - callbacks = [SavePretrainedCallback(output_dir=training_args.output_dir)] - logger.info("***** Running training *****") - logger.info(f" Num examples = {len(train_dataset)}") - logger.info(f" Num Epochs = {training_args.num_train_epochs}") - logger.info(f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}") - logger.info(f" Total train batch size = {total_train_batch_size}") - # Only show the progress bar once on each machine. - - model.fit( - tf_train_dataset, - validation_data=tf_eval_dataset, - epochs=int(training_args.num_train_epochs), - callbacks=callbacks, - ) - # endregion - - # region Evaluation - if training_args.do_eval: - # We normally do validation as part of the Keras fit loop, but we run it independently - # if there was no fit() step (because we didn't train the model) or if the task is MNLI, - # because MNLI has a separate validation-mismatched validation set - logger.info("*** Evaluate ***") - - tasks = [data_args.task_name] - tf_datasets = [tf_eval_dataset] - raw_datasets = [processed_raw_datasets["validation"]] - - num_examples = 0 - - if optim_args.int8: - model = tf.saved_model.load(training_args.output_dir) - else: - from intel_extension_for_transformers.transformers.utils.utility_tf import keras2SavedModel - model = keras2SavedModel(model) - - for raw_dataset, tf_dataset, task in zip(raw_datasets, tf_datasets, tasks): - num_examples += sum( - 1 for _ in (tf_dataset.unbatch() - if hasattr(tf_dataset, "unbatch") else tf_dataset - ) - ) - - preds: np.ndarray = None - label_ids: np.ndarray = None - infer = model.signatures[list(model.signatures.keys())[0]] - - if optim_args.accuracy_only: - iterations = 1 - warmup = 0 - else: - iterations = 10 - warmup = 5 - latency_list = [] - - for idx in range(iterations): - iteration_time = 0 - for i, (inputs, labels) in enumerate(tf_dataset): - for name in inputs: - inputs[name] = tf.constant(inputs[name].numpy(), dtype=infer.inputs[0].dtype) - start = time.time() - results = infer(**inputs) - iteration_time += time.time() - start - if idx == 0: # only accumulate once all the preds and labels - for val in results: - if preds is None: - preds = results[val].numpy() - else: - preds = np.append(preds, results[val].numpy(), axis=0) - if label_ids is None: - label_ids = labels.numpy() - else: - label_ids = np.append(label_ids, labels.numpy(), axis=0) - - latency_list.append(iteration_time) - logger.info("Iteration {} time: {} sec".format(idx, iteration_time)) - eval_metrics = compute_metrics({"logits": preds}, label_ids) - logger.info("\nEvaluation result: ") - logger.info("metric ({}) Accuracy: {}".format(task, eval_metrics["accuracy"])) - - average_iteration_time = np.array(latency_list[warmup:]).mean() - logger.info( - "Throughput: {} samples/sec".format( - num_examples / average_iteration_time) - ) - # endregion - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/examples/huggingface/tensorflow/token-classification/quantization/run_tuning.sh b/examples/huggingface/tensorflow/token-classification/quantization/run_tuning.sh deleted file mode 100644 index 415cf26ddd1..00000000000 --- a/examples/huggingface/tensorflow/token-classification/quantization/run_tuning.sh +++ /dev/null @@ -1,104 +0,0 @@ -#!/bin/bash -set -x - -function main { - - init_params "$@" - run_tuning - -} - -# init params -function init_params { - topology="bert_base_ner" - tuned_checkpoint="saved_results" - extra_cmd="" - batch_size=8 - MAX_SEQ_LENGTH=128 - model_type="bert" - approach="PostTrainingStatic" - cache_dir="cache" - for var in "$@" - do - case $var in - --topology=*) - topology=$(echo $var |cut -f2 -d=) - ;; - --dataset_location=*) - dataset_location=$(echo $var |cut -f2 -d=) - ;; - --input_model=*) - input_model=$(echo $var |cut -f2 -d=) - ;; - --output_model=*) - tuned_checkpoint=$(echo $var |cut -f2 -d=) - ;; - --worker=*) - worker=$(echo $var |cut -f2 -d=) - ;; - --task_index=*) - task_index=$(echo $var |cut -f2 -d=) - ;; - --cache_dir=*) - cache_dir=$(echo $var |cut -f2 -d=) - ;; - *) - echo "Error: No such parameter: ${var}" - exit 1 - ;; - esac - done - -} - -# run_tuning -function run_tuning { - batch_size=64 - if [ "${topology}" = "bert_base_ner" ]; then - TASK_NAME="ner" - model_name_or_path="dslim/bert-base-NER" - approach="PostTrainingStatic" - dataset_name=conll2003 - fi - - if [ "${worker}" = "" ] - then - python -u run_ner.py \ - --model_name_or_path ${model_name_or_path} \ - --dataset_name ${dataset_name} \ - --task_name ${TASK_NAME} \ - --pad_to_max_length \ - --do_eval \ - --max_length ${MAX_SEQ_LENGTH} \ - --per_device_train_batch_size ${batch_size} \ - --per_device_eval_batch_size ${batch_size} \ - --output_dir ${tuned_checkpoint} \ - --no_cuda \ - --overwrite_output_dir \ - --cache_dir ${cache_dir} \ - --quantization_approach ${approach} \ - --tune \ - ${extra_cmd} - else - python -u run_ner.py \ - --model_name_or_path ${model_name_or_path} \ - --dataset_name ${dataset_name} \ - --task_name ${TASK_NAME} \ - --pad_to_max_length \ - --do_eval \ - --max_length ${MAX_SEQ_LENGTH} \ - --per_device_train_batch_size ${batch_size} \ - --per_device_eval_batch_size ${batch_size} \ - --output_dir ${tuned_checkpoint} \ - --no_cuda \ - --overwrite_output_dir \ - --cache_dir ${cache_dir} \ - --quantization_approach ${approach} \ - --tune \ - --worker "${worker}" \ - --task_index ${task_index} \ - ${extra_cmd} - fi -} - -main "$@" diff --git "a/intel_extension_for_transformers/neural_chat/assets/docs/4th Generation Intel\302\256 Xeon\302\256 Scalable Processors Product Specifications.html" "b/intel_extension_for_transformers/neural_chat/assets/docs/4th Generation Intel\302\256 Xeon\302\256 Scalable Processors Product Specifications.html" deleted file mode 100644 index ed26bdf1b00..00000000000 --- "a/intel_extension_for_transformers/neural_chat/assets/docs/4th Generation Intel\302\256 Xeon\302\256 Scalable Processors Product Specifications.html" +++ /dev/null @@ -1,17164 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - -4th Generation Intel® Xeon® Scalable Processors Product Specifications - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- -
- - - - - - - - - - - - - - - - - - -
- - - - - -
-
-
-
-
- -
-
-
-
-
- - - - - -
- - -
- - - - - - - -
-
-
-
-
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-
- -
-
-
-
- - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- -
-

4th Generation Intel® Xeon® Scalable Processors


- Filter: - - View All - - | Embedded - - | Retail Box - | Server - -
- - -
-
-
-
-
- -
- 54 - Products - - - - COMPARE ALL - - - COMPARE NONE - - -
-
-
-

-
-
- Product Name -
-
-
-
-
- Launch Date -
-
-
-
-
- Total Cores -
-
-
-
-
- Max Turbo Frequency -
-
-
-
-
- Processor Base Frequency -
-
-
-
-
- Cache -
-
-
-
-
- TDP -
-
-
- - - - - - - - Q3'23 - - - - - - - - - - 24 - - - - - - - - - - 3.60 GHz - - - - - - - - - - 1.90 GHz - - - - - - - - - - 45 MB - - - - - - - - - - 185 W - - - -
- - - - - - - - Q3'23 - - - - - - - - - - 28 - - - - - - - - - - 3.60 GHz - - - - - - - - - - 2.00 GHz - - - - - - - - - - 52.5 MB - - - - - - - - - - 195 W - - - -
- - - - - - - - Q3'23 - - - - - - - - - - 32 - - - - - - - - - - 3.60 GHz - - - - - - - - - - 2.00 GHz - - - - - - - - - - 60 MB - - - - - - - - - - 205 W - - - -
- - - - - - - - Q3'23 - - - - - - - - - - 32 - - - - - - - - - - 3.60 GHz - - - - - - - - - - 2.00 GHz - - - - - - - - - - 60 MB - - - - - - - - - - 205 W - - - -
- - - - - - - - Q3'23 - - - - - - - - - - 32 - - - - - - - - - - 3.60 GHz - - - - - - - - - - 2.00 GHz - - - - - - - - - - 60 MB - - - - - - - - - - 195 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 16 - - - - - - - - - - 4.00 GHz - - - - - - - - - - 2.90 GHz - - - - - - - - - - 45 MB - - - - - - - - - - 270 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 28 - - - - - - - - - - 3.50 GHz - - - - - - - - - - 2.00 GHz - - - - - - - - - - 75 MB undefined - - - - - - - - - - 250 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 36 - - - - - - - - - - 3.20 GHz - - - - - - - - - - 2.00 GHz - - - - - - - - - - 67.5 MB - - - - - - - - - - 300 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 32 - - - - - - - - - - 3.40 GHz - - - - - - - - - - 2.10 GHz - - - - - - - - - - 82.5 MB - - - - - - - - - - 270 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 44 - - - - - - - - - - 3.80 GHz - - - - - - - - - - 2.70 GHz - - - - - - - - - - 82.5 MB - - - - - - - - - - 350 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 40 - - - - - - - - - - 3.80 GHz - - - - - - - - - - 2.20 GHz - - - - - - - - - - 105 MB - - - - - - - - - - 330 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 40 - - - - - - - - - - 3.70 GHz - - - - - - - - - - 2.00 GHz - - - - - - - - - - 105 MB - - - - - - - - - - 300 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 48 - - - - - - - - - - 3.70 GHz - - - - - - - - - - 2.20 GHz - - - - - - - - - - 97.5 MB - - - - - - - - - - 300 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 32 - - - - - - - - - - 4.10 GHz - - - - - - - - - - 2.80 GHz - - - - - - - - - - 60 MB - - - - - - - - - - 300 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 48 - - - - - - - - - - 3.80 GHz - - - - - - - - - - 2.10 GHz - - - - - - - - - - 105 MB - - - - - - - - - - 350 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 48 - - - - - - - - - - 3.80 GHz - - - - - - - - - - 2.10 GHz - - - - - - - - - - 105 MB - - - - - - - - - - 330 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 48 - - - - - - - - - - 3.80 GHz - - - - - - - - - - 2.40 GHz - - - - - - - - - - 97.5 MB - - - - - - - - - - 330 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 52 - - - - - - - - - - 3.80 GHz - - - - - - - - - - 2.00 GHz - - - - - - - - - - 105 MB - - - - - - - - - - 350 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 52 - - - - - - - - - - 3.60 GHz - - - - - - - - - - 1.70 GHz - - - - - - - - - - 97.5 MB - - - - - - - - - - 300 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 52 - - - - - - - - - - 3.80 GHz - - - - - - - - - - 2.10 GHz - - - - - - - - - - 105 MB - - - - - - - - - - 350 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 52 - - - - - - - - - - 3.60 GHz - - - - - - - - - - 1.80 GHz - - - - - - - - - - 97.5 MB - - - - - - - - - - 300 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 56 - - - - - - - - - - 3.80 GHz - - - - - - - - - - 2.00 GHz - - - - - - - - - - 105 MB - - - - - - - - - - 350 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 60 - - - - - - - - - - 3.50 GHz - - - - - - - - - - 1.90 GHz - - - - - - - - - - 112.5 MB - - - - - - - - - - 350 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 24 - - - - - - - - - - 3.90 GHz - - - - - - - - - - 1.90 GHz - - - - - - - - - - 45 MB - - - - - - - - - - 165 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 24 - - - - - - - - - - 3.90 GHz - - - - - - - - - - 2.10 GHz - - - - - - - - - - 45 MB - - - - - - - - - - 185 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 8 - - - - - - - - - - 4.10 GHz - - - - - - - - - - 2.90 GHz - - - - - - - - - - 22.5 MB - - - - - - - - - - 150 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 16 - - - - - - - - - - 4.00 GHz - - - - - - - - - - 2.00 GHz - - - - - - - - - - 30 MB - - - - - - - - - - 150 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 24 - - - - - - - - - - 3.80 GHz - - - - - - - - - - 1.80 GHz - - - - - - - - - - 45 MB - - - - - - - - - - 165 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 24 - - - - - - - - - - 3.80 GHz - - - - - - - - - - 2.00 GHz - - - - - - - - - - 45 MB - - - - - - - - - - 185 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 28 - - - - - - - - - - 4.10 GHz - - - - - - - - - - 2.00 GHz - - - - - - - - - - 52.5 MB - - - - - - - - - - 205 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 20 - - - - - - - - - - 4.00 GHz - - - - - - - - - - 2.10 GHz - - - - - - - - - - 37.5 MB - - - - - - - - - - 145 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 20 - - - - - - - - - - 4.10 GHz - - - - - - - - - - 2.30 GHz - - - - - - - - - - 37.5 MB - - - - - - - - - - 160 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 32 - - - - - - - - - - 3.40 GHz - - - - - - - - - - 2.00 GHz - - - - - - - - - - 60 MB - - - - - - - - - - 250 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 18 - - - - - - - - - - 4.20 GHz - - - - - - - - - - 2.20 GHz - - - - - - - - - - 45 MB - - - - - - - - - - 165 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 24 - - - - - - - - - - 4.00 GHz - - - - - - - - - - 2.10 GHz - - - - - - - - - - 60 MB - - - - - - - - - - 185 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 32 - - - - - - - - - - 3.60 GHz - - - - - - - - - - 1.80 GHz - - - - - - - - - - 60 MB - - - - - - - - - - 185 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 16 - - - - - - - - - - 4.10 GHz - - - - - - - - - - 2.50 GHz - - - - - - - - - - 37.5 MB - - - - - - - - - - 185 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 32 - - - - - - - - - - 3.80 GHz - - - - - - - - - - 1.80 GHz - - - - - - - - - - 60 MB - - - - - - - - - - 185 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 32 - - - - - - - - - - 3.40 GHz - - - - - - - - - - 2.10 GHz - - - - - - - - - - 60 MB - - - - - - - - - - 270 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 8 - - - - - - - - - - 4.10 GHz - - - - - - - - - - 3.70 GHz - - - - - - - - - - 22.5 MB - - - - - - - - - - 195 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 8 - - - - - - - - - - 4.10 GHz - - - - - - - - - - 3.70 GHz - - - - - - - - - - 22.5 MB - - - - - - - - - - 195 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 32 - - - - - - - - - - 3.90 GHz - - - - - - - - - - 2.20 GHz - - - - - - - - - - 60 MB - - - - - - - - - - 205 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 32 - - - - - - - - - - 3.60 GHz - - - - - - - - - - 2.00 GHz - - - - - - - - - - 60 MB - - - - - - - - - - 205 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 32 - - - - - - - - - - 4.00 GHz - - - - - - - - - - 2.00 GHz - - - - - - - - - - 60 MB - - - - - - - - - - 205 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 24 - - - - - - - - - - 4.00 GHz - - - - - - - - - - 2.60 GHz - - - - - - - - - - 60 MB - - - - - - - - - - 225 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 16 - - - - - - - - - - 4.00 GHz - - - - - - - - - - 3.60 GHz - - - - - - - - - - 45 MB - - - - - - - - - - 270 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 32 - - - - - - - - - - 4.10 GHz - - - - - - - - - - 2.40 GHz - - - - - - - - - - 60 MB - - - - - - - - - - 250 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 32 - - - - - - - - - - 4.10 GHz - - - - - - - - - - 2.10 GHz - - - - - - - - - - 60 MB - - - - - - - - - - 225 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 32 - - - - - - - - - - 3.40 GHz - - - - - - - - - - 2.20 GHz - - - - - - - - - - 60 MB - - - - - - - - - - 270 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 32 - - - - - - - - - - 4.00 GHz - - - - - - - - - - 3.10 GHz - - - - - - - - - - 60 MB - - - - - - - - - - 350 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 10 - - - - - - - - - - 4.00 GHz - - - - - - - - - - 2.70 GHz - - - - - - - - - - 26.25 MB - - - - - - - - - - 150 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 12 - - - - - - - - - - 3.90 GHz - - - - - - - - - - 2.00 GHz - - - - - - - - - - 30 MB - - - - - - - - - - 150 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 20 - - - - - - - - - - 3.90 GHz - - - - - - - - - - 2.00 GHz - - - - - - - - - - 37.5 MB - - - - - - - - - - 165 W - - - -
- - - - - - - - Q1'23 - - - - - - - - - - 8 - - - - - - - - - - 1.90 GHz - - - - - - - - - - 1.80 GHz - - - - - - - - - - 22.5 MB - - - - - - - - - - 125 W - - - -
- -
-
-
-
-
-
- - -
-
-
-
-
-
-

Advanced Search

-

Use this tool to filter Intel® processors by socket, number of cores, cache size, maximum memory, and more

-
- -
-
-
-
-
-
-
-
- -
-
-
-
-
-
-
-
-
- - - - - -
- - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - -
- -
- - - - -
- - - - - - - - - - - - - - - - - - - - - - - -
- -
- - - - - -
- - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
\ No newline at end of file diff --git a/intel_extension_for_transformers/neural_chat/models/base_model.py b/intel_extension_for_transformers/neural_chat/models/base_model.py index 1aa31ef1782..5f3616f89ff 100644 --- a/intel_extension_for_transformers/neural_chat/models/base_model.py +++ b/intel_extension_for_transformers/neural_chat/models/base_model.py @@ -58,6 +58,16 @@ def construct_parameters(query, model_name, device, assistant_model, config): params["device"] = device return params +def safe_path(*paths): + # Prevent path traversal by ensuring the final path is within the base path or assets_path + current_working_directory = os.getcwd() + path_parts = current_working_directory.split('/') + base_path = '/' + path_parts[1] + assets_path = '/intel-extension-for-transformers/intel_extension_for_transformers/neural_chat/assets/' + final_path = os.path.abspath(*paths) + if final_path.startswith(base_path) or final_path.startswith(assets_path): + return final_path + class BaseModel(ABC): """A base class for LLM.""" @@ -158,7 +168,7 @@ def predict_stream(self, query, origin_query="", config=None): my_origin_query = origin_query if is_audio_file(query): - if not os.path.exists(query): + if not os.path.exists(safe_path(query)): raise ValueError(f"The audio file path {query} is invalid.") query_include_prompt = False @@ -181,7 +191,7 @@ def predict_stream(self, query, origin_query="", config=None): if response: logging.info("Get response: %s from cache", response) return response['choices'][0]['text'], link - if plugin_name == "asr" and not os.path.exists(query): + if plugin_name == "asr" and not os.path.exists(safe_path(query)): continue if plugin_name == "retrieval": try: @@ -281,7 +291,7 @@ def predict(self, query, origin_query="", config=None): config.ipex_int8 = self.ipex_int8 if is_audio_file(query): - if not os.path.exists(query): + if not os.path.exists(safe_path(query)): raise ValueError(f"The audio file path {query} is invalid.") query_include_prompt = False @@ -302,7 +312,7 @@ def predict(self, query, origin_query="", config=None): if response: logging.info("Get response: %s from cache", response) return response['choices'][0]['text'] - if plugin_name == "asr" and not os.path.exists(query): + if plugin_name == "asr" and not os.path.exists(safe_path(query)): continue if plugin_name == "retrieval": try: diff --git a/intel_extension_for_transformers/neural_chat/models/model_utils.py b/intel_extension_for_transformers/neural_chat/models/model_utils.py index 6bc2d71d7c7..d4db8a2ee5a 100644 --- a/intel_extension_for_transformers/neural_chat/models/model_utils.py +++ b/intel_extension_for_transformers/neural_chat/models/model_utils.py @@ -1010,17 +1010,19 @@ def is_llm_runtime_model(model, device): def remove_prompt_history(model_name, prompt): result = prompt if re.search("llama", model_name, re.IGNORECASE): - matches = re.findall(r'\[INST\](.*?)\[/INST\]', prompt) + matches = re.findall(r'\[INST\]([^\[]*?)\[/INST\]', prompt) if matches: result = "[INST]" + matches[-1] + "[/INST]" elif re.search("chatglm", model_name, re.IGNORECASE): - pattern = re.compile(r'问:.*?\n答:', re.DOTALL) - matches = pattern.findall(prompt) - if matches: - result = matches[-1].replace("问:", "").replace("\n答:", "").strip() + last_q_index = prompt.rfind("问:") + last_a_index = prompt.rfind("\n答:") + if last_q_index != -1 and last_a_index != -1 and last_q_index < last_a_index: + result = prompt[last_q_index + len("问:"):last_a_index].strip() elif re.search("neuralchat", model_name, re.IGNORECASE): - matches = re.findall(r'### User:.*?### Assistant:', prompt, re.DOTALL) - if matches: + start = prompt.rfind('### User:') + end = prompt.rfind('### Assistant:') + if start != -1 and end != -1: + match = prompt[start:end+len('### Assistant:')] result = ''' ### System: - You are a helpful assistant chatbot trained by Intel. @@ -1029,7 +1031,7 @@ def remove_prompt_history(model_name, prompt): but will refuse to do anything that could be considered harmful to the user. - You are more than just an information source, you are also able to write poetry,\ short stories, and make jokes.
-''' + matches[-1] +''' + match return result diff --git a/intel_extension_for_transformers/neural_chat/server/restful/retrieval_api.py b/intel_extension_for_transformers/neural_chat/server/restful/retrieval_api.py index 40c2aa8a3e6..d338482f220 100644 --- a/intel_extension_for_transformers/neural_chat/server/restful/retrieval_api.py +++ b/intel_extension_for_transformers/neural_chat/server/restful/retrieval_api.py @@ -234,6 +234,13 @@ def handle_retrieval_request(self, request: RetrievalRequest) -> RetrievalRespon RETRIEVAL_FILE_PATH = os.getenv("RETRIEVAL_FILE_PATH", default="./retrieval_docs")+'/' EXCEPT_PATTERNS = ["/xuhui_doc", "default/persist_dir"] +def safe_join(base_path, *paths): + # Prevent path traversal by ensuring the final path is within the base path + base_path = os.path.abspath(base_path) + final_path = os.path.abspath(os.path.join(base_path, *paths)) + if not final_path.startswith(base_path): + raise ValueError("Attempted Path Traversal Detected") + return final_path @router.post("/v1/askdoc/upload_link") async def retrieval_upload_link(request: Request): @@ -316,7 +323,7 @@ async def retrieval_add_files(request: Request, path_prefix = get_path_prefix(kb_id, user_id) upload_path = path_prefix + '/upload_dir' persist_path = path_prefix + '/persist_dir' - save_path = Path(upload_path) / file_path + save_path = safe_join(Path(upload_path), file_path) save_path.parent.mkdir(parents=True, exist_ok=True) # save file content to local disk @@ -618,7 +625,7 @@ async def delete_single_file(request: Request): logger.info(f"[askdoc - delete_file] successfully delete kb {knowledge_base_id}") return {"status": True} - delete_path = Path(path_prefix) / "upload_dir" / del_path + delete_path = safe_join(Path(path_prefix) / "upload_dir", del_path) logger.info(f'[askdoc - delete_file] delete_path: {delete_path}') # partially delete files/folders from the kb diff --git a/intel_extension_for_transformers/qbits/__init__.py b/intel_extension_for_transformers/qbits/__init__.py index 5e48eeb8fc6..c23599090dc 100644 --- a/intel_extension_for_transformers/qbits/__init__.py +++ b/intel_extension_for_transformers/qbits/__init__.py @@ -16,4 +16,5 @@ # limitations under the License. import torch -from intel_extension_for_transformers.qbits_py import * # pylint: disable=E0401, E0611 +if not torch.xpu._is_compiled(): + from intel_extension_for_transformers.qbits_py import * # pylint: disable=E0401, E0611 diff --git a/intel_extension_for_transformers/transformers/__init__.py b/intel_extension_for_transformers/transformers/__init__.py index a2d9dce9fb3..82d4475ba2d 100644 --- a/intel_extension_for_transformers/transformers/__init__.py +++ b/intel_extension_for_transformers/transformers/__init__.py @@ -19,22 +19,12 @@ from .config import ( WEIGHTS_NAME, BenchmarkConfig, - DistillationConfig, DynamicLengthConfig, Provider, PrunerV2, - PruningConfig, - QuantizationConfig, - TFDistillationConfig, -) -from .distillation import ( - SUPPORTED_DISTILLATION_CRITERION_MODE, - DistillationCriterionMode, + ) -from .optimizer import NoTrainerOptimizer, Orchestrate_optimizer -from .optimizer_tf import TFOptimization -from .pruning import SUPPORTED_PRUNING_MODE, PrunerConfig, PruningMode -from .quantization import SUPPORTED_QUANT_MODE, QuantizationMode + from .utils import ( MixedPrecisionConfig, BitsAndBytesConfig, diff --git a/intel_extension_for_transformers/transformers/config.py b/intel_extension_for_transformers/transformers/config.py index 640efdb8cf5..a0009e7d3ed 100644 --- a/intel_extension_for_transformers/transformers/config.py +++ b/intel_extension_for_transformers/transformers/config.py @@ -18,16 +18,11 @@ import yaml from enum import Enum -from neural_compressor.conf.config import ( - Distillation_Conf, Pruner, Pruning_Conf, Quantization_Conf -) -from neural_compressor.conf.dotdict import DotDict, deep_set + +from neural_compressor.conf.dotdict import DotDict from .utils.metrics import Metric from .utils.objectives import Objective, performance -from .quantization import QuantizationMode, SUPPORTED_QUANT_MODE -from .distillation import ( - Criterion, DistillationCriterionMode, SUPPORTED_DISTILLATION_CRITERION_MODE -) + from typing import List, Union from xmlrpc.client import boolean @@ -150,651 +145,6 @@ def __init__( self.latency_constraint = latency_constraint self.evo_eval_metric = evo_eval_metric - -class QuantizationConfig(object): - """Configure the quantization process. - - Args: - framework: Which framework you used - approach: Which quantization approach to use - strategy: Which quantization tuning strategy to use - timeout: Tuning timeout(seconds), 0 means early stop. Combined with max_trials field to decide when to exit - max_trials: Max tune times - metrics: Used to evaluate accuracy of tuning model, no need for NoTrainerOptimize - objectives: Objective with accuracy constraint guaranteed - config_file: Path to the config file - sampling_size: How many samples to use - use_bf16: Whether to use bf16 - recipes: apply recipes for quantization, neural_compressor support below recipes: - 'smooth_quant': whether do smooth quant - 'smooth_quant_args': parameters for smooth_quant - 'fast_bias_correction': whether do fast bias correction - 'weight_correction': whether do weight correction - 'gemm_to_matmul': whether convert gemm to matmul and add, only valid for onnx models - 'graph_optimization_level': support 'DISABLE_ALL', 'ENABLE_BASIC', 'ENABLE_EXTENDED', 'ENABLE_ALL' - only valid for onnx models - 'first_conv_or_matmul_quantization': whether quantize the first conv or matmul - 'last_conv_or_matmul_quantization': whether quantize the last conv or matmul - 'pre_post_process_quantization': whether quantize the ops in preprocess and postprocess - 'add_qdq_pair_to_weight': whether add QDQ pair for weights, only valid for onnxrt_trt_ep - 'optypes_to_exclude_output_quant': don't quantize output of specified optypes - 'dedicated_qdq_pair': whether dedicate QDQ pair, only valid for onnxrt_trt_ep. - """ - def __init__( - self, - framework: str = "pytorch", - approach: str = "PostTrainingStatic", - strategy: str = "basic", - timeout: int = 0, - max_trials: int = 100, - metrics: Union[Metric, List] = None, - objectives: Union[Objective, List] = performance, - config_file: str = None, - sampling_size: int = 100, - use_bf16: bool = False, - recipes: dict = None, - ): - """Init a QuantizationConfig object.""" - super().__init__() - if config_file is None: - self.inc_config = Quantization_Conf() - else: - self.inc_config = Quantization_Conf(config_file) - self.framework = framework - if approach is not None: - self.approach = approach - if strategy is not None: - self.strategy = strategy - if timeout is not None: - self.timeout = timeout - if max_trials is not None: - self.max_trials = max_trials - if metrics is not None: - self.metrics = metrics - else: - self._metrics = None - if objectives is not None: - self.objectives = objectives - else: - self._objectives = None - if sampling_size is not None: - self.sampling_size = sampling_size - self.inc_config.usr_cfg.use_bf16 = use_bf16 - if recipes is not None: - self.recipes = recipes - - @property - def approach(self): - """Get the quantization approach.""" - return self.inc_config.usr_cfg.quantization.approach - - @approach.setter - def approach(self, approach): - """Set the quantization approach.""" - approach = approach.upper() - assert approach in SUPPORTED_QUANT_MODE, \ - f"quantization approach: {approach} is not support!" + \ - "PostTrainingStatic, PostTrainingDynamic and QuantizationAwareTraining are supported!" - self.inc_config.usr_cfg.quantization.approach = QuantizationMode[approach].value - - @property - def input_names(self): - """Get the input names.""" - return self.inc_config.usr_cfg.model.inputs - - @input_names.setter - def input_names(self, input_names): - """Set the input names.""" - assert isinstance(input_names, list), "input_names must be a list" - self.inc_config.usr_cfg.model.inputs = input_names - - @property - def output_names(self): - """Get the output names.""" - return self.inc_config.usr_cfg.model.outputs - - @output_names.setter - def output_names(self, output_names): - """Set the output names.""" - assert isinstance(output_names, list), "output_names must be a list" - self.inc_config.usr_cfg.model.outputs = output_names - - @property - def metrics(self): - """Get the metrics.""" - return self._metrics - - @metrics.setter - def metrics(self, metrics: Union[Metric, List]): - """Set the metrics.""" - self._metrics = metrics - rel_or_abs = {True: "relative", False: "absolute"} - assert isinstance(metrics[0] if isinstance(metrics, list) else metrics, Metric), \ - "metric should be a Metric class!" - if isinstance(metrics, Metric) or len(metrics) == 1: - self.inc_config.usr_cfg.tuning.accuracy_criterion = { - rel_or_abs[metrics[0].is_relative] - if isinstance(metrics, list) else rel_or_abs[metrics.is_relative]: - metrics[0].criterion if isinstance(metrics, list) else metrics.criterion, - "higher_is_better": metrics[0].greater_is_better if isinstance(metrics, list) else - metrics.greater_is_better - } - else: - weights = [metric.weight_ratio for metric in metrics] - if not any(weights): - weight = 1 / len(metrics) - for metric in metrics: - metric.weight_ratio = weight - else: # pragma: no cover - assert all(weights), "Please set the weight ratio for all metrics!" - - assert all(metric.is_relative == metrics[0].is_relative for metric in metrics), \ - "Unsupported different is_relative for different metric now, will support soon!" - assert all(metric.criterion == metrics[0].criterion for metric in metrics), \ - "Unsupported different criterion for different metric now, will support soon!" - - self.inc_config.usr_cfg.tuning.accuracy_criterion = { - rel_or_abs[metrics[0].is_relative]: metrics[0].criterion, - "higher_is_better": metrics[0].greater_is_better - } - - @property - def framework(self): - """Get the framework.""" - return self.inc_config.usr_cfg.model.framework - - @framework.setter - def framework(self, framework): - """Set the framework.""" - assert framework in ["pytorch", "pytorch_fx", "pytorch_ipex", "tensorflow"], \ - "framework: {} is not support!".format(framework) - self.inc_config.usr_cfg.model.framework = framework - - @property - def objectives(self): - """Get the objectives.""" - return self._objectives - - @objectives.setter - def objectives(self, objectives: Union[List, Objective]): - """Set the objectives.""" - self._objectives = objectives - if isinstance(objectives, Objective) or len(objectives) == 1: - self.inc_config.usr_cfg.tuning.objective = objectives.name \ - if isinstance(objectives, Objective) else objectives[0].name - else: - weights = [objective.weight_ratio for objective in objectives] - if not any(weights): - weight = 1 / len(objectives) - for objective in objectives: - objective.weight_ratio = weight - else: - assert all(weights), "Please set the weight ratio for all metrics!" - - self.inc_config.usr_cfg.tuning.multi_objective = { - "objective": [objective.name for objective in objectives], - "higher_is_better": [objective.greater_is_better for objective in objectives], - "weight": [objective.weight_ratio for objective in objectives], - } - - @property - def strategy(self): - """Get the strategy.""" - return self.inc_config.usr_cfg.tuning.strategy.name - - @strategy.setter - def strategy(self, strategy): - """Set the strategy.""" - assert strategy in ["basic", "bayesian", "mse", "mse_v2"], \ - "strategy: {} is not support!".format(strategy) - self.inc_config.usr_cfg.tuning.strategy.name = strategy - if strategy == "mse_v2": - self.inc_config.usr_cfg.tuning.strategy_kwargs = {"confidence_batches": 1} - - @property - def timeout(self): - """Get the timeout.""" - return self.inc_config.usr_cfg.tuning.exit_policy.timeout - - @timeout.setter - def timeout(self, timeout): - """Set the timeout.""" - assert isinstance(timeout, int), "timeout should be integer!" - self.inc_config.usr_cfg.tuning.exit_policy.timeout = timeout - - @property - def op_wise(self): - """Get the op_wise dict.""" - return self.inc_config.usr_cfg.quantization.op_wise - - @op_wise.setter - def op_wise(self, op_wise): - """Set the op_wise dict.""" - self.inc_config.usr_cfg.quantization.op_wise = op_wise - - @property - def optype_wise(self): - """Get the optype_wise dict.""" - return self.inc_config.usr_cfg.quantization.optype_wise - - @optype_wise.setter - def optype_wise(self, optype_wise): - """Set the optype_wise dict.""" - self.inc_config.usr_cfg.quantization.optype_wise = optype_wise - - @property - def max_trials(self): - """Get the number of maximum trials.""" - return self.inc_config.usr_cfg.tuning.exit_policy.max_trials - - @max_trials.setter - def max_trials(self, max_trials): - """Set the number of maximum trials.""" - assert isinstance(max_trials, int), "max_trials should be integer!" - self.inc_config.usr_cfg.tuning.exit_policy.max_trials = max_trials - - @property - def performance_only(self): - """Get the boolean whether to use performance only.""" - return self.inc_config.usr_cfg.tuning.exit_policy.performance_only - - @performance_only.setter - def performance_only(self, performance_only): - """Set the boolean whether to use performance only.""" - assert isinstance(performance_only, boolean), "performance_only should be boolean!" - self.inc_config.usr_cfg.tuning.exit_policy.performance_only = performance_only - - @property - def random_seed(self): - """Get the random seed.""" - return self.inc_config.usr_cfg.tuning.random_seed - - @random_seed.setter - def random_seed(self, random_seed): - """Set the random seed.""" - assert isinstance(random_seed, int), "random_seed should be integer!" - self.inc_config.usr_cfg.tuning.random_seed = random_seed - - @property - def tensorboard(self): - """Get the boolean whether to use tensorboard.""" - return self.inc_config.usr_cfg.tuning.tensorboard - - @tensorboard.setter - def tensorboard(self, tensorboard): - """Set the boolean whether to use tensorboard.""" - assert isinstance(tensorboard, boolean), "tensorboard should be boolean!" - self.inc_config.usr_cfg.tuning.tensorboard = tensorboard - - @property - def output_dir(self): - """Get the output directory.""" - return self.inc_config.usr_cfg.tuning.workspace.path - - @output_dir.setter - def output_dir(self, path): - """Set the output directory.""" - assert isinstance(path, str), "save_path should be a string of directory!" - self.inc_config.usr_cfg.tuning.workspace.path = path - - @property - def resume_path(self): - """Get the resume path.""" - return self.inc_config.usr_cfg.tuning.workspace.resume - - @resume_path.setter - def resume_path(self, path): - """Set the resume path.""" - assert isinstance(path, str), "resume_path should be a string of directory!" - self.inc_config.usr_cfg.tuning.workspace.resume = path - - @property - def sampling_size(self): - """Get the sampling size.""" - return self.inc_config.usr_cfg.quantization.calibration.sampling_size - - @sampling_size.setter - def sampling_size(self, sampling_size): - """Set the sampling size.""" - if isinstance(sampling_size, int): - self.inc_config.usr_cfg.quantization.calibration.sampling_size = [sampling_size] - elif isinstance(sampling_size, list): - self.inc_config.usr_cfg.quantization.calibration.sampling_size = sampling_size - else: - assert False, "The sampling_size must be a list of int numbers" - - @property - def recipes(self): - """Get the sampling size.""" - return self.inc_config.usr_cfg.quantization.recipes - - @recipes.setter - def recipes(self, recipes): - """Set recipes.""" - if recipes is not None and not isinstance(recipes, dict): - raise ValueError("recipes should be a dict.") - - # Support PyTorch only - def smooth_quant(val=None): - if val is not None: - return check_value("smooth_quant", val, bool) - else: - return False - - # Support PyTorch only - def smooth_quant_args(val=None): - if val is not None: - check_value("smooth_quant_args", val, dict) - for k, v in val.items(): - if k == "alpha": - assert isinstance(v, str) or isinstance(v, float),\ - "Smooth_quant_args.alpha should be a float or 'auto'." - return True - else: - return {} - - # Support tensorflow, but not enabled now - def fast_bias_correction(val=None): # pragma: no cover - if val is not None: - return check_value("fast_bias_correction", val, bool) - else: - return False - - # Support tensorflow, but not enabled now - def weight_correction(val=None): # pragma: no cover - if val is not None: - return check_value("weight_correction", val, bool) - else: - return False - - # Support Tensorflow only - def first_conv_or_matmul_quantization(val=None): - if val is not None: - return check_value("first_conv_or_matmul_quantization", val, bool) - else: - return True - - # Support Tensorflow only - def last_conv_or_matmul_quantization(val=None): - if val is not None: - return check_value("last_conv_or_matmul_quantization", val, bool) - else: - return True - - RECIPES = {"smooth_quant": smooth_quant, # Only for PyTorch - "smooth_quant_args": smooth_quant_args, # Only for PyTorch - "fast_bias_correction": fast_bias_correction, # Support PyTorch and Tensorflow, not used now. - "weight_correction": weight_correction, # Support PyTorch and Tensorflow, not used now. - "first_conv_or_matmul_quantization": first_conv_or_matmul_quantization, # Only for Tensorflow - "last_conv_or_matmul_quantization": last_conv_or_matmul_quantization, # Only for Tensorflow - } - _recipes = {} - for k in RECIPES.keys(): - if k in recipes and RECIPES[k](recipes[k]): - _recipes.update({k: recipes[k]}) - else: - _recipes.update({k: RECIPES[k]()}) - deep_set(self.inc_config.usr_cfg, 'quantization.recipes', _recipes) - - -class PruningConfig(object): - """Configure the pruning process. - - Args: - framework: Which framework you used - epochs: How many epochs to prune - epoch_range: Epoch range list - initial_sparsity_ratio: Initial sparsity goal, and not needed if pruner_config argument is defined - target_sparsity_ratio: Target sparsity goal, and not needed if pruner_config argument is defined - metrics: Used to evaluate accuracy of tuning model, not needed for NoTrainerOptimizer - pruner_config: Defined pruning behavior, if it is None, then NLP will create a default pruner with - 'BasicMagnitude' pruning typel - config_file: Path to the config file - """ - def __init__( - self, - framework: str = "pytorch", - epochs: int = 1, - epoch_range: List = [0, 4], - initial_sparsity_ratio: float=0.0, - target_sparsity_ratio: float = 0.97, - metrics: Metric = None, - pruner_config: Union[List, Pruner] = None, - config_file: str = None - ): - """Init a PruningConfig object.""" - super().__init__() - self.inc_config = Pruning_Conf(config_file) - self.framework = framework - - if initial_sparsity_ratio is not None: - self.initial_sparsity_ratio = initial_sparsity_ratio - if target_sparsity_ratio is not None: - self.target_sparsity_ratio = target_sparsity_ratio - if epoch_range is not None: - self.epoch_range = epoch_range - if metrics is not None: - self.metrics = metrics - else: - self._metrics = None - if pruner_config is not None: - self.pruner_config = pruner_config - else: - self.init_prune_config() - self.epochs = epochs - - - def init_prune_config(self): - """Init the pruning config.""" - pruner_config = Pruner() - self.inc_config.usr_cfg.pruning.approach.weight_compression['pruners'] = [pruner_config] - - @property - def pruner_config(self): - """Get the pruner config.""" - return self.inc_config.usr_cfg.pruning.approach.weight_compression.pruners - - @pruner_config.setter - def pruner_config(self, pruner_config): - """Set the pruner config.""" - if isinstance(pruner_config, list): - self.inc_config.usr_cfg.pruning.approach.weight_compression.pruners = pruner_config - else: - self.inc_config.usr_cfg.pruning.approach.weight_compression.pruners = [pruner_config] - - @property - def target_sparsity_ratio(self): - """Get the target sparsity ratio.""" - return self.inc_config.usr_cfg.pruning.approach.weight_compression.target_sparsity - - @target_sparsity_ratio.setter - def target_sparsity_ratio(self, target_sparsity_ratio): - """Set the target sparsity ratio.""" - self.inc_config.usr_cfg.pruning.approach.weight_compression.target_sparsity = \ - target_sparsity_ratio - - @property - def initial_sparsity_ratio(self): - """Get the initial sparsity ratio.""" - return self.inc_config.usr_cfg.pruning.approach.weight_compression.initial_sparsity - - @initial_sparsity_ratio.setter - def initial_sparsity_ratio(self, initial_sparsity_ratio): - """Set the initial sparsity ratio.""" - self.inc_config.usr_cfg.pruning.approach.weight_compression.initial_sparsity = \ - initial_sparsity_ratio - - @property - def epoch_range(self): - """Get the epoch range.""" - return [self.inc_config.usr_cfg.pruning.approach.weight_compression.start_epoch, - self.inc_config.usr_cfg.pruning.approach.weight_compression.end_epoch] - - @epoch_range.setter - def epoch_range(self, epoch_range): - """Set the epoch range.""" - assert isinstance(epoch_range, list) and len(epoch_range) == 2, \ - "You should set epoch_range like [a,b] format to match the pruning start and end epoch." - self.inc_config.usr_cfg.pruning.approach.weight_compression.start_epoch = epoch_range[0] - self.inc_config.usr_cfg.pruning.approach.weight_compression.end_epoch = epoch_range[1] - - @property - def epochs(self): - """Get the epochs.""" - eps = self.inc_config.usr_cfg.pruning.train.epoch \ - if hasattr(self.inc_config.usr_cfg.pruning, "train") else 1 - return eps - - @epochs.setter - def epochs(self, epochs): - """Set the epochs.""" - assert isinstance(epochs, int) and epochs > 0, \ - "You should set epochs > 0 and int, not {}.".format(epochs) - self.inc_config.usr_cfg.pruning["train"] = {"epoch": epochs} - - @property - def framework(self): - """Get the framework.""" - return self.inc_config.usr_cfg.model.framework - - @framework.setter - def framework(self, framework): - """Set the framework.""" - assert framework.lower() in ["pytorch", "pytorch_fx", "tensorflow"], \ - "framework: {} is not support!".format(framework) - self.inc_config.usr_cfg.model.framework = framework.lower() - - @property - def metrics(self): - """Get the metrics.""" - return self._metrics - - @metrics.setter - def metrics(self, metrics: Metric): - """Set the metrics.""" - self._metrics = metrics - - -class DistillationConfig(object): - """Configure the distillation process. - - Args: - framework: Which framework you used - criterion: Criterion of training, example: "KnowledgeLoss" - metrics: Metrics for distillation - inc_config: Distillation config - """ - def __init__( - self, - framework: str = "pytorch", - criterion: Criterion = None, - metrics: Metric = None, - inc_config = None - ): - """Init a DistillationConfig object.""" - super().__init__() - self.inc_config = Distillation_Conf(inc_config) - self.framework = framework - if criterion is not None: - self.criterion = criterion - if metrics is not None: - self.metrics = metrics - else: - self._metrics = None - - @property - def framework(self): - """Get the framework.""" - return self.inc_config.usr_cfg.model.framework - - @framework.setter - def framework(self, framework): - """Set the framework.""" - assert framework in ["pytorch", "pytorch_fx", "tensorflow"], \ - "framework: {} is not support!".format(framework) - self.inc_config.usr_cfg.model.framework = framework - - @property - def criterion(self): - """Get the criterion.""" - return self.inc_config.usr_cfg.distillation.train.criterion - - @criterion.setter - def criterion(self, criterion: Criterion): - """Set the criterion.""" - assert criterion.name.upper() in SUPPORTED_DISTILLATION_CRITERION_MODE, \ - "The criterion name must be in ['KnowledgeLoss', 'IntermediateLayersLoss']" - if criterion.name.upper() == DistillationCriterionMode.KNOWLEDGELOSS.name: - assert criterion.temperature is not None, \ - "Please pass the temperature to Criterion.temperature!" - assert criterion.loss_types is not None, \ - "Please pass the loss_types to Criterion.loss_types!" - assert criterion.loss_weight_ratio is not None, \ - "Please pass the loss_weight_ratio to Criterion.loss_weight_ratio!" - self.inc_config.usr_cfg.distillation.train.criterion = { - DistillationCriterionMode.KNOWLEDGELOSS.value: { - "temperature": criterion.temperature, - "loss_types": criterion.loss_types, - "loss_weights": criterion.loss_weight_ratio - } - } - - if criterion.name.upper() == DistillationCriterionMode.INTERMEDIATELAYERSLOSS.name: - assert criterion.layer_mappings is not None, \ - "Please pass the layer_mappings to Criterion.layer_mappings!" - assert criterion.loss_types is not None, \ - "Please pass the loss_types to Criterion.loss_types!" - assert criterion.loss_weight_ratio is not None, \ - "Please pass the loss_weight_ratio to Criterion.loss_weight_ratio!" - assert criterion.add_origin_loss is not None, \ - "Please pass the add_origin_loss to Criterion.add_origin_loss!" - self.inc_config.usr_cfg.distillation.train.criterion = { - DistillationCriterionMode.INTERMEDIATELAYERSLOSS.value: { - "layer_mappings": criterion.layer_mappings, - "loss_types": criterion.loss_types, - "loss_weights": criterion.loss_weight_ratio, - "add_origin_loss": criterion.add_origin_loss - } - } - - @property - def metrics(self): - """Get the metrics.""" - return self._metrics - - @metrics.setter - def metrics(self, metrics): - """Set the metrics.""" - assert isinstance(metrics, Metric), \ - "metric should be a Metric class!" - self._metrics = metrics - - -class TFDistillationConfig(object): - """Configure the distillation process for Tensorflow. - - Args: - loss_types: Type of loss - loss_weights: Weight ratio of loss - train_steps: Steps of training - temperature: Parameter for KnowledgeDistillationLoss - """ - def __init__( - self, - loss_types: list = [], - loss_weights: list = [], - train_steps: list = [], - temperature: float = 1.0 - ): - """Init a TFDistillationConfig object.""" - super().__init__() - self.loss_types = loss_types - self.loss_weights = loss_weights - self.train_steps = train_steps - self.temperature = temperature - - - class BenchmarkConfig: """Config Class for Benchmark. diff --git a/intel_extension_for_transformers/transformers/distillation.py b/intel_extension_for_transformers/transformers/distillation.py deleted file mode 100644 index 9f801b9c112..00000000000 --- a/intel_extension_for_transformers/transformers/distillation.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (c) 2022 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Distillation: set criterion mode to distillation.""" -from enum import Enum -from typing import List - - -class Criterion(object): - """Criterion class for distillation.""" - def __init__( - self, - name: str = "KNOWLEDGELOSS", - temperature: float = 1.0, - loss_types: List = ['CE', 'CE'], - loss_weight_ratio: List = [0.5, 0.5], - layer_mappings: List = None, - add_origin_loss: bool = False - ): - """Init a Criterion object.""" - self.name = name - self.temperature = temperature - self.loss_types = loss_types - self.loss_weight_ratio = loss_weight_ratio - self.layer_mappings = layer_mappings - self.add_origin_loss = add_origin_loss - - -class DistillationCriterionMode(Enum): - """Criterion mode class for distillation.""" - KNOWLEDGELOSS = "KnowledgeDistillationLoss" - INTERMEDIATELAYERSLOSS = "IntermediateLayersKnowledgeDistillationLoss" - - - -SUPPORTED_DISTILLATION_CRITERION_MODE = \ - set([approach.name for approach in DistillationCriterionMode]) diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index 0254c66f029..7092d85452f 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -183,7 +183,7 @@ def build_woq_model(model, quantization_config): def convert_model_to_public(model): # reorder weight and scales if they have been transposed - if model.device == "xpu": + if model.device == "xpu" or (isinstance(model.device, torch.device) and model.device.type == "xpu"): for name, module in model.named_modules(): if isinstance(module, WeightOnlyQuantizedLinear): if module.weight_transposed: diff --git a/intel_extension_for_transformers/transformers/optimizer.py b/intel_extension_for_transformers/transformers/optimizer.py deleted file mode 100644 index db1eeb6a973..00000000000 --- a/intel_extension_for_transformers/transformers/optimizer.py +++ /dev/null @@ -1,466 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (c) 2022 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Optimization: provides the orchestrate optimizer for Pytorch.""" -import logging -import os -import shlex - -from neural_compressor.experimental import( - common, - Component, - Distillation, - Quantization, - Pruning, -) -from neural_compressor.experimental.scheduler import Scheduler -from intel_extension_for_transformers.transformers import( - DistillationConfig, - Provider, - QuantizationConfig, - PruningConfig -) -from intel_extension_for_transformers.transformers.utils.utility import LazyImport -from intel_extension_for_transformers.transformers.quantization import QuantizationMode -from transformers import PreTrainedModel, PretrainedConfig -from transformers.file_utils import WEIGHTS_NAME -from typing import Callable, Optional, Union, List - -torch = LazyImport("torch") - -logger = logging.getLogger(__name__) - - -class Orchestrate_optimizer: - """Orchestrate_optimizer aggregates and orchestrates components such as Quantization, Pruning and Distillation.""" - def __init__( - self, - model, - components: Optional[List[Component]] = [], - eval_func: Optional[Callable] = None, - train_func: Optional[Callable] = None, - output_dir: Optional[str] = "saved_results", - ): - """Init an orchestrate optimizer. - - Args: - model: Model to quantize and/or prune. - components: List of Component objects which contains Quantization, Pruning, Distillation objects. - eval_func: Evaluation function to evaluate the tuning objective. - train_func: Training function which will be combined with pruning. - """ - if len(components) == 0: - raise RuntimeError("`NLPOptimizer` requires at least one `Quantization`, " - "`Pruning` or `Distillation` object") - self.output_dir = output_dir - if hasattr(model, 'config') and isinstance(model.config, PretrainedConfig): - self.model_config = model.config - self.enable_inc_quant = False - self.enable_inc_pruning = False - self.scheduler = Scheduler() - self.scheduler.model = common.Model(model) - - if len(components) > 1: - agent = self.scheduler.combine(*components) - agent.train_func = train_func - agent.eval_func = eval_func - for component in components: - if isinstance(component, Distillation) and hasattr(component, 'criterion'): - agent.criterion = component.criterion - if isinstance(component, Quantization): - self.enable_inc_quant = True - if isinstance(component, Pruning): - self.enable_inc_pruning = True - self.scheduler.append(agent) - else: - self.scheduler.append(*components) - - def fit(self): - """Run the scheduler.""" - self.opt_model = self.scheduler() - self.save_model(self.output_dir) - if self.enable_inc_pruning == True: - stats, sparsity = self.opt_model.report_sparsity() - logger.info(stats) - logger.info(sparsity) - return self.opt_model.model - - def save_model(self, output_dir, tokenizer=None): - """Save the model and tokenizer in the output directory. - - Args: - output_dir: the path to save config.json and pytorch_model.bin. - tokenizer (object, optional): the tokenizer object, use it if you want to - save tokenizer.json in output_dir. Defaults to None. - """ - os.makedirs(shlex.quote(output_dir), exist_ok=True) - torch.save(self.opt_model.quantized_state_dict(), os.path.join(shlex.quote(output_dir), WEIGHTS_NAME)) - if hasattr(self, 'model_config') and isinstance(self.model_config, PretrainedConfig): - if self.enable_inc_quant == True: - self.model_config.torch_dtype = "int8" - self.model_config.save_pretrained(output_dir) - if tokenizer: # pragma: no cover - tokenizer.save_pretrained(output_dir) - logger.info("orchestrate_optimizations model and configure file have saved to {}".format( - output_dir)) - - -class NoTrainerOptimizer: # pragma: no cover - """Optimizer without using Trainer.""" - def __init__( - self, - model, - output_dir: Optional[str] = "saved_results", - ): - """Init a NoTrainerOptimizer object. - - Args: - model: FP32 model specified for low precision tuning. - output_dir: The folder for saving the results. - """ - self.model = model - self.teacher_model = None - self._eval_func = None - self._train_func = None - self._calib_func = None - self._calib_dataloader = None - self.output_dir = output_dir - self.quant_config = None - self.pruning_config = None - self.distillation_config = None - self._provider = Provider.INC.value - self.pruner = None - self.quantizer = None - self.distiller = None - self.in_training = False - self.enable_inc_quant = False - - @property - def eval_func(self): - """Get the evaluation function.""" - return self._eval_func - - @property - def train_func(self): - """Get the train function.""" - return self._train_func - - @property - def calib_func(self): - """Get the calib function.""" - return self._calib_func - - @property - def provider(self): - """Get the provider.""" - return self._provider - - @property - def calib_dataloader(self): - """Get the calibration dataloader.""" - return self._calib_dataloader - - @eval_func.setter - def eval_func(self, func: Callable): - """Set the evaluation function. - - Args: - func: evaluation function. - """ - self._eval_func = func - - @train_func.setter - def train_func(self, func: Callable): - """Set the train function. - - Args: - func: train function. - """ - self._train_func = func - - @provider.setter - def provider(self, provider): - """Set the provider. - - Args: - provider: optimization provider. - """ - self._provider = provider - - @calib_dataloader.setter - def calib_dataloader(self, dataloader): - """Set the calibration dataloader. - - Args: - dataloader: calibration dataloader. - """ - # transformer issue #1 - if dataloader.batch_size is None: - from .utils.utility import _build_inc_dataloader - self._calib_dataloader = _build_inc_dataloader(dataloader) - else: - self._calib_dataloader = dataloader - - def init_quantizer( - self, - quant_config, - provider: str = Provider.INC.value, - ): - """Init a Quantization object with config. - - Args: - quant_config: quantization config. - provider: define the quantization provider. - """ - from neural_compressor.experimental import Quantization - - assert isinstance(quant_config, QuantizationConfig), \ - "Please pass QuantizationConfig instance to trainer.quantize!" - self.quant_config = quant_config - self.metrics = self.quant_config.metrics - self._provider = Provider[provider.upper()].value - - if self.quant_config.framework == "pytorch": - if self.quant_config.approach == \ - QuantizationMode.POSTTRAININGDYNAMIC.value: - self.quant_config.framework = "pytorch" - else: - self.quant_config.framework = "pytorch_fx" - - quantizer = Quantization(self.quant_config.inc_config) - quantizer.model = common.Model(self.model) - - self.quantizer = quantizer - return quantizer - - def _inc_quantize( - self, - quant_config, - provider: str = Provider.INC.value, - ): - """Do the quantization.""" - if self.quantizer is None: - self.init_quantizer(quant_config=quant_config, provider=provider) - if self._eval_func is not None: - self.quantizer.eval_func = self._eval_func - if self._calib_func is not None: - self.quantizer.calib_func = self._calib_func - if self.quant_config.approach == QuantizationMode.POSTTRAININGSTATIC.value: - assert self._calib_dataloader is not None, \ - "Please pass calib_dataloader to NoTrainerOptimizer.calib_dataloader" - self.quantizer.calib_dataloader = self._calib_dataloader - elif self.quant_config.approach == QuantizationMode.QUANTIZATIONAWARETRAINING.value: - assert self._train_func is not None, \ - "Please pass train_func to NoTrainerOptimizer.train_func" - self.quantizer.q_func = self._train_func - self.opt_model = self.quantizer.fit() - self.enable_inc_quant = True - self.save_model(self.output_dir) - return self.opt_model.model - - def quantize( - self, - quant_config: QuantizationConfig = None, - provider: str = Provider.INC.value, - eval_func: Optional[Callable] = None, - train_func: Optional[Callable] = None, - calib_func: Optional[Callable] = None, - calib_dataloader=None, - ): - """Prepare for invoking the _inc_quantize function. - - Args: - quant_config: quantization config. - provider: define the quantization provider. - eval_func: evaluation function. - train_func: train function. - calib_func: calibration function. - calib_dataloader: calibration dataloader. - """ - if eval_func is not None: - self._eval_func = eval_func - if train_func is not None: - self._train_func = train_func - if calib_func is not None: - self._calib_func = calib_func - if calib_dataloader is not None: - self._calib_dataloader = calib_dataloader - - if self.quantizer is None: - self._provider = Provider[provider.upper()].value - - if self._provider == Provider.INC.value: - return self._inc_quantize(quant_config=quant_config, provider=provider) - else: - assert False, "Unsupported provider:{}".format(self._provider) - - def init_pruner( - self, - pruning_config = None, - provider: str = Provider.INC.value, - ): - """Init a Pruning object with config. - - Args: - pruning_config: pruning config. - provider: define the pruning provider. - """ - from neural_compressor.experimental import Pruning - self.pruning_config = pruning_config - self.metrics = self.pruning_config.metrics - self._provider = Provider[provider.upper()].value - - assert isinstance(self.pruning_config, PruningConfig), \ - "please pass a instance of PruningConfig to trainer.prune!" - - pruner = Pruning(self.pruning_config.inc_config) - pruner.model = common.Model(self.model) - - self.pruner = pruner - return pruner - - def prune( - self, - pruning_config = None, - provider: str = Provider.INC.value, - eval_func: Optional[Callable] = None, - train_func: Optional[Callable] = None, - ): - """Do the pruning. - - Args: - pruning_config: pruning config. - provider: define the pruning provider. - eval_func: evaluation function. - train_func: train function. - """ - if self.pruner is None: - self.init_pruner(pruning_config=pruning_config, provider=provider) - if eval_func is not None: - self._eval_func = eval_func - if train_func is not None: - self._train_func = train_func - - self.pruner.eval_func = self._eval_func - - self.pruner.pruning_func = self._train_func - - self.opt_model = self.pruner.fit() - self.save_model(self.output_dir) - stats, sparsity = self.opt_model.report_sparsity() - logger.info(stats) - logger.info(sparsity) - - return self.opt_model.model - - def init_distiller( - self, - distillation_config, - teacher_model, - provider: str = Provider.INC.value, - ): - """Init a Distillation object with config and the teacher model. - - Args: - distillation_config: distillation config. - teacher_model: set the teacher model. - provider: define the distillation provider. - """ - from neural_compressor.experimental import Distillation, common - assert isinstance(distillation_config, DistillationConfig), \ - "please pass a instance of PruningConfig to trainer.prune!" - self.distillation_config = distillation_config - self._provider = Provider[provider.upper()].value - self.metrics = self.distillation_config.metrics - self.teacher_model = teacher_model - - distiller = Distillation(self.distillation_config.inc_config) - distiller.model = common.Model(self.model) - distiller.teacher_model = common.Model(self.teacher_model) - - self.distiller = distiller - return distiller - - def distill( - self, - distillation_config, - teacher_model, - provider: str = Provider.INC.value, - eval_func: Optional[Callable] = None, - train_func: Optional[Callable] = None, - ): - """Do the distillation. - - Args: - distillation_config: distillation config. - teacher_model: set the teacher model. - provider: define the distillation provider. - eval_func: evaluation function. - train_func: train function. - """ - if self.distiller is None: - self.init_distiller( - distillation_config=distillation_config, - teacher_model=teacher_model, - provider=provider - ) - if eval_func is not None: - self._eval_func = eval_func - if train_func is not None: - self._train_func = train_func - - self.distiller.eval_func = self._eval_func - self.distiller.train_func = self._train_func - self.distiller.create_criterion() - - self.opt_model = self.distiller.fit() - self.save_model(self.output_dir) - return self.opt_model.model - - def _save_inc_int8(self, opt_model, output_dir): - """Save the optimized model in the output directory. - - Args: - opt_model: optimized model. - output_dir: output path. - """ - self.model.config.architectures = [self.model.__class__.__name__] - self.model.config.torch_dtype = "int8" - if isinstance(self.model.config, PretrainedConfig): - self.model.config.save_pretrained(output_dir) - weights_file = os.path.join(os.path.abspath( - os.path.expanduser(output_dir)), WEIGHTS_NAME) - torch.save(opt_model.quantized_state_dict(), weights_file) - - def save_model(self, output_dir, tokenizer=None): - """Save the model and tokenizer in the output directory. - - Args: - output_dir: the path to save config.json and pytorch_model.bin. - tokenizer (object, optional): the tokenizer object, use it if you want to - save tokenizer.json in output_dir. Defaults to None. - """ - os.makedirs(shlex.quote(output_dir), exist_ok=True) - torch.save(self.opt_model.quantized_state_dict(), os.path.join(shlex.quote(output_dir), WEIGHTS_NAME)) - if self.enable_inc_quant and self.opt_model: - self._save_inc_int8(self.opt_model, output_dir) - else: - self.model.save_pretrained(output_dir) - self.model.config.save_pretrained(output_dir) - if tokenizer: # pragma: no cover - tokenizer.save_pretrained(output_dir) - logger.info("Optimized model and configure file have saved to {}".format( - output_dir)) diff --git a/intel_extension_for_transformers/transformers/optimizer_tf.py b/intel_extension_for_transformers/transformers/optimizer_tf.py deleted file mode 100644 index e1ee4f7b416..00000000000 --- a/intel_extension_for_transformers/transformers/optimizer_tf.py +++ /dev/null @@ -1,733 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (c) 2022 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""TFOptimization: provides the optimization class for Tensorflow.""" -import logging -import pstats -import numpy as np -import os -import time -from neural_compressor import __version__ -from neural_compressor.experimental import common -try: - from neural_compressor.model.model import saved_model_session, get_model_type -except ImportError: - from neural_compressor.model.tensorflow_model import saved_model_session, get_model_type -from intel_extension_for_transformers.transformers import (DistillationConfig, - QuantizationConfig, - PruningConfig) -from intel_extension_for_transformers.transformers.quantization import QuantizationMode -from intel_extension_for_transformers.transformers.utils.metrics import Metric -from intel_extension_for_transformers.transformers.utils.utility import LazyImport -from packaging import version -from transformers import PreTrainedModel -from typing import Callable, Optional, List -from .utils.utility_tf import TFDataloader, TMPPATH, TEACHERPATH, get_filepath - -tf = LazyImport("tensorflow") -logger = logging.getLogger(__name__) -logger.setLevel('INFO') - - -class TFOptimization: - """TFOptimization is the entry class for Tensorflow to use the optimization techniques in neural compressor.""" - def __init__(self, - model: PreTrainedModel, - args, - train_dataset=None, - eval_dataset=None, - compute_metrics: Optional[Callable] = None, - criterion=None, - optimizer=None, - task_type=None, - task_id=None, - strategy=None): - """Init a TFOptimziation object. - - Args: - model: FP32 model specified for low precision tuning - args: Training arguments for TF - train_dataset: Training datas - eval_dataset: Evaluation data - compute_metrics: Metrics computing function during the evaluation process - criterion: Tuning criterion - optimizer: The optimizer you used - task_type: Used for distributed multi-node settings. Default set as "worker" - task_id: Used for distributed multi-node settings. - Set as 0 on the leader node and 1, 2, 3... on the followers - strategy: Equals to MultiWorkerMirroredStrategy if use distributed distributed multi-node settings, - otherwise None - """ - self.model = model - self.teacher_model = None - self.component = None - self.eval_dataset = eval_dataset - self.train_dataset = train_dataset - self._eval_func = None - self._train_func = None - self.quant_config = None - self.pruning_config = None - self.distillation_config = None - self.pruner = None - self.quantizer = None - self.distiller = None - self.in_training = False - self._input_names = None - self._output_names = None - self._inputs = None - self.compute_metrics = compute_metrics - self.args = args - self.optimizer = optimizer - self.task_type = task_type - self.task_id = task_id - self.criterion = criterion if criterion is not None else \ - self.model.loss if hasattr(self.model, "loss") else None - self.model.save_pretrained(get_filepath(TMPPATH, self.task_type, self.task_id), saved_model=True) - _, self.input_names, self.output_names = saved_model_session( - os.path.join(get_filepath(TMPPATH, self.task_type, self.task_id), "saved_model/1"), input_tensor_names=[], - output_tensor_names=[]) - self.eval_distributed = False - self.strategy = strategy - - @property - def inputs(self): - """Get the inputs.""" - return self._inputs - - @inputs.setter - def inputs(self, inputs: dict): - """Set the inputs.""" - self._inputs = inputs - - @property - def input_names(self): - """Get the input names.""" - return self._input_names - - @input_names.setter - def input_names(self, input_names: List): - """Set the input names. - - Args: - input_names: the names of inputs. - """ - self._input_names = input_names - - @property - def output_names(self): - """Get the output names.""" - return self._output_names - - @output_names.setter - def output_names(self, output_names: List): - """Set the output names. - - Args: - output_names: the names of outputs. - """ - self._output_names = output_names - - @property - def eval_func(self): - """Get the evaluation function.""" - return self._eval_func - - @eval_func.setter - def eval_func(self, func: Callable): - """Set the evaluation function. - - Args: - func: evaluation function. - """ - self._eval_func = func - - @property - def train_func(self): - """Get the training function.""" - return self._train_func - - @train_func.setter - def train_func(self, func: Callable): - """Set the training function. - - Args: - func: train function. - """ - self._train_func = func - - @property - def train_dataset(self): - """Get the training dataset.""" - return self._train_dataset - - @train_dataset.setter - def train_dataset(self, train_dataset): - """Set the training dataset. - - Args: - train_dataset: train dataset. - """ - assert isinstance(train_dataset, tf.data.Dataset) or train_dataset is None, \ - "train_dataset should be obj of tf.data.Dataset" - self._train_dataset = train_dataset - - @property - def eval_dataset(self): - """Get the evaluation dataset.""" - return self._eval_dataset - - @eval_dataset.setter - def eval_dataset(self, eval_dataset): - """Set the evaluation dataset. - - Args: - eval_dataset: evaluation dataset. - """ - assert isinstance(eval_dataset, tf.data.Dataset) or eval_dataset is None, \ - "eval_dataset should be obj of tf.data.Dataset" - self._eval_dataset = eval_dataset - - def builtin_eval_func(self, model): - """Customize Evaluate function to inference the model for specified metric on the validation dataset. - - Args: - model ([tf.saved_model.load]): The model will be the class of tf.saved_model.load(quantized_model_path). - - Returns: - [float]: evaluation result, the larger is better. - """ - model_type = None - label_ids: np.ndarray = None - try: - model_type = get_model_type(model) - except ValueError: - logger.info("use keras savedModel") - - num_examples = sum(1 for _ in ( - self._eval_dataset.unbatch() if hasattr(self._eval_dataset, "unbatch") else self._eval_dataset)) - logger.info(f"***** Running Evaluation *****") - logger.info(f" Num examples in dataset = {num_examples}") - logger.info(f" Batch size = {self.args.per_device_eval_batch_size}") - - if model_type is None: - preds: np.ndarray = None - infer = model.signatures["serving_default"] - - for idx, (inputs, labels) in enumerate(self._eval_dataset): - for name in inputs: - inputs[name] = tf.constant(inputs[name].numpy(), dtype=infer.inputs[0].dtype) - - results = infer(**inputs) - for val in results: - if preds is None: - preds = results[val].numpy() - else: - preds = np.append(preds, results[val].numpy(), axis=0) - - if label_ids is None: - label_ids = labels[0].numpy() if isinstance( - labels, list) else labels.numpy() - else: - label_ids = np.append( - label_ids, - labels[0].numpy() - if isinstance(labels, list) else labels.numpy(), - axis=0) - test_predictions = {"logits": preds} - eval_metrics = self.compute_metrics(test_predictions, label_ids) - acc = eval_metrics["accuracy"] - return acc - else: # pragma: no cover - from neural_compressor.adaptor.tf_utils.util import get_tensor_by_name - input_tensor = [get_tensor_by_name(\ - model, x) for x in self.input_names] - output_tensor = [get_tensor_by_name(\ - model, x) for x in self.output_names] - - logger.info("Start to evaluate the TensorFlow model.") - - total_time = 0 - config = tf.compat.v1.ConfigProto() - config.use_per_session_threads = 1 - config.inter_op_parallelism_threads = 1 - sess = tf.compat.v1.Session(graph=model, config=config) - feed_dict = {} - label_ids: np.ndarray = None - preds: np.ndarray = None - for idx, (inputs, labels) in enumerate(self._eval_dataset): - assert len(input_tensor) == len(inputs), \ - 'inputs len must equal with input_tensor' - feed_dict = {} - for name in inputs: - for tensor in input_tensor: - pos = tensor.name.rfind(":") - t_name = tensor.name if pos < 0 else tensor.name[:pos] - if name == t_name: - feed_dict[tensor] = inputs[name].numpy() - break - - start = time.time() - logits = sess.run(output_tensor, feed_dict) - total_time += time.time() - start - if not self.args.prediction_loss_only: - if isinstance(logits, tuple): - logits = logits[0] - - if isinstance(labels, tuple): - labels = labels[0].numpy() - - if isinstance(logits, - list) and len(logits) > 1: # pragma: no cover - for val in logits: - if preds is None: - preds = val - else: - preds = np.append(preds, val, axis=0) - - for val in labels: - if label_ids is None: - label_ids = val.numpy() - else: - label_ids = np.append(label_ids, - val.numpy(), - axis=0) - else: - if preds is None: - preds = logits[0] if isinstance(logits, - list) else logits - else: - preds = np.append( - preds, - logits[0] if isinstance(logits, list) else logits, - axis=0) - - if label_ids is None: - label_ids = labels[0].numpy() if isinstance( - labels, list) else labels.numpy() - else: - label_ids = np.append( - label_ids, - labels[0].numpy() - if isinstance(labels, list) else labels.numpy(), - axis=0) - - if self.compute_metrics is not None and preds is not None and label_ids is not None: - try: - loss = self.criterion( - label_ids, preds) if self.criterion is not None else None - except Exception as e: # pragma: no cover - logger.info(e) - logger.info("There is no loss function or loss compute error, \ - Please compute loss in compute_metrics function" - ) - loss = None - results = self.compute_metrics({"logits": preds}, label_ids) - if loss is not None: - results["loss"] = loss.numpy() - - if isinstance(self.metrics, list): - nums = len(self.metrics) - for metric in self.metrics: - assert metric.name in results.keys(), \ - "Please set metric from {}".format(results.keys()) - if nums == 1: - result = results.get(self.metrics[0].name) - else: # pragma: no cover - result = 0 - for metric in self.metrics: - assert metric.weight_ratio is not None, \ - "Please set weights for metric if you want to use more than one metric" - result += results[metric.name] * metric.weighted - logger.info("metric Accuracy: {}".format(result)) - elif isinstance(self.metrics, Metric): - assert self.metrics.name in results.keys(), \ - "Please set metric from {}".format(results.keys()) - result = results.get(self.metrics.name) - logger.info("metric Accuracy: {}".format(result)) - else: # pragma: no cover - assert False, "Please set the correct metrics format from the README" - else: - result = 0 - logger.info("Throughput: {} samples/sec".format(num_examples / total_time)) - return result - - def init_quantizer( - self, - quant_config, - ): - """Init a Quantization object with config. - - Args: - quant_config: quantization config. - """ - from neural_compressor.experimental import Quantization - - self.quant_config = QuantizationConfig() if quant_config is None else quant_config - self.quant_config.framework = "tensorflow" - self.metrics = self.quant_config.metrics - - quantizer = Quantization(self.quant_config.inc_config) - quantizer.model = common.Model( - os.path.join(get_filepath(TMPPATH, self.task_type, self.task_id),"saved_model/1"), modelType="saved_model") - - self.quantizer = quantizer - return quantizer - - def _inc_quantize( - self, - quant_config, - ): - """Do the quantization. - - Args: - quant_config: quantization config. - """ - if self.quantizer is None: - self.init_quantizer(quant_config=quant_config) - if self._eval_func is not None: - self.quantizer.eval_func = self._eval_func - else: - assert self.metrics is not None, \ - "Please pass the metrics to QuantizationConfig.metrics!" - self.quantizer.eval_func = self.builtin_eval_func - - if self.quant_config.approach == QuantizationMode.POSTTRAININGSTATIC.value: - if self._train_dataset is not None: - self.quantizer.calib_dataloader = TFDataloader( - self._train_dataset, - batch_size=self.args.per_device_train_batch_size) - elif self._eval_dataset is not None: - self.quantizer.calib_dataloader = TFDataloader( - self._eval_dataset, - batch_size=self.args.per_device_eval_batch_size) - else: # pragma: no cover - assert False, "Please pass calibration dataset to TFNoTrainerOptimizer.calib_dataloader" - elif self.quant_config.approach == QuantizationMode.QUANTIZATIONAWARETRAINING.value: # pragma: no cover - assert False, \ - "Unsupported quantization aware training for tensorflow framework" - - opt_model = self.quantizer.fit() - opt_model.save(self.args.output_dir) - logger.info( - "quantized model have saved to {}".format(self.args.output_dir) - ) - return opt_model.model - - def quantize( - self, - quant_config: QuantizationConfig = None, - eval_func: Optional[Callable] = None, - train_func: Optional[Callable] = None, - train_dataset=None, - eval_dataset=None, - ): - """Prepare for invoking INC quantize function. - - Args: - quant_config: quantization config. - eval_func: evaluation function. - train_func: train function. - train_dataset: train dataset. - eval_dataset: evaluation dataset. - """ - if eval_func is not None: - self._eval_func = eval_func - if train_func is not None: - self._train_func = train_func - if train_dataset is not None: - self.train_dataset = train_dataset - - if eval_dataset is not None: - self.eval_dataset = eval_dataset - - return self._inc_quantize(quant_config=quant_config) - - def init_pruner( - self, - pruning_config=None, - ): - """Init a Pruning object with config. - - Args: - pruning_config: pruning config. - """ - from neural_compressor.experimental import Pruning - if pruning_config.framework != 'tensorflow': - logger.warning('pruning_config.framework is {}, should be tensorflow'.format(pruning_config.framework)) - pruning_config.framework = 'tensorflow' - self.pruning_config = pruning_config - self.metrics = self.pruning_config.metrics - - assert isinstance(self.pruning_config, PruningConfig), \ - "please pass a instance of PruningConfig to trainer.prune!" - - pruner = Pruning(self.pruning_config.inc_config) - pruner.model = os.path.join(get_filepath(TMPPATH, self.task_type, self.task_id), "saved_model/1") - pruner.model.model_type = "saved_model" - - self.pruner = pruner - self.component = pruner - return pruner - - def prune( - self, - pruning_config=None, - eval_func: Optional[Callable] = None, - train_func: Optional[Callable] = None, - train_dataset=None, - eval_dataset=None, - ): - """Do the pruning. - - Args: - pruning_config: pruning config. - eval_func: evaluation function. - train_func: train function. - train_dataset: train dataset. - eval_dataset: evaluation dataset. - """ - if self.pruner is None: - self.init_pruner(pruning_config=pruning_config) - if eval_func is not None: - self.eval_func = eval_func - if train_func is not None: - self.train_func = train_func - - if train_dataset is not None: - self.train_dataset = train_dataset - - if eval_dataset is not None: - self.eval_dataset = eval_dataset - - if self._eval_func is not None: - self.pruner.eval_func = self._eval_func - else: - assert self.metrics is not None, \ - "Please pass the metrics to PruningConfig.metrics!" - self.pruner.eval_func = self.builtin_eval_func - - if self.train_func is not None: - if version.parse(__version__) <= version.parse("1.12"): - self.pruner.pruning_func = self._train_func - else: - self.pruner.train_func = self._train_func - else: - if version.parse(__version__) <= version.parse("1.12"): - self.pruner.pruning_func = self.build_train_func - else: - self.pruner.train_func = self.build_train_func - - opt_model = self.pruner.fit() - stats, sparsity = opt_model.report_sparsity() - logger.info(stats) - logger.info(sparsity) - - opt_model.save(self.args.output_dir) - logger.info( - "pruned model have saved to {}".format(self.args.output_dir) - ) - return opt_model.model - - def init_distiller( - self, - distillation_config, - teacher_model: PreTrainedModel, - ): - """Init a Distillation object with config and the teacher model. - - Args: - distillation_config: distillation config. - teacher_model: set the teacher model. - """ - from neural_compressor.experimental import Distillation - assert isinstance(distillation_config, DistillationConfig), \ - "please pass a instance of DistillationConfig to trainer.distill!" - - def train_step(data): - if len(data) == 3: - x, y, sample_weight = data # pragma: no cover - else: - sample_weight = None - x, y = data - with tf.GradientTape() as tape: - y_pred = self.model(x) - teacher_outputs = self.distiller.criterion.teacher_model_forward( - input=x, teacher_model=teacher_model) - - loss = self.model.compute_loss(x, y, y_pred, sample_weight) - # _on_after_compute_loss(self, input, student_output, student_loss, teacher_output=None) - # TODO: check, combile - loss = self.distiller.on_after_compute_loss( - x, y_pred.logits, loss, teacher_outputs.logits) - self.model._validate_target_and_loss(y, loss) - # Run backwards pass. - self.model.optimizer.minimize(loss, - self.model.trainable_variables, - tape=tape) - return self.model.compute_metrics(x, y, y_pred, sample_weight) - - self.model.train_step = train_step - # re-compile - self.model.compile( - optimizer=self.model.optimizer, - loss=self.model.loss, - metrics=self.model.compiled_metrics._user_metrics - ) - - if distillation_config.framework != 'tensorflow': - logger.warning( - 'distillation_config.framework is {}, should be tensorflow'. - format(distillation_config.framework)) - distillation_config.framework = 'tensorflow' - self.distillation_config = distillation_config - self.metrics = self.distillation_config.metrics - self.teacher_model = teacher_model - - distiller = Distillation(self.distillation_config.inc_config) - distiller.model = os.path.join(TMPPATH, "saved_model/1") - distiller.model.model_type = "saved_model" - self.teacher_model.save_pretrained(TEACHERPATH, saved_model=True) - distiller.teacher_model = os.path.join(TEACHERPATH, "saved_model/1") - distiller.teacher_model.model_type = "saved_model" - - self.distiller = distiller - self.component = distiller - return distiller - - def distill( - self, - distillation_config, - teacher_model: PreTrainedModel, - eval_func: Optional[Callable] = None, - train_func: Optional[Callable] = None, - ): - """Do the distillation. - - Args: - distillation_config: distillation config. - teacher_model: set the teacher model. - eval_func: evaluation function. - train_func: train function. - """ - if self.distiller is None: - self.init_distiller( - distillation_config=distillation_config, - teacher_model=teacher_model, - ) - if eval_func is not None: - self._eval_func = eval_func - if train_func is not None: - self._train_func = train_func - else: - self._train_func = self.build_train_func - - self.distiller.eval_func = self._eval_func - self.distiller.train_func = self._train_func - self.distiller.create_criterion() - - opt_model = self.distiller.fit() - opt_model.save(self.args.output_dir) - logger.info( - "distilled model have saved to {}".format(self.args.output_dir) - ) - - return opt_model.model - - def model_builder_builtin(self, arch_paras=None, model_cls=None): - """Specify model_cls to use the built-in model builder. - - Args: - arch_paras: architecture parameters. - model_cls: model information. - """ - config = self.model.config - if arch_paras is not None: - assert isinstance(arch_paras, dict), "Expect arch_paras to be a dict." - for k in arch_paras: - if hasattr(config, k): - config.__setattr__(k, arch_paras[k]) - # for MobileBERT, 'intra_bottleneck_size' is associated with - # 'true_hidden_size', and must have the same values. - if k == 'intra_bottleneck_size': - config.__setattr__('true_hidden_size', arch_paras[k]) - return model_cls.from_config(config) - - def build_train_func(self, model): - """Build the training function for pruning or distillation. - - Args: - model (object): the input model - """ - tf.random.set_seed(1) - epochs = 1 - - component = self.component - prune_model = self.model - model_path = get_filepath(TMPPATH, self.task_type, self.task_id) - - if 'distillation' in self.component.cfg: - epochs = max(epochs, self.component.cfg.distillation.train.get("epoch", 1)) - hooks = self.component.hooks - if 'pruning' in self.component.cfg: - epochs = max(epochs, self.component.cfg.pruning.train.get("epoch", 1)) - callbacks = self.pruner.callbacks - hooks = callbacks['tf_pruning'](self.pruner.model, self.model, - self.pruner.hooks) - - class callback(tf.keras.callbacks.Callback): - def on_train_begin(self, logs=None): - if version.parse(__version__) <= version.parse("1.12"): - hooks['pre_epoch_begin']() # pragma: no cover - else: - hooks['on_train_begin']() - - def on_train_end(self, logs=None): - if version.parse(__version__) <= version.parse("1.12"): - hooks['post_epoch_end']() # pragma: no cover - else: - hooks['on_train_end']() - - def on_epoch_begin(self, epoch, logs=None): - # pylint: disable=E1121 - hooks['on_epoch_begin'](epoch) - - def on_epoch_end(self, epoch, logs=None): - component.model._session = None - prune_model.save_pretrained(model_path, saved_model=True) - component.model = os.path.join(model_path, "saved_model/1") - component.model.model_type = "saved_model" - component.model.sess - hooks['on_epoch_end']() - - # pylint: disable=E1121 - def on_train_batch_begin(self, batch, logs=None): - if version.parse(__version__) <= version.parse("1.12"): - hooks['on_batch_begin'](batch) # pragma: no cover - else: - hooks['on_step_begin'](batch) - - def on_train_batch_end(self, batch, logs=None): - if version.parse(__version__) <= version.parse("1.12"): - hooks['on_batch_end']() # pragma: no cover - else: - hooks['on_step_end']() - - self.model.fit(self.train_dataset, - validation_data=self.eval_dataset, - epochs=epochs, - callbacks=[callback()]) - self.component.model._session = None - self.model.save_pretrained(get_filepath(TMPPATH, self.task_type, self.task_id), saved_model=True) diff --git a/intel_extension_for_transformers/transformers/pruning.py b/intel_extension_for_transformers/transformers/pruning.py deleted file mode 100644 index 9ed8688ebe4..00000000000 --- a/intel_extension_for_transformers/transformers/pruning.py +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (c) 2022 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Pruning: specify the supported pruning mode.""" - -from packaging import version -from enum import Enum -from neural_compressor.conf.config import Pruner as INCPruner -from typing import Dict, List -from neural_compressor import __version__ as nc_version - - -class PruningMode(Enum): - """Currently support three pruning modes.""" - BASICMAGNITUDE = "basic_magnitude" - PATTERNLOCK = "pattern_lock" - GROUPLASSO = "group_lasso" - - -SUPPORTED_PRUNING_MODE = set([approach.name for approach in PruningMode]) - - -class PrunerConfig(INCPruner): - """Pruner configuration.""" - def __init__(self, epoch_range: List=[0, 4], initial_sparsity_ratio: float=0.0, - target_sparsity_ratio: float=0.97, update_frequency: int=1, - prune_type: str='BasicMagnitude', method: str='per_tensor', - names: List=[], parameters: Dict=None): - """Init the pruner config. - - Args: - epoch_range: A list with length of 2. The first element is the start epoch and the second element - is the end epoch. Pruning will be done from the start epoch to the end epoch. - initial_sparsity_ratio: Initial sparsity goal - target_sparsity_ratio: Target sparsity goal - update_frequency: How many epochs to update once - prune_type: "BasicMagnitude", "PatternLock", or "GroupLasso" - method: TODO (Remove this parameter) - names: A list of layer names that need to be pruned - parameters: A dictionary of extra parameters - """ - if epoch_range is not None: - assert len(epoch_range) == 2, "Please set the epoch_range as [start_epoch, end_epoch]" - self.start_epoch = epoch_range[0] - self.end_epoch = epoch_range[1] - else: # pragma: no cover - self.start_epoch = None - self.end_epoch = None - self.update_frequency = update_frequency - self.target_sparsity = target_sparsity_ratio - self.initial_sparsity = initial_sparsity_ratio - self.update_frequency = update_frequency - assert prune_type.upper() in SUPPORTED_PRUNING_MODE, \ - "prune_type only support {}!".format( - [mode.lower() for mode in SUPPORTED_PRUNING_MODE] - ) - self.prune_type = PruningMode[prune_type.upper()].value - self.method = method - self.names = names - self.parameters = parameters diff --git a/intel_extension_for_transformers/transformers/quantization.py b/intel_extension_for_transformers/transformers/quantization.py index 529abb931f5..d8cc47eaab2 100644 --- a/intel_extension_for_transformers/transformers/quantization.py +++ b/intel_extension_for_transformers/transformers/quantization.py @@ -23,13 +23,3 @@ require_version("neural_compressor>=1.9.0") except: require_version("neural_compressor_full>=1.9.0", "To fix: pip install neural_compressor") - - -class QuantizationMode(Enum): - """Currently support three quantization modes.""" - POSTTRAININGSTATIC = "post_training_static_quant" - POSTTRAININGDYNAMIC = "post_training_dynamic_quant" - QUANTIZATIONAWARETRAINING = "quant_aware_training" - - -SUPPORTED_QUANT_MODE = set([approach.name for approach in QuantizationMode]) diff --git a/intel_extension_for_transformers/transformers/runtime/test/kernels/benchmark/ci/benchmark b/intel_extension_for_transformers/transformers/runtime/test/kernels/benchmark/ci/benchmark deleted file mode 100755 index afc7f483231..00000000000 Binary files a/intel_extension_for_transformers/transformers/runtime/test/kernels/benchmark/ci/benchmark and /dev/null differ diff --git a/intel_extension_for_transformers/transformers/trainer.py b/intel_extension_for_transformers/transformers/trainer.py index e3720b5da59..4d4b378353e 100644 --- a/intel_extension_for_transformers/transformers/trainer.py +++ b/intel_extension_for_transformers/transformers/trainer.py @@ -27,18 +27,20 @@ import warnings from functools import partial from neural_compressor import __version__ as nc_version -from neural_compressor.experimental import Component from neural_compressor.utils import logger from intel_extension_for_transformers.transformers import ( - DistillationConfig, Provider, - PruningMode, - QuantizationConfig, - QuantizationMode, - PruningConfig, DynamicLengthConfig, BenchmarkConfig, ) +from neural_compressor.training import prepare_compression +from neural_compressor.quantization import fit +from neural_compressor.config import ( + DistillationConfig, + WeightPruningConfig, + PostTrainingQuantConfig, + QuantizationAwareTrainingConfig, +) from intel_extension_for_transformers.transformers.benchmark import benchmark from intel_extension_for_transformers.transformers.utils.metrics import Metric from intel_extension_for_transformers.transformers.utils.utility import LazyImport @@ -128,11 +130,7 @@ def __init__(self, *args, **kwargs): self._calib_dataloader = None self._resuming_checkpoint = None self.compression_ctrl = None - self.component = None self.enable_inc_quant = False - self.pruner = None - self.quantizer = None - self.distiller = None self.fp32_model = None self.opt_model = None # This flag is set for the engine in the export_to_int8_onnx API. @@ -141,7 +139,8 @@ def __init__(self, *args, **kwargs): self.orchestrate_opt = False self.orchestrate_opt_pruning = False self.dynamic_config = None - + self.model_config = None + self.compression_manager = None @property def resuming_checkpoint(self): @@ -239,7 +238,7 @@ def builtin_train_func(self, model): """ self.model_wrapped = model self.model = model - train_result = self.train(component=self.component, + train_result = self.train(compression_manager=self.compression_manager, resume_from_checkpoint=self._resuming_checkpoint) metrics = train_result.metrics if not self.orchestrate_opt: @@ -249,42 +248,6 @@ def builtin_train_func(self, model): self.save_state() return self.model - def init_quantizer( - self, - quant_config, - provider: str = Provider.INC.value, - ): - """Initialize the quantizer. - - Args: - quant_config: The path to the YAML configuration file or QuantizationConfig class containing - accuracy goal, quantization objective and related dataloaders etc. - provider: The provider used to quantize. - - Returns: - An objective of neural_compressor Quantization class, which can automativally searches for - optimal quantization recipes for low precision model inference and achieving best tuning - objectives. - """ - from neural_compressor.experimental import Quantization - - assert isinstance(quant_config, QuantizationConfig), \ - "Please pass QuantizationConfig instance to trainer.quantize!" - self.quant_config = quant_config - self.metrics = self.quant_config.metrics - self._provider = Provider[provider.upper()].value - - if self.quant_config.framework == "pytorch": - if self.quant_config.approach != QuantizationMode.POSTTRAININGDYNAMIC.value \ - or self.quant_config.strategy == 'mse_v2': - self.quant_config.framework = "pytorch_fx" - - quantizer = Quantization(self.quant_config.inc_config) - quantizer.model = self.model - - self.quantizer = quantizer - return quantizer - def _inc_quantize( self, quant_config, @@ -295,34 +258,29 @@ def _inc_quantize( self.fp32_model = copy.deepcopy(self.model) except Exception as e: # pragma: no cover logger.warning("Model deepcopy failed: {}!".format(repr(e))) - if self.quantizer is None: - self.init_quantizer(quant_config=quant_config, provider=provider) - if self._eval_func is not None: - self.quantizer.eval_func = self._eval_func - else: # pragma: no cover - assert self.metrics is not None, \ - "Please pass the metrics to QuantizationConfig.metrics!" - self.quantizer.eval_func = self.builtin_eval_func - - if self.quant_config.framework == "pytorch_ipex": - self.model_config = self.model.config # jit model will loss config - if self.quant_config.approach != QuantizationMode.POSTTRAININGDYNAMIC.value \ - or self.quant_config.strategy == 'mse_v2': - # pylint: disable=E1101 - self.quantizer.calib_dataloader = self.get_train_dataloader() \ - if self._calib_dataloader is None else self._calib_dataloader - if self.quant_config.approach == QuantizationMode.QUANTIZATIONAWARETRAINING.value: - self.quantizer.q_func = \ - self.builtin_train_func if self._train_func is None else self._train_func - self.component = self.quantizer - self.opt_model = self.quantizer.fit() + if isinstance(quant_config, PostTrainingQuantConfig): + if quant_config.backend == "ipex": + self.model_config = self.model.config # jit model will loss config + if self._calib_dataloader is None: + self._calib_dataloader = self.get_train_dataloader() + self.opt_model = fit(self.model, + conf=quant_config, + calib_dataloader=self._calib_dataloader, + eval_func=self._eval_func) + else: + compression_manager = prepare_compression(self.model, quant_config) + self.compression_manager = compression_manager + self.compression_manager.callbacks.on_train_begin() + self._train_func(compression_manager.model._model) + self.compression_manager.callbacks.on_train_end() + self.opt_model = self.compression_manager.model self.enable_inc_quant = True self.save_model(self.args.output_dir) return self.opt_model.model def quantize( self, - quant_config: QuantizationConfig = None, + quant_config: Union[PostTrainingQuantConfig, QuantizationAwareTrainingConfig] = None, provider: str = Provider.INC.value, eval_func: Optional[Callable] = None, train_func: Optional[Callable] = None, @@ -331,7 +289,7 @@ def quantize( """The main entry point of automatic quantization tuning. Args: - quant_config: The path to the YAML configuration file or QuantizationConfig class containing + quant_config: QuantizationConfig class containing accuracy goal, quantization objective and related dataloaders etc. provider: The provider used to quantize. eval_func (:obj:`Callable`, optional): The function used to evaluate the model. @@ -348,9 +306,6 @@ def quantize( if calib_dataloader is not None: self._calib_dataloader = calib_dataloader - if self.quantizer is None: - self._provider = Provider[provider.upper()].value - if self._provider == Provider.INC.value: return self._inc_quantize(quant_config=quant_config, provider=provider) else: @@ -375,54 +330,9 @@ def _save_inc_int8(self, opt_model, output_dir): torch.save(opt_model.quantized_state_dict(), weights_file) logger.info("quantized model and configure file have saved to {}".format(output_dir)) - def init_pruner( - self, - pruning_config=None, - provider: str = Provider.INC.value, - ): - """Initialize the pruner. - - Args: - pruning_config: The path to the YAML configuration file or PruningConf class containing - accuracy goal, pruning objective and related dataloaders etc. - provider: The provider used to quantize. - - Returns: - An objective of neural_compressor Pruning class. - """ - - from neural_compressor.experimental import Pruning - self.pruning_config = pruning_config - self.metrics = self.pruning_config.metrics - self._provider = Provider[provider.upper()].value - - assert isinstance(self.pruning_config, PruningConfig), \ - "please pass a instance of PruningConfig to trainer.prune!" - - pruning_start_epoch, pruning_end_epoch = self.pruning_config.epoch_range - - # pylint: disable=E1101 - if pruning_start_epoch > self.args.num_train_epochs - 1: - logger.warning(f"Pruning end epoch {pruning_start_epoch} is higher than " - f"the total number of training epoch " - f"{self.args.num_train_epochs}. No pruning will be applied.") - - # pylint: disable=E1101 - if pruning_end_epoch > self.args.num_train_epochs - 1: - logger.warning( - f"Pruning end epoch {pruning_end_epoch} is higher than " - f"the total number of training epoch " - f"{self.args.num_train_epochs}. The target sparsity will not be reached.") - - pruner = Pruning(self.pruning_config.inc_config) - pruner.model = self.model - - self.pruner = pruner - return pruner - def prune( self, - pruning_config=None, + pruning_config: Union[WeightPruningConfig] = None, provider: str = Provider.INC.value, eval_func: Optional[Callable] = None, train_func: Optional[Callable] = None, @@ -439,72 +349,19 @@ def prune( Returns: An objective of neural_compressor Pruning class. """ - if self.pruner is None: - self.init_pruner(pruning_config=pruning_config, provider=provider) - if eval_func is not None: - self._eval_func = eval_func - if train_func is not None: - self._train_func = train_func - - if self._eval_func is not None: - self.pruner.eval_func = self._eval_func - else: - assert self.metrics is not None, "Please pass metrics to trainer.pruning.metrics!" - assert self.pruning_config.pruner_config[0].prune_type == PruningMode.BASICMAGNITUDE.value, \ - "Please pass eval_func to trainer.eval_func" - self.pruner.eval_func = self.builtin_eval_func - - if self._train_func is not None: - self.pruner.pruning_func = self._train_func - else: - assert self.pruning_config.pruner_config[0].prune_type == PruningMode.BASICMAGNITUDE.value, \ - "Please pass train_func to trainer.train_func" - self.pruner.pruning_func = self.builtin_train_func - - self.component = self.pruner - self.opt_model = self.pruner.fit() - stats, sparsity = self.opt_model.report_sparsity() - logger.info(stats) - logger.info(sparsity) - + self._eval_func = self.builtin_eval_func if eval_func is None else eval_func + self._train_func = self.builtin_train_func if train_func is None else train_func + compression_manager = prepare_compression(model=self.model, confs=pruning_config) + self.compression_manager = compression_manager + self.compression_manager.callbacks.on_train_begin() + self._train_func(compression_manager.model._model) + self.compression_manager.callbacks.on_train_end() + self.opt_model = self.compression_manager.model return self.opt_model.model - def init_distiller( - self, - distillation_config, - teacher_model: Union[PreTrainedModel, torch.nn.Module], - provider: str = Provider.INC.value, - ): - """The main entry point of automatic distillation tuning. - - Args: - quant_config: The path to the YAML configuration file or DistillationConfig class containing. - accuracy goal, distillation objective and related dataloaders etc. - teacher_model: The model(torch.nn.Module) transfers knowledge to a smaller model. - provider (str): The provider used to quantize. - - Returns: - An objective of neural_compressor Distillation class. - """ - from neural_compressor.experimental import Distillation - assert isinstance(distillation_config, DistillationConfig), \ - "please pass a instance of PruningConfig to trainer.prune!" - self.distillation_config = distillation_config - self._provider = Provider[provider.upper()].value - self.metrics = self.distillation_config.metrics - self.teacher_model = teacher_model - - distiller = Distillation(self.distillation_config.inc_config) - distiller.model = self.model - distiller.teacher_model = self.teacher_model - - self.distiller = distiller - return distiller - def distill( self, - distillation_config, - teacher_model: Union[PreTrainedModel, torch.nn.Module], + distillation_config: Union[DistillationConfig] = None, provider: str = Provider.INC.value, eval_func: Optional[Callable] = None, train_func: Optional[Callable] = None, @@ -514,7 +371,6 @@ def distill( Args: quant_config: The path to the YAML configuration file or DistillationConfig class containing accuracy goal, distillation objective and related dataloaders etc. - teacher_model: The model(torch.nn.Module) transfers knowledge to a smaller model. provider (str): The provider used to quantize. eval_func (:obj:`Callable`, optional: The function to evaluate the model. train_func (:obj:`Callable`, optional: The function to train the model. @@ -522,34 +378,25 @@ def distill( Returns: An objective of neural_compressor Distillation class. """ - if self.distiller is None: - self.init_distiller(distillation_config=distillation_config, - teacher_model=teacher_model, - provider=provider) - if eval_func is not None: - self._eval_func = eval_func - if train_func is not None: - self._train_func = train_func - - if self._eval_func is not None: - self.distiller.eval_func = self._eval_func + if distillation_config.teacher_model is not None: + self.teacher_model = distillation_config.teacher_model else: - assert self.metrics is not None, \ - "Please pass metrics to trainer.distillation.metrics!" - self.distiller.eval_func = self.builtin_eval_func + assert False, "Please provide teacher model for DistillationConfig." + self._eval_func = self.builtin_eval_func if eval_func is None else eval_func + self._train_func = self.builtin_train_func if train_func is None else train_func - self.distiller.train_func = \ - self.builtin_train_func if self._train_func is None else self._train_func - self.distiller.create_criterion() - self.component = self.distiller - self.opt_model = self.distiller.fit() + compression_manager = prepare_compression(self.model, distillation_config) + self.compression_manager = compression_manager + self.compression_manager.callbacks.on_train_begin() + self._train_func(compression_manager.model._model) + self.compression_manager.callbacks.on_epoch_end() + self.opt_model = self.compression_manager.model return self.opt_model.model def orchestrate_optimizations( self, config_list, - teacher_model: Optional[Callable] = None, eval_func: Optional[Callable] = None, train_func: Optional[Callable] = None, ): @@ -562,54 +409,25 @@ def orchestrate_optimizations( eval_func (:obj:`Callable`, optional): Evaluation function to evaluate the tuning objective. train_func (:obj:`Callable`, optional): Training function which will be combined with pruning. """ - from intel_extension_for_transformers.transformers.optimizer import Orchestrate_optimizer + # from intel_extension_for_transformers.transformers.optimizer import Orchestrate_optimizer self.orchestrate_opt = True + for config in config_list: + if isinstance(config, DistillationConfig): + self.teacher_model = config.teacher_model + assert self.teacher_model is not None, "Distillation need teacher model, please provide." self._eval_func = self.builtin_eval_func if eval_func is None else eval_func self._train_func = self.builtin_train_func if train_func is None else train_func - components = self.create_optimizer_builtin(config_list, teacher_model) - self.orchestrate_optimizer = Orchestrate_optimizer(self.model, components, \ - eval_func=self.eval_func, train_func=self.train_func, \ - output_dir=self.args.output_dir) - self.component = self.orchestrate_optimizer.scheduler.components[0] - torch_model = self.orchestrate_optimizer.fit() - return torch_model - - def create_optimizer_builtin(self, config_list, teacher_model=None): - """The function to create optimizer. - - Args: - config_list: The list of configs. - teacher_model (:obj:`Callable`, optional): The model(torch.nn.Module) transfers knowledge - to a smaller model. - """ - components = [] - for config in config_list: - if isinstance(config, QuantizationConfig): - component = self.init_quantizer(config) - component.eval_func = self._eval_func - component.q_func = self._train_func - self.enable_inc_quant = True - elif isinstance(config, PruningConfig): - self.orchestrate_opt_pruning = True - component = self.init_pruner(config) - component.eval_func = self._eval_func - component.pruning_func = self._train_func - elif isinstance(config, DistillationConfig): - assert isinstance(teacher_model, torch.nn.Module), \ - "The teacher_model is needed for distiller" - component = self.init_distiller(config, teacher_model) - component.eval_func = self._eval_func - component.train_func = self._train_func - component.create_criterion() - else: # pragma: no cover - assert False, "Orchestrate_optimizations config_list requires at least one" \ - " `QuantizationConfig`, `PruningConfig` or `DistillationConfig` object" - components.append(component) - return components + compression_manager = prepare_compression(model=self.model, confs=config_list) + self.compression_manager = compression_manager + self.compression_manager.callbacks.on_train_begin() + self._train_func(compression_manager.model._model) + self.compression_manager.callbacks.on_train_end() + self.opt_model = self.compression_manager.model + return self.opt_model.model def train( self, - component: Optional[Component] = None, + compression_manager = None, resume_from_checkpoint: Optional[Union[str, bool]] = None, trial: Union["optuna.Trial", Dict[str, Any]] = None, ignore_keys_for_eval: Optional[List[str]] = None, @@ -618,7 +436,7 @@ def train( """The main entry point tor train model. Args: - component (:obj:`Component`, `optional`): Component object handling the training process. + compression_manager (:obj:`CompressionManager`, `optional`): handling the training process. resume_from_checkpoint (:obj:`str` or :obj:`bool`, `optional`): If a :obj:`str`, local path to a saved checkpoint as saved by a previous instance of :class:`~transformers.Trainer`. If a :obj:`bool` and equals `True`, load the last checkpoint in `args.output_dir` as saved @@ -642,7 +460,7 @@ def train( self.is_in_train = True - self.component = component + self.compression_manager = compression_manager # do_train is not a reliable argument, as it might not be set and .train() still called, so # the following is a workaround: @@ -866,17 +684,12 @@ def train( # We just need to begin an iteration to create the randomization of the sampler. for _ in train_dataloader: break - if isinstance(component, Component): - if hasattr(self.component, "teacher_model"): - self.component.teacher_model._model = self._wrap_model( - self.component.teacher_model.model) - component.pre_epoch_begin(self.calib_dataloader if self.calib_dataloader else None) - if component.combination is not None and "Quantization" in component.combination: - model = component.model.model + if self.compression_manager is not None: + if self.teacher_model is not None: + self.teacher_model = self._wrap_model( + self.teacher_model) + # compression_manager.pre_epoch_begin(self.calib_dataloader if self.calib_dataloader else None) for epoch in range(epochs_trained, num_train_epochs): - if self.compression_ctrl is not None: - self.compression_ctrl.scheduler.epoch_step() - print(self.compression_ctrl.statistics().to_str()) if isinstance(train_dataloader, torch.utils.data.dataloader.DataLoader) and \ isinstance(train_dataloader.sampler, torch.utils.data.distributed.DistributedSampler): train_dataloader.sampler.set_epoch(epoch) @@ -892,8 +705,8 @@ def train( steps_in_epoch = (len(epoch_iterator) if train_dataset_is_sized else args.max_steps * args.gradient_accumulation_steps) self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control) - if isinstance(component, Component): - component.on_epoch_begin(epoch) + if self.compression_manager is not None: + self.compression_manager.callbacks.on_epoch_begin(epoch) self.in_training = True for step, inputs in enumerate(epoch_iterator): @@ -913,8 +726,8 @@ def train( if step % args.gradient_accumulation_steps == 0: self.control = self.callback_handler.on_step_begin( args, self.state, self.control) - if isinstance(component, Component): - component.on_batch_begin(step) + if compression_manager is not None: + self.compression_manager.callbacks.on_step_begin(step) training_step = self.training_step_length_adaptive if self.dynamic_config is not None and \ self.dynamic_config.dynamic_training else self.training_step @@ -943,8 +756,8 @@ def train( # last step in epoch but step is always smaller than gradient_accumulation_steps steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch): - if isinstance(component, Component): - component.on_post_grad() + # if isinstance(component, Component): + # component.on_post_grad() # Gradient clipping if args.max_grad_norm is not None and args.max_grad_norm > 0: @@ -962,11 +775,15 @@ def train( args.max_grad_norm, ) - # Optimizer step - if self.compression_ctrl is not None: - self.compression_ctrl.scheduler.step() + # # Optimizer step + # if self.compression_ctrl is not None: + # self.compression_ctrl.scheduler.step() + if self.compression_manager is not None: + self.compression_manager.callbacks.on_before_optimizer_step() optimizer_was_run = True self.optimizer.step() + if self.compression_manager is not None: + self.compression_manager.callbacks.on_after_optimizer_step() if optimizer_was_run: self.lr_scheduler.step() @@ -977,8 +794,9 @@ def train( self.state.curr_loss = tr_loss_step.cpu().detach().item() self.control = self.callback_handler.on_step_end(args, self.state, self.control) - if isinstance(component, Component): - component.on_batch_end() + + if self.compression_manager is not None: + compression_manager.callbacks.on_step_end() self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval) else: @@ -990,16 +808,8 @@ def train( self.in_training = False self.control = self.callback_handler.on_epoch_end(args, self.state, self.control) - if isinstance(component, Component): - # When Distillation is involved, model will be evaluated in "on_epoch_end" hook, while in SQuAD - # evaluation, "start_positions" and "end_positions" will be removed from inputs of the fx model, - # this will damage the training afterward, so use the copied model for evaluation, - # and then restore the model. - component.model.model = copy.deepcopy(model) - component.on_epoch_end() - component.model.model = model - if 'Distillation' in component.__repr__(): - model.train() + if self.compression_manager is not None: + self.compression_manager.callbacks.on_epoch_end() self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval) # pylint: disable=E1101 @@ -1011,11 +821,6 @@ def train( if self.control.should_training_stop: break - if isinstance(component, Component): - component.post_epoch_end() - if component.combination is not None and "Quantization" in component.combination: - self.model = component.model.model - if args.past_index and hasattr(self, "_past"): # Clean the state at the end of training delattr(self, "_past") @@ -1188,7 +993,6 @@ def training_step( return loss.detach() - def training_step_length_adaptive( self, model: torch.nn.Module, @@ -1385,11 +1189,10 @@ def compute_loss(self, model, inputs, return_outputs=False): # pragma: no cover if self.label_smoother is not None and "labels" in inputs else None teacher_logits = inputs.pop("teacher_logits") if "teacher_logits" in inputs else None - outputs = model(**inputs) - if self.in_training and hasattr(self, "component") and \ - hasattr(self.component, "criterion"): + if self.in_training and hasattr(self, "compression_manager") and \ + hasattr(self.compression_manager, "criterion"): qa_output_merger = lambda outputs: torch.vstack([ torch.vstack([sl, el]) for sl, el in zip(outputs["start_logits"], outputs["end_logits"]) @@ -1416,8 +1219,8 @@ def get_logits(outputs): if "start_positions" in inputs and "end_positions" in inputs: # for SQuAD teacher_logits = torch.vstack(list(teacher_logits)) else: - teacher_outputs = self.component.criterion.teacher_model_forward(inputs) - teacher_logits = get_logits(self.component.criterion.teacher_outputs + teacher_outputs = self.compression_manager.criterion.teacher_model_forward(inputs) + teacher_logits = get_logits(self.compression_manager.criterion.teacher_outputs if teacher_outputs is None else teacher_outputs) logits = get_logits(outputs) @@ -1431,14 +1234,14 @@ def get_logits(outputs): else: raise AssertionError( "Labels of input data not provided, can't compute loss") - if hasattr(self.component, "on_post_forward"): - self.component.on_post_forward(inputs, teacher_output=teacher_logits) - if hasattr(self.component.criterion, "teacher_outputs"): - self.component.criterion.teacher_outputs = \ - get_logits(self.component.criterion.teacher_outputs) - loss = self.component.criterion(logits, labels) - if hasattr(self.component.criterion, 'add_origin_loss') and \ - self.component.criterion.add_origin_loss: + if hasattr(self.compression_manager, "on_post_forward"): + self.compression_manager.on_post_forward(inputs, teacher_output=teacher_logits) + if hasattr(self.compression_manager.criterion, "teacher_outputs"): + self.compression_manager.criterion.teacher_outputs = \ + get_logits(self.compression_manager.criterion.teacher_outputs) + loss = self.compression_manager.criterion(logits, labels) + if hasattr(self.compression_manager.criterion, 'add_origin_loss') and \ + self.compression_manager.criterion.add_origin_loss: loss = loss + outputs['loss'] else: if self.args.past_index >= 0: @@ -1449,7 +1252,8 @@ def get_logits(outputs): else: # We don't use .loss here since the model may return tuples instead of ModelOutput. loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] - loss = self.component.on_after_compute_loss(inputs, logits, loss, teacher_logits) + if self.compression_manager is not None: + loss = self.compression_manager.on_after_compute_loss(inputs, logits, loss, teacher_logits) if "start_positions" in inputs and "end_positions" in inputs: start_logits, end_logits = qa_output_spliter(logits) outputs = {"start_logits": start_logits, "end_logits": end_logits, "loss": loss} diff --git a/requirements.txt b/requirements.txt index 0b45ec85d3f..f73ff94bd6b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ py-cpuinfo -setuptools>=65 +setuptools==69.5.1 setuptools_scm[toml]>=6.2 diff --git a/tests/CI/test_config.py b/tests/CI/test_config.py deleted file mode 100644 index b3d32a35d4b..00000000000 --- a/tests/CI/test_config.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy -import shutil -import torch -import unittest - -from intel_extension_for_transformers.transformers import ( - DistillationConfig, - metrics, - objectives, - PrunerConfig, - PruningConfig, - QuantizationConfig, - TFOptimization, -) -from intel_extension_for_transformers.transformers.distillation import Criterion as DistillationCriterion -from intel_extension_for_transformers.transformers.distillation import DistillationCriterionMode -from intel_extension_for_transformers.transformers.trainer import NLPTrainer -from intel_extension_for_transformers.transformers.utils.objectives import Objective -from intel_extension_for_transformers.transformers.utils.utility_tf import TFDataloader - -from transformers import ( - AutoModelForPreTraining, - HfArgumentParser, - TFTrainingArguments, - TFAutoModelForSequenceClassification, -) - - -class CustomPruner(): - def __init__(self, start_epoch=None, end_epoch=None, initial_sparsity=None, - target_sparsity_ratio=None, update_frequency=1, prune_type='BasicMagnitude', - method='per_tensor', names=[], parameters=None): - self.start_epoch = start_epoch - self.end_epoch = end_epoch - self.update_frequency = update_frequency - self.target_sparsity_ratio = target_sparsity_ratio - self.initial_sparsity = initial_sparsity - self.update_frequency = update_frequency - - -class TestConfig(unittest.TestCase): - @classmethod - def tearDownClass(self): - shutil.rmtree('./tmp_trainer', ignore_errors=True) - - def test_quantization_config_with_init(self): - metric1 = metrics.Metric( - name="F1", greater_is_better=False, is_relative=False, criterion=0.02, weight_ratio=0.5 - ) - metric2 = metrics.Metric( - name="accuracy", greater_is_better=False, is_relative=False, - criterion=0.02, weight_ratio=0.5 - ) - objective1 = objectives.performance - objective2 = objectives.modelsize - quantization_config = QuantizationConfig( - framework="pytorch", - approach="PostTrainingDynamic", - timeout=600, - max_trials=300, - metrics=[metric1, metric2], - objectives=[objective1, objective2], - ) - self.assertEqual(quantization_config.approach, "post_training_dynamic_quant") - self.assertEqual(quantization_config.metrics[0].criterion, 0.02) - self.assertEqual(quantization_config.objectives[1].name, "modelsize") - self.assertEqual(quantization_config.timeout, 600) - self.assertEqual(quantization_config.max_trials, 300) - - from neural_compressor.utils import constant - quantization_config.op_wise = { - 'bert.encoder.layer.0.output.dense': constant.FP32, - } - quantization_config.resume_path = './saved_results' - quantization_config.random_seed = 1 - quantization_config.strategy = 'basic' - quantization_config.performance_only = True - quantization_config.tensorboard = True - quantization_config.sampling_size = [300] - quantization_config.input_names = ['input_ids', 'tokentype_ids'] - quantization_config.output_names = ['seq1, seq2'] - self.assertTrue(isinstance(quantization_config.op_wise, dict)) - self.assertTrue(isinstance(quantization_config.strategy, str)) - self.assertEqual(quantization_config.random_seed, 1) - self.assertEqual(quantization_config.strategy, 'basic') - self.assertTrue(quantization_config.performance_only) - self.assertTrue(quantization_config.tensorboard) - self.assertTrue(quantization_config.resume_path, './saved_results') - self.assertTrue(quantization_config.sampling_size, [300]) - self.assertTrue(quantization_config.input_names, ['input_ids', 'tokentype_ids']) - self.assertTrue(quantization_config.output_names, ['seq1, seq2']) - - def test_quantization_config(self): - quantization_config = QuantizationConfig() - quantization_config.approach = "PostTrainingStatic" - quantization_config.framework = "pytorch" - metric = metrics.Metric(name="F1", greater_is_better=False, criterion=0.02, is_relative=True) - quantization_config.metrics = metric - objective1 = objectives.Objective(name="performance", greater_is_better=True) - objective2 = objectives.Objective(name="modelsize", greater_is_better=False) - quantization_config.objectives = [objective1, objective2] - - quantization_config.timeout = 600 - quantization_config.max_trials = 300 - quantization_config.output_dir = "./savedresult" - - self.assertEqual(quantization_config.approach, "post_training_static_quant") - self.assertEqual(quantization_config.metrics.criterion, 0.02) - self.assertEqual(quantization_config.objectives[1].name, "modelsize") - self.assertEqual(quantization_config.timeout, 600) - self.assertEqual(quantization_config.max_trials, 300) - self.assertEqual(quantization_config.output_dir, "./savedresult") - - def test_pruning_config(self): - pruning_config = PruningConfig() - pruner_config = PrunerConfig() - metric = metrics.Metric(name="F1") - pruning_config.pruner_config = pruner_config - pruning_config.framework = "pytorch" - pruning_config.target_sparsity_ratio = 0.1 - pruning_config.epoch_range = [0, 4] - pruning_config.metrics = metric - - self.assertEqual(pruning_config.pruner_config, [pruner_config]) - self.assertEqual(pruning_config.framework, "pytorch") - self.assertEqual(pruning_config.initial_sparsity_ratio, 0) - self.assertEqual(pruning_config.target_sparsity_ratio, 0.1) - self.assertEqual(pruning_config.epoch_range, [0, 4]) - self.assertEqual(pruning_config.metrics, metric) - self.assertEqual(pruning_config.epochs, 1) - - pruning_config.pruner_config = [pruner_config] - self.assertEqual(pruning_config.pruner_config, [pruner_config]) - - def test_distillation_config(self): - metric = metrics.Metric(name="eval_F1") - criterion = DistillationCriterion( - name="KnowledgeLoss", - temperature=1.0, - loss_types=["CE", "KL"], - loss_weight_ratio=[0, 1] - ) - distillation_config = DistillationConfig( - framework="pytorch", - criterion=criterion, - metrics=metric - ) - - self.assertEqual(distillation_config.framework, "pytorch") - self.assertEqual(list(distillation_config.criterion.keys())[0], - DistillationCriterionMode[criterion.name.upper()].value) - self.assertEqual(distillation_config.metrics, metric) - - criterion = DistillationCriterion( - name="InterMediateLayersloss", - layer_mappings=[['classifier', 'classifier']], - loss_types=['MSE'], - loss_weight_ratio=[1.0], - add_origin_loss=False - ) - distillation_config = DistillationConfig( - framework="pytorch", - criterion=criterion, - metrics=metric - ) - - - def test_trainer_config(self): - model = AutoModelForPreTraining.from_pretrained( - 'google/bert_uncased_L-2_H-128_A-2' - ) - trainer = NLPTrainer(model) - trainer.resuming_checkpoint = 'saved_results' - trainer.eval_func = None - trainer.train_func = None - trainer.calib_dataloader = None - trainer.provider = 'inc' - self.assertEqual(trainer.resuming_checkpoint, 'saved_results') - self.assertEqual(trainer.eval_func, None) - self.assertEqual(trainer.train_func, None) - self.assertEqual(trainer.calib_dataloader, None) - self.assertEqual(trainer.provider, 'inc') - - def test_TFOptimization_config(self): - parser = HfArgumentParser(TFTrainingArguments) - args = parser.parse_args_into_dataclasses( - args=["--output_dir", "./quantized_model", - "--per_device_eval_batch_size", "2"] - ) - model = TFAutoModelForSequenceClassification.from_pretrained( - 'bhadresh-savani/distilbert-base-uncased-sentiment-sst2' - ) - tf_optimizer = TFOptimization(model, args=args[0]) - tf_optimizer.input = 1 - tf_optimizer.eval_func = None - tf_optimizer.train_func = None - self.assertEqual(tf_optimizer.input, 1) - self.assertEqual(tf_optimizer.eval_func, None) - self.assertEqual(tf_optimizer.train_func, None) - - def test_tf_dataloader(self): - def dummy_dataset(type='list'): - if type == 'list': - yield [torch.tensor(1),torch.tensor(2)], \ - [torch.tensor(1),torch.tensor(2)] - else: - yield torch.tensor(1), torch.tensor(1) - - dataloader = TFDataloader(dummy_dataset()) - for input, label in dataloader: - self.assertTrue(type(input) == list) - self.assertTrue(type(label) == list) - dataloader = TFDataloader(dummy_dataset(type='int')) - for input, label in dataloader: - self.assertTrue(type(input) == numpy.ndarray) - self.assertTrue(type(label) == numpy.ndarray) - - def test_Objective_config(self): - perform= Objective.performance() - model_size = Objective.modelsize() - self.assertEqual(perform.name, 'performance') - self.assertEqual(model_size.name, 'modelsize') - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/CI/test_quantization.py b/tests/CI/test_quantization.py index 52ab6c36a6d..f7d767384f6 100644 --- a/tests/CI/test_quantization.py +++ b/tests/CI/test_quantization.py @@ -23,11 +23,13 @@ import unittest from intel_extension_for_transformers.transformers import ( metrics, - objectives, OptimizedModel, - QuantizationConfig, - QuantizationMode, - NoTrainerOptimizer, +) +from neural_compressor.config import ( + PostTrainingQuantConfig, + QuantizationAwareTrainingConfig, + TuningCriterion, + AccuracyCriterion ) from intel_extension_for_transformers.transformers.trainer import NLPTrainer from intel_extension_for_transformers.transformers.trainer import NLPSeq2SeqTrainer @@ -93,7 +95,6 @@ def setUpClass(self): train_dataset=self.dummy_dataset, eval_dataset=self.dummy_dataset, ) - self.optimizer = NoTrainerOptimizer(self.model) @classmethod def tearDownClass(self): @@ -107,116 +108,47 @@ def tearDownClass(self): def test_fx_model_quant(self): fp32_output = self.trainer.predict(self.dummy_dataset).predictions - for mode in QuantizationMode: - print("Quantization approach:", mode.value) - self.trainer = NLPTrainer( - model=self.model, - train_dataset=self.dummy_dataset, - eval_dataset=self.dummy_dataset, - ) - - # Check fp32 jit and onnx model, only once. - if mode == QuantizationMode.POSTTRAININGSTATIC: - jit_model = self.trainer.export_to_jit() - self.trainer.export_to_onnx('fp32-model.onnx') - self.assertTrue(check_onnx('fp32-model.onnx', self.trainer.get_eval_dataloader())) - - self.trainer.benchmark(num_of_instance=1) - tune_metric = metrics.Metric( - name="eval_loss", greater_is_better=False, is_relative=False, criterion=0.5 - ) - quantization_config = QuantizationConfig( - approach=mode.name, - metrics=[tune_metric], - objectives=[objectives.performance] - ) - quantized_model = self.trainer.quantize(quant_config=quantization_config, provider="inc") - self.trainer.benchmark(self.trainer.args.output_dir, num_of_instance=1) - # By default, model will be saved into tmp_trainer dir. - self.trainer.save_model('./quantized_model') - - # Check int8 onnx model - if mode == QuantizationMode.POSTTRAININGSTATIC: - # test different configure to improve UT coverage - self.trainer.export_to_onnx( - save_path=None, - quant_format='Qlinear', - dtype='S8S8', - opset_version=13, - ) - self.assertTrue(check_onnx('./tmp_trainer/int8-model.onnx', self.trainer.get_eval_dataloader())) - else: - self.trainer.export_to_onnx('int8-model.onnx') - self.assertTrue(check_onnx('int8-model.onnx', self.trainer.get_eval_dataloader())) - - if mode == QuantizationMode.QUANTIZATIONAWARETRAINING: - model = onnx.load('int8-model.onnx') - tensor_list = {tensor.name:tensor for tensor in model.graph.initializer} - torch_data = quantized_model.classifier.state_dict()\ - ['module._packed_params._packed_params'][0].\ - dequantize().detach().cpu().numpy().T - from onnx.numpy_helper import to_array - onnx_data = to_array(tensor_list['classifier.weight_quantized']) - onnx_scale = to_array(tensor_list['classifier.weight_scale']) - self.assertTrue(np.allclose(torch_data, onnx_data * onnx_scale, atol=0.001)) - # Check quantized model - output_1 = self.trainer.predict(self.dummy_dataset).predictions - loaded_model = OptimizedModel.from_pretrained( - './quantized_model', - ) - self.trainer.model = loaded_model - output_2 = self.trainer.predict(self.dummy_dataset).predictions - self.assertTrue((fp32_output != output_1).any()) - - # check loaded model - self.assertTrue((output_1 == output_2).all()) - - def test_fx_model_with_smooth_quant(self): - def eval_func(model): - return 1 - - def train_func(model): - return model - - trainer = NLPTrainer( + self.trainer = NLPTrainer( model=self.model, train_dataset=self.dummy_dataset, eval_dataset=self.dummy_dataset, ) + jit_model = self.trainer.export_to_jit() + self.trainer.export_to_onnx('fp32-model.onnx') + self.assertTrue(check_onnx('fp32-model.onnx', self.trainer.get_eval_dataloader())) + + self.trainer.benchmark(num_of_instance=1) tune_metric = metrics.Metric( name="eval_loss", greater_is_better=False, is_relative=False, criterion=0.5 ) - quantization_config = QuantizationConfig( - approach="PostTrainingStatic", - metrics=[tune_metric], - objectives=[objectives.performance], - recipes={"smooth_quant": True, - "smooth_quant_args": {"alpha": 0.6}, - } + self.trainer.metrics = tune_metric + quantization_config = PostTrainingQuantConfig( + approach="static", ) - recipes = quantization_config.recipes - self.assertTrue(recipes["smooth_quant"]) - quantized_model = trainer.quantize(quant_config=quantization_config) - self.assertTrue("quantize" in str(type(quantized_model.classifier.module))) - quantization_config = QuantizationConfig( - approach="PostTrainingStatic", - metrics=[tune_metric], - objectives=[objectives.performance], - recipes={} + quantized_model = self.trainer.quantize(quant_config=quantization_config, provider="inc") + self.trainer.benchmark(self.trainer.args.output_dir, num_of_instance=1) + # By default, model will be saved into tmp_trainer dir. + self.trainer.save_model('./quantized_model') + # test different configure to improve UT coverage + self.trainer.export_to_onnx( + save_path=None, + quant_format='Qlinear', + dtype='S8S8', + opset_version=13, ) - quantized_model = trainer.quantize(quant_config=quantization_config, - train_func=train_func, - eval_func=eval_func) - self.assertTrue("quantize" in str(type(quantized_model.classifier.module))) - - with self.assertRaises(ValueError): - quantization_config = QuantizationConfig( - approach="PostTrainingStatic", - metrics=[tune_metric], - objectives=[objectives.performance], - recipes=[] - ) + self.assertTrue(check_onnx('./tmp_trainer/int8-model.onnx', self.trainer.get_eval_dataloader())) + # Check quantized model + output_1 = self.trainer.predict(self.dummy_dataset).predictions + loaded_model = OptimizedModel.from_pretrained( + './quantized_model', + ) + self.trainer.model = loaded_model + output_2 = self.trainer.predict(self.dummy_dataset).predictions + self.assertTrue((fp32_output != output_1).any()) + + # check loaded model + self.assertTrue((output_1 == output_2).all()) def test_functional_quant(self): def eval_func(model): @@ -226,39 +158,18 @@ def train_func(model): return model self.trainer = NLPTrainer(self.model, train_dataset=self.dummy_dataset) - quantization_config = QuantizationConfig( - approach='PostTrainingStatic', - objectives=[objectives.performance] - ) - self.trainer.quantize(quant_config=quantization_config, - provider="inc", - train_func = train_func, - eval_func = eval_func,) - - def test_no_trainer_quant(self): - def eval_func(model): - return 1 - - def train_func(model): - return model - tune_metric = metrics.Metric( name="eval_loss", greater_is_better=False, is_relative=False, criterion=0.5 ) - quantization_config = QuantizationConfig( - approach='PostTrainingStatic', - metrics=[tune_metric], - objectives=[objectives.performance] + self.trainer.metrics = tune_metric + quantization_config = PostTrainingQuantConfig( + approach='static', ) - self.optimizer.eval_func = eval_func - self.optimizer.train_func = train_func - self.optimizer.provider = "INC" - self.optimizer.calib_dataloader = self.trainer.get_eval_dataloader() - - opt_model = self.optimizer.quantize(quant_config=quantization_config, + self.trainer.quantize(quant_config=quantization_config, provider="inc", train_func = train_func, - eval_func = eval_func) + eval_func = eval_func,) + def test_online_models(self): model = OptimizedModel.from_pretrained( diff --git a/tests/CI/test_quantization_qa_ipex.py b/tests/CI/test_quantization_qa_ipex.py index 67e75a45bd5..0e9dd9a78ad 100644 --- a/tests/CI/test_quantization_qa_ipex.py +++ b/tests/CI/test_quantization_qa_ipex.py @@ -43,7 +43,7 @@ def test_run_qa_ipex(self): --model_name_or_path bert-large-uncased-whole-word-masking-finetuned-squad --dataset_name squad --tune - --quantization_approach PostTrainingStatic + --quantization_approach static --do_train --do_eval --max_eval_samples 100 @@ -62,7 +62,7 @@ def test_run_qa_ipex(self): run_qa.py --model_name_or_path bert-large-uncased-whole-word-masking-finetuned-squad --dataset_name squad - --quantization_approach PostTrainingStatic + --quantization_approach static --do_train --do_eval --max_eval_samples 100 diff --git a/tests/Nightly/test_distillation.py b/tests/Nightly/test_distillation.py index 118c7bb4444..8f818f38e51 100644 --- a/tests/Nightly/test_distillation.py +++ b/tests/Nightly/test_distillation.py @@ -21,14 +21,15 @@ import unittest from datasets import load_dataset, load_metric from intel_extension_for_transformers.transformers import ( - DistillationConfig, - DistillationCriterionMode, metrics, OptimizedModel, - NoTrainerOptimizer +) + +from neural_compressor.config import ( + DistillationConfig, + KnowledgeDistillationLossConfig, ) from intel_extension_for_transformers.transformers.trainer import NLPTrainer -from intel_extension_for_transformers.transformers.distillation import Criterion from transformers import ( AutoModelForSequenceClassification, AutoTokenizer, @@ -48,7 +49,6 @@ def setUpClass(self): self.teacher_model = AutoModelForSequenceClassification.from_pretrained( 'distilbert-base-uncased-finetuned-sst-2-english' ) - self.optimizer = NoTrainerOptimizer(self.model) raw_datasets = load_dataset("glue", "sst2")["validation"] tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") def preprocess_function(examples): @@ -76,38 +76,32 @@ def compute_metrics(p): preds = np.argmax(preds, axis=1) return metric.compute(predictions=preds, references=p.label_ids) origin_weight = copy.deepcopy(self.model.classifier.weight) - for mode in DistillationCriterionMode: - print("Distillation approach:", mode.value) - self.trainer = NLPTrainer( - model=copy.deepcopy(self.model), - train_dataset=self.dataset, - eval_dataset=self.dataset, - compute_metrics=compute_metrics, - ) - metric_ = metrics.Metric(name="eval_accuracy") - criterion = Criterion( - name='IntermediateLayersLoss', - layer_mappings=[['classifier', 'classifier']], - loss_types=['MSE'], - loss_weight_ratio=[1.0], - add_origin_loss=False - ) if mode.value == "IntermediateLayersKnowledgeDistillationLoss" else None - distillation_conf = DistillationConfig(metrics=metric_, criterion=criterion) - distilled_model = self.trainer.distill( - distillation_config=distillation_conf, teacher_model=self.teacher_model - ) - # By default, model will be saved in tmp_trainer dir. - self.trainer.save_model('./distilled_model') - loaded_model = OptimizedModel.from_pretrained( - './distilled_model', - ) - distilled_weight = copy.deepcopy(distilled_model.classifier.weight) - loaded_weight = copy.deepcopy(loaded_model.classifier.weight) - # check distilled model - self.assertTrue((distilled_weight != origin_weight).any()) - # check loaded model - self.assertTrue((distilled_weight == loaded_weight).all()) - mlflow.end_run() + + self.trainer = NLPTrainer( + model=copy.deepcopy(self.model), + train_dataset=self.dataset, + eval_dataset=self.dataset, + compute_metrics=compute_metrics, + ) + metric_ = metrics.Metric(name="eval_accuracy") + self.trainer.metrics = metric_ + distillation_criterion_conf = KnowledgeDistillationLossConfig(loss_types=["CE", "KL"]) + distillation_conf = DistillationConfig(self.teacher_model, distillation_criterion_conf) + distilled_model = self.trainer.distill( + distillation_config=distillation_conf + ) + # By default, model will be saved in tmp_trainer dir. + self.trainer.save_model('./distilled_model') + loaded_model = OptimizedModel.from_pretrained( + './distilled_model', + ) + distilled_weight = copy.deepcopy(distilled_model.classifier.weight) + loaded_weight = copy.deepcopy(loaded_model.classifier.weight) + # check distilled model + self.assertTrue((distilled_weight != origin_weight).any()) + # check loaded model + self.assertTrue((distilled_weight == loaded_weight).all()) + mlflow.end_run() def test_functional_distil(self): def eval_func(model): @@ -118,27 +112,12 @@ def train_func(model): self.trainer = NLPTrainer(self.model) - distillation_conf = DistillationConfig() + distillation_conf = DistillationConfig(teacher_model=self.teacher_model) self.trainer.distill(distillation_conf, - teacher_model=self.teacher_model, provider="inc", train_func = train_func, eval_func = eval_func,) - def test_no_trainer_distill(self): - def eval_func(model): - return 1 - def train_func(model): - return model - - distillation_conf = DistillationConfig() - self.optimizer.eval_func = eval_func - self.optimizer.train_func = train_func - self.optimizer.distill(distillation_conf, - teacher_model=self.teacher_model, - provider="inc", - train_func = train_func, - eval_func = eval_func,) if __name__ == "__main__": unittest.main() diff --git a/tests/Nightly/test_orchestrate_optimization.py b/tests/Nightly/test_orchestrate_optimization.py index 422b10700a9..d65ece8099c 100644 --- a/tests/Nightly/test_orchestrate_optimization.py +++ b/tests/Nightly/test_orchestrate_optimization.py @@ -20,18 +20,14 @@ import torch.utils.data as data import unittest from datasets import load_dataset, load_metric -from intel_extension_for_transformers.transformers import ( - PrunerConfig, - PruningConfig, +from neural_compressor.config import ( + WeightPruningConfig, DistillationConfig, - QuantizationConfig, - DistillationCriterionMode, - metrics, - objectives, - OptimizedModel, + KnowledgeDistillationLossConfig, + QuantizationAwareTrainingConfig, ) +from intel_extension_for_transformers.transformers import metrics from intel_extension_for_transformers.transformers.trainer import NLPTrainer -from intel_extension_for_transformers.transformers.distillation import Criterion from transformers import ( AutoModelForSequenceClassification, @@ -77,36 +73,26 @@ def compute_metrics(p): preds = p.predictions preds = np.argmax(preds, axis=1) return metric.compute(predictions=preds, references=p.label_ids) - origin_weight = copy.deepcopy(self.model.classifier.weight) - for mode in DistillationCriterionMode: - print("Distillation approach:", mode.value) - self.trainer = NLPTrainer( - model=copy.deepcopy(self.model), - train_dataset=self.dataset, - eval_dataset=self.dataset, - compute_metrics=compute_metrics, - ) - self.trainer.calib_dataloader = self.trainer.get_eval_dataloader() + + self.trainer = NLPTrainer( + model=copy.deepcopy(self.model), + train_dataset=self.dataset, + eval_dataset=self.dataset, + compute_metrics=compute_metrics, + ) + self.trainer.calib_dataloader = self.trainer.get_eval_dataloader() tune_metric = metrics.Metric( name="eval_accuracy", is_relative=True, criterion=0.5 ) - pruner_config = PrunerConfig(prune_type='PatternLock', target_sparsity_ratio=0.9) - pruning_conf = PruningConfig(framework="pytorch_fx",pruner_config=[pruner_config], metrics=tune_metric) - distillation_conf = DistillationConfig(framework="pytorch_fx", metrics=tune_metric) - - objective = objectives.performance - quantization_conf = QuantizationConfig( - approach="QuantizationAwareTraining", - max_trials=600, - metrics=[tune_metric], - objectives=[objective] - ) - - from neural_compressor.adaptor.torch_utils.symbolic_trace import symbolic_trace - self.model = symbolic_trace(self.model, is_qat=True) - self.trainer.model = self.model + self.trainer.metrics = tune_metric + pruning_conf = WeightPruningConfig([{"start_step": 0, "end_step": 2}], + target_sparsity=0.64, + pruning_scope="local") + distillation_criterion = KnowledgeDistillationLossConfig(loss_types=["CE", "KL"]) + distillation_conf = DistillationConfig(teacher_model=self.teacher_model, criterion=distillation_criterion) + quantization_conf = QuantizationAwareTrainingConfig() conf_list = [pruning_conf, distillation_conf, quantization_conf] - opt_model = self.trainer.orchestrate_optimizations(config_list=conf_list, teacher_model=self.teacher_model) + opt_model = self.trainer.orchestrate_optimizations(config_list=conf_list) self.assertTrue("quantize" in str(type(opt_model.classifier.module))) diff --git a/tests/Nightly/test_pruning.py b/tests/Nightly/test_pruning.py index b7284ddfe6b..01c7045bf2d 100644 --- a/tests/Nightly/test_pruning.py +++ b/tests/Nightly/test_pruning.py @@ -20,11 +20,8 @@ from intel_extension_for_transformers.transformers import ( metrics, OptimizedModel, - PrunerConfig, - PruningConfig, - PruningMode, - NoTrainerOptimizer ) +from neural_compressor.config import WeightPruningConfig from intel_extension_for_transformers.transformers.trainer import NLPTrainer from transformers import ( AutoModelForSequenceClassification, @@ -63,7 +60,6 @@ def setUpClass(self): train_dataset=self.dummy_dataset, eval_dataset=self.dummy_dataset, ) - self.optimizer = NoTrainerOptimizer(self.model) @classmethod def tearDownClass(self): @@ -72,31 +68,29 @@ def tearDownClass(self): def test_fx_model_prune(self): origin_weight = copy.deepcopy(self.model.classifier.weight) - for mode in PruningMode: - # not supported yet - if mode.name != "BasicMagnitude".upper(): - continue - self.trainer = NLPTrainer( - model=self.model, - train_dataset=self.dummy_dataset, - eval_dataset=self.dummy_dataset, - ) - metric = metrics.Metric(name="eval_loss") - pruner_config = PrunerConfig(prune_type=mode.name, target_sparsity_ratio=0.9) - pruning_conf = PruningConfig(pruner_config=pruner_config, metrics=metric) - agent = self.trainer.init_pruner(pruning_config=pruning_conf) - pruned_model = self.trainer.prune() - # By default, model will be saved in tmp_trainer dir. - self.trainer.save_model('./pruned_model') - loaded_model = OptimizedModel.from_pretrained( - './pruned_model', - ) - pruned_weight = copy.deepcopy(pruned_model.classifier.weight) - loaded_weight = copy.deepcopy(loaded_model.classifier.weight) - # check pruned model - self.assertTrue((pruned_weight != origin_weight).any()) - # check loaded model - self.assertTrue((pruned_weight == loaded_weight).all()) + + self.trainer = NLPTrainer( + model=self.model, + train_dataset=self.dummy_dataset, + eval_dataset=self.dummy_dataset, + ) + metric = metrics.Metric(name="eval_loss") + self.trainer.metrics = metric + pruning_conf = WeightPruningConfig([{"start_step": 0, "end_step": 2}], + target_sparsity=0.64, + pruning_scope="local") + pruned_model = self.trainer.prune(pruning_config=pruning_conf) + # By default, model will be saved in tmp_trainer dir. + self.trainer.save_model('./pruned_model') + loaded_model = OptimizedModel.from_pretrained( + './pruned_model', + ) + pruned_weight = copy.deepcopy(pruned_model.classifier.weight) + loaded_weight = copy.deepcopy(loaded_model.classifier.weight) + # check pruned model + self.assertTrue((pruned_weight != origin_weight).any()) + # check loaded model + self.assertTrue((pruned_weight == loaded_weight).all()) def test_functional_prune(self): def eval_func(model): @@ -106,27 +100,14 @@ def train_func(model): return model self.trainer = NLPTrainer(self.model) - pruner_conf = PrunerConfig(prune_type='BasicMagnitude', target_sparsity_ratio=0.9) - pruning_conf = PruningConfig(pruner_config=pruner_conf) + pruning_conf = WeightPruningConfig([{"start_step": 0, "end_step": 2}], + target_sparsity=0.64, + pruning_scope="local") self.trainer.prune(pruning_conf, provider="inc", train_func = train_func, eval_func = eval_func,) - def test_no_trainer_prune(self): - def eval_func(model): - return 1 - - def train_func(model): - return model - pruner_conf = PrunerConfig(prune_type='BasicMagnitude', target_sparsity_ratio=0.9) - pruning_conf = PruningConfig(pruner_config=pruner_conf) - self.optimizer.eval_func = eval_func - self.optimizer.train_func = train_func - self.optimizer.prune(pruning_conf, - provider="inc", - train_func = train_func, - eval_func = eval_func,) if __name__ == "__main__": unittest.main() diff --git a/tests/Nightly/test_tf_distillation.py b/tests/Nightly/test_tf_distillation.py deleted file mode 100644 index d5521845439..00000000000 --- a/tests/Nightly/test_tf_distillation.py +++ /dev/null @@ -1,134 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import shutil -import numpy as np -import unittest -import tensorflow as tf -from datasets import load_dataset, load_metric -from transformers import (TFAutoModelForSequenceClassification, AutoTokenizer, - HfArgumentParser, TFTrainingArguments, set_seed, - DefaultDataCollator) -from intel_extension_for_transformers.transformers import (DistillationConfig, metrics) -from intel_extension_for_transformers.transformers.distillation import Criterion -from intel_extension_for_transformers.transformers.optimizer_tf import TFOptimization - - -class TestDistillation(unittest.TestCase): - @classmethod - def setUpClass(self): - set_seed(42) - self.model = TFAutoModelForSequenceClassification.from_pretrained( - 'hf-internal-testing/tiny-random-distilbert') - self.teacher_model = TFAutoModelForSequenceClassification.from_pretrained( - 'hf-internal-testing/tiny-random-DistilBertForSequenceClassification') - - raw_datasets = load_dataset("glue", "sst2")["validation"] - self.tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-DistilBertForSequenceClassification") - non_label_column_names = [ - name for name in raw_datasets.column_names if name != "label" - ] - - def preprocess_function(examples): - # Tokenize the texts - args = ((examples['sentence'], )) - result = self.tokenizer(*args, - padding=True, - max_length=64, - truncation=True) - return result - - raw_datasets = raw_datasets.map(preprocess_function, - batched=True, - load_from_cache_file=False) - data_collator = DefaultDataCollator(return_tensors="tf") - dataset = raw_datasets.select(range(10)) - self.dummy_dataset = dataset.to_tf_dataset( - columns=[ - col for col in dataset.column_names - if col not in set(non_label_column_names + ["label"]) - ], - shuffle=False, - batch_size=2, - collate_fn=data_collator, - drop_remainder=False, - # `label_cols` is needed for user-defined losses, such as in this example - # datasets v2.3.x need "labels", not "label" - label_cols=["labels"] - if "label" in dataset.column_names else None, - ) - parser = HfArgumentParser(TFTrainingArguments) - self.args = parser.parse_args_into_dataclasses(args=[ - "--output_dir", "./distilled_model", - "--per_device_eval_batch_size", "2" - ])[0] - optimizer = tf.keras.optimizers.Adam( - learning_rate=self.args.learning_rate, - beta_1=self.args.adam_beta1, - beta_2=self.args.adam_beta2, - epsilon=self.args.adam_epsilon, - clipnorm=self.args.max_grad_norm, - ) - loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( - from_logits=True, reduction=tf.keras.losses.Reduction.SUM) - metrics = ["accuracy"] - self.model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics) - - @classmethod - def tearDownClass(self): - shutil.rmtree('./tmp', ignore_errors=True) - shutil.rmtree('./distilled_model', ignore_errors=True) - - def test_tf_model_distil(self): - metric = load_metric("glue", "sst2") - def compute_metrics(preds, label_ids): - preds = preds["logits"] - preds = np.argmax(preds, axis=1) - result = metric.compute(predictions=preds, references=label_ids) - if len(result) > 1: - result["combined_score"] = np.mean(list(result.values())).item() - return result - - self.optimizer = TFOptimization(model=self.model, - args=self.args, - train_dataset=self.dummy_dataset, - compute_metrics=compute_metrics) - metric_ = metrics.Metric(name="eval_accuracy") - # 'CrossEntropyLoss', 'SparseCategoricalCrossentropy', 'KnowledgeDistillationLoss' - criterion = Criterion(name='KnowledgeLoss', - layer_mappings=[['classifier', 'classifier']], - loss_types=['CE', 'CE'], - loss_weight_ratio=[0.5, 0.5], - add_origin_loss=False) - distillation_conf = DistillationConfig(metrics=metric_, - criterion=criterion) - def eval_func(model): - return 1 - distilled_model = self.optimizer.distill( - distillation_config=distillation_conf, - teacher_model=self.teacher_model, - eval_func=eval_func, - train_func=self.optimizer.build_train_func - ) - distilled_model2 = self.optimizer.distill( - distillation_config=distillation_conf, - teacher_model=self.teacher_model, - eval_func=None, - train_func=None - ) - self.assertEqual(distilled_model.signatures['serving_default'].output_shapes['Identity'], distilled_model2.signatures['serving_default'].output_shapes['Identity']) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/Nightly/test_tf_pruning.py b/tests/Nightly/test_tf_pruning.py deleted file mode 100644 index 5fa4806957a..00000000000 --- a/tests/Nightly/test_tf_pruning.py +++ /dev/null @@ -1,147 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from intel_extension_for_transformers.transformers.utils.utility_tf import get_filepath -import numpy as np -import os -import shutil -import tensorflow as tf -import unittest -from datasets import load_dataset, load_metric -from intel_extension_for_transformers.transformers import ( - metrics, - PrunerConfig, - PruningConfig, - TFOptimization -) -from transformers import ( - AutoTokenizer, - DefaultDataCollator, - HfArgumentParser, - TFAutoModelForSequenceClassification, - TFTrainingArguments, -) - -os.environ["WANDB_DISABLED"] = "true" - - -class TestTFPruning(unittest.TestCase): - @classmethod - def setUpClass(self): - self.model = TFAutoModelForSequenceClassification.from_pretrained( - 'hf-internal-testing/tiny-random-DistilBertForSequenceClassification' - ) - raw_datasets = load_dataset("glue", "sst2")["validation"] - tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-DistilBertForSequenceClassification") - non_label_column_names = [name for name in raw_datasets.column_names if name != "label"] - def preprocess_function(examples): - # Tokenize the texts - args = ( - (examples["sentence"],) - ) - result = tokenizer(*args, padding=True, max_length=64, truncation=True) - - return result - raw_datasets = raw_datasets.map(preprocess_function, batched=True, load_from_cache_file=False) - data_collator = DefaultDataCollator(return_tensors="tf") - dataset = raw_datasets.select(range(10)) - self.dummy_dataset = dataset.to_tf_dataset( - columns=[col for col in dataset.column_names if col not in - set(non_label_column_names + ["label"])], - shuffle=False, - batch_size=2, - collate_fn=data_collator, - drop_remainder=False, - # `label_cols` is needed for user-defined losses, such as in this example - # datasets v2.3.x need "labels", not "label" - label_cols=["labels"] if "label" in dataset.column_names else None, - ) - parser = HfArgumentParser(TFTrainingArguments) - self.args = parser.parse_args_into_dataclasses(args=["--output_dir", "./quantized_model", - "--per_device_eval_batch_size", "2"])[0] - optimizer = tf.keras.optimizers.Adam( - learning_rate=self.args.learning_rate, - beta_1=self.args.adam_beta1, - beta_2=self.args.adam_beta2, - epsilon=self.args.adam_epsilon, - clipnorm=self.args.max_grad_norm, - ) - loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( - from_logits=True, reduction=tf.keras.losses.Reduction.SUM - ) - metrics = ["accuracy"] - self.model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics) - - @classmethod - def tearDownClass(self): - shutil.rmtree('./tmp', ignore_errors=True) - shutil.rmtree('./quantized_model', ignore_errors=True) - - def test_tf_model_quant(self): - # check whether it is possible to set distributed environment - # only for coverage currently - from intel_extension_for_transformers.transformers.utils.utility_tf import distributed_init - distributed_init(["localhost:12345","localhost:23456"], "worker", 0) - self.assertTrue(os.environ['TF_CONFIG'] != None) - del os.environ['TF_CONFIG'] - # check whether filepath can be set correctly if using distributed environment - # only for coverage currently - from intel_extension_for_transformers.transformers.utils.utility_tf import get_filepath - self.assertTrue(type(get_filepath("dummy", "worker", 0)) == str) - self.assertTrue(type(get_filepath("dummy", "worker", 1)) == str) - self.assertTrue(get_filepath("dummy", "worker", 0) != get_filepath("dummy", "worker", 1)) - - metric = load_metric("glue", "sst2") - def compute_metrics(preds, label_ids): - preds = preds["logits"] - preds = np.argmax(preds, axis=1) - result = metric.compute(predictions=preds, references=label_ids) - if len(result) > 1: - result["combined_score"] = np.mean(list(result.values())).item() - return result - self.optimizer = TFOptimization( - model=self.model, - args=self.args, - train_dataset=self.dummy_dataset, - eval_dataset=self.dummy_dataset, - compute_metrics=compute_metrics, - ) - tune_metric = metrics.Metric( - name="accuracy", greater_is_better=True, is_relative=True, criterion=0.01, - ) - prune_type = 'BasicMagnitude' - target_sparsity_ratio = 0.1 - pruner_config = PrunerConfig(prune_type=prune_type, target_sparsity_ratio=target_sparsity_ratio) - pruning_conf = PruningConfig( - epochs=int(1), pruner_config=pruner_config, metrics=tune_metric - ) - p_model = self.optimizer.prune(pruning_config=pruning_conf) - loaded_model = tf.saved_model.load(self.args.output_dir) - p_model = self.optimizer.prune(pruning_config=pruning_conf, - train_dataset=self.dummy_dataset, - eval_dataset=self.dummy_dataset,) - - def eval_func(model): - return 1 - - def train_func(model): - return model - - self.optimizer.prune(pruning_config=pruning_conf, - train_func=train_func, - eval_func=eval_func) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/Nightly/test_tf_quantization.py b/tests/Nightly/test_tf_quantization.py deleted file mode 100644 index 3162950c68a..00000000000 --- a/tests/Nightly/test_tf_quantization.py +++ /dev/null @@ -1,132 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import os -import shutil -import tensorflow as tf -import unittest -from datasets import load_dataset, load_metric -from intel_extension_for_transformers.transformers import ( - metrics, - objectives, - QuantizationConfig, - TFOptimization -) -# from intel_extension_for_transformers.transformers import metrics, objectives -from transformers import ( - AutoTokenizer, - DefaultDataCollator, - HfArgumentParser, - TFAutoModelForSequenceClassification, - TFTrainingArguments, -) - -os.environ["WANDB_DISABLED"] = "true" - - -class TestTFQuantization(unittest.TestCase): - @classmethod - def setUpClass(self): - self.model = TFAutoModelForSequenceClassification.from_pretrained( - 'hf-internal-testing/tiny-random-DistilBertForSequenceClassification' - ) - raw_datasets = load_dataset("glue", "sst2")["validation"] - tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-DistilBertForSequenceClassification") - non_label_column_names = [name for name in raw_datasets.column_names if name != "label"] - def preprocess_function(examples): - # Tokenize the texts - args = ( - (examples["sentence"],) - ) - result = tokenizer(*args, padding=True, max_length=64, truncation=True) - - return result - raw_datasets = raw_datasets.map(preprocess_function, batched=True, load_from_cache_file=False) - data_collator = DefaultDataCollator(return_tensors="tf") - dataset = raw_datasets.select(range(10)) - self.dummy_dataset = dataset.to_tf_dataset( - columns=[col for col in dataset.column_names if col not in - set(non_label_column_names + ["label"])], - shuffle=False, - batch_size=2, - collate_fn=data_collator, - drop_remainder=False, - # `label_cols` is needed for user-defined losses, such as in this example - # datasets v2.3.x need "labels", not "label" - label_cols=["labels"] if "label" in dataset.column_names else None, - ) - - - @classmethod - def tearDownClass(self): - shutil.rmtree('./tmp', ignore_errors=True) - shutil.rmtree('./quantized_model', ignore_errors=True) - - def test_tf_model_quant(self): - parser = HfArgumentParser(TFTrainingArguments) - args = parser.parse_args_into_dataclasses(args=["--output_dir", "./quantized_model", - "--per_device_eval_batch_size", "2"]) - metric = load_metric("glue", "sst2") - def compute_metrics(preds, label_ids): - preds = preds["logits"] - preds = np.argmax(preds, axis=1) - result = metric.compute(predictions=preds, references=label_ids) - if len(result) > 1: - result["combined_score"] = np.mean(list(result.values())).item() - return result - self.optimizer = TFOptimization( - model=self.model, - args=args[0], - compute_metrics=compute_metrics - ) - tune_metric = metrics.Metric( - name="accuracy", greater_is_better=True, is_relative=False, criterion=0.5 - ) - quantization_config = QuantizationConfig( - framework="tensorflow", - approach="POSTTRAININGSTATIC", - metrics=[tune_metric], - objectives=[objectives.performance] - ) - quantized_model = self.optimizer.quantize(quant_config=quantization_config, - train_dataset=self.dummy_dataset, eval_dataset=self.dummy_dataset) - loaded_model = tf.saved_model.load(args[0].output_dir) - - def eval_func(model): - return 1 - - def train_func(model): - return model - - self.optimizer.quantize(quant_config=quantization_config, - train_func=train_func, - eval_func=eval_func) - - quantization_config = QuantizationConfig( - framework="tensorflow", - approach="POSTTRAININGSTATIC", - metrics=[tune_metric], - objectives=[objectives.performance], - recipes={"first_conv_or_matmul_quantization": True, - "last_conv_or_matmul_quantization": True, - } - ) - self.optimizer.quantize(quant_config=quantization_config, - train_func=train_func, - eval_func=eval_func) - - -if __name__ == "__main__": - unittest.main() diff --git a/workflows/chatbot/inference/README.md b/workflows/chatbot/inference/README.md index ba5da39484a..1d9598953a1 100644 --- a/workflows/chatbot/inference/README.md +++ b/workflows/chatbot/inference/README.md @@ -49,7 +49,7 @@ numactl -m -C python generate.py \ To enable FP32 inference, you can add the parameter `--dtype "float32"`. To check the statistical information of inference, you can add the parameter `--return_stats`. ## LLama2 INT8 Inference -[Llama2](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) int8 inference demonstrates in [int8_llama2](https://github.com/intel/intel-extension-for-transformers/tree/int8_llama2/workflows/chatbot/inference) branch and need install Intel-extension-for-pytorch [llm_feature_branch](https://github.com/intel/intel-extension-for-pytorch/tree/llm_feature_branch) branch. Please follow the [README.md](https://github.com/intel/intel-extension-for-transformers/blob/81a4484dcc93f09d7609e6896fe3fbc22756975b/workflows/chatbot/inference/README.md) to setup the environments and make quantization. +[Llama2](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) int8 inference demonstrates in [int8_llama2](https://github.com/intel/intel-extension-for-transformers/tree/int8_llama2/workflows/chatbot/inference) branch and need install Intel-extension-for-pytorch [llm_feature_branch](https://github.com/intel/intel-extension-for-pytorch/tree/llm_feature_branch) branch. Please follow the [README.md](https://github.com/intel/intel-extension-for-transformers/blob/81a4484dcc93f09d7609e6896fe3fbc22756975b/workflows/chatbot/inference/README.md) to set up the environments and make quantization. # Inference on Habana Gaudi @@ -107,7 +107,7 @@ python ../utils/gaudi_spawn.py --use_deepspeed --world_size 8 generate.py \ Habana supports HPU graph mode for inference speedup, which is available for bloom, gpt2, opt, gptj, gpt_neox, mpt, llama. You can use the parameter `use_hpu_graphs` to speed up the inference. -you can use '--peft_model_path' to apply you peft finetuned output model during generation. +you can use '--peft_model_path' to apply your peft finetuned output model during generation. ```bash python ../utils/gaudi_spawn.py --use_deepspeed --world_size 8 generate.py \ @@ -122,7 +122,7 @@ python ../utils/gaudi_spawn.py --use_deepspeed --world_size 8 generate.py \ # Additional Notes -Here are the explanations of parameters in generate.py: +Here are the explanations of the parameters in generate.py: `--temperature`: Controls the diversity of generated text. Lower values result in more deterministic outputs. The default value is 0.1. `--top_p`: During text generation, only consider tokens with cumulative probability up to this value. This parameter helps to avoid extremely low probability tokens. The default value is 0.75. `--top_k`: The number of highest probability vocabulary tokens to consider for each step of text generation. The default value is 40. diff --git a/workflows/chatbot/inference/backend/fastrag/fastrag_service.py b/workflows/chatbot/inference/backend/fastrag/fastrag_service.py index 7d01c0d1659..fd3ec89e329 100644 --- a/workflows/chatbot/inference/backend/fastrag/fastrag_service.py +++ b/workflows/chatbot/inference/backend/fastrag/fastrag_service.py @@ -46,6 +46,7 @@ from database.mysqldb import MysqlDb from starlette.responses import RedirectResponse from mysqldb import MysqlDb +from werkzeug.utils import secure_filename logger = build_logger("fastrag_service", f"fastrag_service.log") parser = argparse.ArgumentParser() @@ -473,7 +474,7 @@ def query(request: QueryRequest): if request.blob: file_content = base64.b64decode(request.blob) random_suffix = str(uuid.uuid4().hex) - sanitized_filename = os.path.basename(request.filename) + sanitized_filename = secure_filename(request.filename) file_path = f"/tmp/customized_doc_{random_suffix}_{sanitized_filename}" with open(file_path, "wb") as f: f.write(file_content) diff --git a/workflows/chatbot/inference/requirements.txt b/workflows/chatbot/inference/requirements.txt index df899c47d4f..e99461277fa 100644 --- a/workflows/chatbot/inference/requirements.txt +++ b/workflows/chatbot/inference/requirements.txt @@ -9,3 +9,4 @@ peft rouge_score sentencepiece torch +werkzeug diff --git a/workflows/compression_aware_training/config/README.md b/workflows/compression_aware_training/config/README.md index c72b4ba0af9..86e13e21260 100644 --- a/workflows/compression_aware_training/config/README.md +++ b/workflows/compression_aware_training/config/README.md @@ -23,7 +23,7 @@ output_dir: Path to output directory. overwrite_output_dir: Whether to overwrite Output cache. perf_tol: Performance tolerance when optimizing the model. quantization: Needs to be true in this case. -quantization_approach: Quantization approach. Supported approach are PostTrainingStatic, PostTrainingDynamic and QuantizationAwareTraining. +quantization_approach: Quantization approach. Supported approach are static, dynamic and qat. is_relative: Metric tolerance model, expected to be relative or absolute. int8: Load int8 model. ``` @@ -41,7 +41,7 @@ output_dir: Path to output directory. overwrite_output_dir: Whether to overwrite Output cache. perf_tol: Performance tolerance when optimizing the model. quantization: Needs to be true in this case. -quantization_approach: Quantization approach. Supported approach are PostTrainingStatic, PostTrainingDynamic and QuantizationAwareTraining. +quantization_approach: Quantization approach. Supported approach are static, dynamic and qat. is_relative: Metric tolerance model, expected to be relative or absolute. int8: Load int8 model. ``` diff --git a/workflows/compression_aware_training/config/config.yaml b/workflows/compression_aware_training/config/config.yaml index 48e31757b6e..0bc18386cfe 100755 --- a/workflows/compression_aware_training/config/config.yaml +++ b/workflows/compression_aware_training/config/config.yaml @@ -25,6 +25,6 @@ overwrite_output_dir: true perf_tol: 0.03 quantization: true -quantization_approach: "QuantizationAwareTraining" +quantization_approach: "qat" is_relative: true int8: false diff --git a/workflows/compression_aware_training/config/distillation_with_qat.yaml b/workflows/compression_aware_training/config/distillation_with_qat.yaml index 48e31757b6e..0bc18386cfe 100755 --- a/workflows/compression_aware_training/config/distillation_with_qat.yaml +++ b/workflows/compression_aware_training/config/distillation_with_qat.yaml @@ -25,6 +25,6 @@ overwrite_output_dir: true perf_tol: 0.03 quantization: true -quantization_approach: "QuantizationAwareTraining" +quantization_approach: "qat" is_relative: true int8: false diff --git a/workflows/compression_aware_training/config/qat.yaml b/workflows/compression_aware_training/config/qat.yaml index faf0416ed2f..be783e839bf 100644 --- a/workflows/compression_aware_training/config/qat.yaml +++ b/workflows/compression_aware_training/config/qat.yaml @@ -24,6 +24,6 @@ overwrite_output_dir: true perf_tol: 0.03 quantization: true -quantization_approach: "QuantizationAwareTraining" +quantization_approach: "qat" is_relative: true int8: false diff --git a/workflows/compression_aware_training/config/sat.yaml b/workflows/compression_aware_training/config/sat.yaml index 7731f0dfb69..439828b0f1f 100755 --- a/workflows/compression_aware_training/config/sat.yaml +++ b/workflows/compression_aware_training/config/sat.yaml @@ -16,7 +16,7 @@ model_name_or_path: "Intel/distilbert-base-uncased-sparse-90-unstructured-pruneo teacher_model_name_or_path: "distilbert-base-uncased-finetuned-sst-2-english" task_name: "sst2" sat: true -quantization_approach: "QuantizationAwareTraining" +quantization_approach: "qat" learning_rate: 0.000012 num_train_epochs: 6 do_train: true diff --git a/workflows/compression_aware_training/src/itrex_opt.py b/workflows/compression_aware_training/src/itrex_opt.py index b727d22c412..fcfd5f7eab7 100755 --- a/workflows/compression_aware_training/src/itrex_opt.py +++ b/workflows/compression_aware_training/src/itrex_opt.py @@ -28,14 +28,19 @@ # Need to use itrex domain toolkit from intel_extension_for_transformers.transformers import ( - DistillationConfig, - PrunerConfig, - PruningConfig, OptimizedModel, - QuantizationConfig, metrics, objectives, ) +from neural_compressor.config import ( + WeightPruningConfig, + DistillationConfig, + KnowledgeDistillationLossConfig, + QuantizationAwareTrainingConfig, + PostTrainingQuantConfig, + TuningCriterion, + AccuracyCriterion +) from intel_extension_for_transformers.transformers.trainer import NLPTrainer from torch.utils.data import DataLoader from tqdm.auto import tqdm @@ -529,7 +534,7 @@ def compute_metrics(p: EvalPrediction): # Initialize and setup our itrexTrainer from neural_compressor.adaptor.torch_utils.symbolic_trace import symbolic_trace - self.model = symbolic_trace(self.model, self.optim_args.quantization_approach=="QuantizationAwareTraining") + self.model = symbolic_trace(self.model, self.optim_args.quantization_approach=="qat") self.trainer = NLPTrainer( model=self.model, @@ -746,30 +751,38 @@ def _do_quantization_aware_training(self): raise ValueError("do_eval must be set to True for quantization.") self.trainer.save_model(self.training_args.output_dir) - if self.optim_args.quantization_approach != "PostTrainingDynamic": + if self.optim_args.quantization_approach != "dynamic": if not self.training_args.do_train: raise ValueError( "do_train must be set to True for static and aware training quantization." ) - elif self.optim_args.quantization_approach == "QuantizationAwareTraining": - early_stopping_patience = 6 - early_stopping_threshold = 0.001 # optional - # trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, - # early_stopping_threshold)) tune_metric = metrics.Metric( - name=metric_name, - is_relative=self.optim_args.is_relative, - criterion=self.optim_args.perf_tol, + name=metric_name, is_relative=self.optim_args.is_relative, criterion=self.optim_args.perf_tol ) + self.trainer.metrics = tune_metric objective = objectives.performance - quantization_config = QuantizationConfig( - approach=self.optim_args.quantization_approach, - max_trials=600, - metrics=[tune_metric], - objectives=[objective], - sampling_size=len(self.train_dataset) // 20, - ) + tuning_criterion = TuningCriterion(max_trials=600, objective=[objective.name]) + accuracy_criterion = AccuracyCriterion( + higher_is_better=True, # optional. + criterion="relative" if self.optim_args.is_relative else "absolute", # optional. Available values are "relative" and "absolute". + tolerable_loss=self.optim_args.perf_tol, # optional. + ) + if self.optim_args.quantization_approach != "qat": + quantization_config = PostTrainingQuantConfig( + approach=self.optim_args.quantization_approach, + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + else: + quantization_config = QuantizationAwareTrainingConfig( + tuning_criterion=tuning_criterion, + accuracy_criterion=accuracy_criterion + ) + early_stopping_patience = 2 + early_stopping_threshold = 0.001 # optional + self.trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, \ + early_stopping_threshold)) model = self.trainer.quantize(quant_config=quantization_config) if self.optim_args.benchmark or self.optim_args.accuracy_only: @@ -939,23 +952,15 @@ def get_logits(teacher_model, train_dataset, teacher_train_dataset): tune_metric = metrics.Metric( name=metric_name, is_relative=self.optim_args.is_relative, criterion=self.optim_args.perf_tol ) - prune_type = 'PatternLock' \ - if self.optim_args.pruning_approach else self.optim_args.pruning_approach - target_sparsity_ratio = self.optim_args.target_sparsity_ratio \ - if self.optim_args.target_sparsity_ratio else None - pruner_config = PrunerConfig(prune_type=prune_type, target_sparsity_ratio=target_sparsity_ratio) - pruning_conf = PruningConfig(framework="pytorch_fx",pruner_config=[pruner_config], metrics=tune_metric) - distillation_conf = DistillationConfig(framework="pytorch_fx", metrics=tune_metric) - - objective = objectives.performance - quantization_conf = QuantizationConfig( - approach=self.optim_args.quantization_approach, - max_trials=600, - metrics=[tune_metric], - objectives=[objective] - ) + self.trainer.metrics = tune_metric + pruning_conf = WeightPruningConfig([{"start_step": 0, "end_step": 2}], + target_sparsity=self.optim_args.target_sparsity_ratio, + pruning_scope="local") + distillation_criterion = KnowledgeDistillationLossConfig(loss_types=["CE", "KL"]) + distillation_conf = DistillationConfig(teacher_model=self.teacher_model, criterion=distillation_criterion) + quantization_conf = QuantizationAwareTrainingConfig() conf_list = [pruning_conf, distillation_conf, quantization_conf] - model = self.trainer.orchestrate_optimizations(config_list=conf_list, teacher_model=self.teacher_model) + model = self.trainer.orchestrate_optimizations(config_list=conf_list) # ############################################################ print( diff --git a/workflows/compression_aware_training/src/utils.py b/workflows/compression_aware_training/src/utils.py index 46467c2b6ab..2ce1a4c819e 100755 --- a/workflows/compression_aware_training/src/utils.py +++ b/workflows/compression_aware_training/src/utils.py @@ -187,7 +187,7 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply prune."}, ) pruning_approach: Optional[str] = field( - default="BasicMagnitude", + default="magnitude", metadata={"help": "Pruning approach. Supported approach is basic_magnite."}, ) target_sparsity_ratio: Optional[float] = field( @@ -207,9 +207,9 @@ class OptimizationArguments: metadata={"help": "Whether or not to apply quantization."}, ) quantization_approach: Optional[str] = field( - default="PostTrainingStatic", - metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " - "PostTrainingDynamic and QuantizationAwareTraining."}, + default="static", + metadata={"help": "Quantization approach. Supported approach are static, " + "dynamic and qat."}, ) metric_name: Optional[str] = field( default=None, diff --git a/workflows/dlsa/run_dlsa.py b/workflows/dlsa/run_dlsa.py index 583e37847ea..92d6827998d 100644 --- a/workflows/dlsa/run_dlsa.py +++ b/workflows/dlsa/run_dlsa.py @@ -39,10 +39,10 @@ ) from intel_extension_for_transformers.transformers import ( OptimizedModel, - QuantizationConfig, metrics, objectives, ) +from neural_compressor.config import PostTrainingQuantConfig, TuningCriterion from intel_extension_for_transformers.transformers.trainer import NLPTrainer hf_logging.set_verbosity_info() @@ -288,12 +288,12 @@ def preprocess(examples): if args.do_quantize: with track("Quantize"): metric = metrics.Metric(name="eval_acc", is_relative=True, criterion=0.01) - q_config = QuantizationConfig( - framework="pytorch_ipex", - approach="PostTrainingStatic", - max_trials=200, # set the Max tune times - metrics=[metric], - objectives=[objectives.performance], + trainer.metrics = metric + tuning_criterion = TuningCriterion(max_trials=600) + q_config = PostTrainingQuantConfig( + backend="ipex", + approach="static", + tuning_criterion=tuning_criterion ) def eval_func(model): diff --git a/workflows/hf_finetuning_and_inference_nlp/src/finetune_itrex.py b/workflows/hf_finetuning_and_inference_nlp/src/finetune_itrex.py index 9e3ba13c89d..3cdab98655f 100644 --- a/workflows/hf_finetuning_and_inference_nlp/src/finetune_itrex.py +++ b/workflows/hf_finetuning_and_inference_nlp/src/finetune_itrex.py @@ -24,9 +24,6 @@ Trainer, ) from intel_extension_for_transformers.transformers import ( - QuantizationConfig, - PruningConfig, - PrunerConfig, metrics, objectives, ) diff --git a/workflows/hf_finetuning_and_inference_nlp/src/infer_itrex.py b/workflows/hf_finetuning_and_inference_nlp/src/infer_itrex.py index b666c6f8bbc..3b6c743c485 100644 --- a/workflows/hf_finetuning_and_inference_nlp/src/infer_itrex.py +++ b/workflows/hf_finetuning_and_inference_nlp/src/infer_itrex.py @@ -17,7 +17,6 @@ import torch from os import path from intel_extension_for_transformers.transformers import ( - QuantizationConfig, metrics, objectives, ) @@ -29,7 +28,7 @@ DataCollatorWithPadding, Trainer, ) - +from neural_compressor.config import PostTrainingQuantConfig, TuningCriterion from infer import DlsaInference from utils import PredsLabels, compute_metrics, save_performance_metrics @@ -76,12 +75,11 @@ def _load_model(self): ) metric = metrics.Metric(name="eval_acc", is_relative=True, criterion=0.03) - q_config = QuantizationConfig( - framework="pytorch", - approach="PostTrainingStatic", - max_trials=200, # set the Max tune times - metrics=[metric], - objectives=[objectives.performance], + self.trainer.metrics = metric + tuning_criterion = TuningCriterion(max_trials=200) + q_config = PostTrainingQuantConfig( + approach="static", + tuning_criterion=tuning_criterion, ) eval_dataloader = self.trainer.get_eval_dataloader() self.model = self.trainer.quantize(