diff --git a/.circleci/config.yml b/.circleci/config.yml index 9c414901c4f5..ecd7066931a9 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -13,6 +13,7 @@ jobs: check_circleci_user: docker: - image: python:3.10-slim + resource_class: small parallelism: 1 steps: - run: echo $CIRCLE_PROJECT_USERNAME @@ -58,14 +59,14 @@ jobs: name: "Prepare pipeline parameters" command: | python utils/process_test_artifacts.py - + # To avoid too long generated_config.yaml on the continuation orb, we pass the links to the artifacts as parameters. # Otherwise the list of tests was just too big. Explicit is good but for that it was a limitation. # We used: # https://circleci.com/docs/api/v2/index.html#operation/getJobArtifacts : to get the job artifacts # We could not pass a nested dict, which is why we create the test_file_... parameters for every single job - + - store_artifacts: path: test_preparation/transformed_artifacts.json - store_artifacts: diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py index 7ccf5ec96cec..71c75dac2ff0 100644 --- a/.circleci/create_circleci_config.py +++ b/.circleci/create_circleci_config.py @@ -32,7 +32,7 @@ "RUN_PT_FLAX_CROSS_TESTS": False, } # Disable the use of {"s": None} as the output is way too long, causing the navigation on CircleCI impractical -COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "dist": "loadfile", "vvv": None, "rsf":None} +COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "dist": "loadfile", "vvv": None, "rsfE":None} DEFAULT_DOCKER_IMAGE = [{"image": "cimg/python:3.8.12"}] @@ -40,9 +40,23 @@ class EmptyJob: job_name = "empty" def to_dict(self): + steps = [{"run": 'ls -la'}] + if self.job_name == "collection_job": + steps.extend( + [ + "checkout", + {"run": "pip install requests || true"}, + {"run": """while [[ $(curl --location --request GET "https://circleci.com/api/v2/workflow/$CIRCLE_WORKFLOW_ID/job" --header "Circle-Token: $CCI_TOKEN"| jq -r '.items[]|select(.name != "collection_job")|.status' | grep -c "running") -gt 0 ]]; do sleep 5; done || true"""}, + {"run": 'python utils/process_circleci_workflow_test_reports.py --workflow_id $CIRCLE_WORKFLOW_ID || true'}, + {"store_artifacts": {"path": "outputs"}}, + {"run": 'echo "All required jobs have now completed"'}, + ] + ) + return { "docker": copy.deepcopy(DEFAULT_DOCKER_IMAGE), - "steps":["checkout"], + "resource_class": "small", + "steps": steps, } @@ -54,9 +68,9 @@ class CircleCIJob: install_steps: List[str] = None marker: Optional[str] = None parallelism: Optional[int] = 0 - pytest_num_workers: int = 12 + pytest_num_workers: int = 8 pytest_options: Dict[str, Any] = None - resource_class: Optional[str] = "2xlarge" + resource_class: Optional[str] = "xlarge" tests_to_run: Optional[List[str]] = None num_test_files_per_worker: Optional[int] = 10 # This should be only used for doctest job! @@ -133,7 +147,7 @@ def to_dict(self): "command": """dpkg-query --show --showformat='${Installed-Size}\t${Package}\n' | sort -rh | head -25 | sort -h | awk '{ package=$2; sub(".*/", "", package); printf("%.5f GB %s\n", $1/1024/1024, package)}' || true"""} }, {"run": {"name": "Create `test-results` directory", "command": "mkdir test-results"}}, - {"run": {"name": "Get files to test", "command":f'curl -L -o {self.job_name}_test_list.txt <>' if self.name != "pr_documentation_tests" else 'echo "Skipped"'}}, + {"run": {"name": "Get files to test", "command":f'curl -L -o {self.job_name}_test_list.txt <> --header "Circle-Token: $CIRCLE_TOKEN"' if self.name != "pr_documentation_tests" else 'echo "Skipped"'}}, {"run": {"name": "Split tests across parallel nodes: show current parallel tests", "command": f"TESTS=$(circleci tests split --split-by=timings {self.job_name}_test_list.txt) && echo $TESTS > splitted_tests.txt && echo $TESTS | tr ' ' '\n'" if self.parallelism else f"awk '{{printf \"%s \", $0}}' {self.job_name}_test_list.txt > splitted_tests.txt" } @@ -185,7 +199,6 @@ def job_name(self): docker_image=[{"image": "huggingface/transformers-torch-light"}], marker="not generate", parallelism=6, - pytest_num_workers=8 ) generate_job = CircleCIJob( @@ -193,28 +206,24 @@ def job_name(self): docker_image=[{"image": "huggingface/transformers-torch-light"}], marker="generate", parallelism=6, - pytest_num_workers=8 ) tokenization_job = CircleCIJob( "tokenization", docker_image=[{"image": "huggingface/transformers-torch-light"}], parallelism=8, - pytest_num_workers=16 ) processor_job = CircleCIJob( "processors", docker_image=[{"image": "huggingface/transformers-torch-light"}], parallelism=8, - pytest_num_workers=6 ) tf_job = CircleCIJob( "tf", docker_image=[{"image":"huggingface/transformers-tf-light"}], parallelism=6, - pytest_num_workers=16, ) @@ -222,7 +231,8 @@ def job_name(self): "flax", docker_image=[{"image":"huggingface/transformers-jax-light"}], parallelism=6, - pytest_num_workers=16 + pytest_num_workers=16, + resource_class="2xlarge", ) @@ -231,7 +241,7 @@ def job_name(self): additional_env={"RUN_PIPELINE_TESTS": True}, docker_image=[{"image":"huggingface/transformers-torch-light"}], marker="is_pipeline_test", - parallelism=4 + parallelism=4, ) @@ -240,7 +250,7 @@ def job_name(self): additional_env={"RUN_PIPELINE_TESTS": True}, docker_image=[{"image":"huggingface/transformers-tf-light"}], marker="is_pipeline_test", - parallelism=4 + parallelism=4, ) @@ -257,7 +267,6 @@ def job_name(self): docker_image=[{"image":"huggingface/transformers-examples-torch"}], # TODO @ArthurZucker remove this once docker is easier to build install_steps=["uv venv && uv pip install . && uv pip install -r examples/pytorch/_tests_requirements.txt"], - pytest_num_workers=8, ) @@ -265,7 +274,6 @@ def job_name(self): "examples_tensorflow", additional_env={"OMP_NUM_THREADS": 8}, docker_image=[{"image":"huggingface/transformers-examples-tf"}], - pytest_num_workers=16, ) @@ -280,6 +288,7 @@ def job_name(self): ], marker="is_staging_test", pytest_num_workers=2, + resource_class="medium", ) @@ -292,13 +301,13 @@ def job_name(self): ], pytest_options={"k onnx": None}, pytest_num_workers=1, + resource_class="small", ) exotic_models_job = CircleCIJob( "exotic_models", docker_image=[{"image":"huggingface/transformers-exotic-models"}], - pytest_num_workers=12, parallelism=4, pytest_options={"durations": 100}, ) @@ -317,7 +326,6 @@ def job_name(self): docker_image=[{"image": "huggingface/transformers-torch-light"}], marker="not generate", parallelism=6, - pytest_num_workers=8, ) @@ -352,6 +360,7 @@ def job_name(self): DOC_TESTS = [doc_test_job] ALL_TESTS = REGULAR_TESTS + EXAMPLES_TESTS + PIPELINE_TESTS + REPO_UTIL_TESTS + DOC_TESTS + [custom_tokenizers_job] + [exotic_models_job] # fmt: skip + def create_circleci_config(folder=None): if folder is None: folder = os.getcwd() @@ -361,7 +370,13 @@ def create_circleci_config(folder=None): if len(jobs) == 0: jobs = [EmptyJob()] - print("Full list of job name inputs", {j.job_name + "_test_list":{"type":"string", "default":''} for j in jobs}) + else: + print("Full list of job name inputs", {j.job_name + "_test_list":{"type":"string", "default":''} for j in jobs}) + # Add a job waiting all the test jobs and aggregate their test summary files at the end + collection_job = EmptyJob() + collection_job.job_name = "collection_job" + jobs = [collection_job] + jobs + config = { "version": "2.1", "parameters": { @@ -371,9 +386,14 @@ def create_circleci_config(folder=None): **{j.job_name + "_test_list":{"type":"string", "default":''} for j in jobs}, **{j.job_name + "_parallelism":{"type":"integer", "default":1} for j in jobs}, }, - "jobs" : {j.job_name: j.to_dict() for j in jobs}, - "workflows": {"version": 2, "run_tests": {"jobs": [j.job_name for j in jobs]}} + "jobs": {j.job_name: j.to_dict() for j in jobs} } + if "CIRCLE_TOKEN" in os.environ: + # For private forked repo. (e.g. new model addition) + config["workflows"] = {"version": 2, "run_tests": {"jobs": [{j.job_name: {"context": ["TRANSFORMERS_CONTEXT"]}} for j in jobs]}} + else: + # For public repo. (e.g. `transformers`) + config["workflows"] = {"version": 2, "run_tests": {"jobs": [j.job_name for j in jobs]}} with open(os.path.join(folder, "generated_config.yml"), "w") as f: f.write(yaml.dump(config, sort_keys=False, default_flow_style=False).replace("' << pipeline", " << pipeline").replace(">> '", " >>")) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index eaa4b3b2f824..1bbd1c1e94d0 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -63,7 +63,7 @@ jobs: commit_id=$GITHUB_SHA fi commit_msg=$(git show -s --format=%s | cut -c1-70) - python3 benchmark/llama.py "${{ github.head_ref || github.ref_name }}" "$commit_id" "$commit_msg" + python3 benchmark/benchmarks_entrypoint.py "${{ github.head_ref || github.ref_name }}" "$commit_id" "$commit_msg" env: HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} # Enable this to see debug logs diff --git a/.github/workflows/push-important-models.yml b/.github/workflows/push-important-models.yml index 1887af0f4c5b..7294777655e1 100644 --- a/.github/workflows/push-important-models.yml +++ b/.github/workflows/push-important-models.yml @@ -134,10 +134,3 @@ jobs: slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }} slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} waitForSSH: true - - benchmark: - name: Benchmark workflow - needs: get_modified_models - if: ${{ needs.get_modified_models.outputs.matrix != '[]' && needs.get_modified_models.outputs.matrix != '' && fromJson(needs.get_modified_models.outputs.matrix)[0] != null }} - uses: ./.github/workflows/benchmark.yml - secrets: inherit diff --git a/.github/workflows/self-comment-ci.yml b/.github/workflows/self-comment-ci.yml new file mode 100644 index 000000000000..3f2b637e047c --- /dev/null +++ b/.github/workflows/self-comment-ci.yml @@ -0,0 +1,313 @@ +name: PR comment GitHub CI + +on: + issue_comment: + types: + - created + branches-ignore: + - main +concurrency: + group: ${{ github.workflow }}-${{ github.event.issue.number }}-${{ startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow') }} + cancel-in-progress: true +permissions: read-all + +env: + HF_HOME: /mnt/cache + TRANSFORMERS_IS_CI: yes + OMP_NUM_THREADS: 8 + MKL_NUM_THREADS: 8 + RUN_SLOW: yes + # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. + # This token is created under the bot `hf-transformers-bot`. + HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} + SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} + TF_FORCE_GPU_ALLOW_GROWTH: true + RUN_PT_TF_CROSS_TESTS: 1 + CUDA_VISIBLE_DEVICES: 0,1 + +jobs: + get-pr-number: + runs-on: ubuntu-22.04 + name: Get PR number + # For security: only allow team members to run + if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }} + outputs: + PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }} + steps: + - name: Get PR number + shell: bash + run: | + if [[ "${{ github.event.issue.number }}" != "" && "${{ github.event.issue.pull_request }}" != "" ]]; then + echo "PR_NUMBER=${{ github.event.issue.number }}" >> $GITHUB_ENV + else + echo "PR_NUMBER=" >> $GITHUB_ENV + fi + + - name: Check PR number + shell: bash + run: | + echo "${{ env.PR_NUMBER }}" + + - name: Set PR number + id: set_pr_number + run: echo "PR_NUMBER=${{ env.PR_NUMBER }}" >> "$GITHUB_OUTPUT" + + get-sha: + runs-on: ubuntu-22.04 + needs: get-pr-number + if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}} + outputs: + PR_HEAD_SHA: ${{ steps.get_sha.outputs.PR_HEAD_SHA }} + PR_MERGE_SHA: ${{ steps.get_sha.outputs.PR_MERGE_SHA }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: "0" + ref: "refs/pull/${{needs.get-pr-number.outputs.PR_NUMBER}}/merge" + + - name: Get SHA (and verify timestamps against the issue comment date) + id: get_sha + env: + PR_NUMBER: ${{ needs.get-pr-number.outputs.PR_NUMBER }} + COMMENT_DATE: ${{ github.event.comment.created_at }} + run: | + git fetch origin refs/pull/$PR_NUMBER/head:refs/remotes/pull/$PR_NUMBER/head + git checkout refs/remotes/pull/$PR_NUMBER/head + echo "PR_HEAD_SHA: $(git log -1 --format=%H)" + echo "PR_HEAD_SHA=$(git log -1 --format=%H)" >> "$GITHUB_OUTPUT" + git fetch origin refs/pull/$PR_NUMBER/merge:refs/remotes/pull/$PR_NUMBER/merge + git checkout refs/remotes/pull/$PR_NUMBER/merge + echo "PR_MERGE_SHA: $(git log -1 --format=%H)" + echo "PR_MERGE_SHA=$(git log -1 --format=%H)" >> "$GITHUB_OUTPUT" + PR_MERGE_COMMIT_TIMESTAMP=$(git log -1 --date=unix --format=%cd) + echo "PR_MERGE_COMMIT_TIMESTAMP: $PR_MERGE_COMMIT_TIMESTAMP" + COMMENT_TIMESTAMP=$(date -d "${COMMENT_DATE}" +"%s") + echo "COMMENT_DATE: $COMMENT_DATE" + echo "COMMENT_TIMESTAMP: $COMMENT_TIMESTAMP" + if [ $COMMENT_TIMESTAMP -le $PR_MERGE_COMMIT_TIMESTAMP ]; then + echo "Last commit on the pull request is newer than the issue comment triggering this run! Abort!"; + exit -1; + fi + + # use a python script to handle this complex logic + # case 1: `run-slow` (auto. infer with limited number of models, but in particular, new model) + # case 2: `run-slow model_1, model_2` + get-tests: + runs-on: ubuntu-22.04 + needs: [get-pr-number, get-sha] + if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}} + outputs: + models: ${{ steps.models_to_run.outputs.models }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: "0" + ref: "refs/pull/${{needs.get-pr-number.outputs.PR_NUMBER}}/merge" + + - name: Verify merge commit SHA + env: + VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }} + run: | + PR_MERGE_SHA=$(git log -1 --format=%H) + if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then + echo "The merged commit SHA is not the same as the verified one! Security issue detected, abort the workflow!"; + exit -1; + fi + + - name: Get models to test + env: + PR_COMMENT: ${{ github.event.comment.body }} + run: | + python -m pip install GitPython + python utils/pr_slow_ci_models.py --message "$PR_COMMENT" | tee output.txt + echo "models=$(tail -n 1 output.txt)" >> $GITHUB_ENV + + - name: Show models to test + id: models_to_run + run: | + echo "${{ env.models }}" + echo "models=${{ env.models }}" >> $GITHUB_ENV + echo "models=${{ env.models }}" >> $GITHUB_OUTPUT + + reply_to_comment: + name: Reply to the comment + if: ${{ needs.get-tests.outputs.models != '[]' }} + needs: [get-pr-number, get-tests] + permissions: + pull-requests: write + runs-on: ubuntu-22.04 + steps: + - name: Reply to the comment + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + MODELS: ${{ needs.get-tests.outputs.models }} + run: | + gh api \ + --method POST \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + repos/${{ github.repository }}/issues/${{ needs.get-pr-number.outputs.PR_NUMBER }}/comments \ + -f "body=This comment contains run-slow, running the specified jobs: ${{ env.MODELS }} ..." + + create_run: + name: Create run + if: ${{ needs.get-tests.outputs.models != '[]' }} + needs: [get-sha, get-tests, reply_to_comment] + permissions: + statuses: write + runs-on: ubuntu-22.04 + steps: + - name: Create Run + id: create_run + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Create a commit status (pending) for a run of this workflow. The status has to be updated later in `update_run_status`. + # See https://docs.github.com/en/rest/commits/statuses?apiVersion=2022-11-28#create-a-commit-status + GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }} + run: | + gh api \ + --method POST \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + repos/${{ github.repository }}/statuses/${{ needs.get-sha.outputs.PR_HEAD_SHA }} \ + -f "target_url=$GITHUB_RUN_URL" -f "state=pending" -f "description=Slow CI job" -f "context=pytest/custom-tests" + + run_models_gpu: + name: Run all tests for the model + if: ${{ needs.get-tests.outputs.models != '[]' }} + needs: [get-pr-number, get-sha, get-tests, create_run] + strategy: + fail-fast: false + matrix: + folders: ${{ fromJson(needs.get-tests.outputs.models) }} + machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] + runs-on: + group: '${{ matrix.machine_type }}' + container: + image: huggingface/transformers-all-latest-gpu + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Echo input and matrix info + shell: bash + run: | + echo "${{ matrix.folders }}" + + - name: Echo folder ${{ matrix.folders }} + shell: bash + # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to + # set the artifact folder names (because the character `/` is not allowed). + run: | + echo "${{ matrix.folders }}" + matrix_folders=${{ matrix.folders }} + matrix_folders=${matrix_folders/'models/'/'models_'} + echo "$matrix_folders" + echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + + - name: Checkout to PR merge commit + working-directory: /transformers + run: | + git fetch origin refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge:refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge + git checkout refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge + git log -1 --format=%H + + - name: Verify merge commit SHA + env: + VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }} + working-directory: /transformers + run: | + PR_MERGE_SHA=$(git log -1 --format=%H) + if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then + echo "The merged commit SHA is not the same as the verified one! Security issue detected, abort the workflow!"; + exit -1; + fi + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Set `machine_type` for report and artifact names + working-directory: /transformers + shell: bash + run: | + echo "${{ matrix.machine_type }}" + if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then + machine_type=single-gpu + elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then + machine_type=multi-gpu + else + machine_type=${{ matrix.machine_type }} + fi + echo "$machine_type" + echo "machine_type=$machine_type" >> $GITHUB_ENV + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run all tests on GPU + working-directory: /transformers + run: | + export CUDA_VISIBLE_DEVICES="$(python3 utils/set_cuda_devices_for_ci.py --test_folder ${{ matrix.folders }})" + echo $CUDA_VISIBLE_DEVICES + python3 -m pytest -v -rsfE --make-reports=${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt + + - name: Make sure report directory exists + shell: bash + run: | + mkdir -p /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports + echo "hello" > /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt + echo "${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports" + + - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports" + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports + + update_run_status: + name: Update Check Run Status + needs: [get-sha, create_run, run_models_gpu] + permissions: + statuses: write + if: ${{ always() && needs.create_run.result == 'success' }} + runs-on: ubuntu-22.04 + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }} + steps: + - name: Get `run_models_gpu` job status + run: | + echo "${{ needs.run_models_gpu.result }}" + if [ "${{ needs.run_models_gpu.result }}" = "cancelled" ]; then + echo "STATUS=failure" >> $GITHUB_ENV + elif [ "${{ needs.run_models_gpu.result }}" = "skipped" ]; then + echo "STATUS=success" >> $GITHUB_ENV + else + echo "STATUS=${{ needs.run_models_gpu.result }}" >> $GITHUB_ENV + fi + + - name: Update PR commit statuses + run: | + echo "${{ needs.run_models_gpu.result }}" + echo "${{ env.STATUS }}" + gh api \ + --method POST \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + repos/${{ github.repository }}/statuses/${{ needs.get-sha.outputs.PR_HEAD_SHA }} \ + -f "target_url=$GITHUB_RUN_URL" -f "state=${{ env.STATUS }}" -f "description=Slow CI job" -f "context=pytest/custom-tests" diff --git a/.github/workflows/self-nightly-past-ci-caller.yml b/.github/workflows/self-nightly-past-ci-caller.yml index 142399a6366c..46d811d4a433 100644 --- a/.github/workflows/self-nightly-past-ci-caller.yml +++ b/.github/workflows/self-nightly-past-ci-caller.yml @@ -21,39 +21,6 @@ jobs: echo "$(python3 -c 'print(int(${{ github.run_number }}) % 10)')" echo "run_number=$(python3 -c 'print(int(${{ github.run_number }}) % 10)')" >> $GITHUB_OUTPUT - run_past_ci_pytorch_1-13: - name: PyTorch 1.13 - needs: get_number - if: needs.get_number.outputs.run_number == 0 && (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) - uses: ./.github/workflows/self-past-caller.yml - with: - framework: pytorch - version: "1.13" - sha: ${{ github.sha }} - secrets: inherit - - run_past_ci_pytorch_1-12: - name: PyTorch 1.12 - needs: get_number - if: needs.get_number.outputs.run_number == 1 && (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) - uses: ./.github/workflows/self-past-caller.yml - with: - framework: pytorch - version: "1.12" - sha: ${{ github.sha }} - secrets: inherit - - run_past_ci_pytorch_1-11: - name: PyTorch 1.11 - needs: get_number - if: needs.get_number.outputs.run_number == 2 && (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) - uses: ./.github/workflows/self-past-caller.yml - with: - framework: pytorch - version: "1.11" - sha: ${{ github.sha }} - secrets: inherit - run_past_ci_tensorflow_2-11: name: TensorFlow 2.11 needs: get_number diff --git a/.github/workflows/self-pr-slow-ci.yml b/.github/workflows/self-pr-slow-ci.yml deleted file mode 100644 index 43fcecd8def2..000000000000 --- a/.github/workflows/self-pr-slow-ci.yml +++ /dev/null @@ -1,151 +0,0 @@ -name: PR slow CI - -on: - pull_request: - paths: - - "src/transformers/models/*/modeling_*.py" - - "tests/**/test_*.py" - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -env: - HF_HOME: /mnt/cache - TRANSFORMERS_IS_CI: yes - OMP_NUM_THREADS: 8 - MKL_NUM_THREADS: 8 - RUN_SLOW: yes - # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. - # This token is created under the bot `hf-transformers-bot`. - HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} - SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} - TF_FORCE_GPU_ALLOW_GROWTH: true - RUN_PT_TF_CROSS_TESTS: 1 - CUDA_VISIBLE_DEVICES: 0,1 - -jobs: - find_models_to_run: - runs-on: ubuntu-22.04 - name: Find models to run slow tests - # Triggered only if the required label `run-slow` is added - if: ${{ contains(github.event.pull_request.labels.*.name, 'run-slow') }} - outputs: - models: ${{ steps.models_to_run.outputs.models }} - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: "0" - ref: ${{ github.event.pull_request.head.sha }} - - - name: Get commit message - run: | - echo "commit_message=$(git show -s --format=%s)" >> $GITHUB_ENV - - - name: Get models to run slow tests - run: | - echo "${{ env.commit_message }}" - python -m pip install GitPython - python utils/pr_slow_ci_models.py --commit_message "${{ env.commit_message }}" | tee output.txt - echo "models=$(tail -n 1 output.txt)" >> $GITHUB_ENV - - - name: Models to run slow tests - id: models_to_run - run: | - echo "${{ env.models }}" - echo "models=${{ env.models }}" >> $GITHUB_OUTPUT - - run_models_gpu: - name: Run all tests for the model - # Triggered only `find_models_to_run` is triggered (label `run-slow` is added) which gives the models to run - # (either a new model PR or via a commit message) - if: ${{ needs.find_models_to_run.outputs.models != '[]' }} - needs: find_models_to_run - strategy: - fail-fast: false - matrix: - folders: ${{ fromJson(needs.find_models_to_run.outputs.models) }} - machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] - runs-on: - group: '${{ matrix.machine_type }}' - container: - image: huggingface/transformers-all-latest-gpu - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - steps: - - name: Echo input and matrix info - shell: bash - run: | - echo "${{ matrix.folders }}" - - - name: Echo folder ${{ matrix.folders }} - shell: bash - # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to - # set the artifact folder names (because the character `/` is not allowed). - run: | - echo "${{ matrix.folders }}" - matrix_folders=${{ matrix.folders }} - matrix_folders=${matrix_folders/'models/'/'models_'} - echo "$matrix_folders" - echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - - name: Update clone - working-directory: /transformers - run: git fetch && git fetch origin pull/${{ github.event.pull_request.number }}/head:pull/${{ github.event.pull_request.number }}/merge && git checkout pull/${{ github.event.pull_request.number }}/merge - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . && python3 -m pip install --upgrade torch torchaudio torchvision - - - name: NVIDIA-SMI - run: | - nvidia-smi - - - name: Set `machine_type` for report and artifact names - working-directory: /transformers - shell: bash - run: | - echo "${{ matrix.machine_type }}" - if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then - machine_type=single-gpu - elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then - machine_type=multi-gpu - else - machine_type=${{ matrix.machine_type }} - fi - echo "$machine_type" - echo "machine_type=$machine_type" >> $GITHUB_ENV - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run all tests on GPU - working-directory: /transformers - run: | - export CUDA_VISIBLE_DEVICES="$(python3 utils/set_cuda_devices_for_ci.py --test_folder ${{ matrix.folders }})" - echo $CUDA_VISIBLE_DEVICES - python3 -m pytest -v -rsfE --make-reports=${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt - - - name: Make sure report directory exists - shell: bash - run: | - mkdir -p /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports - echo "hello" > /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt - echo "${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports" - - - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports" - if: ${{ always() }} - uses: actions/upload-artifact@v4 - with: - name: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports - path: /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports diff --git a/.github/workflows/self-push-amd-mi210-caller.yml b/.github/workflows/self-push-amd-mi210-caller.yml index 3f5c74303d00..267e8413d67e 100644 --- a/.github/workflows/self-push-amd-mi210-caller.yml +++ b/.github/workflows/self-push-amd-mi210-caller.yml @@ -1,28 +1,28 @@ -name: Self-hosted runner (AMD mi210 CI caller) - -on: - workflow_run: - workflows: ["Self-hosted runner (push-caller)"] - branches: ["main"] - types: [completed] - push: - branches: - - run_amd_push_ci_caller* - paths: - - "src/**" - - "tests/**" - - ".github/**" - - "templates/**" - - "utils/**" - pull_request: - types: [opened, reopened, synchronize] - branches: ["main"] - -jobs: - run_amd_ci: - name: AMD mi210 - if: (cancelled() != true) && (github.event_name != 'schedule') && (github.event_name == 'pull_request') - uses: ./.github/workflows/self-push-amd.yml - with: - gpu_flavor: mi210 - secrets: inherit +name: Self-hosted runner (AMD mi210 CI caller) + +on: + #workflow_run: + # workflows: ["Self-hosted runner (push-caller)"] + # branches: ["main"] + # types: [completed] + push: + branches: + - run_amd_push_ci_caller* + paths: + - "src/**" + - "tests/**" + - ".github/**" + - "templates/**" + - "utils/**" + pull_request: + types: [opened, reopened, synchronize] + branches: ["main"] + +jobs: + run_amd_ci: + name: AMD mi210 + if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller'))) + uses: ./.github/workflows/self-push-amd.yml + with: + gpu_flavor: mi210 + secrets: inherit \ No newline at end of file diff --git a/.github/workflows/self-push-amd-mi250-caller.yml b/.github/workflows/self-push-amd-mi250-caller.yml index a5442c38c16e..84d04874de70 100644 --- a/.github/workflows/self-push-amd-mi250-caller.yml +++ b/.github/workflows/self-push-amd-mi250-caller.yml @@ -1,28 +1,28 @@ -name: Self-hosted runner (AMD mi250 CI caller) - -on: - workflow_run: - workflows: ["Self-hosted runner (push-caller)"] - branches: ["main"] - types: [completed] - push: - branches: - - run_amd_push_ci_caller* - paths: - - "src/**" - - "tests/**" - - ".github/**" - - "templates/**" - - "utils/**" - pull_request: - types: [opened, reopened, synchronize] - branches: ["main"] - -jobs: - run_amd_ci: - name: AMD mi250 - if: (cancelled() != true) && (github.event_name != 'schedule') && (github.event_name == 'pull_request') - uses: ./.github/workflows/self-push-amd.yml - with: - gpu_flavor: mi250 - secrets: inherit +name: Self-hosted runner (AMD mi250 CI caller) + +on: + #workflow_run: + # workflows: ["Self-hosted runner (push-caller)"] + # branches: ["main"] + # types: [completed] + push: + branches: + - run_amd_push_ci_caller* + paths: + - "src/**" + - "tests/**" + - ".github/**" + - "templates/**" + - "utils/**" + pull_request: + types: [opened, reopened, synchronize] + branches: ["main"] + +jobs: + run_amd_ci: + name: AMD mi250 + if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller'))) + uses: ./.github/workflows/self-push-amd.yml + with: + gpu_flavor: mi250 + secrets: inherit \ No newline at end of file diff --git a/.github/workflows/self-push-amd-mi300-caller.yml b/.github/workflows/self-push-amd-mi300-caller.yml index a8ee4e540ecf..797916125a24 100644 --- a/.github/workflows/self-push-amd-mi300-caller.yml +++ b/.github/workflows/self-push-amd-mi300-caller.yml @@ -1,10 +1,10 @@ name: Self-hosted runner (AMD mi300 CI caller) on: - workflow_run: - workflows: ["Self-hosted runner (push-caller)"] - branches: ["main"] - types: [completed] + #workflow_run: + # workflows: ["Self-hosted runner (push-caller)"] + # branches: ["main"] + # types: [completed] push: branches: - run_amd_push_ci_caller* diff --git a/.github/workflows/self-push-amd.yml b/.github/workflows/self-push-amd.yml index 0e0a4560043b..6e558ed0e341 100644 --- a/.github/workflows/self-push-amd.yml +++ b/.github/workflows/self-push-amd.yml @@ -151,13 +151,7 @@ jobs: runs-on: rocm container: image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now - options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - env: - # For the meaning of these environment variables, see the job `Setup` - CI_BRANCH_PUSH: ${{ github.event.ref }} - CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }} - CI_SHA_PUSH: ${{ github.event.head_commit.id }} - CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }} + options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Remove transformers repository (installed during docker image build) diff --git a/.github/workflows/self-scheduled-amd-mi210-caller.yml b/.github/workflows/self-scheduled-amd-mi210-caller.yml index 1c79b38a314e..6109faca0093 100644 --- a/.github/workflows/self-scheduled-amd-mi210-caller.yml +++ b/.github/workflows/self-scheduled-amd-mi210-caller.yml @@ -1,55 +1,55 @@ -name: Self-hosted runner (AMD mi210 scheduled CI caller) - -on: - workflow_run: - workflows: ["Self-hosted runner (AMD scheduled CI caller)"] - branches: ["main"] - types: [completed] - push: - branches: - - run_amd_scheduled_ci_caller* - -jobs: - model-ci: - name: Model CI - uses: ./.github/workflows/self-scheduled-amd.yml - with: - job: run_models_gpu - slack_report_channel: "#transformers-ci-daily-amd" - runner: mi210 - docker: huggingface/transformers-pytorch-amd-gpu - ci_event: Scheduled CI (AMD) - mi210 - secrets: inherit - - torch-pipeline: - name: Torch pipeline CI - uses: ./.github/workflows/self-scheduled-amd.yml - with: - job: run_pipelines_torch_gpu - slack_report_channel: "#transformers-ci-daily-amd" - runner: mi210 - docker: huggingface/transformers-pytorch-amd-gpu - ci_event: Scheduled CI (AMD) - mi210 - secrets: inherit - - example-ci: - name: Example CI - uses: ./.github/workflows/self-scheduled-amd.yml - with: - job: run_examples_gpu - slack_report_channel: "#transformers-ci-daily-amd" - runner: mi210 - docker: huggingface/transformers-pytorch-amd-gpu - ci_event: Scheduled CI (AMD) - mi210 - secrets: inherit - - deepspeed-ci: - name: DeepSpeed CI - uses: ./.github/workflows/self-scheduled-amd.yml - with: - job: run_torch_cuda_extensions_gpu - slack_report_channel: "#transformers-ci-daily-amd" - runner: mi210 - docker: huggingface/transformers-pytorch-deepspeed-amd-gpu - ci_event: Scheduled CI (AMD) - mi210 - secrets: inherit +name: Self-hosted runner (AMD mi210 scheduled CI caller) + +on: + workflow_run: + workflows: ["Self-hosted runner (AMD scheduled CI caller)"] + branches: ["main"] + types: [completed] + push: + branches: + - run_amd_scheduled_ci_caller* + +jobs: + model-ci: + name: Model CI + uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main + with: + job: run_models_gpu + slack_report_channel: "#transformers-ci-daily-amd" + runner: mi210 + docker: huggingface/transformers-pytorch-amd-gpu + ci_event: Scheduled CI (AMD) - mi210 + secrets: inherit + + torch-pipeline: + name: Torch pipeline CI + uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main + with: + job: run_pipelines_torch_gpu + slack_report_channel: "#transformers-ci-daily-amd" + runner: mi210 + docker: huggingface/transformers-pytorch-amd-gpu + ci_event: Scheduled CI (AMD) - mi210 + secrets: inherit + + example-ci: + name: Example CI + uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main + with: + job: run_examples_gpu + slack_report_channel: "#transformers-ci-daily-amd" + runner: mi210 + docker: huggingface/transformers-pytorch-amd-gpu + ci_event: Scheduled CI (AMD) - mi210 + secrets: inherit + + deepspeed-ci: + name: DeepSpeed CI + uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main + with: + job: run_torch_cuda_extensions_gpu + slack_report_channel: "#transformers-ci-daily-amd" + runner: mi210 + docker: huggingface/transformers-pytorch-deepspeed-amd-gpu + ci_event: Scheduled CI (AMD) - mi210 + secrets: inherit diff --git a/.github/workflows/self-scheduled-amd-mi250-caller.yml b/.github/workflows/self-scheduled-amd-mi250-caller.yml index fd1513057163..a33b6e579c0e 100644 --- a/.github/workflows/self-scheduled-amd-mi250-caller.yml +++ b/.github/workflows/self-scheduled-amd-mi250-caller.yml @@ -1,55 +1,55 @@ -name: Self-hosted runner (AMD mi250 scheduled CI caller) - -on: - workflow_run: - workflows: ["Self-hosted runner (AMD scheduled CI caller)"] - branches: ["main"] - types: [completed] - push: - branches: - - run_amd_scheduled_ci_caller* - -jobs: - model-ci: - name: Model CI - uses: ./.github/workflows/self-scheduled-amd.yml - with: - job: run_models_gpu - slack_report_channel: "#transformers-ci-daily-amd" - runner: mi250 - docker: huggingface/transformers-pytorch-amd-gpu - ci_event: Scheduled CI (AMD) - mi250 - secrets: inherit - - torch-pipeline: - name: Torch pipeline CI - uses: ./.github/workflows/self-scheduled-amd.yml - with: - job: run_pipelines_torch_gpu - slack_report_channel: "#transformers-ci-daily-amd" - runner: mi250 - docker: huggingface/transformers-pytorch-amd-gpu - ci_event: Scheduled CI (AMD) - mi250 - secrets: inherit - - example-ci: - name: Example CI - uses: ./.github/workflows/self-scheduled-amd.yml - with: - job: run_examples_gpu - slack_report_channel: "#transformers-ci-daily-amd" - runner: mi250 - docker: huggingface/transformers-pytorch-amd-gpu - ci_event: Scheduled CI (AMD) - mi250 - secrets: inherit - - deepspeed-ci: - name: DeepSpeed CI - uses: ./.github/workflows/self-scheduled-amd.yml - with: - job: run_torch_cuda_extensions_gpu - slack_report_channel: "#transformers-ci-daily-amd" - runner: mi250 - docker: huggingface/transformers-pytorch-deepspeed-amd-gpu - ci_event: Scheduled CI (AMD) - mi250 - secrets: inherit +name: Self-hosted runner (AMD mi250 scheduled CI caller) + +on: + workflow_run: + workflows: ["Self-hosted runner (AMD scheduled CI caller)"] + branches: ["main"] + types: [completed] + push: + branches: + - run_amd_scheduled_ci_caller* + +jobs: + model-ci: + name: Model CI + uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main + with: + job: run_models_gpu + slack_report_channel: "#transformers-ci-daily-amd" + runner: mi250 + docker: huggingface/transformers-pytorch-amd-gpu + ci_event: Scheduled CI (AMD) - mi250 + secrets: inherit + + torch-pipeline: + name: Torch pipeline CI + uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main + with: + job: run_pipelines_torch_gpu + slack_report_channel: "#transformers-ci-daily-amd" + runner: mi250 + docker: huggingface/transformers-pytorch-amd-gpu + ci_event: Scheduled CI (AMD) - mi250 + secrets: inherit + + example-ci: + name: Example CI + uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main + with: + job: run_examples_gpu + slack_report_channel: "#transformers-ci-daily-amd" + runner: mi250 + docker: huggingface/transformers-pytorch-amd-gpu + ci_event: Scheduled CI (AMD) - mi250 + secrets: inherit + + deepspeed-ci: + name: DeepSpeed CI + uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main + with: + job: run_torch_cuda_extensions_gpu + slack_report_channel: "#transformers-ci-daily-amd" + runner: mi250 + docker: huggingface/transformers-pytorch-deepspeed-amd-gpu + ci_event: Scheduled CI (AMD) - mi250 + secrets: inherit diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml deleted file mode 100644 index 47f92cd6a2b0..000000000000 --- a/.github/workflows/self-scheduled-amd.yml +++ /dev/null @@ -1,349 +0,0 @@ -name: Self-hosted runner (scheduled-amd) - -# Note: For the AMD CI, we rely on a caller workflow and on the workflow_call event to trigger the -# CI in order to run it on both MI210 and MI250, without having to use matrix here which pushes -# us towards the limit of allowed jobs on GitHub Actions. - -on: - workflow_call: - inputs: - job: - required: true - type: string - slack_report_channel: - required: true - type: string - runner: - required: true - type: string - docker: - required: true - type: string - ci_event: - required: true - type: string - -env: - HF_HOME: /mnt/cache - TRANSFORMERS_IS_CI: yes - OMP_NUM_THREADS: 8 - MKL_NUM_THREADS: 8 - RUN_SLOW: yes - HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} - SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} - NUM_SLICES: 2 - -# Important note: each job (run_tests_single_gpu, run_tests_multi_gpu, run_examples_gpu, run_pipelines_torch_gpu) requires all the previous jobs before running. -# This is done so that we avoid parallelizing the scheduled tests, to leave available -# runners for the push CI that is running on the same machine. -jobs: - check_runner_status: - name: Check Runner Status - runs-on: ubuntu-22.04 - steps: - - name: Checkout transformers - uses: actions/checkout@v4 - with: - fetch-depth: 2 - - - name: Check Runner Status - run: python utils/check_self_hosted_runner.py --target_runners hf-amd-mi210-ci-1gpu-1,hf-amd-mi250-ci-1gpu-1,hf-amd-mi300-ci-1gpu-1 --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} - - check_runners: - name: Check Runners - needs: check_runner_status - strategy: - matrix: - machine_type: [single-gpu, multi-gpu] - runs-on: ['${{ matrix.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}'] - container: - image: huggingface/transformers-pytorch-amd-gpu - options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - steps: - - name: ROCM-SMI - run: | - rocm-smi - - - name: ROCM-INFO - run: | - rocminfo | grep "Agent" -A 14 - - - name: Show ROCR environment - run: | - echo "ROCR: $ROCR_VISIBLE_DEVICES" - - setup: - if: contains(fromJSON('["run_models_gpu"]'), inputs.job) - name: Setup - needs: check_runners - strategy: - matrix: - machine_type: [single-gpu, multi-gpu] - runs-on: ['${{ matrix.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}'] - container: - image: huggingface/transformers-pytorch-amd-gpu - options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - outputs: - folder_slices: ${{ steps.set-matrix.outputs.folder_slices }} - slice_ids: ${{ steps.set-matrix.outputs.slice_ids }} - steps: - - name: Update clone - working-directory: /transformers - run: | - git fetch && git checkout ${{ github.sha }} - - - name: Cleanup - working-directory: /transformers - run: | - rm -rf tests/__pycache__ - rm -rf tests/models/__pycache__ - rm -rf reports - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - id: set-matrix - name: Identify models to test - working-directory: /transformers/tests - run: | - echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT - echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT - - - name: ROCM-SMI - run: | - rocm-smi - - - name: ROCM-INFO - run: | - rocminfo | grep "Agent" -A 14 - - - name: Show ROCR environment - run: | - echo "ROCR: $ROCR_VISIBLE_DEVICES" - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - run_models_gpu: - if: ${{ inputs.job == 'run_models_gpu' }} - name: Single GPU tests - needs: setup - strategy: - max-parallel: 1 # For now, not to parallelize. Can change later if it works well. - fail-fast: false - matrix: - machine_type: [single-gpu, multi-gpu] - slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }} - uses: ./.github/workflows/model_jobs_amd.yml - with: - folder_slices: ${{ needs.setup.outputs.folder_slices }} - machine_type: ${{ matrix.machine_type }} - slice_id: ${{ matrix.slice_id }} - runner: ${{ inputs.runner }} - docker: ${{ inputs.docker }} - secrets: inherit - - run_pipelines_torch_gpu: - if: ${{ inputs.job == 'run_pipelines_torch_gpu' }} - name: PyTorch pipelines - needs: check_runners - strategy: - fail-fast: false - matrix: - machine_type: [single-gpu, multi-gpu] - runs-on: ['${{ matrix.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}'] - container: - image: ${{ inputs.docker }} - options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - steps: - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: ROCM-SMI - run: | - rocm-smi - - - name: ROCM-INFO - run: | - rocminfo | grep "Agent" -A 14 - - - name: Show ROCR environment - run: | - echo "ROCR: $ROCR_VISIBLE_DEVICES" - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run all pipeline tests on GPU - working-directory: /transformers - run: | - python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines -m "not not_device_test" - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports/failures_short.txt - - - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports" - if: ${{ always() }} - uses: actions/upload-artifact@v4 - with: - name: ${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports - path: /transformers/reports/${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports - - run_examples_gpu: - if: ${{ inputs.job == 'run_examples_gpu' }} - name: Examples directory - needs: check_runners - strategy: - fail-fast: false - matrix: - machine_type: [single-gpu] - runs-on: ['${{ matrix.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}'] - container: - image: ${{ inputs.docker }} - options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - steps: - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: ROCM-SMI - run: | - rocm-smi - - - name: ROCM-INFO - run: | - rocminfo | grep "Agent" -A 14 - - - name: Show ROCR environment - run: | - echo "ROCR: $ROCR_VISIBLE_DEVICES" - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run examples tests on GPU - working-directory: /transformers - run: | - pip install -r examples/pytorch/_tests_requirements.txt - python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_examples_gpu_test_reports examples/pytorch -m "not not_device_test" - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_run_examples_gpu_test_reports/failures_short.txt - - - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_examples_gpu_test_reports" - if: ${{ always() }} - uses: actions/upload-artifact@v4 - with: - name: ${{ matrix.machine_type }}_run_examples_gpu_test_reports - path: /transformers/reports/${{ matrix.machine_type }}_run_examples_gpu_test_reports - - run_torch_cuda_extensions_gpu: - if: ${{ inputs.job == 'run_torch_cuda_extensions_gpu' }} - name: Torch ROCm deepspeed tests - needs: check_runners - strategy: - fail-fast: false - matrix: - machine_type: [single-gpu, multi-gpu] - runs-on: ['${{ matrix.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}'] - container: - image: ${{ inputs.docker }} - options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - steps: - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: ROCM-SMI - run: | - rocm-smi - - - name: ROCM-INFO - run: | - rocminfo | grep "Agent" -A 14 - - - name: Show ROCR environment - run: | - echo "ROCR: $ROCR_VISIBLE_DEVICES" - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run all tests on GPU - working-directory: /transformers - run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended -m "not not_device_test" - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt - - - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports" - if: ${{ always() }} - uses: actions/upload-artifact@v4 - with: - name: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports - path: /transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports - - send_results: - name: Slack Report - needs: [ - check_runner_status, - check_runners, - setup, - run_models_gpu, - run_pipelines_torch_gpu, - run_examples_gpu, - run_torch_cuda_extensions_gpu - ] - if: ${{ always() }} - uses: ./.github/workflows/slack-report.yml - with: - job: ${{ inputs.job }} - # This would be `skipped` if `setup` is skipped. - setup_status: ${{ needs.setup.result }} - slack_report_channel: ${{ inputs.slack_report_channel }} - # This would be an empty string if `setup` is skipped. - folder_slices: ${{ needs.setup.outputs.folder_slices }} - quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }} - ci_event: ${{ inputs.ci_event }} - - secrets: inherit diff --git a/.github/workflows/slack-report.yml b/.github/workflows/slack-report.yml index ee2962ba89c3..cbea37ff567a 100644 --- a/.github/workflows/slack-report.yml +++ b/.github/workflows/slack-report.yml @@ -70,7 +70,7 @@ jobs: with: name: ci_results_${{ inputs.job }} path: ci_results_${{ inputs.job }} - + - uses: actions/checkout@v4 - uses: actions/download-artifact@v4 - name: Send message to Slack for quantization workflow @@ -90,7 +90,7 @@ jobs: pip install huggingface_hub pip install slack_sdk pip show slack_sdk - python utils/notification_service_quantization.py "${{ inputs.quantization_matrix }}" + python utils/notification_service_quantization.py "${{ inputs.quantization_matrix }}" # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack. - name: Failure table artifacts @@ -98,4 +98,4 @@ jobs: uses: actions/upload-artifact@v4 with: name: ci_results_${{ inputs.job }} - path: ci_results_${{ inputs.job }} \ No newline at end of file + path: ci_results_${{ inputs.job }} diff --git a/README.md b/README.md index c748e6750662..8ab5ceaf7e68 100644 --- a/README.md +++ b/README.md @@ -249,23 +249,43 @@ The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/sta ### With pip -This repository is tested on Python 3.9+, Flax 0.4.1+, PyTorch 1.11+, and TensorFlow 2.6+. +This repository is tested on Python 3.9+, Flax 0.4.1+, PyTorch 2.0+, and TensorFlow 2.6+. You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). First, create a virtual environment with the version of Python you're going to use and activate it. -Then, you will need to install at least one of Flax, PyTorch, or TensorFlow. -Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/), [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) and/or [Flax](https://github.com/google/flax#quick-install) and [Jax](https://github.com/google/jax#installation) installation pages regarding the specific installation command for your platform. +**macOS/Linux** + +```python -m venv env +source env/bin/activate +``` + +**Windows** + +``` python -m venv env +env\Scripts\activate +``` + +To use 🤗 Transformers, you must install at least one of Flax, PyTorch, or TensorFlow. Refer to the official installation guides for platform-specific commands: + +[TensorFlow installation page](https://www.tensorflow.org/install/), +[PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) and/or [Flax](https://github.com/google/flax#quick-install) and [Jax](https://github.com/google/jax#installation) When one of those backends has been installed, 🤗 Transformers can be installed using pip as follows: -```bash +``` pip install transformers ``` If you'd like to play with the examples or need the bleeding edge of the code and can't wait for a new release, you must [install the library from source](https://huggingface.co/docs/transformers/installation#installing-from-source). +``` +git clone https://github.com/huggingface/transformers.git +cd transformers +pip install +``` + ### With conda 🤗 Transformers can be installed using conda as follows: diff --git a/benchmark/README.md b/benchmark/README.md new file mode 100644 index 000000000000..a827da444f08 --- /dev/null +++ b/benchmark/README.md @@ -0,0 +1,49 @@ +# Benchmarks + +You might want to add new benchmarks. + +You will need to define a python function named `run_benchmark` in your python file and the file must be located in this `benchmark/` directory. + +The expected function signature is the following: + +```py +def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100): +``` + +## Writing metrics to the database + +`MetricRecorder` is thread-safe, in the sense of the python [`Thread`](https://docs.python.org/3/library/threading.html#threading.Thread). This means you can start a background thread to do the readings on the device measurements while not blocking the main thread to execute the model measurements. + +cf [`llama.py`](./llama.py) to see an example of this in practice. + +```py +from benchmarks_entrypoint import MetricsRecorder +import psycopg2 + +def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100): + metrics_recorder = MetricsRecorder(psycopg2.connect("dbname=metrics"), logger, branch, commit_id, commit_msg) + benchmark_id = metrics_recorder.initialise_benchmark({"gpu_name": gpu_name, "model_id": model_id}) + # To collect device measurements + metrics_recorder.collect_device_measurements( + benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes + ) + # To collect your model measurements + metrics_recorder.collect_model_measurements( + benchmark_id, + { + "model_load_time": model_load_time, + "first_eager_forward_pass_time_secs": first_eager_fwd_pass_time, + "second_eager_forward_pass_time_secs": second_eager_fwd_pass_time, + "first_eager_generate_time_secs": first_eager_generate_time, + "second_eager_generate_time_secs": second_eager_generate_time, + "time_to_first_token_secs": time_to_first_token, + "time_to_second_token_secs": time_to_second_token, + "time_to_third_token_secs": time_to_third_token, + "time_to_next_token_mean_secs": mean_time_to_next_token, + "first_compile_generate_time_secs": first_compile_generate_time, + "second_compile_generate_time_secs": second_compile_generate_time, + "third_compile_generate_time_secs": third_compile_generate_time, + "fourth_compile_generate_time_secs": fourth_compile_generate_time, + }, + ) +``` diff --git a/benchmark/benchmarks_entrypoint.py b/benchmark/benchmarks_entrypoint.py new file mode 100644 index 000000000000..7925e2902834 --- /dev/null +++ b/benchmark/benchmarks_entrypoint.py @@ -0,0 +1,144 @@ +import argparse +import importlib.util +import logging +import os +from typing import Dict +import psycopg2 +import sys + +from psycopg2.extras import Json +from psycopg2.extensions import register_adapter + + +register_adapter(dict, Json) + + +class ImportModuleException(Exception): + pass + + +class MetricsRecorder: + def __init__(self, connection, logger: logging.Logger, branch: str, commit_id: str, commit_msg: str): + self.conn = connection + self.conn.autocommit = True + self.logger = logger + self.branch = branch + self.commit_id = commit_id + self.commit_msg = commit_msg + + def initialise_benchmark(self, metadata: Dict[str, str]) -> int: + """ + Creates a new benchmark, returns the benchmark id + """ + # gpu_name: str, model_id: str + with self.conn.cursor() as cur: + cur.execute( + "INSERT INTO benchmarks (branch, commit_id, commit_message, metadata) VALUES (%s, %s, %s, %s) RETURNING benchmark_id", + (self.branch, self.commit_id, self.commit_msg, metadata), + ) + benchmark_id = cur.fetchone()[0] + logger.debug(f"initialised benchmark #{benchmark_id}") + return benchmark_id + + def collect_device_measurements(self, benchmark_id: int, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes): + """ + Collect device metrics, such as CPU & GPU usage. These are "static", as in you cannot pass arbitrary arguments to the function. + """ + with self.conn.cursor() as cur: + cur.execute( + "INSERT INTO device_measurements (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes) VALUES (%s, %s, %s, %s, %s)", + (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes), + ) + self.logger.debug( + f"inserted device measurements for benchmark #{benchmark_id} [CPU util: {cpu_util}, mem MBs: {mem_megabytes}, GPU util: {gpu_util}, GPU mem MBs: {gpu_mem_megabytes}]" + ) + + def collect_model_measurements(self, benchmark_id: int, measurements: Dict[str, float]): + with self.conn.cursor() as cur: + cur.execute( + """ + INSERT INTO model_measurements ( + benchmark_id, + measurements + ) VALUES (%s, %s) + """, + ( + benchmark_id, + measurements, + ), + ) + self.logger.debug(f"inserted model measurements for benchmark #{benchmark_id}: {measurements}") + + def close(self): + self.conn.close() + + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +handler = logging.StreamHandler(sys.stdout) +handler.setLevel(logging.INFO) +formatter = logging.Formatter("[%(levelname)s - %(asctime)s] %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) + + +def parse_arguments(): + """ + Parse command line arguments for the benchmarking CLI. + """ + parser = argparse.ArgumentParser(description="CLI for benchmarking the huggingface/transformers.") + + parser.add_argument( + "branch", + type=str, + help="The branch name on which the benchmarking is performed.", + ) + + parser.add_argument( + "commit_id", + type=str, + help="The commit hash on which the benchmarking is performed.", + ) + + parser.add_argument( + "commit_msg", + type=str, + help="The commit message associated with the commit, truncated to 70 characters.", + ) + + args = parser.parse_args() + + return args.branch, args.commit_id, args.commit_msg + + +def import_from_path(module_name, file_path): + try: + spec = importlib.util.spec_from_file_location(module_name, file_path) + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + spec.loader.exec_module(module) + return module + except Exception as e: + raise ImportModuleException(f"failed to load python module: {e}") + + +if __name__ == "__main__": + benchmarks_folder_path = os.path.dirname(os.path.realpath(__file__)) + + branch, commit_id, commit_msg = parse_arguments() + + for entry in os.scandir(benchmarks_folder_path): + try: + if not entry.name.endswith(".py"): + continue + if entry.path == __file__: + continue + logger.debug(f"loading: {entry.name}") + module = import_from_path(entry.name.split(".")[0], entry.path) + logger.info(f"runnning benchmarks in: {entry.name}") + module.run_benchmark(logger, branch, commit_id, commit_msg) + except ImportModuleException as e: + logger.error(e) + except Exception as e: + logger.error(f"error running benchmarks for {entry.name}: {e}") diff --git a/benchmark/default.yml b/benchmark/default.yml new file mode 100644 index 000000000000..f3f02cab34d1 --- /dev/null +++ b/benchmark/default.yml @@ -0,0 +1,10 @@ +apiVersion: 1 + +providers: + - name: 'Transformers Benchmarks' + orgId: 1 + type: file + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /etc/grafana/dashboards diff --git a/benchmark/grafana_dashboard.json b/benchmark/grafana_dashboard.json index 3d579f7b3687..caaec78a5223 100644 --- a/benchmark/grafana_dashboard.json +++ b/benchmark/grafana_dashboard.json @@ -30,7 +30,7 @@ "title": "Go to data", "tooltip": "Go to data", "type": "link", - "url": "http://transformers-benchmarks.huggingface.co/d/fdz33iyzln9c0a/transformers-benchmarks?orgId=1&from=${StartTime}&to=${EndTime}" + "url": "http://transformers-benchmarks.hf.co/d/fdz33iyzln9c0a/transformers-benchmarks?orgId=1&from=${StartTime}&to=${EndTime}" } ], "liveNow": true, @@ -77,7 +77,7 @@ "properties": [ { "id": "custom.width", - "value": 196 + "value": 202 } ] }, @@ -101,7 +101,7 @@ "properties": [ { "id": "custom.width", - "value": 581 + "value": 524 } ] }, @@ -113,7 +113,19 @@ "properties": [ { "id": "custom.width", - "value": 379 + "value": 353 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "model_id" + }, + "properties": [ + { + "id": "custom.width", + "value": 216 } ] } @@ -143,12 +155,14 @@ "targets": [ { "datasource": { - "type": "grafana-postgresql-datasource" + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT commit_id as commit_id, commit_message, gpu_name, created_at AS date FROM benchmarks WHERE branch = '${branch}' ORDER BY benchmark_id DESC LIMIT ${last_n_commits};", + "rawSql": "SELECT commit_id, commit_message, metadata->>'gpu_name' as gpu_name, metadata->>'model_id' as model_id, created_at AS date FROM benchmarks WHERE branch = '${branch}' AND metadata->>'gpu_name' = '${gpu_name}' ORDER BY benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -306,13 +320,14 @@ "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'first_eager_forward_pass_time_secs' AS double precision) AS first_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "rawSql": "SELECT CAST(m.measurements->'first_eager_forward_pass_time_secs' AS double precision) AS first_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -431,13 +446,14 @@ "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'second_eager_forward_pass_time_secs' AS double precision) AS second_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "rawSql": "SELECT CAST(m.measurements->'second_eager_forward_pass_time_secs' AS double precision) AS second_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -565,13 +581,14 @@ "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'time_to_first_token_secs' AS double precision) AS time_to_first_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "rawSql": "SELECT CAST(m.measurements->'time_to_first_token_secs' AS double precision) AS time_to_first_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -686,13 +703,14 @@ "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'time_to_second_token_secs' AS double precision) AS time_to_second_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "rawSql": "SELECT CAST(m.measurements->'time_to_second_token_secs' AS double precision) AS time_to_second_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -807,13 +825,14 @@ "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'time_to_third_token_secs' AS double precision) AS time_to_third_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "rawSql": "SELECT CAST(m.measurements->'time_to_third_token_secs' AS double precision) AS time_to_third_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -928,13 +947,14 @@ "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'time_to_next_token_mean_secs' AS double precision) AS time_to_next_token_mean_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "rawSql": "SELECT CAST(m.measurements->'time_to_next_token_mean_secs' AS double precision) AS time_to_next_token_mean_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -1062,13 +1082,14 @@ "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'first_compile_generate_time_secs' AS double precision) AS first_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "rawSql": "SELECT CAST(m.measurements->'first_compile_generate_time_secs' AS double precision) AS first_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -1183,13 +1204,14 @@ "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'second_compile_generate_time_secs' AS double precision) AS second_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "rawSql": "SELECT CAST(m.measurements->'second_compile_generate_time_secs' AS double precision) AS second_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -1304,13 +1326,14 @@ "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'third_compile_generate_time_secs' AS double precision) AS third_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "rawSql": "SELECT CAST(m.measurements->'third_compile_generate_time_secs' AS double precision) AS third_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -1425,13 +1448,14 @@ "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'fourth_compile_generate_time_secs' AS double precision) AS fourth_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "rawSql": "SELECT CAST(m.measurements->'fourth_compile_generate_time_secs' AS double precision) AS fourth_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -1480,11 +1504,7 @@ "id": 15, "panels": [ { - "datasource": { - "default": true, - "type": "grafana-postgresql-datasource", - "uid": "be28nkzirtb0gd" - }, + "datasource": {}, "fieldConfig": { "defaults": { "color": { @@ -1528,8 +1548,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1563,8 +1582,9 @@ "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", @@ -1665,11 +1685,7 @@ "type": "timeseries" }, { - "datasource": { - "default": true, - "type": "grafana-postgresql-datasource", - "uid": "be28nkzirtb0gd" - }, + "datasource": {}, "fieldConfig": { "defaults": { "color": { @@ -1713,8 +1729,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1748,8 +1763,9 @@ "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", @@ -1850,11 +1866,7 @@ "type": "timeseries" }, { - "datasource": { - "default": true, - "type": "grafana-postgresql-datasource", - "uid": "be28nkzirtb0gd" - }, + "datasource": {}, "fieldConfig": { "defaults": { "color": { @@ -1898,8 +1910,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1933,8 +1944,9 @@ "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", @@ -2035,11 +2047,7 @@ "type": "timeseries" }, { - "datasource": { - "default": true, - "type": "grafana-postgresql-datasource", - "uid": "be28nkzirtb0gd" - }, + "datasource": {}, "fieldConfig": { "defaults": { "color": { @@ -2083,8 +2091,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2118,8 +2125,9 @@ "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", @@ -2224,7 +2232,6 @@ "type": "row" } ], - "refresh": "", "schemaVersion": 39, "tags": [], "templating": { @@ -2236,6 +2243,7 @@ "value": "main" }, "datasource": { + "default": true, "type": "grafana-postgresql-datasource", "uid": "be28nkzirtb0gd" }, @@ -2248,7 +2256,7 @@ "name": "branch", "options": [], "query": "SELECT DISTINCT branch FROM benchmarks;", - "refresh": 2, + "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, @@ -2261,6 +2269,7 @@ "value": "1729701492845" }, "datasource": { + "default": true, "type": "grafana-postgresql-datasource", "uid": "be28nkzirtb0gd" }, @@ -2281,10 +2290,11 @@ { "current": { "selected": false, - "text": "1730120430069", - "value": "1730120430069" + "text": "1730393397577", + "value": "1730393397577" }, "datasource": { + "default": true, "type": "grafana-postgresql-datasource", "uid": "be28nkzirtb0gd" }, @@ -2312,15 +2322,16 @@ "type": "grafana-postgresql-datasource", "uid": "be28nkzirtb0gd" }, - "definition": "SELECT DISTINCT gpu_name FROM benchmarks;", + "definition": "SELECT DISTINCT metadata->>'gpu_name' FROM benchmarks;", + "description": "", "hide": 0, "includeAll": false, "label": "GPU", "multi": false, "name": "gpu_name", "options": [], - "query": "SELECT DISTINCT gpu_name FROM benchmarks;", - "refresh": 2, + "query": "SELECT DISTINCT metadata->>'gpu_name' FROM benchmarks;", + "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, @@ -2328,7 +2339,7 @@ }, { "current": { - "selected": false, + "selected": true, "text": "10", "value": "10" }, @@ -2359,6 +2370,6 @@ "timezone": "browser", "title": "Transformers benchmarks", "uid": "fdz33iyzln9c0a", - "version": 4, + "version": 10, "weekStart": "" } diff --git a/benchmark/grafana_datasource.yaml b/benchmark/grafana_datasource.yaml new file mode 100644 index 000000000000..25f36254104a --- /dev/null +++ b/benchmark/grafana_datasource.yaml @@ -0,0 +1,17 @@ +apiVersion: 1 +datasources: + - name: grafana-postgresql-datasource + uid: be28nkzirtb0gd + type: postgres + url: $GRAFANA_POSTGRES_DATASOURCE_URL + user: $GRAFANA_POSTGRES_DATASOURCE_USER + secureJsonData: + password: $GRAFANA_POSTGRES_DATASOURCE_PWD + jsonData: + database: metrics + maxOpenConns: 100 + maxIdleConns: 100 + maxIdleConnsAuto: true + connMaxLifetime: 14400 + postgresVersion: 1000 + timescaledb: false diff --git a/benchmark/init_db.sql b/benchmark/init_db.sql index 573cc11518e8..a7864c4af183 100644 --- a/benchmark/init_db.sql +++ b/benchmark/init_db.sql @@ -3,7 +3,7 @@ CREATE TABLE IF NOT EXISTS benchmarks ( branch VARCHAR(255), commit_id VARCHAR(72), commit_message VARCHAR(70), - gpu_name VARCHAR(255), + metadata jsonb, created_at timestamp without time zone NOT NULL DEFAULT (current_timestamp AT TIME ZONE 'UTC') ); diff --git a/benchmark/llama.py b/benchmark/llama.py index 4a2c57422e6f..bbe1afefd5ef 100644 --- a/benchmark/llama.py +++ b/benchmark/llama.py @@ -1,71 +1,25 @@ -import argparse -import json -import logging +from logging import Logger import os -import sys -from statistics import mean from threading import Event, Thread from time import perf_counter, sleep from typing import Optional +from benchmarks_entrypoint import MetricsRecorder import gpustat import psutil import psycopg2 import torch from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, StaticCache -from psycopg2.extras import Json -from psycopg2.extensions import register_adapter os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - -handler = logging.StreamHandler(sys.stdout) -handler.setLevel(logging.INFO) -formatter = logging.Formatter("[%(levelname)s - %(asctime)s] %(message)s") -handler.setFormatter(formatter) -logger.addHandler(handler) - os.environ["TOKENIZERS_PARALLELISM"] = "1" torch.set_float32_matmul_precision("high") -register_adapter(dict, Json) - - -def parse_arguments(): - """ - Parse command line arguments for the benchmarking CLI. - """ - parser = argparse.ArgumentParser(description="CLI for benchmarking the huggingface/transformers.") - - parser.add_argument( - "branch", - type=str, - help="The branch name on which the benchmarking is performed.", - ) - - parser.add_argument( - "commit_id", - type=str, - help="The commit hash on which the benchmarking is performed.", - ) - parser.add_argument( - "commit_msg", - type=str, - help="The commit message associated with the commit, truncated to 70 characters.", - ) - args = parser.parse_args() - - return args.branch, args.commit_id, args.commit_msg - - -def collect_metrics(benchmark_id, continue_metric_collection): +def collect_metrics(benchmark_id, continue_metric_collection, metrics_recorder): p = psutil.Process(os.getpid()) - conn = psycopg2.connect("dbname=metrics") - cur = conn.cursor() while not continue_metric_collection.is_set(): with p.oneshot(): cpu_util = p.cpu_percent() @@ -73,47 +27,41 @@ def collect_metrics(benchmark_id, continue_metric_collection): gpu_stats = gpustat.GPUStatCollection.new_query() gpu_util = gpu_stats[0]["utilization.gpu"] gpu_mem_megabytes = gpu_stats[0]["memory.used"] - cur.execute( - "INSERT INTO device_measurements (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes) VALUES (%s, %s, %s, %s, %s)", - (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes), + metrics_recorder.collect_device_measurements( + benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes ) sleep(0.01) - conn.commit() - conn.close() -def run_benchmark(branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100): +def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100): continue_metric_collection = Event() metrics_thread = None + model_id = "meta-llama/Llama-2-7b-hf" + metrics_recorder = MetricsRecorder(psycopg2.connect("dbname=metrics"), logger, branch, commit_id, commit_msg) try: gpu_stats = gpustat.GPUStatCollection.new_query() gpu_name = gpu_stats[0]["name"] - conn = psycopg2.connect("dbname=metrics") - cur = conn.cursor() - cur.execute( - "INSERT INTO benchmarks (branch, commit_id, commit_message, gpu_name) VALUES (%s, %s, %s, %s) RETURNING benchmark_id", - (branch, commit_id, commit_msg, gpu_name), + benchmark_id = metrics_recorder.initialise_benchmark({"gpu_name": gpu_name, "model_id": model_id}) + logger.info(f"running benchmark #{benchmark_id} on {gpu_name} for {model_id}") + metrics_thread = Thread( + target=collect_metrics, + args=[benchmark_id, continue_metric_collection, metrics_recorder], ) - conn.commit() - benchmark_id = cur.fetchone()[0] - logger.info(f"running benchmark #{benchmark_id} on {gpu_name}") - metrics_thread = Thread(target=collect_metrics, args=[benchmark_id, continue_metric_collection]) metrics_thread.start() logger.info("started background thread to fetch device metrics") os.environ["TOKENIZERS_PARALLELISM"] = "false" # silence warnings when compiling device = "cuda" - ckpt = "meta-llama/Llama-2-7b-hf" logger.info("downloading weights") # This is to avoid counting download in model load time measurement - model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16) + model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16) gen_config = GenerationConfig(do_sample=False, top_p=1, temperature=1) logger.info("loading model") start = perf_counter() model = AutoModelForCausalLM.from_pretrained( - ckpt, torch_dtype=torch.float16, generation_config=gen_config + model_id, torch_dtype=torch.float16, generation_config=gen_config ).eval() model.to(device) torch.cuda.synchronize() @@ -121,7 +69,7 @@ def run_benchmark(branch: str, commit_id: str, commit_msg: str, num_tokens_to_ge model_load_time = end - start logger.info(f"loaded model in: {model_load_time}s") - tokenizer = AutoTokenizer.from_pretrained(ckpt) + tokenizer = AutoTokenizer.from_pretrained(model_id) prompt = "Why dogs are so cute?" inputs = tokenizer(prompt, return_tensors="pt").to(device) @@ -368,41 +316,27 @@ def decode_one_token(model, cur_token, cache_position, past_key_values): logger.info(f"completed second compile generation in: {fourth_compile_generate_time}s") logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}") - cur.execute( - """ - INSERT INTO model_measurements ( - benchmark_id, - measurements - ) VALUES (%s, %s) - """, - ( - benchmark_id, - { - "model_load_time": model_load_time, - "first_eager_forward_pass_time_secs": first_eager_fwd_pass_time, - "second_eager_forward_pass_time_secs": second_eager_fwd_pass_time, - "first_eager_generate_time_secs": first_eager_generate_time, - "second_eager_generate_time_secs": second_eager_generate_time, - "time_to_first_token_secs": time_to_first_token, - "time_to_second_token_secs": time_to_second_token, - "time_to_third_token_secs": time_to_third_token, - "time_to_next_token_mean_secs": mean_time_to_next_token, - "first_compile_generate_time_secs": first_compile_generate_time, - "second_compile_generate_time_secs": second_compile_generate_time, - "third_compile_generate_time_secs": third_compile_generate_time, - "fourth_compile_generate_time_secs": fourth_compile_generate_time, - }, - ), + metrics_recorder.collect_model_measurements( + benchmark_id, + { + "model_load_time": model_load_time, + "first_eager_forward_pass_time_secs": first_eager_fwd_pass_time, + "second_eager_forward_pass_time_secs": second_eager_fwd_pass_time, + "first_eager_generate_time_secs": first_eager_generate_time, + "second_eager_generate_time_secs": second_eager_generate_time, + "time_to_first_token_secs": time_to_first_token, + "time_to_second_token_secs": time_to_second_token, + "time_to_third_token_secs": time_to_third_token, + "time_to_next_token_mean_secs": mean_time_to_next_token, + "first_compile_generate_time_secs": first_compile_generate_time, + "second_compile_generate_time_secs": second_compile_generate_time, + "third_compile_generate_time_secs": third_compile_generate_time, + "fourth_compile_generate_time_secs": fourth_compile_generate_time, + }, ) - conn.commit() - conn.close() except Exception as e: logger.error(f"Caught exception: {e}") continue_metric_collection.set() if metrics_thread is not None: metrics_thread.join() - - -if __name__ == "__main__": - branch, commit_id, commit_msg = parse_arguments() - run_benchmark(branch, commit_id, commit_msg, num_tokens_to_generate=20) + metrics_recorder.close() diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile index b597f5a73fb5..b40ba3f35ff8 100644 --- a/docker/transformers-all-latest-gpu/Dockerfile +++ b/docker/transformers-all-latest-gpu/Dockerfile @@ -65,6 +65,9 @@ RUN python3 -m pip install --no-cache-dir python-Levenshtein # For `FastSpeech2ConformerTokenizer` tokenizer RUN python3 -m pip install --no-cache-dir g2p-en +# For Some bitsandbytes tests +RUN python3 -m pip install --no-cache-dir einops + # When installing in editable mode, `transformers` is not recognized as a package. # this line must be added in order for python to be aware of transformers. RUN cd transformers && python3 setup.py develop diff --git a/docker/transformers-pytorch-amd-gpu/Dockerfile b/docker/transformers-pytorch-amd-gpu/Dockerfile index da91906d6214..88763ca07f42 100644 --- a/docker/transformers-pytorch-amd-gpu/Dockerfile +++ b/docker/transformers-pytorch-amd-gpu/Dockerfile @@ -1,5 +1,4 @@ -FROM rocm/dev-ubuntu-22.04:6.0.2 -# rocm/pytorch has no version with 2.1.0 +FROM rocm/dev-ubuntu-22.04:6.3 LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive @@ -9,9 +8,11 @@ RUN apt update && \ apt clean && \ rm -rf /var/lib/apt/lists/* +RUN export PATH="${PATH:+${PATH}:}~/opt/rocm/bin" + RUN python3 -m pip install --no-cache-dir --upgrade pip numpy -RUN python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0 +RUN python3 -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.3/ RUN python3 -m pip install --no-cache-dir --upgrade importlib-metadata setuptools ninja git+https://github.com/facebookresearch/detectron2.git pytesseract "itsdangerous<2.1.0" @@ -30,5 +31,5 @@ RUN python3 -m pip uninstall -y tensorflow flax # this line must be added in order for python to be aware of transformers. RUN cd transformers && python3 setup.py develop -# Remove nvml as it is not compatible with ROCm. apex is not tested on NVIDIA either. -RUN python3 -m pip uninstall py3nvml pynvml apex -y +# Remove nvml and nvidia-ml-py as it is not compatible with ROCm. apex is not tested on NVIDIA either. +RUN python3 -m pip uninstall py3nvml pynvml nvidia-ml-py apex -y diff --git a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile index d31e1cae5534..b67793dc9592 100644 --- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile +++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile @@ -1,11 +1,11 @@ -FROM rocm/dev-ubuntu-22.04:5.6 +FROM rocm/dev-ubuntu-22.04:6.3 LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive -ARG PYTORCH='2.1.1' -ARG TORCH_VISION='0.16.1' -ARG TORCH_AUDIO='2.1.1' -ARG ROCM='5.6' +ARG PYTORCH='2.5.1' +ARG TORCH_VISION='0.20.0' +ARG TORCH_AUDIO='2.5.0' +ARG ROCM='6.3' RUN apt update && \ apt install -y --no-install-recommends \ diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile index 53e66662f9ee..44d1ceb2bfdd 100755 --- a/docker/transformers-quantization-latest-gpu/Dockerfile +++ b/docker/transformers-quantization-latest-gpu/Dockerfile @@ -9,7 +9,7 @@ SHELL ["sh", "-lc"] # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant # to be used as arguments for docker build (so far). -ARG PYTORCH='2.4.1' +ARG PYTORCH='2.5.1' # Example: `cu102`, `cu113`, etc. ARG CUDA='cu118' @@ -36,15 +36,23 @@ RUN python3 -m pip install --no-cache-dir einops # Add bitsandbytes for mixed int8 testing RUN python3 -m pip install --no-cache-dir bitsandbytes -# Add auto-gptq for gtpq quantization testing -RUN python3 -m pip install --no-cache-dir auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ +# Add auto-gptq for gtpq quantization testing, installed from source for pytorch==2.5.1 compatibility +# TORCH_CUDA_ARCH_LIST="7.5+PTX" is added to make the package compile for Tesla T4 gpus available for the CI. +RUN pip install gekko +RUN git clone https://github.com/PanQiWei/AutoGPTQ.git && cd AutoGPTQ && TORCH_CUDA_ARCH_LIST="7.5+PTX" python3 setup.py install # Add optimum for gptq quantization testing RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum +# Add PEFT +RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/peft@main#egg=peft + # Add aqlm for quantization testing RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2 +# Add vptq for quantization testing +RUN python3 -m pip install --no-cache-dir vptq + # Add hqq for quantization testing RUN python3 -m pip install --no-cache-dir hqq @@ -52,8 +60,8 @@ RUN python3 -m pip install --no-cache-dir hqq RUN python3 -m pip install --no-cache-dir gguf # Add autoawq for quantization testing -# >=v0.2.3 needed for compatibility with torch 2.2.1 -RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+cu118-cp310-cp310-linux_x86_64.whl +# >=v0.2.7 needed for compatibility with transformers > 4.46 +RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.7.post2/autoawq-0.2.7.post2-py3-none-any.whl # Add quanto for quantization testing RUN python3 -m pip install --no-cache-dir optimum-quanto @@ -61,6 +69,10 @@ RUN python3 -m pip install --no-cache-dir optimum-quanto # Add eetq for quantization testing RUN python3 -m pip install git+https://github.com/NetEase-FuXi/EETQ.git +# Add flute-kernel and fast_hadamard_transform for quantization testing +RUN python3 -m pip install --no-cache-dir flute-kernel==0.3.0 -i https://flute-ai.github.io/whl/cu118 +RUN python3 -m pip install --no-cache-dir fast_hadamard_transform==1.0.4.post1 + # When installing in editable mode, `transformers` is not recognized as a package. # this line must be added in order for python to be aware of transformers. RUN cd transformers && python3 setup.py develop diff --git a/docs/source/ar/_toctree.yml b/docs/source/ar/_toctree.yml index d9523eaf5da5..30e247eb54e1 100644 --- a/docs/source/ar/_toctree.yml +++ b/docs/source/ar/_toctree.yml @@ -30,26 +30,26 @@ - local: conversations title: الدردشة مع المحولات title: البرامج التعليمية -# - sections: -# - isExpanded: false -# sections: -# - local: tasks/sequence_classification -# title: تصنيف النصوص -# - local: tasks/token_classification -# title: تصنيف الرموز -# - local: tasks/question_answering -# title: الإجابة على الأسئلة -# - local: tasks/language_modeling -# title: نمذجة اللغة السببية -# - local: tasks/masked_language_modeling -# title: نمذجة اللغة المقنعة -# - local: tasks/translation -# title: الترجمة -# - local: tasks/summarization -# title: التلخيص -# - local: tasks/multiple_choice -# title: الاختيار المتعدد -# title: معالجة اللغات الطبيعية +- sections: + - isExpanded: false + sections: + - local: tasks/sequence_classification + title: تصنيف النصوص + - local: tasks/token_classification + title: تصنيف الرموز + - local: tasks/question_answering + title: الإجابة على الأسئلة + - local: tasks/language_modeling + title: نمذجة اللغة السببية + - local: tasks/masked_language_modeling + title: نمذجة اللغة المقنعة + - local: tasks/translation + title: الترجمة + - local: tasks/summarization + title: التلخيص + - local: tasks/multiple_choice + title: الاختيار المتعدد + title: معالجة اللغات الطبيعية # - isExpanded: false # sections: # - local: tasks/audio_classification @@ -107,10 +107,10 @@ # - local: tasks/prompting # title: دليل إرشادي لمحفزات النماذج اللغوية الكبيرة # title: الإرشاد -# title: أدلة المهام + title: أدلة المهام - sections: - local: fast_tokenizers - title: استخدم مجزئيات النصوص السريعة من 🤗 Tokenizers + title: استخدم مجزئيات النصوص السريعة من 🤗 Tokenizers - local: multilingual title: الاستدلال باستخدام نماذج متعددة اللغات - local: create_a_model @@ -129,16 +129,20 @@ title: التصدير إلى TFLite - local: torchscript title: التصدير إلى TorchScript -# - local: benchmarks -# title: المعايير -# - local: notebooks -# title: دفاتر الملاحظات مع الأمثلة -# - local: community -# title: موارد المجتمع + - local: notebooks + title: دفاتر الملاحظات مع الأمثلة + - local: community + title: موارد المجتمع - local: troubleshooting title: استكشاف الأخطاء وإصلاحها - local: gguf title: التوافق مع ملفات GGUF + - local: tiktoken + title: التوافق مع ملفات TikToken + - local: modular_transformers + title: الوحدات النمطية في `transformers` + - local: how_to_hack_models + title: اختراق النموذج (الكتابة فوق فئة لاستخدامك) title: أدلة المطورين # - sections: # - local: quantization/overview @@ -151,6 +155,8 @@ # title: AWQ # - local: quantization/aqlm # title: AQLM +# - local: quantization/vptq +# title: VPTQ # - local: quantization/quanto # title: Quanto # - local: quantization/eetq @@ -875,7 +881,7 @@ # - local: internal/pipelines_utils # title: مرافق خطوط الأنابيب # - local: internal/tokenization_utils -# title: مرافق مقسم النصوص +# title: مرافق مقسم النصوص # - local: internal/trainer_utils # title: مرافق المدرب # - local: internal/generation_utils diff --git a/docs/source/ar/community.md b/docs/source/ar/community.md new file mode 100644 index 000000000000..5a1c31de0aaa --- /dev/null +++ b/docs/source/ar/community.md @@ -0,0 +1,66 @@ +# مجتمع المطورين + +هذه الصفحة تجمع الموارد حول 🤗 Transformers التي طورها المجتمع. + +## موارد المجتمع: + +| المصدر | الوصف | المؤلف | +|:----------|:-------------|------:| +| [Hugging Face Transformers Glossary Flashcards](https://www.darigovresearch.com/huggingface-transformers-glossary-flashcards) | مجموعة من البطاقات التعليمية القائمة على [Transformers Docs Glossary](glossary) والتي تم وضعها في شكل يمكن تعلمه/مراجعته بسهولة باستخدام [Anki](https://apps.ankiweb.net/) وهو تطبيق مفتوح المصدر متعدد المنصات مصمم خصيصًا للاحتفاظ بالمعرفة على المدى الطويل. شاهد هذا [فيديو تمهيدي حول كيفية استخدام البطاقات التعليمية](https://www.youtube.com/watch?v=Dji_7PILrw). | [Darigov Research](https://www.darigovresearch.com/) | + +## دفاتر ملاحظات المجتمع: + +| الدفتر | الوصف | المؤلف | | +|:----------|:-------------|:-------------|------:| +| [Fine-tune a pre-trained Transformer to generate lyrics](https://github.com/AlekseyKorshuk/huggingartists) | كيفية توليد كلمات الأغاني على غرار فنانك المفضل من خلال ضبط نموذج GPT-2 | [Aleksey Korshuk](https://github.com/AlekseyKorshuk) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb) | +| [Train T5 in Tensorflow 2](https://github.com/snapthat/TF-T5-text-to-text) | كيفية تدريب T5 لأي مهمة باستخدام Tensorflow 2. يوضح هذا الدفتر مهمة السؤال والجواب المنفذة في Tensorflow 2 باستخدام SQUAD | [Muhammad Harris](https://github.com/HarrisDePerceptron) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb) | +| [Train T5 on TPU](https://github.com/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb) | كيفية تدريب T5 على SQUAD مع Transformers و Nlp | [Suraj Patil](https://github.com/patil-suraj) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil) | +| [Fine-tune T5 for Classification and Multiple Choice](https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) | كيفية ضبط نموذج T5 للتصنيف والمهام متعددة الخيارات باستخدام تنسيق النص إلى نص مع PyTorch Lightning | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) | +| [Fine-tune DialoGPT on New Datasets and Languages](https://github.com/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) | كيفية ضبط نموذج DialoGPT على مجموعة بيانات جديدة لروبوتات الدردشة المحادثية المفتوحة | [Nathan Cooper](https://github.com/ncoop57) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) | +| [Long Sequence Modeling with Reformer](https://github.com/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb) | كيفية التدريب على تسلسلات طويلة تصل إلى 500,000 رمز باستخدام Reformer | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb) | +| [Fine-tune BART for Summarization](https://github.com/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb) | كيفية ضبط نموذج BART للتلخيص باستخدام fastai باستخدام blurr | [Wayde Gilliam](https://ohmeow.com/) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb) | +| [Fine-tune a pre-trained Transformer on anyone's tweets](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | كيفية توليد تغريدات على غرار حساب Twitter المفضل لديك من خلال ضبط نموذج GPT-2 | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | +| [Optimize 🤗 Hugging Face models with Weights & Biases](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | دليل كامل لعرض تكامل W&B مع Hugging Face | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | +| [Pretrain Longformer](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) | كيفية بناء نسخة "طويلة" من النماذج المسبقة التدريب الموجودة | [Iz Beltagy](https://beltagy.net) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) | +| [Fine-tune Longformer for QA](https://github.com/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | كيفية ضبط نموذج Longformer لمهمة QA | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | +| [Evaluate Model with 🤗nlp](https://github.com/patrickvonplaten/notebooks/blob/master/How_to_evaluate_Longformer_on_TriviaQA_using_NLP.ipynb) | كيفية تقييم نموذج Longformer على TriviaQA مع `nlp` | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1m7eTGlPmLRgoPkkA7rkhQdZ9ydpmsdLE?usp=sharing) | +| [Fine-tune T5 for Sentiment Span Extraction](https://github.com/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) | كيفية ضبط نموذج T5 لاستخراج المشاعر باستخدام تنسيق النص إلى نص مع PyTorch Lightning | [Lorenzo Ampil](https://github.com/enzoampil) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) | +| [Fine-tune DistilBert for Multiclass Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb) | كيفية ضبط نموذج DistilBert للتصنيف متعدد الفئات باستخدام PyTorch | [Abhishek Kumar Mishra](https://github.com/abhimishra91) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb)| +|[Fine-tune BERT for Multi-label Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|كيفية ضبط نموذج BERT للتصنيف متعدد التصنيفات باستخدام PyTorch|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)| +|[Fine-tune T5 for Summarization](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)|كيفية ضبط نموذج T5 للتلخيص في PyTorch وتتبع التجارب باستخدام WandB|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)| +|[Speed up Fine-Tuning in Transformers with Dynamic Padding / Bucketing](https://github.com/ELS-RD/transformers-notebook/blob/master/Divide_Hugging_Face_Transformers_training_time_by_2_or_more.ipynb)|كيفية تسريع الضبط الدقيق بعامل 2 باستخدام الضبط الديناميكي/التقسيم|[Michael Benesty](https://github.com/pommedeterresautee) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CBfRU1zbfu7-ijiOqAAQUA-RJaxfcJoO?usp=sharing)| +|[Pretrain Reformer for Masked Language Modeling](https://github.com/patrickvonplaten/notebooks/blob/master/Reformer_For_Masked_LM.ipynb)| كيفية تدريب نموذج Reformer مع طبقات الانتباه ثنائية الاتجاه | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tzzh0i8PgDQGV3SMFUGxM7_gGae3K-uW?usp=sharing)| +|[Expand and Fine Tune Sci-BERT](https://github.com/lordtt13/word-embeddings/blob/master/COVID-19%20Research%20Data/COVID-SciBERT.ipynb)| كيفية زيادة مفردات نموذج SciBERT المسبق التدريب من AllenAI على مجموعة بيانات CORD وإنشاء خط أنابيب لها. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1rqAR40goxbAfez1xvF3hBJphSCsvXmh8)| +|[Fine Tune BlenderBotSmall for Summarization using the Trainer API](https://github.com/lordtt13/transformers-experiments/blob/master/Custom%20Tasks/fine-tune-blenderbot_small-for-summarization.ipynb)| كيفية ضبط نموذج BlenderBotSmall للتلخيص على مجموعة بيانات مخصصة، باستخدام واجهة برمجة التطبيقات Trainer. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/19Wmupuls7mykSGyRN_Qo6lPQhgp56ymq?usp=sharing)| +|[Fine-tune Electra and interpret with Integrated Gradients](https://github.com/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb) | كيفية ضبط نموذج Electra للتحليل العاطفي وتفسير التنبؤات باستخدام Captum Integrated Gradients | [Eliza Szczechla](https://elsanns.github.io) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb)| +|[fine-tune a non-English GPT-2 Model with Trainer class](https://github.com/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb) | كيفية ضبط نموذج GPT-2 غير الإنجليزي باستخدام فئة Trainer | [Philipp Schmid](https://www.philschmid.de) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb)| +|[Fine-tune a DistilBERT Model for Multi Label Classification task](https://github.com/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb) | كيفية ضبط نموذج DistilBERT لمهمة التصنيف متعدد التصنيفات | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb)| +|[Fine-tune ALBERT for sentence-pair classification](https://github.com/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb) | كيفية ضبط نموذج ALBERT أو أي نموذج آخر قائم على BERT لمهمة التصنيف المزدوج للجمل | [Nadir El Manouzi](https://github.com/NadirEM) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb)| +|[Fine-tune Roberta for sentiment analysis](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | كيفية ضبط نموذج Roberta للتحليل العاطفي | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)| +|[Evaluating Question Generation Models](https://github.com/flexudy-pipe/qugeev) | ما مدى دقة الإجابات على الأسئلة التي يولدها نموذجك التحويلي seq2seq؟ | [Pascal Zoleko](https://github.com/zolekode) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)| +|[Classify text with DistilBERT and Tensorflow](https://github.com/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb) | كيفية ضبط نموذج DistilBERT للتصنيف النصي في TensorFlow | [Peter Bayerle](https://github.com/peterbayerle) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb)| +|[Leverage BERT for Encoder-Decoder Summarization on CNN/Dailymail](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | كيفية البدء السريع لنموذج *EncoderDecoderModel* مع نقطة تفتيش *google-bert/bert-base-uncased* للتلخيص على CNN/Dailymail | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)| +|[Leverage RoBERTa for Encoder-Decoder Summarization on BBC XSum](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | كيفية البدء السريع لنموذج *EncoderDecoderModel* المشترك مع نقطة تفتيش *FacebookAI/roberta-base* للتلخيص على BBC/XSum | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)| +|[Fine-tune TAPAS on Sequential Question Answering (SQA)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) | كيفية ضبط نموذج *TapasForQuestionAnswering* مع نقطة تفتيش *tapas-base* على مجموعة بيانات Sequential Question Answering (SQA) | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb)| +|[Evaluate TAPAS on Table Fact Checking (TabFact)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb) | كيفية تقييم نموذج *TapasForSequenceClassification* المضبوط مسبقًا مع نقطة تفتيش *tapas-base-finetuned-tabfact* باستخدام مزيج من مكتبتي 🤗 datasets و 🤗 transformers | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb)| +|[Fine-tuning mBART for translation](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb) | كيفية ضبط نموذج mBART باستخدام Seq2SeqTrainer للترجمة من الهندية إلى الإنجليزية | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb)| +|[Fine-tune LayoutLM on FUNSD (a form understanding dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb) | كيفية ضبط نموذج *LayoutLMForTokenClassification* على مجموعة بيانات FUNSD لاستخراج المعلومات من المستندات الممسوحة ضوئيًا | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb)| +|[Fine-Tune DistilGPT2 and Generate Text](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb) | كيفية ضبط نموذج DistilGPT2 وتوليد النص | [Aakash Tripathi](https://github.com/tripathiaakash) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb)| +|[Fine-Tune LED on up to 8K tokens](https://github.com/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb) | كيفية ضبط نموذج LED على pubmed للتلخيص طويل المدى | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb)| +|[Evaluate LED on Arxiv](https://github.com/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb) | كيفية تقييم نموذج LED للتلخيص طويل المدى بشكل فعال | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb)| +|[Fine-tune LayoutLM on RVL-CDIP (a document image classification dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb) | كيفية ضبط نموذج *LayoutLMForSequenceClassification* على مجموعة بيانات RVL-CDIP لتصنيف المستندات الممسوحة ضوئيًا | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb)| +|[Wav2Vec2 CTC decoding with GPT2 adjustment](https://github.com/voidful/huggingface_notebook/blob/main/xlsr_gpt.ipynb) | كيفية فك تشفير تسلسل CTC مع تعديل نموذج اللغة | [Eric Lam](https://github.com/voidful) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1e_zQHYbO2YKEaUgzb1ww1WwiAyydAj?usp=sharing)| +|[Fine-tune BART for summarization in two languages with Trainer class](https://github.com/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb) | كيفية ضبط نموذج BART للتلخيص بلغتين باستخدام فئة Trainer | [Eliza Szczechla](https://github.com/elsanns) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb)| +|[Evaluate Big Bird on Trivia QA](https://github.com/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb) | كيفية تقييم نموذج BigBird للأسئلة والأجوبة على وثائق طويلة على Trivia QA | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb)| +| [Create video captions using Wav2Vec2](https://github.com/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | كيفية إنشاء تعليقات توضيحية على YouTube من أي فيديو من خلال تفريغ الصوت باستخدام Wav2Vec | [Niklas Muennighoff](https://github.com/Muennighoff) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | +| [Fine-tune the Vision Transformer on CIFAR-10 using PyTorch Lightning](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) | كيفية ضبط نموذج Vision Transformer (ViT) على CIFAR-10 باستخدام مكتبات HuggingFace Transformers و Datasets و PyTorch Lightning | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) | +| [Fine-tune the Vision Transformer on CIFAR-10 using the 🤗 Trainer](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) | كيفية ضبط نموذج Vision Transformer (ViT) على CIFAR-10 باستخدام مكتبات HuggingFace Transformers و Datasets و 🤗 Trainer | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) | +| [Evaluate LUKE on Open Entity, an entity typing dataset](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | كيفية تقييم نموذج *LukeForEntityClassification* على مجموعة بيانات Open Entity | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | +| [Evaluate LUKE on TACRED, a relation extraction dataset](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | كيفية تقييم نموذج *LukeForEntityPairClassification* على مجموعة بيانات TACRED | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | +| [Evaluate LUKE on CoNLL-2003, an important NER benchmark](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | كيفية تقييم نموذج *LukeForEntitySpanClassification* على مجموعة بيانات CoNLL-2003 | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | +| [Evaluate BigBird-Pegasus on PubMed dataset](https://github.com/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | كيفية تقييم نموذج *BigBirdPegasusForConditionalGeneration* على مجموعة بيانات PubMed | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | +| [Speech Emotion Classification with Wav2Vec2](https://github.com/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | كيفية استخدام نموذج Wav2Vec2 المسبق التدريب لتصنيف المشاعر على مجموعة بيانات MEGA | [Mehrdad Farahani](https://github.com/m3hrdadfi) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | +| [Detect objects in an image with DETR](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | كيفية استخدام نموذج *DetrForObjectDetection* المدرب للكشف عن الأجسام في صورة وتصوير الانتباه | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | +| [Fine-tune DETR on a custom object detection dataset](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | كيفية ضبط نموذج *DetrForObjectDetection* على مجموعة بيانات الكشف عن الأجسام المخصصة | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | +| [Finetune T5 for Named Entity Recognition](https://github.com/ToluClassics/Notebooks/blob/main/T5_Ner_Finetuning.ipynb) | كيفية ضبط نموذج *T5* على مهمة التعرف على الكيانات المسماة | [Ogundepo Odunayo](https://github.com/ToluClassics) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1obr78FY_cBmWY5ODViCmzdY6O1KB65Vc?usp=sharing) | +| [Fine-Tuning Open-Source LLM using QLoRA with MLflow and PEFT](https://github.com/mlflow/mlflow/blob/master/docs/source/llms/transformers/tutorials/fine-tuning/transformers-peft.ipynb) | كيفية استخدام [QLoRA](https://github.com/artidoro/qlora) و [PEFT](https://huggingface.co/docs/peft/en/index) لضبط نموذج LLM بطريقة فعالة من حيث الذاكرة، مع استخدام [MLflow](https://mlflow.org/docs/latest/llms/transformers/index.html) لإدارة تتبع التجارب | [Yuki Watanabe](https://github.com/B-Step62) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mlflow/mlflow/blob/master/docs/source/llms/transformers/tutorials/fine-tuning/transformers-peft.ipynb) | diff --git a/docs/source/ar/how_to_hack_models.md b/docs/source/ar/how_to_hack_models.md new file mode 100644 index 000000000000..8ce3589732f0 --- /dev/null +++ b/docs/source/ar/how_to_hack_models.md @@ -0,0 +1,163 @@ +# كيفية تعديل أي نموذج من نماذج Transformers + +توفر مكتبة [🤗 Transformers](https://github.com/huggingface/transformers) مجموعة من النماذج المسبقة التدريب والأدوات لمعالجة اللغات الطبيعية، والرؤية، وما إلى ذلك. على الرغم من أن هذه النماذج تغطي مجموعة واسعة من التطبيقات، فقد تواجه حالات استخدام لا تدعمها المكتبة بشكل افتراضي. يُمكن للتخصيص أن يفتح إمكانيات جديدة، مثل إضافة طبقات جديدة، أو تعديل البنية المعمارية، أو تحسين آليات الانتباه. سيُوضح لك هذا الدليل كيفية تعديل نماذج Transformers الموجودة لتلبية احتياجاتك المحددة. الشيء الرائع هو أنك لست بحاجة إلى الخروج من إطار عمل Transformers لإجراء هذه التغييرات. ي يمكنك تعديل النماذج مباشرةً في Transformers والاستفادة من الميزات مثل [واجهة برمجة التطبيقات Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer)، و [PreTrainedModel](https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel)، والضبط الدقيق الفعال باستخدام أدوات مثل [PEFT](https://huggingface.co/docs/peft/index). + +سنرشدك في هذا الدليل لكيفية تخصيص نماذج Transformers الموجودة لتلبية متطلباتك، دون فقدان مزايا الإطار. ستتعلم كيفية: + +- تعديل بنية نموذج ما من خلال تغيير آلية الانتباه الخاصة به. +- تطبيق تقنيات مثل Low-Rank Adaptation (LoRA) على مكونات نموذج محددة. + +نحن نشجعك على المساهمة باختراقاتك الخاصة ومشاركتها هنا مع المجتمع1 + +## مثال: تعديل آلية الانتباه في نموذج Segment Anything (SAM) + +نموذج **Segment Anything (SAM)** هو نموذج رائد في مجال تجزئة الصور. في تنفيذه الافتراضي، يستخدم SAM إسقاطًا مجمعًا للاستعلام والمفتاح والقيمة (`qkv`) في آلية الانتباه الخاصة به. ومع ذلك، قد ترغب في ضبط مكونات محددة فقط من آلية الانتباه، مثل إسقاطات الاستعلام (`q`) والقيمة (`v`)، لتقليل عدد المعلمات القابلة للتدريب والموارد الحسابية المطلوبة. + +### الدافع + +من خلال تقسيم الإسقاط المجمع `qkv` إلى إسقاطات منفصلة `q` و `k` و `v`، يمكنك تطبيق تقنيات مثل **LoRA** (Low-Rank Adaptation) على إسقاطي `q` و `v` فقط. يسمح لك هذا بما يلي: + +- ضبط عدد أقل من المعلمات، مما يقلل من العبء الحسابي. +- تحقيق أداء أفضل من خلال التركيز على مكونات محددة. +- تجربة استراتيجيات تعديل مختلفة في آلية الانتباه. + +### التنفيذ + +#### **الخطوة 1: إنشاء فئة اهتمام مخصصة** + +بعد ذلك، قم بإنشاء فئة فرعية من فئة `SamVisionAttention` الأصلية وعدلها لتضم إسقاطات `q` و `k` و `v` منفصلة. + +```python +import torch +import torch.nn as nn +from transformers.models.sam.modeling_sam import SamVisionAttention + +class SamVisionAttentionSplit(SamVisionAttention, nn.Module): + def __init__(self, config, window_size): + super().__init__(config, window_size) + del self.qkv + # إسقاطات منفصلة q و k و v + self.q = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias) + self.k = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias) + self.v = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias) + self._register_load_state_dict_pre_hook(self.split_q_k_v_load_hook) + + def split_q_k_v_load_hook(self, state_dict, prefix, *args): + keys_to_delete = [] + for key in list(state_dict.keys()): + if "qkv." in key: + # تقسيم q و k و v من الإسقاط المجمع + q, k, v = state_dict[key].chunk(3, dim=0) + # استبدال الإسقاطات الفردية q و k و v + state_dict[key.replace("qkv.", "q.")] = q + state_dict[key.replace("qkv.", "k.")] = k + state_dict[key.replace("qkv.", "v.")] = v + # وضع علامة على مفتاح qkv القديم للحذف + keys_to_delete.append(key) + + # حذف مفاتيح qkv القديمة + for key in keys_to_delete: + del state_dict[key] + + def forward(self, hidden_states: torch.Tensor, output_attentions=False) -> torch.Tensor: + batch_size, height, width, _ = hidden_states.shape + qkv_shapes = (batch_size * self.num_attention_heads, height * width, -1) + query = self.q(hidden_states).reshape((batch_size, height * width,self.num_attention_heads, -1)).permute(0,2,1,3).reshape(qkv_shapes) + key = self.k(hidden_states).reshape((batch_size, height * width,self.num_attention_heads, -1)).permute(0,2,1,3).reshape(qkv_shapes) + value = self.v(hidden_states).reshape((batch_size, height * width,self.num_attention_heads, -1)).permute(0,2,1,3).reshape(qkv_shapes) + + attn_weights = (query * self.scale) @ key.transpose(-2, -1) + + if self.use_rel_pos: + attn_weights = self.add_decomposed_rel_pos( + attn_weights, query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width) + ) + + attn_weights = torch.nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query.dtype) + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + attn_output = (attn_probs @ value).reshape(batch_size, self.num_attention_heads, height, width, -1) + attn_output = attn_output.permute(0, 2, 3, 1, 4).reshape(batch_size, height, width, -1) + attn_output = self.proj(attn_output) + + if output_attentions: + outputs = (attn_output, attn_weights) + else: + outputs = (attn_output, None) + return outputs +``` + +**الشرح:** + +- **الإسقاطات المنفصلة:** يتم إزالة الإسقاط المُجمع `qkv`، وإنشاء إسقاطات خطية منفصلة `q` و `k` و `v`. +- **دالة استدعاء تحميل الأوزان:** تقوم طريقة `_split_qkv_load_hook` بتقسيم أوزان `qkv` المسبقة التدريب إلى أوزان `q` و `k` و `v` منفصلة عند تحميل النموذج. يضمن هذا التوافق مع أي نموذج مسبق التدريب. +- **التنفيذ الأمامي:** يتم حساب الاستعلامات والمفاتيح والقيم بشكل منفصل، وتستمر آلية الانتباه كالمعتاد. + +#### **الخطوة 2: استبدال فئة الانتباه الأصلية** + +استبدل فئة `SamVisionAttention` الأصلية بفئتك المخصصة بحيث يستخدم النموذج آلية الانتباه المعدلة. + +```python +from transformers import SamModel +from transformers.models.sam import modeling_sam + +# استبدال فئة الاهتمام في وحدة نمطية modeling_sam +modeling_sam.SamVisionAttention = SamVisionAttentionSplit + +# تحميل نموذج SAM المسبق التدريب +model = SamModel.from_pretrained("facebook/sam-vit-base") +``` + +**الشرح:** + +- **استبدال الفئة:** من خلال تعيين فئتك المخصصة إلى `modeling_sam.SamVisionAttention`، فإن أي حالات من فئة `SamVisionAttention` في النموذج ستستخدم النسخة المعدلة. وبالتالي، عند استدعاء `SamModel`، سيتم استخدام `SamVisionAttentionSplit` المحددة حديثًا. +- **تحميل النموذج:** يتم تحميل النموذج باستخدام `from_pretrained`، ويتم دمج آلية الانتباه المخصصة. + +#### **الخطوة 3: تطبيق LoRA على إسقاطات محددة** + +مع وجود إسقاطات `q` و `k` و `v` منفصلة، يمكنك الآن تطبيق LoRA على مكونات محددة، مثل إسقاطات `q` و `v`. + +```python +from peft import LoraConfig, get_peft_model + +config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q", "v"], # تطبيق LoRA على إسقاطات q و v + lora_dropout=0.1, + task_type="mask-generation" +) + +# تطبيق LoRA على النموذج +model = get_peft_model(model, config) +``` + +**الشرح:** + +- **تكوين LoRA:** تحدد `LoraConfig` المرتبة `r`، وعامل القياس `lora_alpha`، والوحدات المستهدفة (`"q"` و `"v"`)، ومعدل التخلي، ونوع المهمة. +- **تطبيق LoRA:** تقوم دالة `get_peft_model` بتطبيق LoRA على الوحدات المحددة في النموذج. +- **تقليل المعلمات:** من خلال التركيز على `q` و `v`، فإنك تقلل عدد المعلمات القابلة للتدريب، مما يؤدي إلى تسريع التدريب وتقليل استخدام الذاكرة. + +#### **الخطوة 4: التحقق من عدد المعلمات القابلة للتدريب** + +من السهل التحقق من عدد المعلمات القابلة للتدريب ومعرفة تأثير تعديلك. + +```python +model.print_trainable_parameters() +``` + +**الناتج المتوقع:** + +``` +عدد المعلمات القابلة للتدريب: 608,256 || جميع المعلمات: 94,343,728 || نسبة المعلمات القابلة للتدريب: 0.6447 +عدد المعلمات القابلة للتدريب: 912,384 || جميع المعلمات: 94,647,856 || نسبة المعلمات القابلة للتدريب: 0.9640 # مع k +``` + +## المساهمة بابداعاتك الخاصة + +يمكن لتعديل النماذج المسبقة التدريب أن يفتح آفاقًا جديدة للبحث والتطبيق. من خلال فهم وتعديل الآليات الداخلية للنماذج مثل SAM، يمكنك تخصيصها لتلبية احتياجاتك المحددة، وتحسين الأداء، وتجربة أفكار جديدة. + +إذا قمت بتطوير تعديﻻتك الخاصة لنماذج Transformers وترغب في مشاركتها، ففكر في المساهمة في هذه الوثيقة. + +- **إنشاء طلب سحب (Pull Request):** شارك تغييراتك وتحسيناتك في التعليمات البرمجية مباشرة في المستودع. +- **كتابة التوثيق:** قدم تفسيرات وأمثلة واضحة لتعديلاتك. +- **التفاعل مع المجتمع:** ناقش أفكارك واحصل على تعليقات من المطورين والباحثين الآخرين من خلال فتح مشكلة. diff --git a/docs/source/ar/installation.md b/docs/source/ar/installation.md index ac5962ec8589..d3bd4c655b60 100644 --- a/docs/source/ar/installation.md +++ b/docs/source/ar/installation.md @@ -144,7 +144,7 @@ conda install conda-forge::transformers تُحمّل النماذج المُسبقة التدريب وتُخزّن مؤقتًا في: `~/.cache/huggingface/hub`. هذا هو المجلد الافتراضي الذي يُحدده متغير البيئة `TRANSFORMERS_CACHE`. على Windows، يكون دليل ذاكرة التخزين المؤقت الافتراضي هو `C:\Users\username\.cache\huggingface\hub`. يمكنك تغيير متغيرات البيئة shell الموضحة أدناه - حسب الأولوية - لتحديد دليل ذاكرة تخزين مؤقت مختلف: -1. متغير البيئة (افتراضي): `HUGGINGFACE_HUB_CACHE` أو `TRANSFORMERS_CACHE`. +1. متغير البيئة (افتراضي): `HF_HUB_CACHE` أو `TRANSFORMERS_CACHE`. 2. متغير البيئة: `HF_HOME`. 3. متغير البيئة: `XDG_CACHE_HOME` + `/huggingface`. diff --git a/docs/source/ar/modular_transformers.md b/docs/source/ar/modular_transformers.md new file mode 100644 index 000000000000..b500fec1c92d --- /dev/null +++ b/docs/source/ar/modular_transformers.md @@ -0,0 +1,184 @@ +# المحولات النمطية + +مكتبة `transformers` هي إطار عمل ذو فلسفة محدد؛ يتم تعريف فلسفتنا في [الدليل المفاهيمي](./philosophy). + +جوهر هذه الفلسفة يتمثل في مبدأ [نموذج واحد، ملف واحد](https://huggingface.co/blog/transformers-design-philosophy) +في المكتبة. الجانب السلبي لهذا المكون هو تقييده لوراثة واستيراد مكونات الملفات. + +نتيجة لذلك، تتكرر مكونات النموذج عبر العديد من الملفات. يحتوي `transformers` على عدد كبير من طبقات الانتباه، يقارب عدد النماذج، والكثير منها متطابق. يتسبب هذا في تباعد عمليات التنفيذ المستقلة مع تطبيق الإصلاحات والتغييرات. +على أجزاء محددة من التعليمات البرمجية. + +ولمعالجة ذلك، اعتمدنا مفهوم "النسخ" في المكتبة. فبإضافة تعليق يُشير إلى أن التعليمات البرمجية هي نسخة من أخرى، نضمن من خلال أنظمة CI والأوامر المحلية عدم تباعد النسخ. لكن هذه العملية، رغم بساطتها، تُسبب إرهاقاً. كما أنها تزيد العبء على المساهمين، وهو ما نهدف إلى تجاوزه. + +غالباً ما تتطلب مساهمات النماذج إضافة تعليمات برمجية (حوالي 1000 سطر)، ومعالج (حوالي 500 سطر)، واختبارات، ووثائق، إلخ. ونادراً ما تقل مساهمات النماذج عن 3000-5000 سطر من التعليمات البرمجية، معظمها أكواد نمطية. هذا يرفع مستوى المساهمات، + +ونهدف مع المحولات النمطية إلى خفض هذا المستوى إلى حدّ مقبول. + +## ما هو؟ + +تقدم المحولات النمطية مفهوم ملف "نمطي" لمجلد نموذج. يقبل هذا الملف النمطي تعليمات برمجية +غير مقبولة عادة في ملفات النمذجة/المعالجة، حيث يسمح بالاستيراد من نماذج مجاورة وكذلك +الوراثة من الفئات إلى فئات أخرى. + +يعرّف هذا الملف النمطي النماذج والمعالجات وفئة التكوين التي سيتم تعريفها في وحداتهم +المتعلقة. + +وأخيرًا، يقدم هذا الميزة أداة `linter` جديدة والتي ستعمل على "تفكيك" الملف النمطي إلى بنية "نموذج واحد، ملف واحد" +هيكل الدليل. سيتم إنشاء هذه الملفات تلقائيًا في كل مرة يتم فيها تشغيل البرنامج النصي؛ مما يقلل من المساهمات المطلوبة +إلى الملف النمطي، وبالتالي فقط إلى التغييرات بين النموذج المساهم والنماذج الأخرى. + +سيقوم مستخدمو النموذج في النهاية باستيراد واستخدام واجهة الملف الواحد، لذا لا يتوقع حدوث أي تغيير هنا. من خلال القيام بذلك، +نأمل في الجمع بين أفضل ما في العالمين: تمكين المساهمات البسيطة مع الالتزام بفلسفتنا. + +لذلك، هذا بديل لعلامات `# Copied from`، ويمكن توقع انتقال النماذج المساهمة سابقًا إلى +تنسيق المحولات النمطية الجديد في الأشهر المقبلة. + +### التفاصيل + +تُبسط أداة "linter" الوراثة، مُنشئةً جميع الملفات المفردة من الملف النمطي، مع الحفاظ على شفافيتها أمام مستخدمي Python. حاليًا، تُبسط الأداة مستوىً واحدًا من الوراثة + +على سبيل المثال: +- إذا ورثت فئة التكوين من فئة أخرى وأضافت/حذفت معامل، فسيتم إما الإشارة إلى الملف المولد مباشرةً + (في حالة الإضافة) أو إزالته تمامًا (في حالة الحذف). +- إذا ورثت فئة من فئة أخرى، على سبيل المثال: `class GemmaModel(LlamaModel):`، تُستنتج التبعيات تلقائيًا + سيتم استنتاج جميع الوحدات الفرعية تلقائيًا من الفئة الأصلية. +- إذا قمت بتعريف وظائف جديدة في الملف `modular` واستخدمتها داخل الفئات، فستستنتج أداة linter ذلك تلقائيًا + +يجب أن تكون قادرًا على كتابة كل شيء (المجزىء اللغوي، ومُعالِج الصور، والنموذج، والتكوين) في الملف `modular`، وسيتم إنشاء الملفات المُقابلة تلقائيًا. + +### التطبيق + +[TODO] نقدم اختبارًا جديدًا، للتأكد من أن المحتوى المولد يتطابق مع ما هو موجود في `modular_xxxx.py` + +### الأمثلة + +هنا مثال سريع باستخدام BERT و RoBERTa. النموذجان مرتبطان ارتباطًا وثيقًا: يختلف تنفيذهما النموذجي في طبقة تضمين. + +بدلاً من إعادة تعريف النموذج بالكامل، إليك كيف يبدو ملف `modular_roberta.py` لفئات النمذجة والتكوين (لأغراض المثال، يتم تجاهل المجزىء اللغوي في هذا الوقت حيث أنه مختلف جدًا). + +```python +from torch import nn +from ..bert.configuration_bert import BertConfig +from ..bert.modeling_bert import ( + BertModel, + BertEmbeddings, + BertForMaskedLM +) + +# تكوين RoBERTa مطابق لتكوين BERT +class RobertaConfig(BertConfig): + model_type = 'roberta' + +# نعيد تعريف الإضافات هنا لتسليط الضوء على اختلاف معرف الحشو، ونعيد تعريف الإضافات الموضعية +class RobertaEmbeddings(BertEmbeddings): + def __init__(self, config): + super().__init__(config()) + + self.padding_idx = config.pad_token_id + self.position_embeddings = nn.Embedding( + config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx + ) + +# نموذج RoBERTa مطابق لنموذج BERT، باستثناء طبقة الإضافات. +# نعيد تعريف الإضافات أعلاه، لذا هنا لا توجد حاجة لعمل إضافي +class RobertaModel(BertModel): + def __init__(self, config): + super().__init__(config) + self.embeddings = RobertaEmbeddings(config) + + +# الرؤوس الآن تحتاج فقط إلى إعادة تعريف النموذج داخل `RobertaModel` الصحيح +class RobertaForMaskedLM(BertForMaskedLM): + def __init__(self, config): + super().__init__(config) + self.model = RobertaModel(config) +``` + +لاحظ أنه إذا لم تستخدم الاعتماد الذي حددته، فستحصل على الخطأ التالي: + +```bash +ValueError: You defined `RobertaEmbeddings` in the modular_roberta.py, it should be used + when you define `BertModel`, as it is one of it's direct dependencies. Make sure + you use it in the `__init__` function. +``` + +بالإضافة إلى ذلك، قد تجد قائمة بالأمثلة هنا: + +## ما هو ليس كذلك + +ليس بديلاً لتعليمات برمجة النمذجة (بعد؟)، وإذا لم يكن نموذجك يعتمد على أي شيء آخر موجود من قبل، فيمكنك إضافة ملف `نمذجة` كالعادة. + + +## الاستخدام المتقدم + +### إزالة السمات والوظائف +لإزالة السمات التي لا تستخدم في نموذجك النمطي، والتي لا تريد رؤيتها في النمذجة المفككة: + +```python +class GemmaModel(LlamaModel): | class GemmaModel(PreTrainedModel): + def __init__(self, config): | def __init__(self, config): + super().__init__(self, eos_token) | super().__init__(config) + del self.embed_tokens | self.padding_idx = config.pad_token_id + | self.vocab_size = config.vocab_size + | + | self.layers = nn.ModuleList( + | [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + | ) + | self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + | self.rotary_emb = LlamaRotaryEmbedding(config=config) + | self.gradient_checkpointing = False + | + | # Initialize weights and apply final processing + | self.post_init() +``` +إذا قمت بالتحقق من `LlamaModel` الأصلي، فستجد `embed_tokens` الذي تمت إزالته هنا (كما هو متوقع!) + +إزالة وظيفة مشابهة، تحتاج فقط إلى كتابتها مع `raise ValueError("")` لمحاكاة السلوك الذي تريده فعليًا عند إزالة وظيفة أصلية في بايثون. + +```python +class GemmaTokenizer(LlamaTokenizer): + ... + + def get_spm_processor(self): + raise AttributeError("Not needed for Gemma") + + def unk_token_length(self): + raise AttributeError("Not needed for Gemma") +``` + +### تعريف وظائف جديدة + +إذا قمت بتعريف وظيفة جديدة في الملف `modular` لاستخدامها داخل فئة، على سبيل المثال + +```python +def my_new_function(*args, **kwargs): + # Do something here + pass + +class GemmaModel(LlamaModel): + def forward(*args, **kwargs): + # Call the function + example = my_new_function(*args, **kwargs) + # continue here +``` + +سيتم نسخ وظيفة `my_new_function` (وبشكل متكرر، أي وظائف أخرى جديدة يتم استدعاؤها في جسمها) تلقائيًا +في الملف الذي يتم استخدامه. + +### استدعاء `super()` +قمنا مؤخرًا بشحن بعض الميزات التي تسمح لك بالانتقال من: +```python +class GemmaTokenizer(LlamaTokenizer, PretrainedTokenizerFast): | class GemmaModel(nn.Module): + def __init__(self, eos_token=""): | def __init__(self): + eos_token = AddedToken(eos_token) | eos_token = AddedToken(eos_token) + PretrainedTokenizerFast.__init__(self, eos_token) | super().__init__(eos_token) +``` +هذا مفيد عندما لا تريد تفكيك استدعاء `super()`، وتريد التمييز بين أي استدعاء super init تقوم به! + +### التسمية الخاصة +ندعم الآن أيضًا حالات خاصة مثل +```python +class GemmaVisionModel(CLIPModel): + pass +``` +حيث اسم فئة `GemmaVision` الخاصة بك ليس هو نفسه `Gemma` النمطي. هذا مفيد للغاية للنماذج المركبة. diff --git a/docs/source/ar/notebooks.md b/docs/source/ar/notebooks.md new file mode 100644 index 000000000000..0591204d602c --- /dev/null +++ b/docs/source/ar/notebooks.md @@ -0,0 +1,141 @@ +# دفاتر ملاحظات 🤗 Transformers + +يمكنك أن تجد هنا قائمة بدفاتر الملاحظات الرسمية التي تقدمها Hugging Face. + +كما نود أن ندرج هنا محتوى مثيرًا للاهتمام تم إنشاؤه بواسطة المجتمع. +إذا كتبت دفتر ملاحظات يستفيد من 🤗 Transformers وتود إدراجه هنا، فيُرجى فتح طلب سحب حتى يمكن تضمينه ضمن دفاتر ملاحظات المجتمع. + + +## دفاتر ملاحظات Hugging Face 🤗 + +### دفاتر ملاحظات التوثيق + +يمكنك فتح أي صفحة من صفحات التوثيق كدفتر ملاحظات في Colab (يوجد زر مباشرة على تلك الصفحات) ولكنها مدرجة هنا أيضًا إذا كنت بحاجة إليها: + +| دفتر الملاحظات | الوصف | | | +|:----------|:-------------|:-------------|------:| +| [جولة سريعة في المكتبة](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/quicktour.ipynb) | عرض لمختلف واجهات برمجة التطبيقات في Transformers |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/quicktour.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/en/transformers_doc/quicktour.ipynb)| +| [ملخص المهام](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/task_summary.ipynb) | كيفية تشغيل نماذج مكتبة Transformers مهمة تلو الأخرى |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/task_summary.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/task_summary.ipynb)| +| [معالجة البيانات مسبقًا](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/preprocessing.ipynb) | كيفية استخدام محلل لغوي لمعالجة بياناتك مسبقًا |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/preprocessing.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/preprocessing.ipynb)| +| [الضبط الدقيق لنموذج مُدرَّب مسبقًا](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb) | كيفية استخدام المدرب لضبط نموذج مُدرَّب مسبقًا بدقة |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb)| +| [ملخص للمحللات اللغوية](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb) | الاختلافات بين خوارزمية المحلل اللغوي |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb)| +| [النماذج متعددة اللغات](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb) | كيفية استخدام النماذج متعددة اللغات للمكتبة |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)| + + +### أمثلة PyTorch + +#### معالجة اللغة الطبيعية[[pytorch-nlp]] + +| دفتر الملاحظات | الوصف | | | +|:----------|:-------------|:-------------|------:| +| [تدريب محللك اللغوي](https://github.com/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb) | كيفية تدريب واستخدام محللك اللغوي الخاص بك |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)| +| [تدريب نموذج لغتك](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb) | كيفية البدء بسهولة في استخدام المحولات |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)| +| [كيفية ضبط نموذج بدقة على تصنيف النص](https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على أي مهمة GLUE. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)| +| [كيفية ضبط نموذج بدقة على النمذجة اللغوية](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على مهمة LM سببية أو مقنعة. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)| +| [كيفية ضبط نموذج بدقة على تصنيف الرموز المميزة](https://github.com/huggingface/notebooks/blob/main/examples/token_classification.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على مهمة تصنيف الرموز المميزة (NER، PoS). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)| +| [كيفية ضبط نموذج بدقة على الإجابة على الأسئلة](https://github.com/huggingface/notebooks/blob/main/examples/question_answering.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على SQUAD. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)| +| [كيفية ضبط نموذج بدقة على الاختيار من متعدد](https://github.com/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على SWAG. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)| +| [كيفية ضبط نموذج بدقة على الترجمة](https://github.com/huggingface/notebooks/blob/main/examples/translation.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على WMT. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/translation.ipynb)| +| [كيفية ضبط نموذج بدقة على التلخيص](https://github.com/huggingface/notebooks/blob/main/examples/summarization.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على XSUM. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)| +| [كيفية تدريب نموذج لغة من البداية](https://github.com/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)| تسليط الضوء على جميع الخطوات لتدريب نموذج Transformer بشكل فعال على بيانات مخصصة | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)| +| [كيفية إنشاء نص](https://github.com/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)| كيفية استخدام أساليب فك التشفير المختلفة لإنشاء اللغة باستخدام المحولات | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)| +| [كيفية إنشاء نص (مع قيود)](https://github.com/huggingface/blog/blob/main/notebooks/53_constrained_beam_search.ipynb)| كيفية توجيه إنشاء اللغة باستخدام القيود التي يوفرها المستخدم | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/53_constrained_beam_search.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/main/notebooks/53_constrained_beam_search.ipynb)| +| [Reformer](https://github.com/huggingface/blog/blob/main/notebooks/03_reformer.ipynb)| كيف يدفع Reformer حدود النمذجة اللغوية | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/blog/blob/main/notebooks/03_reformer.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/patrickvonplaten/blog/blob/main/notebooks/03_reformer.ipynb)| + +#### رؤية الكمبيوتر[[pytorch-cv]] + +| دفتر الملاحظات | الوصف | | | +|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------:| +| [كيفية ضبط نموذج بدقة على تصنيف الصور (Torchvision)](https://github.com/huggingface/notebooks/blob/main/examples/image_classification.ipynb) | يوضح كيفية معالجة البيانات مسبقًا باستخدام Torchvision وضبط أي نموذج رؤية مُدرَّب مسبقًا بدقة على تصنيف الصور | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)| +| [كيفية ضبط نموذج بدقة على تصنيف الصور (Albumentations)](https://github.com/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb) | يوضح كيفية معالجة البيانات مسبقًا باستخدام Albumentations وضبط أي نموذج رؤية مُدرَّب مسبقًا بدقة على تصنيف الصور | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb)| +| [كيفية ضبط نموذج بدقة على تصنيف الصور (Kornia)](https://github.com/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb) | يوضح كيفية معالجة البيانات مسبقًا باستخدام Kornia وضبط أي نموذج رؤية مُدرَّب مسبقًا بدقة على تصنيف الصور | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)| +| [كيفية إجراء الكشف عن الأشياء بدون لقطات مع OWL-ViT](https://github.com/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb) | يوضح كيفية إجراء الكشف عن الأشياء بدون لقطات على الصور باستخدام استعلامات نصية | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb)| +| [كيفية ضبط نموذج وصف الصور بدقة](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb) | يوضح كيفية ضبط BLIP بدقة لوصف الصور على مجموعة بيانات مخصصة | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb)| +| [كيفية بناء نظام تشابه الصور مع Transformers](https://github.com/huggingface/notebooks/blob/main/examples/image_similarity.ipynb) | يوضح كيفية بناء نظام تشابه الصور | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_similarity.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_similarity.ipynb)| +| [كيفية ضبط نموذج SegFormer بدقة على التجزئة الدلالية](https://github.com/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb) | يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج SegFormer مُدرَّب مسبقًا بدقة على التجزئة الدلالية | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb)| +| [كيفية ضبط نموذج VideoMAE بدقة على تصنيف الفيديو](https://github.com/huggingface/notebooks/blob/main/examples/video_classification.ipynb) | يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج VideoMAE مُدرَّب مسبقًا بدقة على تصنيف الفيديو | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/video_classification.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/video_classification.ipynb)| + + +#### الصوت[[pytorch-audio]] + +| دفتر الملاحظات | الوصف | | | +|:----------|:-------------|:-------------|------:| +| [كيفية ضبط نموذج التعرف على الكلام باللغة الإنجليزية بدقة](https://github.com/huggingface/notebooks/blob/main/examples/speech_recognition.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج كلام مُدرَّب مسبقًا بدقة على TIMIT | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/speech_recognition.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/speech_recognition.ipynb)| +| [كيفية ضبط نموذج التعرف على الكلام بأي لغة بدقة](https://github.com/huggingface/notebooks/blob/main/examples/multi_lingual_speech_recognition.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج كلام مُدرَّب مسبقًا متعدد اللغات بدقة على Common Voice | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multi_lingual_speech_recognition.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/multi_lingual_speech_recognition.ipynb)| +| [كيفية ضبط نموذج بدقة على تصنيف الصوت](https://github.com/huggingface/notebooks/blob/main/examples/audio_classification.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج كلام مُدرَّب مسبقًا بدقة على Keyword Spotting | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb)| + + +#### التسلسلات البيولوجية[[pytorch-bio]] + +| دفتر الملاحظات | الوصف | | | +|:----------|:----------------------------------------------------------------------------------------|:-------------|------:| +| [كيفية ضبط نموذج بروتين مُدرَّب مسبقًا بدقة](https://github.com/huggingface/notebooks/blob/main/examples/protein_language_modeling.ipynb) | شاهد كيفية ترميز البروتينات وضبط نموذج "لغة" بروتين مُدرَّب مسبقًا كبير بدقة | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/protein_language_modeling.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/protein_language_modeling.ipynb) | +| [كيفية إنشاء طيات بروتينية](https://github.com/huggingface/notebooks/blob/main/examples/protein_folding.ipynb) | شاهد كيفية الانتقال من تسلسل البروتين إلى نموذج بروتين كامل وملف PDB | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/protein_folding.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/protein_folding.ipynb) | +| [كيفية ضبط نموذج محول النيوكليوتيدات بدقة](https://github.com/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling.ipynb) | شاهد كيفية ترميز الحمض النووي وضبط نموذج "لغة" الحمض النووي مُدرَّب مسبقًا كبير بدقة | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling.ipynb) | +| [ضبط نموذج محول النيوكليوتيدات بدقة باستخدام LoRA](https://github.com/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling_with_peft.ipynb) | تدريب نماذج DNA أكبر بكثير بطريقة فعالة من حيث الذاكرة | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling_with_peft.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling_with_peft.ipynb) | + + +#### طرائق أخرى[[pytorch-other]] + +| دفتر الملاحظات | الوصف | | | +|:----------|:----------------------------------------------------------------------------------------|:-------------|------:| +| [التنبؤ الاحتمالي بالسلاسل الزمنية](https://github.com/huggingface/notebooks/blob/main/examples/time-series-transformers.ipynb) | شاهد كيفية تدريب Time Series Transformer على مجموعة بيانات مخصصة | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/time-series-transformers.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/time-series-transformers.ipynb) | + +#### دفاتر ملاحظات الأدوات المساعدة [[pytorch-utility]] + +| دفتر الملاحظات | الوصف | | | +|:----------|:-------------|:-------------|------:| +| [كيفية تصدير النموذج إلى ONNX](https://github.com/huggingface/notebooks/blob/main/examples/onnx-export.ipynb)| تسليط الضوء على كيفية التصدير وتشغيل أعباء عمل الاستدلال من خلال ONNX | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/onnx-export.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/onnx-export.ipynb)| +| [كيفية استخدام المعايير](https://github.com/huggingface/notebooks/blob/main/examples/benchmark.ipynb)| كيفية قياس أداء النماذج باستخدام المحولات | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/benchmark.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/benchmark.ipynb)| + +### أمثلة TensorFlow + +#### معالجة اللغة الطبيعية[[tensorflow-nlp]] + +| دفتر الملاحظات | الوصف | | | +|:----------|:-------------|:-------------|------:| +| [تدريب محللك اللغوي](https://github.com/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb) | كيفية تدريب واستخدام محللك اللغوي الخاص بك |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)| +| [تدريب نموذج لغتك](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch-tf.ipynb) | كيفية البدء بسهولة في استخدام المحولات |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch-tf.ipynb)| +| [كيفية ضبط نموذج بدقة على تصنيف النص](https://github.com/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على أي مهمة GLUE. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb)| +| [كيفية ضبط نموذج بدقة على النمذجة اللغوية](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على مهمة LM سببية أو مقنعة. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb)| +| [كيفية ضبط نموذج بدقة على تصنيف الرموز المميزة](https://github.com/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على مهمة تصنيف الرموز المميزة (NER، PoS). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb)| +| [كيفية ضبط نموذج بدقة على الإجابة على الأسئلة](https://github.com/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على SQUAD. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb)| +| [كيفية ضبط نموذج بدقة على الاختيار من متعدد](https://github.com/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على SWAG. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb)| +| [كيفية ضبط نموذج بدقة على الترجمة](https://github.com/huggingface/notebooks/blob/main/examples/translation-tf.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على WMT. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb)| +| [كيفية ضبط نموذج بدقة على التلخيص](https://github.com/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على XSUM. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb)| + +#### رؤية الكمبيوتر[[tensorflow-cv]] + +| دفتر الملاحظات | الوصف | | | +|:---------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------|:-------------|------:| +| [كيفية ضبط نموذج بدقة على تصنيف الصور](https://github.com/huggingface/notebooks/blob/main/examples/image_classification-tf.ipynb) | يوضح كيفية معالجة البيانات مسبقًا وضبط أي نموذج رؤية مُدرَّب مسبقًا بدقة على تصنيف الصور | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification-tf.ipynb)| +| [كيفية ضبط نموذج SegFormer بدقة على التجزئة الدلالية](https://github.com/huggingface/notebooks/blob/main/examples/semantic_segmentation-tf.ipynb) | يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج SegFormer مُدرَّب مسبقًا بدقة على التجزئة الدلالية | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/semantic_segmentation-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/semantic_segmentation-tf.ipynb)| + +#### التسلسلات البيولوجية[[tensorflow-bio]] + +| دفتر الملاحظات | الوصف | | | +|:----------|:-------------|:-------------|------:| +| [كيفية ضبط نموذج بروتين مُدرَّب مسبقًا بدقة](https://github.com/huggingface/notebooks/blob/main/examples/protein_language_modeling-tf.ipynb) | شاهد كيفية ترميز البروتينات وضبط نموذج "لغة" بروتين مُدرَّب مسبقًا كبير بدقة | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/protein_language_modeling-tf.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/protein_language_modeling-tf.ipynb) | + +#### دفاتر ملاحظات الأدوات المساعدة [[tensorflow-utility]] + +| دفتر الملاحظات | الوصف | | | +|:----------|:-------------|:-------------|------:| +| [كيفية تدريب نماذج TF/Keras على TPU](https://github.com/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb) | شاهد كيفية التدريب بسرعة عالية على أجهزة TPU من Google | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb) | + +### دفاتر ملاحظات Optimum + +🤗 [Optimum](https://github.com/huggingface/optimum) هو امتداد لـ 🤗 Transformers، يوفر مجموعة من أدوات تحسين الأداء التي تمكن من تحقيق أقصى قدر من الكفاءة لتدريب وتشغيل النماذج على الأجهزة المستهدفة. + +| دفتر الملاحظات | الوصف | | | +|:----------|:-------------|:-------------|------:| +| [كيفية تكميم نموذج باستخدام ONNX Runtime لتصنيف النص](https://github.com/huggingface/notebooks/blob/main/examples/text_classification_quantization_ort.ipynb)| يوضح كيفية تطبيق التكميم الثابت والديناميكي على نموذج باستخدام [ONNX Runtime](https://github.com/microsoft/onnxruntime) لأي مهمة GLUE. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_ort.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_ort.ipynb)| +| [كيفية تكميم نموذج باستخدام Intel Neural Compressor لتصنيف النص](https://github.com/huggingface/notebooks/blob/main/examples/text_classification_quantization_inc.ipynb)| يوضح كيفية تطبيق التكميم الثابت والديناميكي والتدريبي على نموذج باستخدام [Intel Neural Compressor (INC)](https://github.com/intel/neural-compressor) لأي مهمة GLUE. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_inc.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_inc.ipynb)| +| [كيفية ضبط نموذج بدقة على تصنيف النص باستخدام ONNX Runtime](https://github.com/huggingface/notebooks/blob/main/examples/text_classification_ort.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج بدقة على أي مهمة GLUE باستخدام [ONNX Runtime](https://github.com/microsoft/onnxruntime). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_ort.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification_ort.ipynb)| +| [كيفية ضبط نموذج بدقة على التلخيص باستخدام ONNX Runtime](https://github.com/huggingface/notebooks/blob/main/examples/summarization_ort.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج بدقة على XSUM باستخدام [ONNX Runtime](https://github.com/microsoft/onnxruntime). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization_ort.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/summarization_ort.ipynb)| + + +## دفاتر ملاحظات المجتمع: + +تتوفر المزيد من دفاتر الملاحظات التي طورها المجتمع [هنا](https://hf.co/docs/transformers/community#community-notebooks). + diff --git a/docs/source/ar/quicktour.md b/docs/source/ar/quicktour.md index 9a99c28287d6..1795c3a5d74f 100644 --- a/docs/source/ar/quicktour.md +++ b/docs/source/ar/quicktour.md @@ -347,8 +347,8 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725], ```py >>> from transformers import AutoModel ->>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory) ->>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True) +>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory) +>>> pt_model = AutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True) ``` @@ -356,8 +356,8 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725], ```py >>> from transformers import TFAutoModel ->>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory) ->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True) +>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory) +>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True) ``` diff --git a/docs/source/ar/tasks/language_modeling.md b/docs/source/ar/tasks/language_modeling.md new file mode 100644 index 000000000000..24f2db00a7a6 --- /dev/null +++ b/docs/source/ar/tasks/language_modeling.md @@ -0,0 +1,422 @@ + + +# نمذجة اللغة السببية (Causal language modeling) + +[[open-in-colab]] + +هناك نوعان من نمذجة اللغة، السببية والمقنعة. يوضح هذا الدليل نمذجة اللغة السببية. +تُستخدم نماذج اللغة السببية غالبًا لتوليد النص. يمكنك استخدام هذه النماذج للتطبيقات الإبداعية مثل +اختيار مغامرة النص الخاصة بك أو مساعد ترميز ذكي مثل Copilot أو CodeParrot. + + + +تتنبأ نمذجة اللغة السببية بالرمز التالي في تسلسل من الرموز، ولا يمكن للنموذج سوى الاهتمام بالرموز على +اليسار. هذا يعني أن النموذج لا يمكنه رؤية الرموز المستقبلية. GPT-2 هو مثال على نموذج اللغة السببية. + +سيوضح لك هذا الدليل كيفية: + +1. ضبط دقيق [DistilRoBERTa](https://huggingface.co/distilbert/distilroberta-base) على مجموعة فرعية [r/askscience](https://www.reddit.com/r/askscience/) من مجموعة بيانات [ELI5](https://huggingface.co/datasets/eli5). +2. استخدام النموذج المدرب الخاص بك للاستنتاج. + + + +لرؤية جميع العمارات ونقاط التحقق المتوافقة مع هذه المهمة، نوصي بالتحقق من [task-page](https://huggingface.co/tasks/text-generation) + + + +قبل أن تبدأ، تأكد من تثبيت جميع المكتبات الضرورية: + +```bash +pip install transformers datasets evaluate +``` + +نحن نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل ومشاركة نموذجك مع المجتمع. عند المطالبة، أدخل رمزك لتسجيل الدخول: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## تحميل مجموعة بيانات ELI5 + +ابدأ بتحميل أول 5000 مثال من [ELI5-Category](https://huggingface.co/datasets/eli5_category) مجموعة البيانات مع مكتبة 🤗 Datasets. سيعطيك هذا فرصة للتجربة والتأكد من أن كل شيء يعمل قبل قضاء المزيد من الوقت في التدريب على مجموعة البيانات الكاملة. + +```py +>>> from datasets import load_dataset + +>>> eli5 = load_dataset("eli5_category", split="train[:5000]") +``` + +قم بتقسيم مجموعة بيانات `train` إلى مجموعتي تدريب واختبار باستخدام الخاصية [`~datasets.Dataset.train_test_split`]: + +```py +>>> eli5 = eli5.train_test_split(test_size=0.2) +``` + +ثم ألق نظرة على مثال: + +```py +>>> eli5["train"][0] +{'q_id': '7h191n', + 'title': 'What does the tax bill that was passed today mean? How will it affect Americans in each tax bracket?', + 'selftext': '', + 'category': 'Economics', + 'subreddit': 'explainlikeimfive', + 'answers': {'a_id': ['dqnds8l', 'dqnd1jl', 'dqng3i1', 'dqnku5x'], + 'text': ["The tax bill is 500 pages long and there were a lot of changes still going on right to the end. It's not just an adjustment to the income tax brackets, it's a whole bunch of changes. As such there is no good answer to your question. The big take aways are: - Big reduction in corporate income tax rate will make large companies very happy. - Pass through rate change will make certain styles of business (law firms, hedge funds) extremely happy - Income tax changes are moderate, and are set to expire (though it's the kind of thing that might just always get re-applied without being made permanent) - People in high tax states (California, New York) lose out, and many of them will end up with their taxes raised.", + 'None yet. It has to be reconciled with a vastly different house bill and then passed again.', + 'Also: does this apply to 2017 taxes? Or does it start with 2018 taxes?', + 'This article explains both the House and senate bills, including the proposed changes to your income taxes based on your income level. URL_0'], + 'score': [21, 19, 5, 3], + 'text_urls': [[], + [], + [], + ['https://www.investopedia.com/news/trumps-tax-reform-what-can-be-done/']]}, + 'title_urls': ['url'], + 'selftext_urls': ['url']} +``` + +على الرغم من أن هذا قد يبدو معقدًا، إلا أنك مهتم حقًا بحقل `text`. ما هو رائع حول مهام نمذجة اللغة +أنت لا تحتاج إلى تسميات (تُعرف أيضًا باسم المهمة غير الخاضعة للإشراف) لأن الكلمة التالية تعمل كتسمية. + +## معالجة مسبقة (Preprocess) + + + +الخطوة التالية هي تحميل مجزء النص DistilGPT2 لمعالجة حقل `text` الفرعي: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2") +``` + +ستلاحظ من المثال أعلاه، الحقل `text` هو في الواقع متداخل داخل `answers`. هذا يعني أنك ستحتاج إلى +استخراج حقل `text` الفرعي من بنيته المتداخلة باستخدام الدالة [`flatten`](https://huggingface.co/docs/datasets/process#flatten): + +```py +>>> eli5 = eli5.flatten() +>>> eli5["train"][0] +{'q_id': '7h191n', + 'title': 'What does the tax bill that was passed today mean? How will it affect Americans in each tax bracket?', + 'selftext': '', + 'category': 'Economics', + 'subreddit': 'explainlikeimfive', + 'answers.a_id': ['dqnds8l', 'dqnd1jl', 'dqng3i1', 'dqnku5x'], + 'answers.text': ["The tax bill is 500 pages long and there were a lot of changes still going on right to the end. It's not just an adjustment to the income tax brackets, it's a whole bunch of changes. As such there is no good answer to your question. The big take aways are: - Big reduction in corporate income tax rate will make large companies very happy. - Pass through rate change will make certain styles of business (law firms, hedge funds) extremely happy - Income tax changes are moderate, and are set to expire (though it's the kind of thing that might just always get re-applied without being made permanent) - People in high tax states (California, New York) lose out, and many of them will end up with their taxes raised.", + 'None yet. It has to be reconciled with a vastly different house bill and then passed again.', + 'Also: does this apply to 2017 taxes? Or does it start with 2018 taxes?', + 'This article explains both the House and senate bills, including the proposed changes to your income taxes based on your income level. URL_0'], + 'answers.score': [21, 19, 5, 3], + 'answers.text_urls': [[], + [], + [], + ['https://www.investopedia.com/news/trumps-tax-reform-what-can-be-done/']], + 'title_urls': ['url'], + 'selftext_urls': ['url']} +``` + +كل حقل فرعي هو الآن عموداً منفصلاً مسبوقاً بـ `answers`، وحقل `text` هو قائمة الآن. بدلاً من ذلك +من تجزائة نص كل جملة بشكل منفصل، قم بتحويل القائمة إلى سلسلة حتى تتمكن من تجزئة نصها بشكل مجمّع. + +هنا أول دالة معالجة مسبقة لدمج قائمة السلاسل لكل مثال ومجزىء النتيجة: + +```py +>>> def preprocess_function(examples): +... return tokenizer([" ".join(x) for x in examples["answers.text"]]) +``` + +لتطبيق دالة المعالجة المسبقة هذه على مجموعة البيانات بأكملها، استخدم الدالة 🤗 Datasets [`~datasets.Dataset.map`]. يمكنك تسريع هذه العملية `map` عن طريق تعيين `batched=True` لمعالجة عناصر متعددة من مجموعة البيانات في وقت واحد، وزيادة عدد العمليات مع `num_proc`. احذف أي أعمدة لا تحتاجها: + +```py +>>> tokenized_eli5 = eli5.map( +... preprocess_function, +... batched=True, +... num_proc=4, +... remove_columns=eli5["train"].column_names, +... ) +``` + +تحتوي هذه المجموعة من البيانات على تسلسلات الرموز، ولكن بعضها أطول من الطول الأقصى للمدخلات للنموذج. + +يمكنك الآن استخدام دالة ما قبل المعالجة ثانية لـ: + +- تجميع كل التسلسلات. +- تقسيم التسلسلات المجمّعة إلى أجزاء أقصر محددة، بحجم `block_size`، والتي يجب أن تكون أقصر من الطول الأقصى للمدخلات ومناسبة لذاكرة GPU. + +```py +>>> block_size = 128 + +>>> def group_texts(examples): +... # ربط جميع النصوص. +... concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} +... total_length = len(concatenated_examples[list(examples.keys())[0]]) +... # نتجاهل الباقي الصغير، يمكننا إضافة الحشو إذا كان النموذج يدعمه بدلاً من هذا الإسقاط، يمكنك +... # تخصيص هذا الجزء حسب احتياجاتك. +... if total_length >= block_size: +... total_length = (total_length // block_size) * block_size +... # التقسيم إلى أجزاء بحجم block_size. +... result = { +... k: [t[i : i + block_size] for i in range(0, total_length, block_size)] +... for k, t in concatenated_examples.items() +... } +... result["labels"] = result["input_ids"].copy() +... return result +``` + +طبق دالة `group_texts` على كامل المجموعة من البيانات: + +```py +>>> lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4) +``` + +الآن قم بإنشاء دفعة من الأمثلة باستخدام [`DataCollatorForLanguageModeling`]. من الأفضل أن تقوم بـ *الحشو الديناميكي* للجمل إلى الطول الأطول في الدفعة أثناء التجميع، بدلاً من حشو كامل المجموعة من البيانات إلى الطول الأقصى. + + + +استخدم رمز نهاية التسلسل كرمز للحشو، وحدد `mlm_probability` لحجب الرموز بشكل عشوائي عند كل تكرار للبيانات: + +```py +>>> from transformers import DataCollatorForLanguageModeling + +>>> tokenizer.pad_token = tokenizer.eos_token +>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) +``` + + + +استخدم رمز نهاية التسلسل كرمز للحشو، وحدد `mlm_probability` لحجب الرموز بشكل عشوائي عند كل تكرار للبيانات: + +```py +>>> from transformers import DataCollatorForLanguageModeling + +>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="tf") +``` + + + + +## التدريب (Train) + + + + + + +إذا لم تكن على دراية بتدريب نموذج باستخدام [`Trainer`], اطلع على [البرنامج التعليمي الأساسي](../training#train-with-pytorch-trainer)! + + + +أنت جاهز الآن لبدء تدريب نموذجك! قم بتحميل DistilGPT2 باستخدام [`AutoModelForCausalLM`]: + +```py +>>> from transformers import AutoModelForCausalLM, TrainingArguments, Trainer + +>>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2") +``` + +في هذه المرحلة، تبقى ثلاث خطوات فقط: + +1. حدد معلمات التدريب الخاصة بك في [`TrainingArguments`]. المعامل الوحيد المطلوب هو `output_dir` الذي يحدد أين سيتم حفظ نموذجك. ستقوم بدفع هذا النموذج إلى Hub بتحديد `push_to_hub=True` (يجب أن تكون مسجلاً الدخول إلى Hugging Face لتحميل نموذجك). +2. قم بتمرير معاملات التدريب إلى [`Trainer`] إلى جانب النموذج، والمجموعات من البيانات، ومجمّع البيانات. +3. قم باستدعاء [`~Trainer.train`] لتدريب نموذجك. + +```py +>>> training_args = TrainingArguments( +... output_dir="my_awesome_eli5_clm-model", +... eval_strategy="epoch", +... learning_rate=2e-5, +... weight_decay=0.01, +... push_to_hub=True, +... ) + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=lm_dataset["train"], +... eval_dataset=lm_dataset["test"], +... data_collator=data_collator, +... tokenizer=tokenizer, +... ) + +>>> trainer.train() +``` + +بمجرد اكتمال التدريب، استخدم طريقة [`~transformers.Trainer.evaluate`] لتقييم نموذجك والحصول على احتمالية الارتباك: + +```py +>>> import math + +>>> eval_results = trainer.evaluate() +>>> print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}") +Perplexity: 49.61 +``` + +ثم شارك نموذجك على Hub باستخدام طريقة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك: + +```py +>>> trainer.push_to_hub() +``` + + + + +إذا لم تكن على دراية بتدريب نموذج باستخدام Keras، اطلع على [البرنامج التعليمي الأساسي](../training#train-a-tensorflow-model-with-keras)! + + +لتدريب نموذج في TensorFlow، ابدأ بإعداد دالة المحسن، وجدول معدل التعلم، وبعض معاملات التدريب: + +```py +>>> from transformers import create_optimizer, AdamWeightDecay + +>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01) +``` + +ثم يمكنك تحميل DistilGPT2 باستخدام [`TFAutoModelForCausalLM`]: + +```py +>>> from transformers import TFAutoModelForCausalLM + +>>> model = TFAutoModelForCausalLM.from_pretrained("distilbert/distilgpt2") +``` + +حول مجموعات بياناتك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]: + +```py +>>> tf_train_set = model.prepare_tf_dataset( +... lm_dataset["train"], +... shuffle=True, +... batch_size=16, +... collate_fn=data_collator, +... ) + +>>> tf_test_set = model.prepare_tf_dataset( +... lm_dataset["test"], +... shuffle=False, +... batch_size=16, +... collate_fn=data_collator, +... ) +``` + +قم بتهيئة النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن جميع نماذج Transformers لديها دالة خسارة ذات صلة بالمهمة الافتراضية، لذلك لا تحتاج إلى تحديد واحدة ما لم ترغب في ذلك: + +```py +>>> import tensorflow as tf + +>>> model.compile(optimizer=optimizer) # لا يوجد حجة للخسارة! +``` + +يمكن القيام بذلك عن طريق تحديد مكان دفع نموذجك ومجمّع البيانات في [`~transformers.PushToHubCallback`]: + +```py +>>> from transformers.keras_callbacks import PushToHubCallback + +>>> callback = PushToHubCallback( +... output_dir="my_awesome_eli5_clm-model", +... tokenizer=tokenizer, +... ) +``` + +أخيراً، أنت جاهز لبدء تدريب نموذجك! قم باستدعاء [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق من الصحة، وعدد العصور، والتعليقات الخاصة بك لتدريب النموذج: + +```py +>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=[callback]) +``` + +بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه! + + + + + +للحصول على مثال أكثر تعمقًا حول كيفية تدريب نموذج للنمذجة اللغوية السببية، اطلع على الدفتر المقابل +[دفتر PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb) +أو [دفتر TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb). + + + +## الاستدلال (Inference) + +رائع، الآن بعد أن قمت بتدريب نموذج، يمكنك استخدامه للاستدلال! + +قم بابتكار سؤال تود توليد نص منه: + +```py +>>> prompt = "Somatic hypermutation allows the immune system to" +``` + +أبسط طريقة لتجربة نموذجك المدرب للاستدلال هي استخدامه في [`pipeline`]. قم بتنفيذ `pipeline` لتوليد النص مع نموذجك، ومرر نصك إليه: + +```py +>>> from transformers import pipeline + +>>> generator = pipeline("text-generation", model="username/my_awesome_eli5_clm-model") +>>> generator(prompt) +[{'generated_text': "Somatic hypermutation allows the immune system to be able to effectively reverse the damage caused by an infection.\n\n\nThe damage caused by an infection is caused by the immune system's ability to perform its own self-correcting tasks."}] +``` + + + +قسم النص وإرجع `input_ids` كتنسورات PyTorch: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_eli5_clm-model") +>>> inputs = tokenizer(prompt, return_tensors="pt").input_ids +``` + +استخدم طريقة [`~generation.GenerationMixin.generate`] لتوليد النص. +للمزيد من التفاصيل حول استراتيجيات توليد النص المختلفة والبارامترات للتحكم في التوليد، راجع صفحة [استراتيجيات توليد النص](../generation_strategies). + +```py +>>> from transformers import AutoModelForCausalLM + +>>> model = AutoModelForCausalLM.from_pretrained("username/my_awesome_eli5_clm-model") +>>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95) +``` + +فك ترميز الرموز المولدة مرة أخرى إلى نص: + +```py +>>> tokenizer.batch_decode(outputs, skip_special_tokens=True) +["Somatic hypermutation allows the immune system to react to drugs with the ability to adapt to a different environmental situation. In other words, a system of 'hypermutation' can help the immune system to adapt to a different environmental situation or in some cases even a single life. In contrast, researchers at the University of Massachusetts-Boston have found that 'hypermutation' is much stronger in mice than in humans but can be found in humans, and that it's not completely unknown to the immune system. A study on how the immune system"] +``` + + +قم بتقسيم النص وإرجاع `input_ids` كـ TensorFlow tensors: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_eli5_clm-model") +>>> inputs = tokenizer(prompt, return_tensors="tf").input_ids +``` + +استخدم طريقة [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] لإنشاء الملخص. للمزيد من التفاصيل حول استراتيجيات توليد النص المختلفة والبارامترات للتحكم في التوليد، راجع صفحة [استراتيجيات توليد النص](../generation_strategies). + +```py +>>> from transformers import TFAutoModelForCausalLM + +>>> model = TFAutoModelForCausalLM.from_pretrained("username/my_awesome_eli5_clm-model") +>>> outputs = model.generate(input_ids=inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95) +``` + +فك ترميز الرموز المولدة مرة أخرى إلى نص: + +```py +>>> tokenizer.batch_decode(outputs, skip_special_tokens=True) +['Somatic hypermutation allows the immune system to detect the presence of other viruses as they become more prevalent. Therefore, researchers have identified a high proportion of human viruses. The proportion of virus-associated viruses in our study increases with age. Therefore, we propose a simple algorithm to detect the presence of these new viruses in our samples as a sign of improved immunity. A first study based on this algorithm, which will be published in Science on Friday, aims to show that this finding could translate into the development of a better vaccine that is more effective for'] +``` + + \ No newline at end of file diff --git a/docs/source/ar/tasks/masked_language_modeling.md b/docs/source/ar/tasks/masked_language_modeling.md new file mode 100644 index 000000000000..e8382927d1e6 --- /dev/null +++ b/docs/source/ar/tasks/masked_language_modeling.md @@ -0,0 +1,442 @@ + + +# نمذجة اللغة المقنعة (Masked language modeling) + +[[open-in-colab]] + + + +تتنبأ نمذجة اللغة المقنعة برمز مقنع في تسلسل، ويمكن للنموذج الانتباه إلى الرموز بشكل ثنائي الاتجاه. هذا +يعني أن النموذج لديه إمكانية الوصول الكاملة إلى الرموز الموجودة على اليسار واليمين. تعد نمذجة اللغة المقنعة ممتازة للمهام التي +تتطلب فهمًا سياقيًا جيدًا لتسلسل كامل. BERT هو مثال على نموذج لغة مقنع. + +سيوضح لك هذا الدليل كيفية: + +1. تكييف [DistilRoBERTa](https://huggingface.co/distilbert/distilroberta-base) على مجموعة فرعية [r/askscience](https://www.reddit.com/r/askscience/) من مجموعة بيانات [ELI5](https://huggingface.co/datasets/eli5). +2. استخدام نموذج المدرب الخاص بك للاستدلال. + + + +لمعرفة جميع البنى والنسخ المتوافقة مع هذه المهمة، نوصي بالتحقق من [صفحة المهمة](https://huggingface.co/tasks/fill-mask) + + + +قبل أن تبدأ، تأكد من تثبيت جميع المكتبات الضرورية: + +```bash +pip install transformers datasets evaluate +``` + +نحن نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل ومشاركة نموذجك مع المجتمع. عندما تتم مطالبتك، أدخل رمزك لتسجيل الدخول: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## تحميل مجموعة بيانات ELI5 + +ابدأ بتحميل أول 5000 مثال من مجموعة بيانات [ELI5-Category](https://huggingface.co/datasets/eli5_category) باستخدام مكتبة 🤗 Datasets. سيعطيك هذا فرصة للتجربة والتأكد من أن كل شيء يعمل قبل قضاء المزيد من الوقت في التدريب على مجموعة البيانات الكاملة. + +```py +>>> from datasets import load_dataset + +>>> eli5 = load_dataset("eli5_category", split="train[:5000]") +``` + +قم بتقسيم مجموعة البيانات `train` إلى مجموعتي تدريب واختبار باستخدام الدالة [`~datasets.Dataset.train_test_split`]: + +```py +>>> eli5 = eli5.train_test_split(test_size=0.2) +``` + +ثم ألق نظرة على مثال: + +```py +>>> eli5["train"][0] +{'q_id': '7h191n', + 'title': 'What does the tax bill that was passed today mean? How will it affect Americans in each tax bracket?', + 'selftext': '', + 'category': 'Economics', + 'subreddit': 'explainlikeimfive', + 'answers': {'a_id': ['dqnds8l', 'dqnd1jl', 'dqng3i1', 'dqnku5x'], + 'text': ["The tax bill is 500 pages long and there were a lot of changes still going on right to the end. It's not just an adjustment to the income tax brackets, it's a whole bunch of changes. As such there is no good answer to your question. The big take aways are: - Big reduction in corporate income tax rate will make large companies very happy. - Pass through rate change will make certain styles of business (law firms, hedge funds) extremely happy - Income tax changes are moderate, and are set to expire (though it's the kind of thing that might just always get re-applied without being made permanent) - People in high tax states (California, New York) lose out, and many of them will end up with their taxes raised.", + 'None yet. It has to be reconciled with a vastly different house bill and then passed again.', + 'Also: does this apply to 2017 taxes? Or does it start with 2018 taxes?', + 'This article explains both the House and senate bills, including the proposed changes to your income taxes based on your income level. URL_0'], + 'score': [21, 19, 5, 3], + 'text_urls': [[], + [], + [], + ['https://www.investopedia.com/news/trumps-tax-reform-what-can-be-done/']]}, + 'title_urls': ['url'], + 'selftext_urls': ['url']} +``` + +على الرغم من أن هذا قد يبدو كثيرًا، إلا أنك مهتم حقًا بحقل `text`. ما هو رائع حول مهام نمذجة اللغة هو أنك لا تحتاج إلى تسميات (تُعرف أيضًا باسم المهمة غير الخاضعة للإشراف) لأن الكلمة التالية *هي* التسمية. + +## معالجة مسبقة (Preprocess) + + + +بالنسبة لنمذجة اللغة المقنعة، فإن الخطوة التالية هي تحميل معالج DistilRoBERTa لمعالجة حقل `text` الفرعي: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilroberta-base") +``` + +ستلاحظ من المثال أعلاه، أن حقل `text` موجود بالفعل داخل `answers`. هذا يعني أنك ستحتاج إلى استخراج حقل `text` الفرعي من بنيته المضمنة باستخدام الدالة [`flatten`](https://huggingface.co/docs/datasets/process#flatten): + +```py +>>> eli5 = eli5.flatten() +>>> eli5["train"][0] +{'q_id': '7h191n', + 'title': 'What does the tax bill that was passed today mean? How will it affect Americans in each tax bracket?', + 'selftext': '', + 'category': 'Economics', + 'subreddit': 'explainlikeimfive', + 'answers.a_id': ['dqnds8l', 'dqnd1jl', 'dqng3i1', 'dqnku5x'], + 'answers.text': ["The tax bill is 500 pages long and there were a lot of changes still going on right to the end. It's not just an adjustment to the income tax brackets, it's a whole bunch of changes. As such there is no good answer to your question. The big take aways are: - Big reduction in corporate income tax rate will make large companies very happy. - Pass through rate change will make certain styles of business (law firms, hedge funds) extremely happy - Income tax changes are moderate, and are set to expire (though it's the kind of thing that might just always get re-applied without being made permanent) - People in high tax states (California, New York) lose out, and many of them will end up with their taxes raised.", + 'None yet. It has to be reconciled with a vastly different house bill and then passed again.', + 'Also: does this apply to 2017 taxes? Or does it start with 2018 taxes?', + 'This article explains both the House and senate bills, including the proposed changes to your income taxes based on your income level. URL_0'], + 'answers.score': [21, 19, 5, 3], + 'answers.text_urls': [[], + [], + [], + ['https://www.investopedia.com/news/trumps-tax-reform-what-can-be-done/']], + 'title_urls': ['url'], + 'selftext_urls': ['url']} +``` + +كل حقل فرعي هو الآن عمود منفصل كما هو موضح بواسطة بادئة `answers`، وحقل `text` هو قائمة الآن. بدلاً من +معالجة كل جملة بشكل منفصل، قم بتحويل القائمة إلى سلسلة حتى تتمكن من معالجتها بشكل مشترك. + +هنا أول دالة معالجة مسبقة لربط قائمة السلاسل لكل مثال ومعالجة النتيجة: + +```py +>>> def preprocess_function(examples): +... return tokenizer([" ".join(x) for x in examples["answers.text"]]) +``` + +لتطبيق دالة المعالجة المسبقة على مجموعة البيانات بأكملها، استخدم الدالة 🤗 Datasets [`~datasets.Dataset.map`]. يمكنك تسريع دالة `map` عن طريق تعيين `batched=True` لمعالجة عدة عناصر في وقت واحد، وزيادة عدد العمليات باستخدام `num_proc`. احذف أي أعمدة غير ضرورية: + +```py +>>> tokenized_eli5 = eli5.map( +... preprocess_function, +... batched=True, +... num_proc=4, +... remove_columns=eli5["train"].column_names, +... ) +``` + + +تحتوي مجموعة البيانات هذه على تسلسلات رمزية، ولكن بعضها أطول من الطول الأقصى للمدخلات للنموذج. + +يمكنك الآن استخدام دالة معالجة مسبقة ثانية لـ: +- تجميع جميع التسلسلات +- تقسيم التسلسلات المجمّعة إلى أجزاء أقصر محددة بـ `block_size`، والتي يجب أن تكون أقصر من الحد الأقصى لطول المدخلات ومناسبة لذاكرة GPU. + +```py +>>> block_size = 128 + +>>> def group_texts(examples): +... # تجميع جميع النصوص. +... concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} +... total_length = len(concatenated_examples[list(examples.keys())[0]]) +... # نتجاهل الجزء المتبقي الصغير، يمكننا إضافة الحشو إذا كان النموذج يدعمه بدلاً من هذا الإسقاط، يمكنك +... # تخصيص هذا الجزء حسب احتياجاتك. +... if total_length >= block_size: +... total_length = (total_length // block_size) * block_size +... # تقسيمها إلى أجزاء بحجم block_size. +... result = { +... k: [t[i : i + block_size] for i in range(0, total_length, block_size)] +... for k, t in concatenated_examples.items() +... } +... return result +``` + +طبق دالة `group_texts` على مجموعة البيانات بأكملها: + +```py +>>> lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4) +``` + +الآن، قم بإنشاء دفعة من الأمثلة باستخدام [`DataCollatorForLanguageModeling`]. من الأكثر كفاءة أن تقوم بـ *الحشو الديناميكي* ليصل طولها إلى أطول جملة في الدفعة أثناء التجميع، بدلاً من حشو مجموعة البيانات بأكملها إلى الطول الأقصى. + + + + +استخدم رمز نهاية التسلسل كرمز الحشو وحدد `mlm_probability` لحجب الرموز عشوائياً كل مرة تكرر فيها البيانات: + +```py +>>> from transformers import DataCollatorForLanguageModeling + +>>> tokenizer.pad_token = tokenizer.eos_token +>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15) +``` + + + +استخدم رمز نهاية التسلسل كرمز الحشو وحدد `mlm_probability` لحجب الرموز عشوائياً كل مرة تكرر فيها البيانات: + +```py +>>> from transformers import DataCollatorForLanguageModeling + +>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="tf") +``` + + + +## التدريب (Train) + + + + + + +إذا لم تكن على دراية بتعديل نموذج باستخدام [`Trainer`], ألق نظرة على الدليل الأساسي [هنا](../training#train-with-pytorch-trainer)! + + + +أنت مستعد الآن لبدء تدريب نموذجك! قم بتحميل DistilRoBERTa باستخدام [`AutoModelForMaskedLM`]: + +```py +>>> from transformers import AutoModelForMaskedLM + +>>> model = AutoModelForMaskedLM.from_pretrained("distilbert/distilroberta-base") +``` + +في هذه المرحلة، تبقى ثلاث خطوات فقط: + +1. حدد معلمات التدريب الخاصة بك في [`TrainingArguments`]. المعلمة الوحيدة المطلوبة هي `output_dir` والتي تحدد مكان حفظ نموذجك. ستقوم بدفع هذا النموذج إلى Hub عن طريق تعيين `push_to_hub=True` (يجب أن تكون مسجلاً الدخول إلى Hugging Face لتحميل نموذجك). +2. قم بتمرير معلمات التدريب إلى [`Trainer`] مع النموذج، ومجموعات البيانات، ومجمّع البيانات. +3. قم باستدعاء [`~Trainer.train`] لتعديل نموذجك. + +```py +>>> training_args = TrainingArguments( +... output_dir="my_awesome_eli5_mlm_model", +... eval_strategy="epoch", +... learning_rate=2e-5, +... num_train_epochs=3, +... weight_decay=0.01, +... push_to_hub=True, +... ) + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=lm_dataset["train"], +... eval_dataset=lm_dataset["test"], +... data_collator=data_collator, +... tokenizer=tokenizer, +... ) + +>>> trainer.train() +``` + +بمجرد اكتمال التدريب، استخدم طريقة [`~transformers.Trainer.evaluate`] لتقييم النموذج والحصول على مقياس + الحيرة: + +```py +>>> import math + +>>> eval_results = trainer.evaluate() +>>> print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}") +Perplexity: 8.76 +``` + +ثم شارك نموذجك على Hub باستخدام طريقة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك: + +```py +>>> trainer.push_to_hub() +``` + + + + +إذا لم تكن على دراية بتعديل نموذج باستخدام Keras، ألق نظرة على الدليل الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)! + + +لتعديل نموذج في TensorFlow، ابدأ بإعداد دالة محسن، وجدول معدل التعلم، وبعض معلمات التدريب: + +```py +>>> from transformers import create_optimizer, AdamWeightDecay + +>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01) +``` + +ثم يمكنك تحميل DistilRoBERTa باستخدام [`TFAutoModelForMaskedLM`]: + +```py +>>> from transformers import TFAutoModelForMaskedLM + +>>> model = TFAutoModelForMaskedLM.from_pretrained("distilbert/distilroberta-base") +``` + +قم بتحويل مجموعات بياناتك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]: + +```py +>>> tf_train_set = model.prepare_tf_dataset( +... lm_dataset["train"], +... shuffle=True, +... batch_size=16, +... collate_fn=data_collator, +... ) + +>>> tf_test_set = model.prepare_tf_dataset( +... lm_dataset["test"], +... shuffle=False, +... batch_size=16, +... collate_fn=data_collator, +... ) +``` + +قم بتهيئة النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن نماذج Transformers لديها جميعها دالة خسارة افتراضية ذات صلة بالمهمة، لذلك لا تحتاج إلى تحديد واحدة ما لم تكن تريد ذلك: + +```py +>>> import tensorflow as tf + +>>> model.compile(optimizer=optimizer) # لا توجد حجة للخسارة! +``` + +يمكن القيام بذلك عن طريق تحديد مكان دفع نموذجك ومعالج الرموز في [`~transformers.PushToHubCallback`]: + +```py +>>> from transformers.keras_callbacks import PushToHubCallback + +>>> callback = PushToHubCallback( +... output_dir="my_awesome_eli5_mlm_model", +... tokenizer=tokenizer, +... ) +``` + +أخيراً، أنت مستعد لبدء تدريب نموذجك! قم باستدعاء [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق، وعدد العصور، والتعليقات الخاصة بك لتعديل النموذج: + +```py +>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=[callback]) +``` + +بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائياً إلى Hub حتى يتمكن الجميع من استخدامه! + + + + + +لمثال أكثر تفصيلاً حول كيفية تعديل نموذج للنمذجة اللغوية المقنعة، ألق نظرة على الدفتر المقابل +[دفتر PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb) +أو [دفتر TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb). + + + +## الاستدلال + +رائع، الآن بعد أن قمت بتعديل نموذج، يمكنك استخدامه للاستدلال! + +جهّز بعض النصوص التي تريد أن يملأ النموذج الفراغات فيها، واستخدم الرمز الخاص `` للإشارة إلى الفراغ: + +```py +>>> text = "The Milky Way is a galaxy." +``` + +أبسط طريقة لتجربة نموذجك المعدل للاستدلال هي استخدامه في [`pipeline`]. قم بإنشاء كائن `pipeline` لملء الفراغ مع نموذجك، ومرر نصك إليه. إذا أردت، يمكنك استخدام معلمة `top_k` لتحديد عدد التنبؤات التي تريد إرجاعها: + +```py +>>> from transformers import pipeline + +>>> mask_filler = pipeline("fill-mask", "username/my_awesome_eli5_mlm_model") +>>> mask_filler(text, top_k=3) +[{'score': 0.5150994658470154, + 'token': 21300, + 'token_str': ' spiral', + 'sequence': 'The Milky Way is a spiral galaxy.'}, + {'score': 0.07087188959121704, + 'token': 2232, + 'token_str': ' massive', + 'sequence': 'The Milky Way is a massive galaxy.'}, + {'score': 0.06434620916843414, + 'token': 650, + 'token_str': ' small', + 'sequence': 'The Milky Way is a small galaxy.'}] +``` + + + +قم بتجزئة النص وإرجاع `input_ids` كمتجهات PyTorch. ستحتاج أيضًا إلى تحديد موضع رمز ``: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_eli5_mlm_model") +>>> inputs = tokenizer(text, return_tensors="pt") +>>> mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1] +``` + +قم بتمرير المدخلات إلى النموذج وإرجاع `logits` للرمز المقنع: + +```py +>>> from transformers import AutoModelForMaskedLM + +>>> model = AutoModelForMaskedLM.from_pretrained("username/my_awesome_eli5_mlm_model") +>>> logits = model(**inputs).logits +>>> mask_token_logits = logits[0, mask_token_index, :] +``` + +ثم قم بإرجاع الرموز الثلاثة المقنعة ذات الاحتمالية الأعلى وطباعتها: + +```py +>>> top_3_tokens = torch.topk(mask_token_logits, 3, dim=1).indices[0].tolist() + +>>> for token in top_3_tokens: +... print(text.replace(tokenizer.mask_token, tokenizer.decode([token]))) +The Milky Way is a spiral galaxy. +The Milky Way is a massive galaxy. +The Milky Way is a small galaxy. +``` + + +قم بتقسيم النص إلى رموز وإرجاع `input_ids` كـ TensorFlow tensors. ستحتاج أيضًا إلى تحديد موضع رمز ``: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_eli5_mlm_model") +>>> inputs = tokenizer(text, return_tensors="tf") +>>> mask_token_index = tf.where(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1] +``` + +قم بتمرير المدخلات إلى النموذج وإرجاع `logits` للرمز المقنع: + +```py +>>> from transformers import TFAutoModelForMaskedLM + +>>> model = TFAutoModelForMaskedLM.from_pretrained("username/my_awesome_eli5_mlm_model") +>>> logits = model(**inputs).logits +>>> mask_token_logits = logits[0, mask_token_index, :] +``` + +ثم قم بإرجاع الرموز الثلاثة المقنعة ذات الاحتمالية الأعلى وطباعتها: + +```py +>>> top_3_tokens = tf.math.top_k(mask_token_logits, 3).indices.numpy() + +>>> for token in top_3_tokens: +... print(text.replace(tokenizer.mask_token, tokenizer.decode([token]))) +The Milky Way is a spiral galaxy. +The Milky Way is a massive galaxy. +The Milky Way is a small galaxy. +``` + + \ No newline at end of file diff --git a/docs/source/ar/tasks/multiple_choice.md b/docs/source/ar/tasks/multiple_choice.md new file mode 100644 index 000000000000..78f98560754f --- /dev/null +++ b/docs/source/ar/tasks/multiple_choice.md @@ -0,0 +1,452 @@ + + +# الاختيار من متعدد (Multiple choice) + +[[open-in-colab]] + +مهمة الاختيار من متعدد مشابهة لمهمة الإجابة على الأسئلة، ولكن مع توفير عدة إجابات محتملة مع سياق، ويُدرّب النموذج على تحديد الإجابة الصحيحة. + +سيوضح لك هذا الدليل كيفية: + +1. ضبط نموذج [BERT](https://huggingface.co/google-bert/bert-base-uncased) باستخدام الإعداد `regular` لمجموعة بيانات [SWAG](https://huggingface.co/datasets/swag) لاختيار الإجابة الأفضل من بين الخيارات المتعددة المتاحة مع السياق. +2. استخدام النموذج المضبوط للاستدلال. + +قبل البدء، تأكد من تثبيت جميع المكتبات الضرورية: + +```bash +pip install transformers datasets evaluate +``` + +نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل نموذجك ومشاركته مع المجتمع. عند المطالبة، أدخل الرمز المميز الخاص بك لتسجيل الدخول: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## تحميل مجموعة بيانات SWAG + +ابدأ بتحميل تهيئة `regular` لمجموعة بيانات SWAG من مكتبة 🤗 Datasets: + +```py +>>> from datasets import load_dataset + +>>> swag = load_dataset("swag", "regular") +``` + +ثم ألق نظرة على مثال: + +```py +>>> swag["train"][0] +{'ending0': 'passes by walking down the street playing their instruments.', + 'ending1': 'has heard approaching them.', + 'ending2': "arrives and they're outside dancing and asleep.", + 'ending3': 'turns the lead singer watches the performance.', + 'fold-ind': '3416', + 'gold-source': 'gold', + 'label': 0, + 'sent1': 'Members of the procession walk down the street holding small horn brass instruments.', + 'sent2': 'A drum line', + 'startphrase': 'Members of the procession walk down the street holding small horn brass instruments. A drum line', + 'video-id': 'anetv_jkn6uvmqwh4'} +``` + +على الرغم من أن الحقول تبدو كثيرة، إلا أنها في الواقع بسيطة جداً: + +- `sent1` و `sent2`: يعرض هذان الحقلان بداية الجملة، وبدمجهما معًا، نحصل على حقل `startphrase`. +- `ending`: يقترح نهاية محتملة للجملة، واحدة منها فقط هي الصحيحة. +- `label`: يحدد نهاية الجملة الصحيحة. + +## المعالجة المسبقة (Preprocess) + +الخطوة التالية هي استدعاء مُجزئ BERT لمعالجة بدايات الجمل والنهايات الأربع المحتملة: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased") +``` + +تحتاج دالة المعالجة المسبقة التي تريد إنشاءها إلى: + +1. إنشاء أربع نسخ من حقل `sent1` ودمج كل منها مع `sent2` لإعادة إنشاء كيفية بدء الجملة. +2. دمج `sent2` مع كل من نهايات الجمل الأربع المحتملة. +3. تتجميع هاتين القائمتين لتتمكن من تجزئتهما، ثم إعادة ترتيبها بعد ذلك بحيث يكون لكل مثال حقول `input_ids` و `attention_mask` و `labels` مقابلة. + + +```py +>>> ending_names = ["ending0", "ending1", "ending2", "ending3"] + +>>> def preprocess_function(examples): +... first_sentences = [[context] * 4 for context in examples["sent1"]] +... question_headers = examples["sent2"] +... second_sentences = [ +... [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers) +... ] + +... first_sentences = sum(first_sentences, []) +... second_sentences = sum(second_sentences, []) + +... tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True) +... return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()} +``` + +لتطبيق دالة المعالجة المسبقة على مجموعة البيانات بأكملها، استخدم طريقة [`~datasets.Dataset.map`] الخاصة بـ 🤗 Datasets. يمكنك تسريع دالة `map` عن طريق تعيين `batched=True` لمعالجة عناصر متعددة من مجموعة البيانات في وقت واحد: + +```py +tokenized_swag = swag.map(preprocess_function, batched=True) +``` + +لا يحتوي 🤗 Transformers على مجمع بيانات للاختيار من متعدد، لذلك ستحتاج إلى تكييف [`DataCollatorWithPadding`] لإنشاء دفعة من الأمثلة. من الأكفأ إضافة حشو (padding) ديناميكي للجمل إلى أطول طول في دفعة أثناء التجميع، بدلاً من حشو مجموعة البيانات بأكملها إلى الحد الأقصى للطول. + +يقوم `DataCollatorForMultipleChoice` بتجميع جميع مدخلات النموذج، ويطبق الحشو، ثم يعيد تجميع النتائج في شكلها الأصلي: + + + + +```py +>>> from dataclasses import dataclass +>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy +>>> from typing import Optional, Union +>>> import torch + +>>> @dataclass +... class DataCollatorForMultipleChoice: +... """ +... Data collator that will dynamically pad the inputs for multiple choice received. +... """ + +... tokenizer: PreTrainedTokenizerBase +... padding: Union[bool, str, PaddingStrategy] = True +... max_length: Optional[int] = None +... pad_to_multiple_of: Optional[int] = None + +... def __call__(self, features): +... label_name = "label" if "label" in features[0].keys() else "labels" +... labels = [feature.pop(label_name) for feature in features] +... batch_size = len(features) +... num_choices = len(features[0]["input_ids"]) +... flattened_features = [ +... [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features +... ] +... flattened_features = sum(flattened_features, []) + +... batch = self.tokenizer.pad( +... flattened_features, +... padding=self.padding, +... max_length=self.max_length, +... pad_to_multiple_of=self.pad_to_multiple_of, +... return_tensors="pt", +... ) + +... batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()} +... batch["labels"] = torch.tensor(labels, dtype=torch.int64) +... return batch +``` + + + +```py +>>> from dataclasses import dataclass +>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy +>>> from typing import Optional, Union +>>> import tensorflow as tf + +>>> @dataclass +... class DataCollatorForMultipleChoice: +... """ +... Data collator that will dynamically pad the inputs for multiple choice received. +... """ + +... tokenizer: PreTrainedTokenizerBase +... padding: Union[bool, str, PaddingStrategy] = True +... max_length: Optional[int] = None +... pad_to_multiple_of: Optional[int] = None + +... def __call__(self, features): +... label_name = "label" if "label" in features[0].keys() else "labels" +... labels = [feature.pop(label_name) for feature in features] +... batch_size = len(features) +... num_choices = len(features[0]["input_ids"]) +... flattened_features = [ +... [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features +... ] +... flattened_features = sum(flattened_features, []) + +... batch = self.tokenizer.pad( +... flattened_features, +... padding=self.padding, +... max_length=self.max_length, +... pad_to_multiple_of=self.pad_to_multiple_of, +... return_tensors="tf", +... ) + +... batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()} +... batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64) +... return batch +``` + + + +## التقييم (Evaluate) + +يُفضل غالبًا تضمين مقياس أثناء التدريب لتقييم أداء نموذجك. يمكنك تحميل طريقة تقييم بسرعة باستخدام مكتبة 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index). لهذه المهمة، قم بتحميل مقياس [الدقة](https://huggingface.co/spaces/evaluate-metric/accuracy) (انظر إلى [الجولة السريعة](https://huggingface.co/docs/evaluate/a_quick_tour) لـ 🤗 Evaluate لمعرفة المزيد حول كيفية تحميل المقياس وحسابه): + +```py +>>> import evaluate + +>>> accuracy = evaluate.load("accuracy") +``` + +ثم أنشئ دالة لتمرير التنبؤات والتسميات إلى [`~evaluate.EvaluationModule.compute`] لحساب الدقة: + +```py +>>> import numpy as np + +>>> def compute_metrics(eval_pred): +... predictions, labels = eval_pred +... predictions = np.argmax(predictions, axis=1) +... return accuracy.compute(predictions=predictions, references=labels) +``` + +دالتك `compute_metrics` جاهزة الآن، وستعود إليها عند إعداد تدريبك. + +## التدريب (Train) + + + + + + +إذا لم تكن معتادًا على ضبط نموذج باستخدام [`Trainer`], فراجع الدرس الأساسي [هنا](../training#train-with-pytorch-trainer)! + + + +أنت جاهز لبدء تدريب نموذجك الآن! قم بتحميل BERT باستخدام [`AutoModelForMultipleChoice`]: + +```py +>>> from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer + +>>> model = AutoModelForMultipleChoice.from_pretrained("google-bert/bert-base-uncased") +``` + +في هذه المرحلة، تبقى ثلاث خطوات فقط: + +1. حدد معلمات التدريب الخاصة بك في [`TrainingArguments`]. المعلمة الوحيدة المطلوبة هي `output_dir` التي تحدد مكان حفظ نموذجك. ستدفع هذا النموذج إلى Hub عن طريق تعيين `push_to_hub=True` (يجب عليك تسجيل الدخول إلى Hugging Face لتحميل نموذجك). في نهاية كل حقبة، سيقوم [`Trainer`] بتقييم الدقة وحفظ نقطة فحص التدريب. +2. مرر معلمات التدريب إلى [`Trainer`] جنبًا إلى جنب مع النموذج ومُجمِّع البيانات والمعالج ودالة تجميع البيانات ودالة `compute_metrics`. +3. استدعي [`~Trainer.train`] لضبط نموذجك. + +```py +>>> training_args = TrainingArguments( +... output_dir="my_awesome_swag_model", +... eval_strategy="epoch", +... save_strategy="epoch", +... load_best_model_at_end=True, +... learning_rate=5e-5, +... per_device_train_batch_size=16, +... per_device_eval_batch_size=16, +... num_train_epochs=3, +... weight_decay=0.01, +... push_to_hub=True, +... ) + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=tokenized_swag["train"], +... eval_dataset=tokenized_swag["validation"], +... processing_class=tokenizer, +... data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer), +... compute_metrics=compute_metrics, +... ) + +>>> trainer.train() +``` + +بمجرد اكتمال التدريب، شارك نموذجك مع Hub باستخدام طريقة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك: + +```py +>>> trainer.push_to_hub() +``` + + + + +إذا لم تكن معتادًا على ضبط نموذج باستخدام Keras، فراجع الدرس الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)! + + +لضبط نموذج في TensorFlow، ابدأ بإعداد دالة مُحسِّن وجدول معدل التعلم وبعض معلمات التدريب: + +```py +>>> from transformers import create_optimizer + +>>> batch_size = 16 +>>> num_train_epochs = 2 +>>> total_train_steps = (len(tokenized_swag["train"]) // batch_size) * num_train_epochs +>>> optimizer, schedule = create_optimizer(init_lr=5e-5, num_warmup_steps=0, num_train_steps=total_train_steps) +``` + +ثم يمكنك تحميل BERT باستخدام [`TFAutoModelForMultipleChoice`]: + +```py +>>> from transformers import TFAutoModelForMultipleChoice + +>>> model = TFAutoModelForMultipleChoice.from_pretrained("google-bert/bert-base-uncased") +``` + +حوّل مجموعات البيانات الخاصة بك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]: + +```py +>>> data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer) +>>> tf_train_set = model.prepare_tf_dataset( +... tokenized_swag["train"], +... shuffle=True, +... batch_size=batch_size, +... collate_fn=data_collator, +... ) + +>>> tf_validation_set = model.prepare_tf_dataset( +... tokenized_swag["validation"], +... shuffle=False, +... batch_size=batch_size, +... collate_fn=data_collator, +... ) +``` + +قم بتهيئة النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن جميع نماذج Transformers تحتوي على دالة خسارة مناسبة للمهمة بشكل افتراضي، لذلك لا تحتاج إلى تحديد واحدة ما لم ترغب في ذلك: + +```py +>>> model.compile(optimizer=optimizer) # لا توجد وسيطة خسارة! +``` + +الخطوتان الأخيرتان قبل بدء التدريب هما: حساب دقة التنبؤات، وتوفير طريقة لرفع النموذج إلى Hub. ويمكن تحقيق ذلك باستخدام [استدعاءات Keras](../main_classes/keras_callbacks) + +مرر دالتك `compute_metrics` إلى [`~transformers.KerasMetricCallback`]: + +```py +>>> from transformers.keras_callbacks import KerasMetricCallback + +>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set) +``` + +حدد مكان دفع نموذجك ومعالجك في [`~transformers.PushToHubCallback`]: + +```py +>>> from transformers.keras_callbacks import PushToHubCallback + +>>> push_to_hub_callback = PushToHubCallback( +... output_dir="my_awesome_model", +... tokenizer=tokenizer, +... ) +``` + +ثم قم بتضمين الاستدعاءات معًا: + +```py +>>> callbacks = [metric_callback, push_to_hub_callback] +``` + +أخيرًا، أنت جاهز لبدء تدريب نموذجك! استدعِ[`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق من الصحة وعدد الحقب والاستدعاءات لضبط النموذج: + +```py +>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=2, callbacks=callbacks) +``` + +بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه! + + + + + +للحصول على مثال أكثر تعمقًا حول كيفية ضبط نموذج للاختيار من متعدد، ألق نظرة على [دفتر ملاحظات PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb) +أو [دفتر ملاحظات TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb) المقابل. + + + +## الاستدلال (Inference) + +رائع، الآن بعد أن قمت بضبط نموذج، يمكنك استخدامه للاستدلال! + +قم بإنشاء نص واقتراح إجابتين محتملتين: + +```py +>>> prompt = "France has a bread law, Le Décret Pain, with strict rules on what is allowed in a traditional baguette." +>>> candidate1 = "The law does not apply to croissants and brioche." +>>> candidate2 = "The law applies to baguettes." +``` + + + +قم بتحليل كل مطالبة وزوج إجابة مرشح وأعد تنسورات PyTorch. يجب عليك أيضًا إنشاء بعض `العلامات`: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_swag_model") +>>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="pt", padding=True) +>>> labels = torch.tensor(0).unsqueeze(0) +``` + +مرر مدخلاتك والعلامات إلى النموذج وأرجع`logits`: + +```py +>>> from transformers import AutoModelForMultipleChoice + +>>> model = AutoModelForMultipleChoice.from_pretrained("username/my_awesome_swag_model") +>>> outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels) +>>> logits = outputs.logits +``` + +استخرج الفئة ذات الاحتمالية الأكبر: + +```py +>>> predicted_class = logits.argmax().item() +>>> predicted_class +0 +``` + + +قم بتحليل كل مطالبة وزوج إجابة مرشح وأعد موترات TensorFlow: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_swag_model") +>>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="tf", padding=True) +``` + +مرر مدخلاتك إلى النموذج وأعد القيم logits: + +```py +>>> from transformers import TFAutoModelForMultipleChoice + +>>> model = TFAutoModelForMultipleChoice.from_pretrained("username/my_awesome_swag_model") +>>> inputs = {k: tf.expand_dims(v, 0) for k, v in inputs.items()} +>>> outputs = model(inputs) +>>> logits = outputs.logits +``` + +استخرج الفئة ذات الاحتمالية الأكبر: + +```py +>>> predicted_class = int(tf.math.argmax(logits, axis=-1)[0]) +>>> predicted_class +0 +``` + + diff --git a/docs/source/ar/tasks/question_answering.md b/docs/source/ar/tasks/question_answering.md new file mode 100644 index 000000000000..0c4b66443d81 --- /dev/null +++ b/docs/source/ar/tasks/question_answering.md @@ -0,0 +1,432 @@ + + +# الإجابة على الأسئلة (Question answering) + +[[open-in-colab]] + + + +تُقدّم مهام الإجابة على الأسئلة إجابةً بناءً على سؤال. إذا سبق لك أن سألت مساعدًا افتراضيًا مثل Alexa أو Siri أو Google عن حالة الطقس، فأنت قد استخدمت نموذج للإجابة على الأسئلة من قبل. هناك نوعان شائعان لمهام الإجابة على الأسئلة: + +- الاستخراجية: استخراج الإجابة من السياق المحدد. +- التلخيصية: إنشاء إجابة من السياق تجيب على السؤال بشكل صحيح. + +سيوضح لك هذا الدليل كيفية: + +1. ضبط [DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased) على مجموعة بيانات [SQuAD](https://huggingface.co/datasets/squad) للإجابة على الأسئلة الاستخراجية. +2. استخدام النموذج المضبوط للاستدلال. + + + +لمشاهدة جميع الهياكل والنسخ المتوافقة مع هذه المهمة، نوصي بالرجوع إلى [صفحة المهمة](https://huggingface.co/tasks/question-answering) + + + +قبل البدء، تأكد من تثبيت جميع المكتبات الضرورية: + +```bash +pip install transformers datasets evaluate +``` + +نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل نموذجك ومشاركته مع المجتمع. عند المطالبة، أدخل الرمز المميز الخاص بك لتسجيل الدخول: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## تحميل مجموعة بيانات SQuAD + +ابدأ بتحميل جزء أصغر من مجموعة بيانات SQuAD من مكتبة 🤗 Datasets. سيتيح لك ذلك فرصة للتجربة والتحقق من عمل كل شيء بشكل صحيح قبل قضاء المزيد من الوقت في التدريب على مجموعة البيانات الكاملة. + +```py +>>> from datasets import load_dataset + +>>> squad = load_dataset("squad", split="train[:5000]") +``` + +قم بتقسيم تقسيم `train` لمجموعة البيانات إلى مجموعة تدريب واختبار باستخدام طريقة [`~datasets.Dataset.train_test_split`]: + +```py +>>> squad = squad.train_test_split(test_size=0.2) +``` + +ثم ألق نظرة على مثال: + +```py +>>> squad["train"][0] +{'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']}, + 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', + 'id': '5733be284776f41900661182', + 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', + 'title': 'University_of_Notre_Dame' +} +``` + +هناك العديد من الحقول المهمة هنا: + +- `answers`: موقع بداية الرمز المميز للإجابة ونص الإجابة. +- `context`: معلومات أساسية يحتاج النموذج إلى استخراج الإجابة منها. +- `question`: السؤال الذي يجب على النموذج الإجابة عليه. + +## المعالجة المسبقة (Preprocess) + + + +الخطوة التالية هي تحميل المحلل اللغوى DistilBERT لمعالجة حقلي `question` و `context`: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased") +``` + +هناك بعض خطوات المعالجة المسبقة الخاصة بمهام الإجابة على الأسئلة التي يجب أن تكون على دراية بها: + +1. قد تحتوي بعض الأمثلة في مجموعة البيانات على `context` طويلًا يتجاوز الحد الأقصى لطول مدخل النموذج. للتعامل مع النصوص الأطول، يتم اقتطاع `context` فقط عن طريق تعيين `truncation="only_second"`. +2. بعد ذلك، يتم تحديد مواضع بداية ونهاية الإجابة في `context` الأصلي عن طريق تعيين + `return_offset_mapping=True`. +3. باستخدام التعيين، يمكن الآن تحديد رموز بداية ونهاية الإجابة. استخدم طريقة [`~tokenizers.Encoding.sequence_ids`] + لتحديد أجزاء الإزاحة التي تتوافق مع `question` و `context`. + +فيما يلي كيفية إنشاء دالة لقص وتعيين رموز البداية والنهاية لـ `answer` إلى `context`: + +```py +>>> def preprocess_function(examples): +... questions = [q.strip() for q in examples["question"]] +... inputs = tokenizer( +... questions, +... examples["context"], +... max_length=384, +... truncation="only_second", +... return_offsets_mapping=True, +... padding="max_length", +... ) + +... offset_mapping = inputs.pop("offset_mapping") +... answers = examples["answers"] +... start_positions = [] +... end_positions = [] + +... for i, offset in enumerate(offset_mapping): +... answer = answers[i] +... start_char = answer["answer_start"][0] +... end_char = answer["answer_start"][0] + len(answer["text"][0]) +... sequence_ids = inputs.sequence_ids(i) + +... # Find the start and end of the context +... idx = 0 +... while sequence_ids[idx] != 1: +... idx += 1 +... context_start = idx +... while sequence_ids[idx] == 1: +... idx += 1 +... context_end = idx - 1 + +... # If the answer is not fully inside the context, label it (0, 0) +... if offset[context_start][0] > end_char or offset[context_end][1] < start_char: +... start_positions.append(0) +... end_positions.append(0) +... else: +... # Otherwise it's the start and end token positions +... idx = context_start +... while idx <= context_end and offset[idx][0] <= start_char: +... idx += 1 +... start_positions.append(idx - 1) + +... idx = context_end +... while idx >= context_start and offset[idx][1] >= end_char: +... idx -= 1 +... end_positions.append(idx + 1) + +... inputs["start_positions"] = start_positions +... inputs["end_positions"] = end_positions +... return inputs +``` + +لتطبيق المعالجة المسبقة على كامل مجموعة البيانات، استخدم [`~datasets.Dataset.map`] من مكتبة 🤗 Datasets. يمكنك تسريع دالة `map` عن طريق تعيين `batched=True` لمعالجة عناصر متعددة من مجموعة البيانات دفعة واحدة. قم بإزالة أي أعمدة لا تحتاجها: + +```py +>>> tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names) +``` + +الآن قم بإنشاء دفعة من الأمثلة باستخدام [`DefaultDataCollator`]. بخلاف مجمّعات البيانات الأخرى في 🤗 Transformers، لا يطبق [`DefaultDataCollator`] أي معالجة مسبقة إضافية مثل الحشو. + + + + +```py +>>> from transformers import DefaultDataCollator + +>>> data_collator = DefaultDataCollator() +``` + + + +```py +>>> from transformers import DefaultDataCollator + +>>> data_collator = DefaultDataCollator(return_tensors="tf") +``` + + + +## التدريب (Train) + + + + + + +إذا لم تكن معتادًا على ضبط نموذج باستخدام [`Trainer`], ألق نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-with-pytorch-trainer)! + + + +أنت جاهز لبدء تدريب نموذجك الآن! قم بتحميل DistilBERT باستخدام [`AutoModelForQuestionAnswering`]: + +```py +>>> from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer + +>>> model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased") +``` + +في هذه المرحلة، تبقى ثلاث خطوات فقط: + +1. حدد المعاملات الفائقة للتدريب في [`TrainingArguments`]. المعامل الوحيد المطلوب هو `output_dir` الذي يحدد مكان حفظ نموذجك. ستدفع هذا النموذج إلى Hub عن طريق تعيين `push_to_hub=True` (يجب عليك تسجيل الدخول إلى Hugging Face لتحميل نموذجك). +2. مرر معاملات التدريب إلى [`Trainer`] جنبًا إلى جنب مع النموذج، ومجموعة البيانات، والمُحلّل النصي، ومُجمّع البيانات. +3. استدعِ ـ [`~Trainer.train`] لضبط النموذج. + +```py +>>> training_args = TrainingArguments( +... output_dir="my_awesome_qa_model", +... eval_strategy="epoch", +... learning_rate=2e-5, +... per_device_train_batch_size=16, +... per_device_eval_batch_size=16, +... num_train_epochs=3, +... weight_decay=0.01, +... push_to_hub=True, +... ) + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=tokenized_squad["train"], +... eval_dataset=tokenized_squad["test"], +... processing_class=tokenizer, +... data_collator=data_collator, +... ) + +>>> trainer.train() +``` + +بمجرد اكتمال التدريب، شارك نموذجك في Hub باستخدام الدالة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك: + +```py +>>> trainer.push_to_hub() +``` + + + + + +إذا لم تكن معتادًا على ضبط نموذج باستخدام Keras، فألق نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)! + + +لضبط نموذج في TensorFlow، ابدأ بإعداد دالة مُحسِّن، وجدول معدل التعلم، وبعض المعاملات الفائقة للتدريب: + +```py +>>> from transformers import create_optimizer + +>>> batch_size = 16 +>>> num_epochs = 2 +>>> total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs +>>> optimizer, schedule = create_optimizer( +... init_lr=2e-5, +... num_warmup_steps=0, +... num_train_steps=total_train_steps, +... ) +``` + +ثم يمكنك تحميل DistilBERT باستخدام [`TFAutoModelForQuestionAnswering`]: + +```py +>>> from transformers import TFAutoModelForQuestionAnswering + +>>> model = TFAutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased") +``` + +حوّل مجموعات البيانات الخاصة بك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]: + +```py +>>> tf_train_set = model.prepare_tf_dataset( +... tokenized_squad["train"], +... shuffle=True, +... batch_size=16, +... collate_fn=data_collator, +... ) + +>>> tf_validation_set = model.prepare_tf_dataset( +... tokenized_squad["test"], +... shuffle=False, +... batch_size=16, +... collate_fn=data_collator, +... ) +``` + +قم بتكوين النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method): + +```py +>>> import tensorflow as tf + +>>> model.compile(optimizer=optimizer) +``` + +آخر شيء يجب إعداده قبل بدء التدريب هو توفير طريقة لدفع نموذجك إلى Hub. يمكن القيام بذلك عن طريق تحديد مكان دفع نموذجك ومعالجك المعجمي في [`~transformers.PushToHubCallback`]: + +```py +>>> from transformers.keras_callbacks import PushToHubCallback + +>>> callback = PushToHubCallback( +... output_dir="my_awesome_qa_model", +... tokenizer=tokenizer, +... ) +``` + +أخيرًا، أنت جاهز لبدء تدريب نموذجك! اتصل بـ [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق من الصحة، وعدد العهود، ومعاودة الاتصال الخاصة بك لضبط النموذج: + +```py +>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=[callback]) +``` +بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه! + + + + + + +للحصول على مثال أكثر تعمقًا حول كيفية ضبط نموذج للإجابة على الأسئلة، ألق نظرة على [دفتر ملاحظات PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb) المقابل +أو [دفتر ملاحظات TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb). + + + +## التقييم (Evaluate) + +يتطلب التقييم للإجابة على الأسئلة قدرًا كبيرًا من المعالجة اللاحقة. لتوفير وقتك، يتخطى هذا الدليل خطوة التقييم. لا يزال [`Trainer`] يحسب خسارة التقييم أثناء التدريب، مما يعني أنك لست تجهل تمامًا أداء نموذجك. + +إذا كان لديك المزيد من الوقت وتهتم بكيفية تقييم نموذجك للإجابة على الأسئلة، فألق نظرة على فصل [الإجابة على الأسئلة](https://huggingface.co/course/chapter7/7?fw=pt#post-processing) من دورة 🤗 Hugging Face! + +## الاستدلال (Inference) + +رائع، الآن بعد أن قمت بضبط نموذج، يمكنك استخدامه للاستدلال! + +حدد سؤالًا وسياقًا ليقوم النموذج بالتنبؤ بالإجابة عليه: + +```py +>>> question = "How many programming languages does BLOOM support?" +>>> context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages." +``` + +أبسط طريقة لتجربة نموذجك المُدرَّب للاستدلال هي استخدامه في [`pipeline`]. قم بإنشاء كائن لـ `pipeline` للإجابة على الأسئلة باستخدام نموذجك، ومرِّر النص إليه: + +```py +>>> from transformers import pipeline + +>>> question_answerer = pipeline("question-answering", model="my_awesome_qa_model") +>>> question_answerer(question=question, context=context) +{'score': 0.2058267742395401, + 'start': 10, + 'end': 95, + 'answer': '176 مليار معامل ويمكنه إنشاء نصوص بـ 46 لغة طبيعية و 13'} +``` + +يمكنك أيضًا تكرار نتائج `pipeline` يدويًا إذا أردت: + + + + + قسّم النص وأرجع تنسورات PyTorch: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model") +>>> inputs = tokenizer(question, context, return_tensors="pt") +``` + +مرر مدخلاتك إلى النموذج وأرجع `logits`: + +```py +>>> import torch +>>> from transformers import AutoModelForQuestionAnswering + +>>> model = AutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model") +>>> with torch.no_grad(): +... outputs = model(**inputs) +``` + +احصل على أعلى احتمال من مخرجات النموذج لموضعي البداية والنهاية: + +```py +>>> answer_start_index = outputs.start_logits.argmax() +>>> answer_end_index = outputs.end_logits.argmax() +``` + +استخلاص الإجابة من الرموز المتوقعة: + +```py +>>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1] +>>> tokenizer.decode(predict_answer_tokens) +'176 billion parameters and can generate text in 46 languages natural languages and 13' +``` + + +قم بتحليل النص المعجمي وأعد موترات TensorFlow: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model") +>>> inputs = tokenizer(question, context, return_tensors="tf") +``` + +مرر مدخلاتك إلى النموذج وأعد `logits`: + +```py +>>> from transformers import TFAutoModelForQuestionAnswering + +>>> model = TFAutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model") +>>> outputs = model(**inputs) +``` + +احصل على أعلى احتمال من مخرجات النموذج لموضعي البداية والنهاية: + +```py +>>> answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0]) +>>> answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0]) +``` + +استخلاص الإجابة من الرموز المتوقعة: + +```py +>>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1] +>>> tokenizer.decode(predict_answer_tokens) +'176 billion parameters and can generate text in 46 languages natural languages and 13' +``` + + diff --git a/docs/source/ar/tasks/sequence_classification.md b/docs/source/ar/tasks/sequence_classification.md new file mode 100644 index 000000000000..a98964957b47 --- /dev/null +++ b/docs/source/ar/tasks/sequence_classification.md @@ -0,0 +1,387 @@ + + +# تصنيف النص(Text classification) + +[[open-in-colab]] + + + +تصنيف النص هو مهمة NLP شائعة حيث يُعيّن تصنيفًا أو فئة للنص. تستخدم بعض أكبر الشركات تصنيف النصوص في الإنتاج لمجموعة واسعة من التطبيقات العملية. أحد أكثر أشكال تصنيف النص شيوعًا هو تحليل المشاعر، والذي يقوم بتعيين تسمية مثل 🙂 إيجابية، 🙁 سلبية، أو 😐 محايدة لتسلسل نصي. + +سيوضح لك هذا الدليل كيفية: + +1. ضبط [DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased) على مجموعة بيانات [IMDb](https://huggingface.co/datasets/imdb) لتحديد ما إذا كانت مراجعة الفيلم إيجابية أو سلبية. +2. استخدام نموذج الضبط الدقيق للتنبؤ. + + + +لرؤية جميع البنى ونقاط التحقق المتوافقة مع هذه المهمة، نوصي بالتحقق من [صفحة المهمة](https://huggingface.co/tasks/text-classification). + + + +قبل أن تبدأ، تأكد من تثبيت جميع المكتبات الضرورية: + +```bash +pip install transformers datasets evaluate accelerate +``` + +نحن نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل ومشاركة نموذجك مع المجتمع. عند المطالبة، أدخل رمزك لتسجيل الدخول: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## تحميل مجموعة بيانات IMDb + +ابدأ بتحميل مجموعة بيانات IMDb من مكتبة 🤗 Datasets: + +```py +>>> from datasets import load_dataset + +>>> imdb = load_dataset("imdb") +``` + +ثم ألق نظرة على مثال: + +```py +>>> imdb["test"][0] +{ + "label": 0, + "text": "I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have to always say \"Gene Roddenberry's Earth...\" otherwise people would not continue watching. Roddenberry's ashes must be turning in their orbit as this dull, cheap, poorly edited (watching it without advert breaks really brings this home) trudging Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring him back as another actor. Jeeez! Dallas all over again.", +} +``` + +هناك حقولان في هذه المجموعة من البيانات: + +- `text`: نص مراجعة الفيلم. +- `label`: قيمة إما `0` لمراجعة سلبية أو `1` لمراجعة إيجابية. + +## المعالجة المسبقة(Preprocess) + +الخطوة التالية هي تحميل المُجزِّئ النص DistilBERT لتهيئة لحقل `text`: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased") +``` + +أنشئ دالة لتهيئة حقل `text` وتقصير السلاسل النصية بحيث لا يتجاوز طولها الحد الأقصى لإدخالات DistilBERT: + +```py +>>> def preprocess_function(examples): +... return tokenizer(examples["text"], truncation=True) +``` + +لتطبيق دالة التهيئة على مجموعة البيانات بأكملها، استخدم دالة 🤗 Datasets [`~datasets.Dataset.map`] . يمكنك تسريع `map` باستخدام `batched=True` لمعالجة دفعات من البيانات: + +```py +tokenized_imdb = imdb.map(preprocess_function, batched=True) +``` + +الآن قم بإنشاء دفعة من الأمثلة باستخدام [`DataCollatorWithPadding`]. الأكثر كفاءة هو استخدام الحشو الديناميكي لجعل الجمل متساوية في الطول داخل كل دفعة، بدلًا من حشو كامل البيانات إلى الحد الأقصى للطول. + + + + +```py +>>> from transformers import DataCollatorWithPadding + +>>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer) +``` + + + +```py +>>> from transformers import DataCollatorWithPadding + +>>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf") +``` + + + +## التقييم(Evaluate) + +يُعدّ تضمين مقياس أثناء التدريب مفيدًا لتقييم أداء النموذج. يمكنك تحميل طريقة تقييم بسرعة باستخدام مكتبة 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) . بالنسبة لهذه المهمة، قم بتحميل مقياس [الدقة](https://huggingface.co/spaces/evaluate-metric/accuracy) (راجع جولة 🤗 Evaluate [السريعة](https://huggingface.co/docs/evaluate/a_quick_tour) لمعرفة المزيد حول كيفية تحميل وحساب مقياس): + +```py +>>> import evaluate + +>>> accuracy = evaluate.load("accuracy") +``` + +ثم أنشئ دالة تقوم بتمرير تنبؤاتك وتصنيفاتك إلى [`~evaluate.EvaluationModule.compute`] لحساب الدقة: + +```py +>>> import numpy as np + +>>> def compute_metrics(eval_pred): +... predictions, labels = eval_pred +... predictions = np.argmax(predictions, axis=1) +... return accuracy.compute(predictions=predictions, references=labels) +``` + +دالة `compute_metrics` جاهزة الآن، وستعود إليها عند إعداد التدريب. + +## التدريب(Train) + +قبل أن تبدأ في تدريب نموذجك، قم بإنشاء خريطة من المعرفات المتوقعة إلى تسمياتها باستخدام `id2label` و `label2id`: + +```py +>>> id2label = {0: "NEGATIVE", 1: "POSITIVE"} +>>> label2id = {"NEGATIVE": 0, "POSITIVE": 1} +``` + + + + + +إذا لم تكن على دراية بضبط نموذج دقيق باستخدام [`Trainer`], فالق نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-with-pytorch-trainer)! + + + +أنت مستعد الآن لبدء تدريب نموذجك! قم بتحميل DistilBERT مع [`AutoModelForSequenceClassification`] جنبًا إلى جنب مع عدد التصنيفات المتوقعة، وتصنيفات الخرائط: + +```py +>>> from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer + +>>> model = AutoModelForSequenceClassification.from_pretrained( +... "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id +... ) +``` + +في هذه المرحلة، هناك ثلاث خطوات فقط متبقية: + +1. حدد مُعامِلات التدريب في [`TrainingArguments`]. المُعامل المطلوب الوحيد هو `output_dir`، لتحديد مكان حفظ النموذج. يمكنك رفع النموذج إلى Hub بتعيين `push_to_hub=True` (يجب تسجيل الدخول إلى Hugging Face لرفع النموذج). سيقوم `Trainer` بتقييم الدقة وحفظ نقاط التحقق في نهاية كل حقبة. +2. مرر مُعامِلات التدريب إلى `Trainer` مع النموذج، ومجموعة البيانات، والمحلل اللغوي، ومُجمِّع البيانات، ووظيفة `compute_metrics`. +3. استدعِ [`~Trainer.train`] لضبط النموذج. + +```py +>>> training_args = TrainingArguments( +... output_dir="my_awesome_model", +... learning_rate=2e-5, +... per_device_train_batch_size=16, +... per_device_eval_batch_size=16, +... num_train_epochs=2, +... weight_decay=0.01, +... eval_strategy="epoch", +... save_strategy="epoch", +... load_best_model_at_end=True, +... push_to_hub=True, +... ) + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=tokenized_imdb["train"], +... eval_dataset=tokenized_imdb["test"], +... processing_class=tokenizer, +... data_collator=data_collator, +... compute_metrics=compute_metrics, +... ) + +>>> trainer.train() +``` + + + +يستخدم [`Trainer`] الحشو الديناميكي افتراضيًا عند تمرير `tokenizer` إليه. في هذه الحالة، لا تحتاج لتحديد مُجمِّع البيانات صراحةً. + + + +بعد اكتمال التدريب، شارك نموذجك على Hub باستخدام الطريقة [`~transformers.Trainer.push_to_hub`] ليستخدمه الجميع: + +```py +>>> trainer.push_to_hub() +``` + + + + +إذا لم تكن على دراية بضبط نموذج باستخدام Keras، قم بالاطلاع على البرنامج التعليمي الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)! + + +لضبط نموذج في TensorFlow، ابدأ بإعداد دالة المحسن، وجدول معدل التعلم، وبعض معلمات التدريب: + +```py +>>> from transformers import create_optimizer +>>> import tensorflow as tf + +>>> batch_size = 16 +>>> num_epochs = 5 +>>> batches_per_epoch = len(tokenized_imdb["train"]) // batch_size +>>> total_train_steps = int(batches_per_epoch * num_epochs) +>>> optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps) +``` + +ثم يمكنك تحميل DistilBERT مع [`TFAutoModelForSequenceClassification`] بالإضافة إلى عدد التصنيفات المتوقعة، وتعيينات التسميات: + +```py +>>> from transformers import TFAutoModelForSequenceClassification + +>>> model = TFAutoModelForSequenceClassification.from_pretrained( +... "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id +... ) +``` + +قم بتحويل مجموعات بياناتك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]: + +```py +>>> tf_train_set = model.prepare_tf_dataset( +... tokenized_imdb["train"], +... shuffle=True, +... batch_size=16, +... collate_fn=data_collator, +... ) + +>>> tf_validation_set = model.prepare_tf_dataset( +... tokenized_imdb["test"], +... shuffle=False, +... batch_size=16, +... collate_fn=data_collator, +... ) +``` + +قم بتهيئة النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن جميع نماذج Transformers لديها دالة خسارة ذات صلة بالمهمة بشكل افتراضي، لذلك لا تحتاج إلى تحديد واحدة ما لم ترغب في ذلك: + +```py +>>> import tensorflow as tf + +>>> model.compile(optimizer=optimizer) # No loss argument! +``` + +آخر أمرين يجب إعدادهما قبل بدء التدريب هو حساب الدقة من التوقعات، وتوفير طريقة لدفع نموذجك إلى Hub. يتم ذلك باستخدام [Keras callbacks](../main_classes/keras_callbacks). + +قم بتمرير دالة `compute_metrics` الخاصة بك إلى [`~transformers.KerasMetricCallback`]: + +```py +>>> from transformers.keras_callbacks import KerasMetricCallback + +>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set) +``` + +حدد مكان دفع نموذجك والمجزئ اللغوي في [`~transformers.PushToHubCallback`]: + +```py +>>> from transformers.keras_callbacks import PushToHubCallback + +>>> push_to_hub_callback = PushToHubCallback( +... output_dir="my_awesome_model", +... tokenizer=tokenizer, +... ) +``` + +ثم اجمع الاستدعاءات معًا: + +```py +>>> callbacks = [metric_callback, push_to_hub_callback] +``` + +أخيرًا، أنت مستعد لبدء تدريب نموذجك! قم باستدعاء [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق، وعدد الحقبات، واستدعاءاتك لضبط النموذج: + +```py +>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks) +``` + +بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه! + + + + + +للحصول على مثال أكثر عمقًا حول كيفية ضبط نموذج لتصنيف النصوص، قم بالاطلاع على الدفتر المقابل +[دفتر PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb) +أو [دفتر TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb). + + + +## الاستدلال(Inference) + +رائع، الآن بعد أن قمت بضبط نموذج، يمكنك استخدامه للاستدلال! + +احصل على بعض النصوص التي ترغب في إجراء الاستدلال عليها: + +```py +>>> text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three." +``` + +أسهل طريقة لتجربة النموذج المضبوط للاستدلال هي استخدامه ضمن [`pipeline`]. قم بإنشاء `pipeline` لتحليل المشاعر مع نموذجك، ومرر نصك إليه: + +```py +>>> from transformers import pipeline + +>>> classifier = pipeline("sentiment-analysis", model="stevhliu/my_awesome_model") +>>> classifier(text) +[{'label': 'POSITIVE', 'score': 0.9994940757751465}] +``` + +يمكنك أيضًا تكرار نتائج `pipeline` يدويًا إذا أردت: + + + +قم يتجزئة النص وإرجاع تنسورات PyTorch: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_model") +>>> inputs = tokenizer(text, return_tensors="pt") +``` + +مرر المدخلات إلى النموذج واسترجع `logits`: + +```py +>>> from transformers import AutoModelForSequenceClassification + +>>> model = AutoModelForSequenceClassification.from_pretrained("stevhliu/my_awesome_model") +>>> with torch.no_grad(): +... logits = model(**inputs).logits +``` + +استخرج الفئة ذات الاحتمالية الأعلى، واستخدم `id2label` لتحويلها إلى تصنيف نصي: + +```py +>>> predicted_class_id = logits.argmax().item() +>>> model.config.id2label[predicted_class_id] +'POSITIVE' +``` + + +قم بتحليل النص وإرجاع تنسيقات TensorFlow: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_model") +>>> inputs = tokenizer(text, return_tensors="tf") +``` + +قم بتمرير مدخلاتك إلى النموذج وإرجاع `logits`: + +```py +>>> from transformers import TFAutoModelForSequenceClassification + +>>> model = TFAutoModelForSequenceClassification.from_pretrained("stevhliu/my_awesome_model") +>>> logits = model(**inputs).logits +``` + +استخرج الفئة ذات الاحتمالية الأعلى، واستخدم `id2label` لتحويلها إلى تصنيف نصي: + +```py +>>> predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0]) +>>> model.config.id2label[predicted_class_id] +'POSITIVE' +``` + + diff --git a/docs/source/ar/tasks/summarization.md b/docs/source/ar/tasks/summarization.md new file mode 100644 index 000000000000..17dbcb42e837 --- /dev/null +++ b/docs/source/ar/tasks/summarization.md @@ -0,0 +1,397 @@ + + +# التلخيص (Summarization) + +[[open-in-colab]] + + + +يقوم التلخيص بإنشاء نسخة مختصرة من مستند أو مقال، حيث يلتقط جميع المعلومات المهمة. بالإضافة إلى الترجمة، يعتبر التلخيص مثالاً آخر على مهمة يمكن صياغتها كتسلسل إلى تسلسل. يمكن أن يكون التلخيص: + +- استخراجي: استخراج أهم المعلومات من مستند. +- تجريدي: إنشاء نص جديد يلخص أهم المعلومات. + +سيوضح لك هذا الدليل كيفية: + +1. ضبط دقيق [T5](https://huggingface.co/google-t5/t5-small) على مجموعة فرعية من مشاريع قوانين ولاية كاليفورنيا من مجموعة بيانات [BillSum](https://huggingface.co/datasets/billsum) للتلخيص التجريدي. +2. استخدام النموذج المضبوط بدقة للتنبؤ. + + + +لمشاهدة جميع البنى ونقاط التفتيش المتوافقة مع هذه المهمة، نوصي بالتحقق من [صفحة المهمة](https://huggingface.co/tasks/summarization) + + + +قبل البدء، تأكد من تثبيت جميع المكتبات الضرورية: + +```bash +pip install transformers datasets evaluate rouge_score +``` + +نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل نموذجك ومشاركته مع المجتمع. عند المطالبة، أدخل الرمز المميز لتسجيل الدخول: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## تحميل مجموعة بيانات BillSum + +ابدأ بتحميل جزء صغير من بيانات مشاريع القوانين الخاصة بولاية كاليفورنيا من مجموعة بيانات BillSum في مكتبة 🤗 Datasets: + +```py +>>> from datasets import load_dataset + +>>> billsum = load_dataset("billsum", split="ca_test") +``` + +قسّم مجموعة البيانات إلى مجموعتي تدريب واختبار باستخدام الدالة [`~datasets.Dataset.train_test_split`]: + +```py +>>> billsum = billsum.train_test_split(test_size=0.2) +``` + +ثم ألقِ نظرة على مثال: + +```py +>>> billsum["train"][0] +{'summary': 'Existing law authorizes state agencies to enter into contracts for the acquisition of goods or services upon approval by the Department of General Services. Existing law sets forth various requirements and prohibitions for those contracts, including, but not limited to, a prohibition on entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between spouses and domestic partners or same-sex and different-sex couples in the provision of benefits. Existing law provides that a contract entered into in violation of those requirements and prohibitions is void and authorizes the state or any person acting on behalf of the state to bring a civil action seeking a determination that a contract is in violation and therefore void. Under existing law, a willful violation of those requirements and prohibitions is a misdemeanor.\nThis bill would also prohibit a state agency from entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between employees on the basis of gender identity in the provision of benefits, as specified. By expanding the scope of a crime, this bill would impose a state-mandated local program.\nThe California Constitution requires the state to reimburse local agencies and school districts for certain costs mandated by the state. Statutory provisions establish procedures for making that reimbursement.\nThis bill would provide that no reimbursement is required by this act for a specified reason.', + 'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 10295.35 is added to the Public Contract Code, to read:\n10295.35.\n(a) (1) Notwithstanding any other law, a state agency shall not enter into any contract for the acquisition of goods or services in the amount of one hundred thousand dollars ($100,000) or more with a contractor that, in the provision of benefits, discriminates between employees on the basis of an employee’s or dependent’s actual or perceived gender identity, including, but not limited to, the employee’s or dependent’s identification as transgender.\n(2) For purposes of this section, “contract” includes contracts with a cumulative amount of one hundred thousand dollars ($100,000) or more per contractor in each fiscal year.\n(3) For purposes of this section, an employee health plan is discriminatory if the plan is not consistent with Section 1365.5 of the Health and Safety Code and Section 10140 of the Insurance Code.\n(4) The requirements of this section shall apply only to those portions of a contractor’s operations that occur under any of the following conditions:\n(A) Within the state.\n(B) On real property outside the state if the property is owned by the state or if the state has a right to occupy the property, and if the contractor’s presence at that location is connected to a contract with the state.\n(C) Elsewhere in the United States where work related to a state contract is being performed.\n(b) Contractors shall treat as confidential, to the maximum extent allowed by law or by the requirement of the contractor’s insurance provider, any request by an employee or applicant for employment benefits or any documentation of eligibility for benefits submitted by an employee or applicant for employment.\n(c) After taking all reasonable measures to find a contractor that complies with this section, as determined by the state agency, the requirements of this section may be waived under any of the following circumstances:\n(1) There is only one prospective contractor willing to enter into a specific contract with the state agency.\n(2) The contract is necessary to respond to an emergency, as determined by the state agency, that endangers the public health, welfare, or safety, or the contract is necessary for the provision of essential services, and no entity that complies with the requirements of this section capable of responding to the emergency is immediately available.\n(3) The requirements of this section violate, or are inconsistent with, the terms or conditions of a grant, subvention, or agreement, if the agency has made a good faith attempt to change the terms or conditions of any grant, subvention, or agreement to authorize application of this section.\n(4) The contractor is providing wholesale or bulk water, power, or natural gas, the conveyance or transmission of the same, or ancillary services, as required for ensuring reliable services in accordance with good utility practice, if the purchase of the same cannot practically be accomplished through the standard competitive bidding procedures and the contractor is not providing direct retail services to end users.\n(d) (1) A contractor shall not be deemed to discriminate in the provision of benefits if the contractor, in providing the benefits, pays the actual costs incurred in obtaining the benefit.\n(2) If a contractor is unable to provide a certain benefit, despite taking reasonable measures to do so, the contractor shall not be deemed to discriminate in the provision of benefits.\n(e) (1) Every contract subject to this chapter shall contain a statement by which the contractor certifies that the contractor is in compliance with this section.\n(2) The department or other contracting agency shall enforce this section pursuant to its existing enforcement powers.\n(3) (A) If a contractor falsely certifies that it is in compliance with this section, the contract with that contractor shall be subject to Article 9 (commencing with Section 10420), unless, within a time period specified by the department or other contracting agency, the contractor provides to the department or agency proof that it has complied, or is in the process of complying, with this section.\n(B) The application of the remedies or penalties contained in Article 9 (commencing with Section 10420) to a contract subject to this chapter shall not preclude the application of any existing remedies otherwise available to the department or other contracting agency under its existing enforcement powers.\n(f) Nothing in this section is intended to regulate the contracting practices of any local jurisdiction.\n(g) This section shall be construed so as not to conflict with applicable federal laws, rules, or regulations. In the event that a court or agency of competent jurisdiction holds that federal law, rule, or regulation invalidates any clause, sentence, paragraph, or section of this code or the application thereof to any person or circumstances, it is the intent of the state that the court or agency sever that clause, sentence, paragraph, or section so that the remainder of this section shall remain in effect.\nSEC. 2.\nSection 10295.35 of the Public Contract Code shall not be construed to create any new enforcement authority or responsibility in the Department of General Services or any other contracting agency.\nSEC. 3.\nNo reimbursement is required by this act pursuant to Section 6 of Article XIII\u2009B of the California Constitution because the only costs that may be incurred by a local agency or school district will be incurred because this act creates a new crime or infraction, eliminates a crime or infraction, or changes the penalty for a crime or infraction, within the meaning of Section 17556 of the Government Code, or changes the definition of a crime within the meaning of Section 6 of Article XIII\u2009B of the California Constitution.', + 'title': 'An act to add Section 10295.35 to the Public Contract Code, relating to public contracts.'} +``` + +هناك مُدخلان سترغب في استخدامهما: + +- `text`: نص القانون الذي سيكون مُدخلًا للنموذج. +- `summary`: نسخة مُختصرة من `text` والتي ستكون هدف النموذج. + +## المعالجة المسبقة (Preprocess) + +الخطوة التالية هي تحميل مجزء النصوص T5 لمعالجة `text` و `summary`: + +```py +>>> from transformers import AutoTokenizer + +>>> checkpoint = "google-t5/t5-small" +>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint) +``` + +وظيفة المعالجة المسبقة التي تريد إنشاءها تحتاج إلى: + +1. إضافة بادئة للمُدخل باستخدام توجيه حتى يعرف T5 أن هذه مهمة تلخيص. تتطلب بعض النماذج القادرة على مهام البرمجة اللغوية العصبية المتعددة توجيهات لمهام مُحددة. +2. استخدام مُعامل الكلمة الرئيسية `text_target` عند ترميز التصنيفات. +3. قصّ التسلسلات بحيث لا يزيد طولها عن الحد الأقصى الذي تم تعيينه بواسطة مُعامل `max_length`. + +```py +>>> prefix = "summarize: " + +>>> def preprocess_function(examples): +... inputs = [prefix + doc for doc in examples["text"]] +... model_inputs = tokenizer(inputs, max_length=1024, truncation=True) + +... labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True) + +... model_inputs["labels"] = labels["input_ids"] +... return model_inputs +``` + +لتطبيق دالة المعالجة المسبقة على مجموعة البيانات بأكملها، استخدم طريقة [`~datasets.Dataset.map`] الخاصة بـ 🤗 Datasets. يمكنك تسريع دالة `map` عن طريق تعيين `batched=True` لمعالجة عناصر متعددة من مجموعة البيانات في وقت واحد: + +```py +>>> tokenized_billsum = billsum.map(preprocess_function, batched=True) +``` + +الآن قم بإنشاء دفعة من الأمثلة باستخدام [`DataCollatorForSeq2Seq`]. الأكثر كفاءة *الحشو الديناميكي* للجمل إلى أطول طول في دفعة أثناء عملية التجميع، بدلاً من حشو مجموعة البيانات بأكملها إلى الحد الأقصى للطول. + + + + +```py +>>> from transformers import DataCollatorForSeq2Seq + +>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint) +``` + + + +```py +>>> from transformers import DataCollatorForSeq2Seq + +>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf") +``` + + + +## التقييم (Evaluate) + +يُعد تضمين مقياس أثناء التدريب مفيدًا غالبًا لتقييم أداء نموذجك. يمكنك تحميل طريقة تقييم بسرعة باستخدام مكتبة 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index). لهذه المهمة، قم بتحميل مقياس [ROUGE](https://huggingface.co/spaces/evaluate-metric/rouge) (راجع [الجولة السريعة](https://huggingface.co/docs/evaluate/a_quick_tour) الخاصة بـ 🤗 Evaluate لمعرفة المزيد حول كيفية تحميل وحساب مقياس): + +```py +>>> import evaluate + +>>> rouge = evaluate.load("rouge") +``` + +ثم قم بإنشاء دالة تُمرر تنبؤاتك وتصنيفاتك إلى [`~evaluate.EvaluationModule.compute`] لحساب مقياس ROUGE: + +```py +>>> import numpy as np + +>>> def compute_metrics(eval_pred): +... predictions, labels = eval_pred +... decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) +... labels = np.where(labels != -100, labels, tokenizer.pad_token_id) +... decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) + +... result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True) + +... prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions] +... result["gen_len"] = np.mean(prediction_lens) + +... return {k: round(v, 4) for k, v in result.items()} +``` + +دالة `compute_metrics` الخاصة بك جاهزة الآن، وستعود إليها عند إعداد التدريب الخاص بك. + +## التدريب (Train) + + + + + + +إذا لم تكن معتادًا على ضبط نموذج باستخدام [`Trainer`]، فألق نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-with-pytorch-trainer)! + + + +أنت جاهز لبدء تدريب نموذجك الآن! قم بتحميل T5 باستخدام [`AutoModelForSeq2SeqLM`]: + +```py +>>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer + +>>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) +``` + +في هذه المرحلة، لم يتبق سوى ثلاث خطوات: + +1. حدد مُعامِلات التدريب الخاصة بك في [`Seq2SeqTrainingArguments`]. المعامل الوحيد المطلوب هو `output_dir` الذي يُحدد مكان حفظ نموذجك. ستدفع هذا النموذج إلى Hub عن طريق تعيين `push_to_hub=True` (تحتاج إلى تسجيل الدخول إلى Hugging Face لتحميل نموذجك). في نهاية كل حقبة، سيقوم [`Trainer`] بتقييم مقياس ROUGE وحفظ نقطة تفتيش التدريب. +2. مرر مُعامِلات التدريب إلى [`Seq2SeqTrainer`] جنبًا إلى جنب مع النموذج ومجموعة البيانات والمُحلِّل اللغوي وجامع البيانات ودالة `compute_metrics`. +3. استدعِ [`~Trainer.train`] لضبط نموذجك. + +```py +>>> training_args = Seq2SeqTrainingArguments( +... output_dir="my_awesome_billsum_model", +... eval_strategy="epoch", +... learning_rate=2e-5, +... per_device_train_batch_size=16, +... per_device_eval_batch_size=16, +... weight_decay=0.01, +... save_total_limit=3, +... num_train_epochs=4, +... predict_with_generate=True, +... fp16=True, #change to bf16=True for XPU +... push_to_hub=True, +... ) + +>>> trainer = Seq2SeqTrainer( +... model=model, +... args=training_args, +... train_dataset=tokenized_billsum["train"], +... eval_dataset=tokenized_billsum["test"], +... processing_class=tokenizer, +... data_collator=data_collator, +... compute_metrics=compute_metrics, +... ) + +>>> trainer.train() +``` + +بمجرد اكتمال التدريب، شارك نموذجك مع Hub باستخدام طريقة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك: + +```py +>>> trainer.push_to_hub() +``` + + + + +إذا لم تكن معتادًا على ضبط نموذج باستخدام Keras، فألق نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)! + + +لضبط نموذج في TensorFlow، ابدأ بإعداد دالة مُحسِّن وجدول معدل التعلم وبعض معلمات التدريب: + +```py +>>> from transformers import create_optimizer, AdamWeightDecay + +>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01) +``` + +ثم يمكنك تحميل T5 باستخدام [`TFAutoModelForSeq2SeqLM`]: + +```py +>>> from transformers import TFAutoModelForSeq2SeqLM + +>>> model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint) +``` + +حوّل مجموعات البيانات الخاصة بك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]: + +```py +>>> tf_train_set = model.prepare_tf_dataset( +... tokenized_billsum["train"], +... shuffle=True, +... batch_size=16, +... collate_fn=data_collator, +... ) + +>>> tf_test_set = model.prepare_tf_dataset( +... tokenized_billsum["test"], +... shuffle=False, +... batch_size=16, +... collate_fn=data_collator, +... ) +``` + +قم بتكوين النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن جميع نماذج Transformers لديها دالة خسارة ذات صلة بالمهمة افتراضيًا، لذلك لست بحاجة إلى تحديد واحدة ما لم تكن ترغب في ذلك: + +```py +>>> import tensorflow as tf + +>>> model.compile(optimizer=optimizer) # No loss argument! +``` + +آخر شيئين يجب إعدادهما قبل بدء التدريب هما حساب درجة ROUGE من التنبؤات، وتوفير طريقة لدفع نموذجك إلى Hub. يتم كلاهما باستخدام [استدعاءات Keras](../main_classes/keras_callbacks). + +مرر دالة `compute_metrics` الخاصة بك إلى [`~transformers.KerasMetricCallback`]: + +```py +>>> from transformers.keras_callbacks import KerasMetricCallback + +>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set) +``` + +حدد مكان دفع نموذجك ومُحلِّلك اللغوي في [`~transformers.PushToHubCallback`]: + +```py +>>> from transformers.keras_callbacks import PushToHubCallback + +>>> push_to_hub_callback = PushToHubCallback( +... output_dir="my_awesome_billsum_model", +... tokenizer=tokenizer, +... ) +``` + +ثم اجمع استدعاءاتك معًا: + +```py +>>> callbacks = [metric_callback, push_to_hub_callback] +``` + +أخيرًا، أنت جاهز لبدء تدريب نموذجك! اتصل بـ [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق من الصحة وعدد الحقب واستدعاءاتك لضبط النموذج: + +```py +>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks) +``` + +بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه! + + + + + +للحصول على مثال أكثر تعمقًا حول كيفية ضبط نموذج للتجميع، ألقِ نظرة على [دفتر ملاحظات PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb) +أو [دفتر ملاحظات TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb) المقابل. + + + +## الاستدلال (Inference) + +رائع، الآن بعد أن قمت بضبط نموذج، يمكنك استخدامه للاستدلال! + +خدد بعض النصوص الذي ترغب في تلخيصها. بالنسبة لـ T5، تحتاج إلى إضافة بادئة إلى مُدخلاتك اعتمادًا على المهمة التي تعمل عليها. بالنسبة التلخيص، يجب عليك إضافة بادئة إلى مُدخلاتك كما هو موضح أدناه: + +```py +>>> text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes." +``` + +أبسط طريقة لتجربة نموذجك المضبوط للاستدلال هي استخدامه في [`pipeline`]. استخدم `pipeline` للتلخيص باستخدام نموذجك، ومرر نصك إليه: + +```py +>>> from transformers import pipeline + +>>> summarizer = pipeline("summarization", model="username/my_awesome_billsum_model") +>>> summarizer(text) +[{"summary_text": "The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country."}] +``` + +يمكنك أيضًا تكرار نتائج `pipeline` يدويًا إذا أردت: + + + +قسم النص وإرجع `input_ids` كتنسورات PyTorch: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_billsum_model") +>>> inputs = tokenizer(text, return_tensors="pt").input_ids +``` + +استخدم طريقة [`~generation.GenerationMixin.generate`] لإنشاء التلخيص. لمزيد من التفاصيل حول استراتيجيات توليد النص المختلفة والمعلمات للتحكم في التوليد، راجع واجهة برمجة تطبيقات [توليد النص](../main_classes/text_generation). + +```py +>>> from transformers import AutoModelForSeq2SeqLM + +>>> model = AutoModelForSeq2SeqLM.from_pretrained("username/my_awesome_billsum_model") +>>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=False) +``` + +فك تشفير معرفات الرموز المولدة مرة أخرى إلى نص: + +```py +>>> tokenizer.decode(outputs[0], skip_special_tokens=True) +'the inflation reduction act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in american history. it will ask the ultra-wealthy and corporations to pay their fair share.' +``` + + +قسم النص وإرجع `input_ids` كتنسورات TensorFlow: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_billsum_model") +>>> inputs = tokenizer(text, return_tensors="tf").input_ids +``` + +استخدم طريقة [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] لإنشاء التلخيص. لمزيد من التفاصيل حول استراتيجيات توليد النص المختلفة والمعلمات للتحكم في التوليد، راجع واجهة برمجة تطبيقات [توليد النص](../main_classes/text_generation). + +```py +>>> from transformers import TFAutoModelForSeq2SeqLM + +>>> model = TFAutoModelForSeq2SeqLM.from_pretrained("username/my_awesome_billsum_model") +>>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=False) +``` + +فك تشفير معرفات الرموز المولدة مرة أخرى إلى نص: + +```py +>>> tokenizer.decode(outputs[0], skip_special_tokens=True) +'the inflation reduction act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in american history. it will ask the ultra-wealthy and corporations to pay their fair share.' +``` + + \ No newline at end of file diff --git a/docs/source/ar/tasks/token_classification.md b/docs/source/ar/tasks/token_classification.md new file mode 100644 index 000000000000..e311482aeccb --- /dev/null +++ b/docs/source/ar/tasks/token_classification.md @@ -0,0 +1,550 @@ + + +# تصنيف الرموز(Token classification) + +[[open-in-colab]] + + + +يهدف تصنيف الرموز إلى إعطاء تسمية لكل رمز على حدة في الجملة. من أكثر مهام تصنيف الرموز شيوعًا هو التعرف على الكيانات المسماة (NER). يحاول NER تحديد تسمية لكل كيان في الجملة، مثل شخص، أو مكان، أو منظمة. + +سيوضح لك هذا الدليل كيفية: + +1. ضبط [DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased) على مجموعة بيانات [WNUT 17](https://huggingface.co/datasets/wnut_17) للكشف عن كيانات جديدة. +2. استخدام نموذجك المضبوط بدقة للاستدلال. + + + +للاطلاع جميع البنى والنقاط المتوافقة مع هذه المهمة، نوصي بالرجوع من [صفحة المهمة](https://huggingface.co/tasks/token-classification). + + + +قبل أن تبدأ، تأكد من تثبيت جميع المكتبات الضرورية: + +```bash +pip install transformers datasets evaluate seqeval +``` + +نحن نشجعك على تسجيل الدخول إلى حساب HuggingFace الخاص بك حتى تتمكن من تحميل ومشاركة نموذجك مع المجتمع. عندما يُطلب منك، أدخل رمزك لتسجيل الدخول: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## تحميل مجموعة بيانات WNUT 17 + +ابدأ بتحميل مجموعة بيانات WNUT 17 من مكتبة 🤗 Datasets: + +```py +>>> from datasets import load_dataset + +>>> wnut = load_dataset("wnut_17") +``` + +ثم ألق نظرة على مثال: + +```py +>>> wnut["train"][0] +{'id': '0', + 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0], + 'tokens': ['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.'] +} +``` + +يمثل كل رقم في `ner_tags` كياناً. حوّل الأرقام إلى أسماء التصنيفات لمعرفة ماهية الكيانات: + +```py +>>> label_list = wnut["train"].features[f"ner_tags"].feature.names +>>> label_list +[ + "O", + "B-corporation", + "I-corporation", + "B-creative-work", + "I-creative-work", + "B-group", + "I-group", + "B-location", + "I-location", + "B-person", + "I-person", + "B-product", + "I-product", +] +``` + +يشير الحرف الذي يسبق كل `ner_tag` إلى موضع الرمز للكيان: + +- `B-` يشير إلى بداية الكيان. +- `I-` يشير إلى أن الرمز يقع ضمن نفس الكيان (على سبيل المثال، الرمز `State` هو جزء من كيان مثل `Empire State Building`). +- `0` يشير إلى أن الرمز لا يمثل أي كيان. + +## المعالجة المسبقة(Preprocess) + + + +الخطوة التالية هي تحميل مُجزِّئ النصوص DistilBERT للمعالجة المسبقة لحقل `tokens`: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased") +``` + +كما رأيت في حقل `tokens` المثال أعلاه، يبدو أن المدخل قد تم تحليله بالفعل. لكن المدخل لم يُجزأ بعد ويتعيّن عليك ضبط `is_split_into_words=True` لتقسيم الكلمات إلى كلمات فرعية. على سبيل المثال: + +```py +>>> example = wnut["train"][0] +>>> tokenized_input = tokenizer(example["tokens"], is_split_into_words=True) +>>> tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"]) +>>> tokens +['[CLS]', '@', 'paul', '##walk', 'it', "'", 's', 'the', 'view', 'from', 'where', 'i', "'", 'm', 'living', 'for', 'two', 'weeks', '.', 'empire', 'state', 'building', '=', 'es', '##b', '.', 'pretty', 'bad', 'storm', 'here', 'last', 'evening', '.', '[SEP]'] +``` + +ومع ذلك، يضيف هذا بعض الرموز الخاصة `[CLS]` و`[SEP]` وتقسيم الكلمات إلى أجزاء يُنشئ عدم تطابق بين المُدخلات والتسميات. قد يتم تقسيم كلمة واحدة تقابل تسمية واحدة الآن إلى كلمتين فرعيتين. ستحتاج إلى إعادة محاذاة الرموز والتسميات عن طريق: + +1. ربط كل رمز بالكلمة الأصلية باستخدام الخاصية [`word_ids`](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.BatchEncoding.word_ids). +2. تعيين التسمية `-100` للرموز الخاصة `[CLS]` و`[SEP]` بحيث يتم تجاهلها بواسطة دالة الخسارة PyTorch (انظر [CrossEntropyLoss](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html)). +3. تسمية الرمز الأول فقط لكلمة معينة. قم بتعيين `-100` لأجزاء الكلمة الأخرى. + +هنا كيف يمكنك إنشاء وظيفة لإعادة محاذاة الرموز والتسميات، وقص الجمل لتتجاوز الحد الأقصى لطول مُدخلات DistilBERT: + +```py +>>> def tokenize_and_align_labels(examples): +... tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True) + +... labels = [] +... for i, label in enumerate(examples[f"ner_tags"]): +... word_ids = tokenized_inputs.word_ids(batch_index=i) # تعيين الرموز إلى كلماتهم المقابلة. +... previous_word_idx = None +... label_ids = [] +... for word_idx in word_ids: # تعيين الرموز الخاصة إلى -100. +... if word_idx is None: +... label_ids.append(-100) +... elif word_idx != previous_word_idx: # تسمية الرمز الأول فقط لكلمة معينة. +... label_ids.append(label[word_idx]) +... else: +... label_ids.append(-100) +... previous_word_idx = word_idx +... labels.append(label_ids) + +... tokenized_inputs["labels"] = labels +... return tokenized_inputs +``` + +لتطبيق هذه العملية على كامل مجموعة البيانات، استخدم الدالة [`~datasets.Dataset.map`] لمجموعة بيانات 🤗. يمكنك تسريع الدالة `map` عن طريق تعيين `batched=True` لمعالجة عناصر متعددة من مجموعة البيانات في وقت واحد: + +```py +>>> tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True) +``` + +الآن قم بإنشاء دفعة من الأمثلة باستخدام [`DataCollatorWithPadding`].من الأفضل استخدام *الحشو الديناميكي* للجمل إلى أطول طول في دفعة أثناء التجميع، بدلاً من حشو مجموعة البيانات بالكامل إلى الطول الأقصى. + + + +```py +>>> from transformers import DataCollatorForTokenClassification + +>>> data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer) +``` + + +```py +>>> from transformers import DataCollatorForTokenClassification + +>>> data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf") +``` + + + +## التقييم(Evaluate) + +يُعدّ تضمين مقياس أثناء التدريب مفيدًا في تقييم أداء نموذجك. يمكنك تحميل طريقة تقييم بسرعة مع مكتبة 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index). لهذه المهمة، قم بتحميل إطار [seqeval](https://huggingface.co/spaces/evaluate-metric/seqeval) (انظر جولة 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) لمعرفة المزيد حول كيفية تحميل وحساب مقياس). يُخرج seqeval عدة نتائج: الدقة، والاستذكار، ومقياس F1، والدقة. + +```py +>>> import evaluate + +>>> seqeval = evaluate.load("seqeval") +``` + +احصل على تسميات الكيانات المسماة (NER) أولاً،ثم أنشئ دالة تُمرر تنبؤاتك وتسمياتك الصحيحة إلى [`~evaluate.EvaluationModule.compute`] لحساب النتائج: + +```py +>>> import numpy as np + +>>> labels = [label_list[i] for i in example[f"ner_tags"]] + +>>> def compute_metrics(p): +... predictions, labels = p +... predictions = np.argmax(predictions, axis=2) + +... true_predictions = [ +... [label_list[p] for (p, l) in zip(prediction, label) if l != -100] +... for prediction, label in zip(predictions, labels) +... ] +... true_labels = [ +... [label_list[l] for (p, l) in zip(prediction, label) if l != -100] +... for prediction, label in zip(predictions, labels) +... ] + +... results = seqeval.compute(predictions=true_predictions, references=true_labels) +... return { +... "precision": results["overall_precision"], +... "recall": results["overall_recall"], +... "f1": results["overall_f1"], +... "accuracy": results["overall_accuracy"], +... } +``` + +دالة `compute_metrics` جاهزة للاستخدام، وستحتاج إليها عند إعداد التدريب. + +## التدريب(Train) + +قبل تدريب النموذج، جهّز خريطة تربط بين المعرّفات المتوقعة وتسمياتها باستخدام `id2label` و `label2id`: + +```py +>>> id2label = { +... 0: "O", +... 1: "B-corporation", +... 2: "I-corporation", +... 3: "B-creative-work", +... 4: "I-creative-work", +... 5: "B-group", +... 6: "I-group", +... 7: "B-location", +... 8: "I-location", +... 9: "B-person", +... 10: "I-person", +... 11: "B-product", +... 12: "I-product", +... } +>>> label2id = { +... "O": 0, +... "B-corporation": 1, +... "I-corporation": 2, +... "B-creative-work": 3, +... "I-creative-work": 4, +... "B-group": 5, +... "I-group": 6, +... "B-location": 7, +... "I-location": 8, +... "B-person": 9, +... "I-person": 10, +... "B-product": 11, +... "I-product": 12, +... } +``` + + + + + +إذا لم تكن على دراية بتعديل نموذج باستخدام [`Trainer`], ألق نظرة على الدليل التعليمي الأساسي [هنا](../training#train-with-pytorch-trainer)! + + + +أنت مستعد الآن لبدء تدريب نموذجك! قم بتحميل DistilBERT مع [`AutoModelForTokenClassification`] إلى جانب عدد التصنيفات المتوقعة، وخريطة التسميات: + +```py +>>> from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer + +>>> model = AutoModelForTokenClassification.from_pretrained( +... "distilbert/distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id +... ) +``` + +في هذه المرحلة، هناك ثلاث خطوات فقط متبقية: + +1. حدد معلمات التدريب الخاصة بك في [`TrainingArguments`]. المعامل الوحيد المطلوب هو `output_dir` الذي يحدد مكان حفظ نموذجك. ستقوم بدفع هذا النموذج إلى Hub عن طريق تعيين `push_to_hub=True` (يجب أن تكون مسجلاً الدخول إلى Hugging Face لتحميل نموذجك). في نهاية كل حقبة، سيقوم [`Trainer`] بتقييم درجات seqeval وحفظ تسخة التدريب. +2. قم بتمرير معاملات التدريب إلى [`Trainer`] إلى جانب النموذج، ومجموعة البيانات، والمُجزِّئ اللغوي، و`data collator`، ودالة `compute_metrics`. +3.استدعِ [`~Trainer.train`] لتدريب نموذجك. + +```py +>>> training_args = TrainingArguments( +... output_dir="my_awesome_wnut_model", +... learning_rate=2e-5, +... per_device_train_batch_size=16, +... per_device_eval_batch_size=16, +... num_train_epochs=2, +... weight_decay=0.01, +... eval_strategy="epoch", +... save_strategy="epoch", +... load_best_model_at_end=True, +... push_to_hub=True, +... ) + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=tokenized_wnut["train"], +... eval_dataset=tokenized_wnut["test"], +... processing_class=tokenizer, +... data_collator=data_collator, +... compute_metrics=compute_metrics, +... ) + +>>> trainer.train() +``` + +بمجرد اكتمال التدريب، شارك نموذجك على Hub باستخدام طريقة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك: + +```py +>>> trainer.push_to_hub() +``` + + + + +إذا لم تكن على دراية بتعديل نموذج باستخدام Keras، ألق نظرة على الدليل التعليمي الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)! + + +للتعديل على نموذج في TensorFlow، ابدأ بإعداد دالة محسن، وجدول معدل التعلم، وبعض معلمات التدريب: + +```py +>>> from transformers import create_optimizer + +>>> batch_size = 16 +>>> num_train_epochs = 3 +>>> num_train_steps = (len(tokenized_wnut["train"]) // batch_size) * num_train_epochs +>>> optimizer, lr_schedule = create_optimizer( +... init_lr=2e-5, +... num_train_steps=num_train_steps, +... weight_decay_rate=0.01, +... num_warmup_steps=0, +... ) +``` + +ثم يمكنك تحميل DistilBERT مع [`TFAutoModelForTokenClassification`] إلى جانب عدد التسميات المتوقعة، وتخطيطات التسميات: + +```py +>>> from transformers import TFAutoModelForTokenClassification + +>>> model = TFAutoModelForTokenClassification.from_pretrained( +... "distilbert/distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id +... ) +``` + +قم بتحويل مجموعات بياناتك إلى تنسيق `tf.data.Dataset` مع [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]: + +```py +>>> tf_train_set = model.prepare_tf_dataset( +... tokenized_wnut["train"], +... shuffle=True, +... batch_size=16, +... collate_fn=data_collator, +... ) + +>>> tf_validation_set = model.prepare_tf_dataset( +... tokenized_wnut["validation"], +... shuffle=False, +... batch_size=16, +... collate_fn=data_collator, +... ) +``` + +هيّئ النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن نماذج Transformers تتضمن دالة خسارة افتراضية مرتبطة بالمهمة، لذلك لا تحتاج إلى تحديد واحدة إلا إذا كنت ترغب في ذلك: + +```py +>>> import tensorflow as tf + +>>> model.compile(optimizer=optimizer) # No loss argument! +``` + +آخر أمرين يجب إعدادهما قبل بدء التدريب هو حساب درجات seqeval من التنبؤات، وتوفير طريقة لدفع نموذجك إلى Hub. يتم ذلك باستخدام [Keras callbacks](../main_classes/keras_callbacks). + +مرر دالة `compute_metrics` الخاصة بك إلى [`~transformers.KerasMetricCallback`]: + +```py +>>> from transformers.keras_callbacks import KerasMetricCallback + +>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set) +``` + +حدد مكان دفع نموذجك والمحلل اللغوي في [`~transformers.PushToHubCallback`]: + +```py +>>> from transformers.keras_callbacks import PushToHubCallback + +>>> push_to_hub_callback = PushToHubCallback( +... output_dir="my_awesome_wnut_model", +... tokenizer=tokenizer, +... ) +``` + +ثم جمّع callbacks الخاصة بك معًا: + +```py +>>> callbacks = [metric_callback, push_to_hub_callback] +``` + +أخيرًا، أنت جاهز الآن لبدء تدريب نموذجك! قم باستدعاء [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع بيانات التدريب والتحقق، وعدد الحقبات، وcallbacks لتعديل النموذج: + +```py +>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks) +``` + +بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه! + + + + + +للحصول على مثال أكثر تفصيلاً حول كيفية تعديل نموذج لتصنيف الرموز، ألق نظرة على الدفتر المقابل +[دفتر PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb) +أو [دفتر TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb). + + + +## الاستدلال(Inference) + +رائع، الآن بعد أن قمت بتعديل نموذج، يمكنك استخدامه للاستدلال! + +احصل على بعض النصوص التي تريد تشغيل الاستدلال عليها: + +```py +>>> text = "The Golden State Warriors are an American professional basketball team based in San Francisco." +``` + +أبسط طريقة لتجربة نموذجك المُدرب مسبقًا للاستدلال هي استخدامه في [`pipeline`]. قم بتنفيذ `pipeline` لتصنيف الكيانات المسماة مع نموذجك، ومرر نصك إليه: + +```py +>>> from transformers import pipeline + +>>> classifier = pipeline("ner", model="stevhliu/my_awesome_wnut_model") +>>> classifier(text) +[{'entity': 'B-location', + 'score': 0.42658573, + 'index': 2, + 'word': 'golden', + 'start': 4, + 'end': 10}, + {'entity': 'I-location', + 'score': 0.35856336, + 'index': 3, + 'word': 'state', + 'start': 11, + 'end': 16}, + {'entity': 'B-group', + 'score': 0.3064001, + 'index': 4, + 'word': 'warriors', + 'start': 17, + 'end': 25}, + {'entity': 'B-location', + 'score': 0.65523505, + 'index': 13, + 'word': 'san', + 'start': 80, + 'end': 83}, + {'entity': 'B-location', + 'score': 0.4668663, + 'index': 14, + 'word': 'francisco', + 'start': 84, + 'end': 93}] +``` + +يمكنك أيضًا تكرار نتائج `pipeline` يدويًا إذا أردت: + + + +قسّم النص إلى رموز وأرجع المُوتّرات بلغة PyTorch: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_wnut_model") +>>> inputs = tokenizer(text, return_tensors="pt") +``` + +مرر مدخلاتك إلى النموذج واحصل على `logits`: + +```py +>>> from transformers import AutoModelForTokenClassification + +>>> model = AutoModelForTokenClassification.from_pretrained("stevhliu/my_awesome_wnut_model") +>>> with torch.no_grad(): +... logits = model(**inputs).logits +``` + +استخرج الفئة ذات الاحتمالية الأعلى، واستخدم جدول `id2label` الخاصة بالنموذج لتحويلها إلى تسمية نصية: + +```py +>>> predictions = torch.argmax(logits, dim=2) +>>> predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]] +>>> predicted_token_class +['O', + 'O', + 'B-location', + 'I-location', + 'B-group', + 'O', + 'O', + 'O', + 'O', + 'O', + 'O', + 'O', + 'O', + 'B-location', + 'B-location', + 'O', + 'O'] +``` + + +قسّم النص إلى رموز وأرجع المُوتّرات ب TensorFlow: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_wnut_model") +>>> inputs = tokenizer(text, return_tensors="tf") +``` + +مرر مدخلاتك إلى النموذج واحصل على `logits`: + +```py +>>> from transformers import TFAutoModelForTokenClassification + +>>> model = TFAutoModelForTokenClassification.from_pretrained("stevhliu/my_awesome_wnut_model") +>>> logits = model(**inputs).logits +``` + +استخرج الفئة ذات الاحتمالية الأعلى، واستخدم جدول `id2label` الخاصة بالنموذج لتحويلها إلى تسمية نصية: + +```py +>>> predicted_token_class_ids = tf.math.argmax(logits, axis=-1) +>>> predicted_token_class = [model.config.id2label[t] for t in predicted_token_class_ids[0].numpy().tolist()] +>>> predicted_token_class +['O', + 'O', + 'B-location', + 'I-location', + 'B-group', + 'O', + 'O', + 'O', + 'O', + 'O', + 'O', + 'O', + 'O', + 'B-location', + 'B-location', + 'O', + 'O'] +``` + + diff --git a/docs/source/ar/tasks/translation.md b/docs/source/ar/tasks/translation.md new file mode 100644 index 000000000000..6245b903c22d --- /dev/null +++ b/docs/source/ar/tasks/translation.md @@ -0,0 +1,407 @@ + + +# الترجمة(Translation) + +[[open-in-colab]] + + + +الترجمة هي عملية تحويل سلسلة نصية من لغة إلى أخرى. وهي إحدى المهام التي يمكن صياغتها كمسألة تسلسل إلى تسلسل، وهو إطار عمل قوي لإنتاج مخرجات من مدخلات، مثل الترجمة أو التلخيص. تُستخدم أنظمة الترجمة عادةً للترجمة بين نصوص لغات مختلفة، ويمكن استخدامها أيضًا لترجمة الكلام أو لمهام تجمع بين النصوص والكلام، مثل تحويل النص إلى كلام أو تحويل الكلام إلى نص. + +سيوضح لك هذا الدليل كيفية: + +1. ضبط دقيق لنموذج [T5](https://huggingface.co/google-t5/t5-small) على المجموعة الفرعية الإنجليزية-الفرنسية من مجموعة بيانات [OPUS Books](https://huggingface.co/datasets/opus_books) لترجمة النص الإنجليزي إلى الفرنسية. +2. استخدام النموذج المضبوط بدقة للاستدلال. + + + +لمشاهدة جميع البنى والنسخ المتوافقة مع هذه المهمة، نوصي بالتحقق من [صفحة المهمة](https://huggingface.co/tasks/translation). + + + +قبل البدء، تأكد من تثبيت جميع المكتبات الضرورية: + +```bash +pip install transformers datasets evaluate sacrebleu +``` + +نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل نموذجك ومشاركته مع المجتمع. عند الطلب، أدخل الرمز المميز الخاص بك لتسجيل الدخول: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## تحميل مجموعة بيانات OPUS Books + +ابدأ بتحميل المجموعة الفرعية الإنجليزية-الفرنسية من مجموعة بيانات [OPUS Books](https://huggingface.co/datasets/opus_books) من مكتبة 🤗 Datasets: + +```py +>>> from datasets import load_dataset + +>>> books = load_dataset("opus_books", "en-fr") +``` + +قسّم مجموعة البيانات إلى مجموعة تدريب ومجموعة اختبار باستخدام طريقة [`~datasets.Dataset.train_test_split`]: + +```py +>>> books = books["train"].train_test_split(test_size=0.2) +``` + +ثم ألقِ نظرة على مثال: + +```py +>>> books["train"][0] +{'id': '90560', + 'translation': {'en': 'But this lofty plateau measured only a few fathoms, and soon we reentered Our Element.', + 'fr': 'Mais ce plateau élevé ne mesurait que quelques toises, et bientôt nous fûmes rentrés dans notre élément.'}} +``` + +`translation`: ترجمة إنجليزية وفرنسية للنص. + +## المعالجة المسبقة(Preprocess) + + + +الخطوة التالية هي تحميل مُجزئ T5 لمعالجة أزواج اللغة الإنجليزية-الفرنسية: + +```py +>>> from transformers import AutoTokenizer + +>>> checkpoint = "google-t5/t5-small" +>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint) +``` + +يجب أن تقوم دالة المعالجة المسبقة التي تُريد إنشاءها بما يلي: + +1. إضافة بادئة إلى المُدخل بمُوجه حتى يعرف T5 أن هذه مهمة ترجمة. تتطلب بعض النماذج القادرة على أداء مهام متعددة توجيهًا لمهام مُحددة. +2. تعيين اللغة الهدف (الفرنسية) في معامل `text_target` لضمان معالجة المُجزئ للنص بشكل صحيح. إذا لم تُعيّن `text_target`، فسيُعالج المُجزئ النص على أنه إنجليزي. +3. اقتطاع التسلسلات بحيث لا يزيد طولها عن الحد الأقصى الذي يحدده معامل `max_length`. + +```py +>>> source_lang = "en" +>>> target_lang = "fr" +>>> prefix = "translate English to French: " + +>>> def preprocess_function(examples): +... inputs = [prefix + example[source_lang] for example in examples["translation"]] +... targets = [example[target_lang] for example in examples["translation"]] +... model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True) +... return model_inputs +``` + +لتطبيق دالة المعالجة المسبقة على مجموعة البيانات بأكملها، استخدم طريقة [`~datasets.Dataset.map`] من 🤗 Datasets. يمكنك تسريع دالة `map` عن طريق تعيين `batched=True` لمعالجة عناصر متعددة من مجموعة البيانات في وقت واحد: + +```py +>>> tokenized_books = books.map(preprocess_function, batched=True) +``` + +الآن أنشئ دفعة من الأمثلة باستخدام [`DataCollatorForSeq2Seq`]. من الأكثر كفاءة *الحشو الديناميكي* للجمل إلى أطول طول في دفعة أثناء التجميع، بدلاً من حشو مجموعة البيانات بأكملها إلى الحد الأقصى للطول. + + + + +```py +>>> from transformers import DataCollatorForSeq2Seq + +>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint) +``` + + + +```py +>>> from transformers import DataCollatorForSeq2Seq + +>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf") +``` + + + +## التقييم (Evaluate) + +غالباً ما يكون تضمين مقياس أثناء التدريب مفيداً لتقييم أداء نموذجك. يمكنك تحميل طريقة تقييم بسرعة باستخدام مكتبة 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index). لهذه المهمة، حمّل مقياس [SacreBLEU](https://huggingface.co/spaces/evaluate-metric/sacrebleu) (راجع [الجولة السريعة](https://huggingface.co/docs/evaluate/a_quick_tour) لـ 🤗 Evaluate لمعرفة المزيد حول كيفية تحميل وحساب مقياس): + +```py +>>> import evaluate + +>>> metric = evaluate.load("sacrebleu") +``` + +ثم أنشئ دالة تُمرر تنبؤاتك وتسمياتك إلى [`~evaluate.EvaluationModule.compute`] لحساب درجة SacreBLEU: + +```py +>>> import numpy as np + +>>> def postprocess_text(preds, labels): +... preds = [pred.strip() for pred in preds] +... labels = [[label.strip()] for label in labels] + +... return preds, labels + +>>> def compute_metrics(eval_preds): +... preds, labels = eval_preds +... if isinstance(preds, tuple): +... preds = preds[0] +... decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) + +... labels = np.where(labels != -100, labels, tokenizer.pad_token_id) +... decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) + +... decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) + +... result = metric.compute(predictions=decoded_preds, references=decoded_labels) +... result = {"bleu": result["score"]} + +... prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds] +... result["gen_len"] = np.mean(prediction_lens) +... result = {k: round(v, 4) for k, v in result.items()} +... return result +``` + +دالة `compute_metrics` الخاصة بك جاهزة الآن، وسوف تعود إليها عند إعداد التدريب. + +## التدريب (Train) + + + + + + +إذا لم تكن معتادًا على ضبط دقيق نموذج باستخدام [`Trainer`], فألقِ نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-with-pytorch-trainer)! + + + +أنت جاهز لبدء تدريب نموذجك الآن! حمّل T5 باستخدام [`AutoModelForSeq2SeqLM`]: + +```py +>>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer + +>>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) +``` + +في هذه المرحلة، تبقى ثلاث خطوات فقط: + +1. حدد مُعاملات للتدريب في [`Seq2SeqTrainingArguments`]. المُعامل الوحيدة المطلوبة هي `output_dir` التي تحدد مكان حفظ النموذج الخاص بك. ستقوم بدفع هذا النموذج إلى Hub عن طريق تعيين `push_to_hub=True` (يجب عليك تسجيل الدخول إلى Hugging Face لتحميل نموذجك). في نهاية كل حقبة، سيقوم [`Trainer`] بتقييم مقياس SacreBLEU وحفظ نقطة تدقيق التدريب. +2. مرر مُعاملات التدريب إلى [`Seq2SeqTrainer`] جنبًا إلى جنب مع النموذج ومجموعة البيانات والمعالج اللغوي وجامع البيانات ووظيفة `compute_metrics`. +3. نفّذ [`~Trainer.train`] لضبط نموذجك. + +```py +>>> training_args = Seq2SeqTrainingArguments( +... output_dir="my_awesome_opus_books_model", +... eval_strategy="epoch", +... learning_rate=2e-5, +... per_device_train_batch_size=16, +... per_device_eval_batch_size=16, +... weight_decay=0.01, +... save_total_limit=3, +... num_train_epochs=2, +... predict_with_generate=True, +... fp16=True, #change to bf16=True for XPU +... push_to_hub=True, +... ) + +>>> trainer = Seq2SeqTrainer( +... model=model, +... args=training_args, +... train_dataset=tokenized_books["train"], +... eval_dataset=tokenized_books["test"], +... processing_class=tokenizer, +... data_collator=data_collator, +... compute_metrics=compute_metrics, +... ) + +>>> trainer.train() +``` + +بمجرد اكتمال التدريب، شارك نموذجك مع Hub باستخدام طريقة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك: + +```py +>>> trainer.push_to_hub() +``` + + + + +إذا لم تكن معتادًا على ضبط نموذج باستخدام Keras، فألق نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)! + + +لضبط نموذج في TensorFlow، ابدأ بإعداد دالة مُحسِّن وجدول معدل تعلم وبعض المعلمات الفائقة للتدريب: + +```py +>>> from transformers import AdamWeightDecay + +>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01) +``` + +ثم يمكنك تحميل T5 باستخدام [`TFAutoModelForSeq2SeqLM`]: + +```py +>>> from transformers import TFAutoModelForSeq2SeqLM + +>>> model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint) +``` + +حوّل مجموعات البيانات الخاصة بك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]: + +```py +>>> tf_train_set = model.prepare_tf_dataset( +... tokenized_books["train"], +... shuffle=True, +... batch_size=16, +... collate_fn=data_collator, +... ) + +>>> tf_test_set = model.prepare_tf_dataset( +... tokenized_books["test"], +... shuffle=False, +... batch_size=16, +... collate_fn=data_collator, +... ) +``` + +قم بتكوين النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن جميع نماذج Transformers تحتوي على دالة خسارة ذات صلة بالمهمة بشكل افتراضي، لذلك لا تحتاج إلى تحديد واحدة إلا إذا كنت ترغب في ذلك: + +```py +>>> import tensorflow as tf + +>>> model.compile(optimizer=optimizer) # No loss argument! +``` + +آخر شيئين يجب إعدادهما قبل بدء التدريب هما حساب مقياس SacreBLEU من التوقعات، وتوفير طريقة لدفع نموذجك إلى Hub. يتم كلاهما باستخدام [استدعاءات Keras](../main_classes/keras_callbacks). + +مرر دالة `compute_metrics` الخاصة بك إلى [`~transformers.KerasMetricCallback`]: + +```py +>>> from transformers.keras_callbacks import KerasMetricCallback + +>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set) +``` + +حدد مكان دفع نموذجك ومعالجك اللغوي في [`~transformers.PushToHubCallback`]: + +```py +>>> from transformers.keras_callbacks import PushToHubCallback + +>>> push_to_hub_callback = PushToHubCallback( +... output_dir="my_awesome_opus_books_model", +... tokenizer=tokenizer, +... ) +``` + +ثم اجمع استدعاءاتك معًا: + +```py +>>> callbacks = [metric_callback, push_to_hub_callback] +``` + +أخيرًا، أنت جاهز لبدء تدريب نموذجك! اتصل بـ [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق من الصحة وعدد الحقب واستدعاءاتك لضبط النموذج: + +```py +>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks) +``` + +بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه! + + + + + +للحصول على مثال أكثر تعمقًا لكيفية ضبط نموذج للترجمة، ألق نظرة على [دفتر ملاحظات PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb) المقابل +أو [دفتر ملاحظات TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb). + + + +## الاستدلال (Inference) + +رائع، الآن بعد أن قمت بضبط نموذج، يمكنك استخدامه للاستدلال! + +أحضر بعض النصوص التي ترغب في ترجمتها إلى لغة أخرى. بالنسبة لـ T5، تحتاج إلى إضافة بادئة إلى مدخلاتك اعتمادًا على المهمة التي تعمل عليها. للترجمة من الإنجليزية إلى الفرنسية، يجب عليك إضافة بادئة إلى مدخلاتك كما هو موضح أدناه: + +```py +>>> text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria." +``` + +أبسط طريقة لتجربة نموذجك المضبوط للاستدلال هي استخدامه في [`pipeline`]. قم بإنشاء مثيل لـ `pipeline` للترجمة باستخدام نموذجك، ومرر النص الخاص بك إليه: + +```py +>>> from transformers import pipeline + +# تغيير `xx` إلى لغة الإدخال و `yy` إلى لغة المخرجات المطلوبة. +# أمثلة: "en" للغة الإنجليزية، "fr" للغة الفرنسية، "de" للغة الألمانية، "es" للغة الإسبانية، "zh" للغة الصينية، إلخ؛ translation_en_to_fr تترجم من الإنجليزية إلى الفرنسية +# يمكنك عرض جميع قوائم اللغات هنا - https://huggingface.co/languages +>>> translator = pipeline("translation_xx_to_yy", model="username/my_awesome_opus_books_model") +>>> translator(text) +[{'translation_text': 'Legumes partagent des ressources avec des bactéries azotantes.'}] +``` + +يمكنك أيضًا تكرار نتائج `pipeline` يدويًا إذا أردت: + + + +قم بتحويل النص إلى رموز وإرجاع `input_ids` كموترات PyTorch: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_opus_books_model") +>>> inputs = tokenizer(text, return_tensors="pt").input_ids +``` + +استخدم الدالة [`~generation.GenerationMixin.generate`] لإنشاء الترجمة. لمزيد من التفاصيل حول استراتيجيات توليد النصوص المختلفة والمعلمات للتحكم في التوليد، تحقق من واجهة برمجة تطبيقات [توليد النصوص](../main_classes/text_generation). + +```py +>>> from transformers import AutoModelForSeq2SeqLM + +>>> model = AutoModelForSeq2SeqLM.from_pretrained("username/my_awesome_opus_books_model") +>>> outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95) +``` + +فك تشفير معرفات الرموز المولدة مرة أخرى إلى نص: + +```py +>>> tokenizer.decode(outputs[0], skip_special_tokens=True) +'Les lignées partagent des ressources avec des bactéries enfixant l'azote.' +``` + + +قم بتحويل النص إلى رموز وإرجاع `input_ids` كموترات TensorFlow: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_opus_books_model") +>>> inputs = tokenizer(text, return_tensors="tf").input_ids +``` + +استخدم طريقة [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] لإنشاء الترجمة. لمزيد من التفاصيل حول استراتيجيات توليد النصوص المختلفة والمعلمات للتحكم في التوليد، تحقق من واجهة برمجة تطبيقات [توليد النصوص](../main_classes/text_generation). + +```py +>>> from transformers import TFAutoModelForSeq2SeqLM + +>>> model = TFAutoModelForSeq2SeqLM.from_pretrained("username/my_awesome_opus_books_model") +>>> outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95) +``` + +فك تشفير معرفات الرموز المولدة مرة أخرى إلى نص: + +```py +>>> tokenizer.decode(outputs[0], skip_special_tokens=True) +'Les lugumes partagent les ressources avec des bactéries fixatrices d'azote.' +``` + + \ No newline at end of file diff --git a/docs/source/ar/tiktoken.md b/docs/source/ar/tiktoken.md new file mode 100644 index 000000000000..6f3755d8670c --- /dev/null +++ b/docs/source/ar/tiktoken.md @@ -0,0 +1,41 @@ +# Tiktoken والتفاعل مع Transformers + +يتم دمج دعم ملفات نموذج tiktoken بسلاسة في 🤗 transformers عند تحميل النماذج +`from_pretrained` مع ملف `tokenizer.model` tiktoken على Hub، والذي يتم تحويله تلقائيًا إلى [المحلل اللغوي السريع](https://huggingface.co/docs/transformers/main/en/main_classes/tokenizer#transformers.PreTrainedTokenizerFast). + +### النماذج المعروفة التي تم إصدارها مع `tiktoken.model`: + - gpt2 + - llama3 + +## مثال على الاستخدام + +من أجل تحميل ملفات `tiktoken` في `transformers`، تأكد من أن ملف `tokenizer.model` هو ملف tiktoken وسيتم تحميله تلقائيًا عند التحميل `from_pretrained`. إليك كيفية تحميل مجزىء لغوي ونموذج، والذي +يمكن تحميله من نفس الملف بالضبط: + +```py +from transformers import AutoTokenizer + +model_id = "meta-llama/Meta-Llama-3-8B-Instruct" +tokenizer = AutoTokenizer.from_pretrained(model_id, subfolder="original") +``` +## إنشاء مجزىء لغوي tiktoken + +لا يحتوي ملف `tokenizer.model` على أي معلومات حول الرموز أو الأنماط الإضافية. إذا كانت هذه الأمور مهمة، قم بتحويل المحلل اللغوي إلى `tokenizer.json`، وهو التنسيق المناسب لـ [`PreTrainedTokenizerFast`]. + +قم بتوليد ملف `tokenizer.model` باستخدام [tiktoken.get_encoding](https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/registry.py#L63) ثم قم بتحويله إلى `tokenizer.json` باستخدام [`convert_tiktoken_to_fast`]. + +```py + +from transformers.integrations.tiktoken import convert_tiktoken_to_fast +from tiktoken import get_encoding + +# يمكنك تحميل ترميزك المخصص أو الترميز الذي توفره OpenAI +encoding = get_encoding("gpt2") +convert_tiktoken_to_fast(encoding, "config/save/dir") +``` + +يتم حفظ ملف `tokenizer.json` الناتج في الدليل المحدد ويمكن تحميله باستخدام [`PreTrainedTokenizerFast`]. + +```py +tokenizer = PreTrainedTokenizerFast.from_pretrained("config/save/dir") +``` diff --git a/docs/source/de/installation.md b/docs/source/de/installation.md index 1bd34f73302b..44b6f1ed981e 100644 --- a/docs/source/de/installation.md +++ b/docs/source/de/installation.md @@ -149,7 +149,7 @@ conda install conda-forge::transformers Vorgefertigte Modelle werden heruntergeladen und lokal zwischengespeichert unter: `~/.cache/huggingface/hub`. Dies ist das Standardverzeichnis, das durch die Shell-Umgebungsvariable "TRANSFORMERS_CACHE" vorgegeben ist. Unter Windows wird das Standardverzeichnis durch `C:\Benutzer\Benutzername\.cache\huggingface\hub` angegeben. Sie können die unten aufgeführten Shell-Umgebungsvariablen - in der Reihenfolge ihrer Priorität - ändern, um ein anderes Cache-Verzeichnis anzugeben: -1. Shell-Umgebungsvariable (Standard): `HUGGINGFACE_HUB_CACHE` oder `TRANSFORMERS_CACHE`. +1. Shell-Umgebungsvariable (Standard): `HF_HUB_CACHE` oder `TRANSFORMERS_CACHE`. 2. Shell-Umgebungsvariable: `HF_HOME`. 3. Shell-Umgebungsvariable: `XDG_CACHE_HOME` + `/huggingface`. diff --git a/docs/source/de/quicktour.md b/docs/source/de/quicktour.md index 01cd7200750c..c01609207fec 100644 --- a/docs/source/de/quicktour.md +++ b/docs/source/de/quicktour.md @@ -109,7 +109,7 @@ label: NEGATIVE, with score: 0.5309 Die [`pipeline`] kann auch über einen ganzen Datensatz iterieren. Starten wir mit der Installation der [🤗 Datasets](https://huggingface.co/docs/datasets/) Bibliothek: ```bash -pip install datasets +pip install datasets ``` Erstellen wir eine [`pipeline`] mit der Aufgabe die wir lösen und dem Modell welches wir nutzen möchten. @@ -191,7 +191,7 @@ Wenn Sie kein Modell für Ihren Anwendungsfall finden können, müssen Sie ein v -Unter der Haube arbeiten die Klassen [`AutoModelForSequenceClassification`] und [`AutoTokenizer`] zusammen, um die [`pipeline`] zu betreiben. Eine [`AutoClass`](./model_doc/auto) ist eine Abkürzung, die automatisch die Architektur eines trainierten Modells aus dessen Namen oder Pfad abruft. Sie müssen nur die passende `AutoClass` für Ihre Aufgabe und den zugehörigen Tokenizer mit [`AutoTokenizer`] auswählen. +Unter der Haube arbeiten die Klassen [`AutoModelForSequenceClassification`] und [`AutoTokenizer`] zusammen, um die [`pipeline`] zu betreiben. Eine [`AutoClass`](./model_doc/auto) ist eine Abkürzung, die automatisch die Architektur eines trainierten Modells aus dessen Namen oder Pfad abruft. Sie müssen nur die passende `AutoClass` für Ihre Aufgabe und den zugehörigen Tokenizer mit [`AutoTokenizer`] auswählen. Kehren wir zu unserem Beispiel zurück und sehen wir uns an, wie Sie die `AutoClass` verwenden können, um die Ergebnisse der [`pipeline`] zu replizieren. @@ -281,7 +281,7 @@ Jetzt können Sie Ihren vorverarbeiteten Stapel von Eingaben direkt an das Model ``` Das Modell gibt die endgültigen Aktivierungen in dem Attribut "logits" aus. Wenden Sie die Softmax-Funktion auf die "logits" an, um die Wahrscheinlichkeiten zu erhalten: - + ```py >>> from torch import nn @@ -308,7 +308,7 @@ In der [Aufgabenzusammenfassung](./task_summary) steht, welche [AutoModel]-Klass Jetzt können Sie Ihren vorverarbeiteten Stapel von Eingaben direkt an das Modell übergeben, indem Sie die Wörterbuchschlüssel direkt an die Tensoren übergeben: - + ```py >>> tf_outputs = tf_model(tf_batch) ``` @@ -383,8 +383,8 @@ Ein besonders cooles 🤗 Transformers-Feature ist die Möglichkeit, ein Modell ```py >>> from transformers import AutoModel ->>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory) ->>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True) +>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory) +>>> pt_model = AutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True) ``` @@ -392,8 +392,8 @@ Ein besonders cooles 🤗 Transformers-Feature ist die Möglichkeit, ein Modell ```py >>> from transformers import TFAutoModel ->>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory) ->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True) +>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory) +>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True) ``` diff --git a/docs/source/en/_config.py b/docs/source/en/_config.py index 4381def017dd..f49e4e473196 100644 --- a/docs/source/en/_config.py +++ b/docs/source/en/_config.py @@ -11,4 +11,4 @@ "{processor_class}": "FakeProcessorClass", "{model_class}": "FakeModelClass", "{object_class}": "FakeObjectClass", -} \ No newline at end of file +} diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index ca7ee4557fee..34aacd0796a3 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -139,8 +139,6 @@ title: Export to TFLite - local: torchscript title: Export to TorchScript - - local: benchmarks - title: Benchmarks - local: notebooks title: Notebooks with examples - local: community @@ -167,10 +165,14 @@ title: AWQ - local: quantization/aqlm title: AQLM + - local: quantization/vptq + title: VPTQ - local: quantization/quanto title: Quanto - local: quantization/eetq title: EETQ + - local: quantization/higgs + title: HIGGS - local: quantization/hqq title: HQQ - local: quantization/fbgemm_fp8 @@ -322,6 +324,8 @@ sections: - local: model_doc/albert title: ALBERT + - local: model_doc/bamba + title: Bamba - local: model_doc/bart title: BART - local: model_doc/barthez @@ -362,6 +366,8 @@ title: CodeLlama - local: model_doc/cohere title: Cohere + - local: model_doc/cohere2 + title: Cohere2 - local: model_doc/convbert title: ConvBERT - local: model_doc/cpm @@ -378,6 +384,8 @@ title: DeBERTa-v2 - local: model_doc/dialogpt title: DialoGPT + - local: model_doc/diffllama + title: DiffLlama - local: model_doc/distilbert title: DistilBERT - local: model_doc/dpr @@ -394,10 +402,10 @@ title: ESM - local: model_doc/falcon title: Falcon + - local: model_doc/falcon3 + title: Falcon3 - local: model_doc/falcon_mamba title: FalconMamba - - local: model_doc/fastspeech2_conformer - title: FastSpeech2Conformer - local: model_doc/flan-t5 title: FLAN-T5 - local: model_doc/flan-ul2 @@ -440,6 +448,10 @@ title: Granite - local: model_doc/granitemoe title: GraniteMoe + - local: model_doc/granitevision + title: GraniteVision + - local: model_doc/helium + title: Helium - local: model_doc/herbert title: HerBERT - local: model_doc/ibert @@ -492,6 +504,8 @@ title: mLUKE - local: model_doc/mobilebert title: MobileBERT + - local: model_doc/modernbert + title: ModernBert - local: model_doc/mpnet title: MPNet - local: model_doc/mpt @@ -516,8 +530,8 @@ title: Nyströmformer - local: model_doc/olmo title: OLMo - - local: model_doc/olmo_1124 - title: OLMo November 2024 + - local: model_doc/olmo2 + title: OLMo2 - local: model_doc/olmoe title: OLMoE - local: model_doc/open-llama @@ -643,6 +657,8 @@ title: DiNAT - local: model_doc/dinov2 title: DINOV2 + - local: model_doc/dinov2_with_registers + title: DINOv2 with Registers - local: model_doc/dit title: DiT - local: model_doc/dpt @@ -657,6 +673,8 @@ title: GLPN - local: model_doc/hiera title: Hiera + - local: model_doc/ijepa + title: I-JEPA - local: model_doc/imagegpt title: ImageGPT - local: model_doc/levit @@ -691,6 +709,8 @@ title: SegFormer - local: model_doc/seggpt title: SegGpt + - local: model_doc/superglue + title: SuperGlue - local: model_doc/superpoint title: SuperPoint - local: model_doc/swiftformer @@ -703,6 +723,10 @@ title: Swin2SR - local: model_doc/table-transformer title: Table Transformer + - local: model_doc/textnet + title: TextNet + - local: model_doc/timm_wrapper + title: Timm Wrapper - local: model_doc/upernet title: UperNet - local: model_doc/van @@ -719,6 +743,8 @@ title: ViTMatte - local: model_doc/vit_msn title: ViTMSN + - local: model_doc/vitpose + title: ViTPose - local: model_doc/yolos title: YOLOS - local: model_doc/zoedepth @@ -736,8 +762,8 @@ title: dac - local: model_doc/encodec title: EnCodec - - local: model_doc/hiera - title: Hiera + - local: model_doc/fastspeech2_conformer + title: FastSpeech2Conformer - local: model_doc/hubert title: Hubert - local: model_doc/mctct @@ -746,6 +772,8 @@ title: Mimi - local: model_doc/mms title: MMS + - local: model_doc/moonshine + title: Moonshine - local: model_doc/moshi title: Moshi - local: model_doc/musicgen @@ -808,6 +836,8 @@ title: ALIGN - local: model_doc/altclip title: AltCLIP + - local: model_doc/aria + title: Aria - local: model_doc/blip title: BLIP - local: model_doc/blip-2 @@ -826,12 +856,16 @@ title: CLIPSeg - local: model_doc/clvp title: CLVP + - local: model_doc/colpali + title: ColPali - local: model_doc/data2vec title: Data2Vec - local: model_doc/deplot title: DePlot - local: model_doc/donut title: Donut + - local: model_doc/emu3 + title: Emu3 - local: model_doc/flava title: FLAVA - local: model_doc/git @@ -896,6 +930,8 @@ title: Pix2Struct - local: model_doc/pixtral title: Pixtral + - local: model_doc/qwen2_5_vl + title: Qwen2.5-VL - local: model_doc/qwen2_audio title: Qwen2Audio - local: model_doc/qwen2_vl diff --git a/docs/source/en/add_new_pipeline.md b/docs/source/en/add_new_pipeline.md index 1e5b95e9b48c..e8234c565b26 100644 --- a/docs/source/en/add_new_pipeline.md +++ b/docs/source/en/add_new_pipeline.md @@ -184,7 +184,7 @@ class PairClassificationPipeline(Pipeline): ``` The implementation is framework agnostic, and will work for PyTorch and TensorFlow models. If we have saved this in -a file named `pair_classification.py`, we can then import it and register it like this: +a file named `pair_classification.py`, we can then import it and register it like this. ```py from pair_classification import PairClassificationPipeline @@ -199,6 +199,22 @@ PIPELINE_REGISTRY.register_pipeline( ) ``` +The [register_pipeline](https://github.com/huggingface/transformers/blob/9feae5fb0164e89d4998e5776897c16f7330d3df/src/transformers/pipelines/base.py#L1387) function registers the pipeline details (task type, pipeline class, supported backends) to a models `config.json` file. + +```json + "custom_pipelines": { + "pair-classification": { + "impl": "pair_classification.PairClassificationPipeline", + "pt": [ + "AutoModelForSequenceClassification" + ], + "tf": [ + "TFAutoModelForSequenceClassification" + ], + } + }, +``` + Once this is done, we can use it with a pretrained model. For instance `sgugger/finetuned-bert-mrpc` has been fine-tuned on the MRPC dataset, which classifies pairs of sentences as paraphrases or not. diff --git a/docs/source/en/agents.md b/docs/source/en/agents.md index 721e348f89fe..56c9184980f4 100644 --- a/docs/source/en/agents.md +++ b/docs/source/en/agents.md @@ -225,7 +225,7 @@ You have access to the following tools: To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences. At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task, then the tools that you want to use. -Then in the 'Code:' sequence, you shold write the code in simple Python. The code sequence must end with '/End code' sequence. +Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '/End code' sequence. During each intermediate step, you can use 'print()' to save whatever important information you will then need. These print outputs will then be available in the 'Observation:' field, for using this information as input for the next step. diff --git a/docs/source/en/agents_advanced.md b/docs/source/en/agents_advanced.md index e80e402d7374..eb5149d2faa3 100644 --- a/docs/source/en/agents_advanced.md +++ b/docs/source/en/agents_advanced.md @@ -162,7 +162,7 @@ agent.run( improved_prompt could be "A bright blue space suit wearing rabbit, on the surface of the moon, under a bright orange sunset, with the Earth visible in the background" Now that I have improved the prompt, I can use the image generator tool to generate an image based on this prompt. ->>> Agent is executing the code below: +=== Agent is executing the code below: image = image_generator(prompt="A bright blue space suit wearing rabbit, on the surface of the moon, under a bright orange sunset, with the Earth visible in the background") final_answer(image) ``` @@ -211,7 +211,7 @@ agent.run("How many more blocks (also denoted as layers) are in BERT base encode ## Display your agent run in a cool Gradio interface -You can leverage `gradio.Chatbot`to display your agent's thoughts using `stream_to_gradio`, here is an example: +You can leverage `gradio.Chatbot` to display your agent's thoughts using `stream_to_gradio`, here is an example: ```py import gradio as gr diff --git a/docs/source/en/autoclass_tutorial.md b/docs/source/en/autoclass_tutorial.md index 0f02f19ed295..33f48b2b043f 100644 --- a/docs/source/en/autoclass_tutorial.md +++ b/docs/source/en/autoclass_tutorial.md @@ -138,12 +138,15 @@ Load a processor with [`AutoProcessor.from_pretrained`]: -The `AutoModelFor` classes let you load a pretrained model for a given task (see [here](model_doc/auto) for a complete list of available tasks). For example, load a model for sequence classification with [`AutoModelForSequenceClassification.from_pretrained`]: +The `AutoModelFor` classes let you load a pretrained model for a given task (see [here](model_doc/auto) for a complete list of available tasks). For example, load a model for sequence classification with [`AutoModelForSequenceClassification.from_pretrained`]. + +> [!WARNING] +> By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type. ```py >>> from transformers import AutoModelForSequenceClassification ->>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased") +>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", torch_dtype="auto") ``` Easily reuse the same checkpoint to load an architecture for a different task: @@ -151,7 +154,7 @@ Easily reuse the same checkpoint to load an architecture for a different task: ```py >>> from transformers import AutoModelForTokenClassification ->>> model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased") +>>> model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased", torch_dtype="auto") ``` diff --git a/docs/source/en/benchmarks.md b/docs/source/en/benchmarks.md deleted file mode 100644 index c61a21bb532c..000000000000 --- a/docs/source/en/benchmarks.md +++ /dev/null @@ -1,387 +0,0 @@ - - -# Benchmarks - - - -Hugging Face's Benchmarking tools are deprecated and it is advised to use external Benchmarking libraries to measure the speed -and memory complexity of Transformer models. - - - -[[open-in-colab]] - -Let's take a look at how 🤗 Transformers models can be benchmarked, best practices, and already available benchmarks. - -A notebook explaining in more detail how to benchmark 🤗 Transformers models can be found [here](https://github.com/huggingface/notebooks/tree/main/examples/benchmark.ipynb). - -## How to benchmark 🤗 Transformers models - -The classes [`PyTorchBenchmark`] and [`TensorFlowBenchmark`] allow to flexibly benchmark 🤗 Transformers models. The benchmark classes allow us to measure the _peak memory usage_ and _required time_ for both _inference_ and _training_. - - - -Here, _inference_ is defined by a single forward pass, and _training_ is defined by a single forward pass and -backward pass. - - - -The benchmark classes [`PyTorchBenchmark`] and [`TensorFlowBenchmark`] expect an object of type [`PyTorchBenchmarkArguments`] and -[`TensorFlowBenchmarkArguments`], respectively, for instantiation. [`PyTorchBenchmarkArguments`] and [`TensorFlowBenchmarkArguments`] are data classes and contain all relevant configurations for their corresponding benchmark class. In the following example, it is shown how a BERT model of type _bert-base-cased_ can be benchmarked. - - - -```py ->>> from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments - ->>> args = PyTorchBenchmarkArguments(models=["google-bert/bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512]) ->>> benchmark = PyTorchBenchmark(args) -``` - - -```py ->>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments - ->>> args = TensorFlowBenchmarkArguments( -... models=["google-bert/bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512] -... ) ->>> benchmark = TensorFlowBenchmark(args) -``` - - - -Here, three arguments are given to the benchmark argument data classes, namely `models`, `batch_sizes`, and -`sequence_lengths`. The argument `models` is required and expects a `list` of model identifiers from the -[model hub](https://huggingface.co/models) The `list` arguments `batch_sizes` and `sequence_lengths` define -the size of the `input_ids` on which the model is benchmarked. There are many more parameters that can be configured -via the benchmark argument data classes. For more detail on these one can either directly consult the files -`src/transformers/benchmark/benchmark_args_utils.py`, `src/transformers/benchmark/benchmark_args.py` (for PyTorch) -and `src/transformers/benchmark/benchmark_args_tf.py` (for Tensorflow). Alternatively, running the following shell -commands from root will print out a descriptive list of all configurable parameters for PyTorch and Tensorflow -respectively. - - - -```bash -python examples/pytorch/benchmarking/run_benchmark.py --help -``` - -An instantiated benchmark object can then simply be run by calling `benchmark.run()`. - -```py ->>> results = benchmark.run() ->>> print(results) -==================== INFERENCE - SPEED - RESULT ==================== --------------------------------------------------------------------------------- -Model Name Batch Size Seq Length Time in s --------------------------------------------------------------------------------- -google-bert/bert-base-uncased 8 8 0.006 -google-bert/bert-base-uncased 8 32 0.006 -google-bert/bert-base-uncased 8 128 0.018 -google-bert/bert-base-uncased 8 512 0.088 --------------------------------------------------------------------------------- - -==================== INFERENCE - MEMORY - RESULT ==================== --------------------------------------------------------------------------------- -Model Name Batch Size Seq Length Memory in MB --------------------------------------------------------------------------------- -google-bert/bert-base-uncased 8 8 1227 -google-bert/bert-base-uncased 8 32 1281 -google-bert/bert-base-uncased 8 128 1307 -google-bert/bert-base-uncased 8 512 1539 --------------------------------------------------------------------------------- - -==================== ENVIRONMENT INFORMATION ==================== - -- transformers_version: 2.11.0 -- framework: PyTorch -- use_torchscript: False -- framework_version: 1.4.0 -- python_version: 3.6.10 -- system: Linux -- cpu: x86_64 -- architecture: 64bit -- date: 2020-06-29 -- time: 08:58:43.371351 -- fp16: False -- use_multiprocessing: True -- only_pretrain_model: False -- cpu_ram_mb: 32088 -- use_gpu: True -- num_gpus: 1 -- gpu: TITAN RTX -- gpu_ram_mb: 24217 -- gpu_power_watts: 280.0 -- gpu_performance_state: 2 -- use_tpu: False -``` - - -```bash -python examples/tensorflow/benchmarking/run_benchmark_tf.py --help -``` - -An instantiated benchmark object can then simply be run by calling `benchmark.run()`. - -```py ->>> results = benchmark.run() ->>> print(results) ->>> results = benchmark.run() ->>> print(results) -==================== INFERENCE - SPEED - RESULT ==================== --------------------------------------------------------------------------------- -Model Name Batch Size Seq Length Time in s --------------------------------------------------------------------------------- -google-bert/bert-base-uncased 8 8 0.005 -google-bert/bert-base-uncased 8 32 0.008 -google-bert/bert-base-uncased 8 128 0.022 -google-bert/bert-base-uncased 8 512 0.105 --------------------------------------------------------------------------------- - -==================== INFERENCE - MEMORY - RESULT ==================== --------------------------------------------------------------------------------- -Model Name Batch Size Seq Length Memory in MB --------------------------------------------------------------------------------- -google-bert/bert-base-uncased 8 8 1330 -google-bert/bert-base-uncased 8 32 1330 -google-bert/bert-base-uncased 8 128 1330 -google-bert/bert-base-uncased 8 512 1770 --------------------------------------------------------------------------------- - -==================== ENVIRONMENT INFORMATION ==================== - -- transformers_version: 2.11.0 -- framework: Tensorflow -- use_xla: False -- framework_version: 2.2.0 -- python_version: 3.6.10 -- system: Linux -- cpu: x86_64 -- architecture: 64bit -- date: 2020-06-29 -- time: 09:26:35.617317 -- fp16: False -- use_multiprocessing: True -- only_pretrain_model: False -- cpu_ram_mb: 32088 -- use_gpu: True -- num_gpus: 1 -- gpu: TITAN RTX -- gpu_ram_mb: 24217 -- gpu_power_watts: 280.0 -- gpu_performance_state: 2 -- use_tpu: False -``` - - - -By default, the _time_ and the _required memory_ for _inference_ are benchmarked. In the example output above the first -two sections show the result corresponding to _inference time_ and _inference memory_. In addition, all relevant -information about the computing environment, _e.g._ the GPU type, the system, the library versions, etc... are printed -out in the third section under _ENVIRONMENT INFORMATION_. This information can optionally be saved in a _.csv_ file -when adding the argument `save_to_csv=True` to [`PyTorchBenchmarkArguments`] and -[`TensorFlowBenchmarkArguments`] respectively. In this case, every section is saved in a separate -_.csv_ file. The path to each _.csv_ file can optionally be defined via the argument data classes. - -Instead of benchmarking pre-trained models via their model identifier, _e.g._ `google-bert/bert-base-uncased`, the user can -alternatively benchmark an arbitrary configuration of any available model class. In this case, a `list` of -configurations must be inserted with the benchmark args as follows. - - - -```py ->>> from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments, BertConfig - ->>> args = PyTorchBenchmarkArguments( -... models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512] -... ) ->>> config_base = BertConfig() ->>> config_384_hid = BertConfig(hidden_size=384) ->>> config_6_lay = BertConfig(num_hidden_layers=6) - ->>> benchmark = PyTorchBenchmark(args, configs=[config_base, config_384_hid, config_6_lay]) ->>> benchmark.run() -==================== INFERENCE - SPEED - RESULT ==================== --------------------------------------------------------------------------------- -Model Name Batch Size Seq Length Time in s --------------------------------------------------------------------------------- -bert-base 8 128 0.006 -bert-base 8 512 0.006 -bert-base 8 128 0.018 -bert-base 8 512 0.088 -bert-384-hid 8 8 0.006 -bert-384-hid 8 32 0.006 -bert-384-hid 8 128 0.011 -bert-384-hid 8 512 0.054 -bert-6-lay 8 8 0.003 -bert-6-lay 8 32 0.004 -bert-6-lay 8 128 0.009 -bert-6-lay 8 512 0.044 --------------------------------------------------------------------------------- - -==================== INFERENCE - MEMORY - RESULT ==================== --------------------------------------------------------------------------------- -Model Name Batch Size Seq Length Memory in MB --------------------------------------------------------------------------------- -bert-base 8 8 1277 -bert-base 8 32 1281 -bert-base 8 128 1307 -bert-base 8 512 1539 -bert-384-hid 8 8 1005 -bert-384-hid 8 32 1027 -bert-384-hid 8 128 1035 -bert-384-hid 8 512 1255 -bert-6-lay 8 8 1097 -bert-6-lay 8 32 1101 -bert-6-lay 8 128 1127 -bert-6-lay 8 512 1359 --------------------------------------------------------------------------------- - -==================== ENVIRONMENT INFORMATION ==================== - -- transformers_version: 2.11.0 -- framework: PyTorch -- use_torchscript: False -- framework_version: 1.4.0 -- python_version: 3.6.10 -- system: Linux -- cpu: x86_64 -- architecture: 64bit -- date: 2020-06-29 -- time: 09:35:25.143267 -- fp16: False -- use_multiprocessing: True -- only_pretrain_model: False -- cpu_ram_mb: 32088 -- use_gpu: True -- num_gpus: 1 -- gpu: TITAN RTX -- gpu_ram_mb: 24217 -- gpu_power_watts: 280.0 -- gpu_performance_state: 2 -- use_tpu: False -``` - - -```py ->>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments, BertConfig - ->>> args = TensorFlowBenchmarkArguments( -... models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512] -... ) ->>> config_base = BertConfig() ->>> config_384_hid = BertConfig(hidden_size=384) ->>> config_6_lay = BertConfig(num_hidden_layers=6) - ->>> benchmark = TensorFlowBenchmark(args, configs=[config_base, config_384_hid, config_6_lay]) ->>> benchmark.run() -==================== INFERENCE - SPEED - RESULT ==================== --------------------------------------------------------------------------------- -Model Name Batch Size Seq Length Time in s --------------------------------------------------------------------------------- -bert-base 8 8 0.005 -bert-base 8 32 0.008 -bert-base 8 128 0.022 -bert-base 8 512 0.106 -bert-384-hid 8 8 0.005 -bert-384-hid 8 32 0.007 -bert-384-hid 8 128 0.018 -bert-384-hid 8 512 0.064 -bert-6-lay 8 8 0.002 -bert-6-lay 8 32 0.003 -bert-6-lay 8 128 0.0011 -bert-6-lay 8 512 0.074 --------------------------------------------------------------------------------- - -==================== INFERENCE - MEMORY - RESULT ==================== --------------------------------------------------------------------------------- -Model Name Batch Size Seq Length Memory in MB --------------------------------------------------------------------------------- -bert-base 8 8 1330 -bert-base 8 32 1330 -bert-base 8 128 1330 -bert-base 8 512 1770 -bert-384-hid 8 8 1330 -bert-384-hid 8 32 1330 -bert-384-hid 8 128 1330 -bert-384-hid 8 512 1540 -bert-6-lay 8 8 1330 -bert-6-lay 8 32 1330 -bert-6-lay 8 128 1330 -bert-6-lay 8 512 1540 --------------------------------------------------------------------------------- - -==================== ENVIRONMENT INFORMATION ==================== - -- transformers_version: 2.11.0 -- framework: Tensorflow -- use_xla: False -- framework_version: 2.2.0 -- python_version: 3.6.10 -- system: Linux -- cpu: x86_64 -- architecture: 64bit -- date: 2020-06-29 -- time: 09:38:15.487125 -- fp16: False -- use_multiprocessing: True -- only_pretrain_model: False -- cpu_ram_mb: 32088 -- use_gpu: True -- num_gpus: 1 -- gpu: TITAN RTX -- gpu_ram_mb: 24217 -- gpu_power_watts: 280.0 -- gpu_performance_state: 2 -- use_tpu: False -``` - - - -Again, _inference time_ and _required memory_ for _inference_ are measured, but this time for customized configurations -of the `BertModel` class. This feature can especially be helpful when deciding for which configuration the model -should be trained. - - -## Benchmark best practices - -This section lists a couple of best practices one should be aware of when benchmarking a model. - -- Currently, only single device benchmarking is supported. When benchmarking on GPU, it is recommended that the user - specifies on which device the code should be run by setting the `CUDA_VISIBLE_DEVICES` environment variable in the - shell, _e.g._ `export CUDA_VISIBLE_DEVICES=0` before running the code. -- The option `no_multi_processing` should only be set to `True` for testing and debugging. To ensure accurate - memory measurement it is recommended to run each memory benchmark in a separate process by making sure - `no_multi_processing` is set to `True`. -- One should always state the environment information when sharing the results of a model benchmark. Results can vary - heavily between different GPU devices, library versions, etc., as a consequence, benchmark results on their own are not very - useful for the community. - - -## Sharing your benchmark - -Previously all available core models (10 at the time) have been benchmarked for _inference time_, across many different -settings: using PyTorch, with and without TorchScript, using TensorFlow, with and without XLA. All of those tests were -done across CPUs (except for TensorFlow XLA) and GPUs. - -The approach is detailed in the [following blogpost](https://medium.com/huggingface/benchmarking-transformers-pytorch-and-tensorflow-e2917fb891c2) and the results are -available [here](https://docs.google.com/spreadsheets/d/1sryqufw2D0XlUH4sq3e9Wnxu5EAQkaohzrJbd5HdQ_w/edit?usp=sharing). - -With the new _benchmark_ tools, it is easier than ever to share your benchmark results with the community - -- [PyTorch Benchmarking Results](https://github.com/huggingface/transformers/tree/main/examples/pytorch/benchmarking/README.md). -- [TensorFlow Benchmarking Results](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/benchmarking/README.md). diff --git a/docs/source/en/chat_templating.md b/docs/source/en/chat_templating.md index 1bdf05a26c8d..3581487e130f 100644 --- a/docs/source/en/chat_templating.md +++ b/docs/source/en/chat_templating.md @@ -23,8 +23,8 @@ of text (as is the case with a standard language model), the model instead conti of one or more **messages**, each of which includes a **role**, like "user" or "assistant", as well as message text. Much like tokenization, different models expect very different input formats for chat. This is the reason we added -**chat templates** as a feature. Chat templates are part of the tokenizer. They specify how to convert conversations, -represented as lists of messages, into a single tokenizable string in the format that the model expects. +**chat templates** as a feature. Chat templates are part of the tokenizer for text-only LLMs or processor for multimodal LLMs. They specify how to convert conversations, +represented as lists of messages, into a single tokenizable string in the format that the model expects. Let's make this concrete with a quick example using the `mistralai/Mistral-7B-Instruct-v0.1` model: @@ -39,11 +39,11 @@ Let's make this concrete with a quick example using the `mistralai/Mistral-7B-In ... ] >>> tokenizer.apply_chat_template(chat, tokenize=False) -"[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today? [INST] I'd like to show off how chat templating works! [/INST]" +" [INST] Hello, how are you? [/INST] I'm doing great. How can I help you today? [INST] I'd like to show off how chat templating works! [/INST]" ``` -Notice how the tokenizer has added the control tokens [INST] and [/INST] to indicate the start and end of -user messages (but not assistant messages!), and the entire chat is condensed into a single string. +Notice how the tokenizer has added the control tokens [INST] and [/INST] to indicate the start and end of +user messages (but not assistant messages!), and the entire chat is condensed into a single string. If we use `tokenize=True`, which is the default setting, that string will also be tokenized for us. Now, try the same code, but swap in the `HuggingFaceH4/zephyr-7b-beta` model instead, and you should get: @@ -59,17 +59,26 @@ I'd like to show off how chat templating works! Both Zephyr and Mistral-Instruct were fine-tuned from the same base model, `Mistral-7B-v0.1`. However, they were trained with totally different chat formats. Without chat templates, you would have to write manual formatting code for each -model, and it's very easy to make minor errors that hurt performance! Chat templates handle the details of formatting +model, and it's very easy to make minor errors that hurt performance! Chat templates handle the details of formatting for you, allowing you to write universal code that works for any model. + + +Chat templates are a critical component of our [chat CLI](quicktour#chat-with-text-generation-models). +You can apply the learnings of this guide there as well. + + + ## How do I use chat templates? As you can see in the example above, chat templates are easy to use. Simply build a list of messages, with `role` -and `content` keys, and then pass it to the [`~PreTrainedTokenizer.apply_chat_template`] method. Once you do that, +and `content` keys, and then pass it to the [`~PreTrainedTokenizer.apply_chat_template`] or [`~ProcessorMixin.apply_chat_template`] method +depending on what type of model you are using. Once you do that, you'll get output that's ready to go! When using chat templates as input for model generation, it's also a good idea -to use `add_generation_prompt=True` to add a [generation prompt](#what-are-generation-prompts). +to use `add_generation_prompt=True` to add a [generation prompt](#what-are-generation-prompts). +## Usage with text-only LLMs Here's an example of preparing input for `model.generate()`, using `Zephyr` again: ```python @@ -89,19 +98,19 @@ messages = [ tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt") print(tokenizer.decode(tokenized_chat[0])) ``` -This will yield a string in the input format that Zephyr expects. +This will yield a string in the input format that Zephyr expects. ```text <|system|> -You are a friendly chatbot who always responds in the style of a pirate +You are a friendly chatbot who always responds in the style of a pirate <|user|> -How many helicopters can a human eat in one sitting? +How many helicopters can a human eat in one sitting? <|assistant|> ``` Now that our input is formatted correctly for Zephyr, we can use the model to generate a response to the user's question: ```python -outputs = model.generate(tokenized_chat, max_new_tokens=128) +outputs = model.generate(tokenized_chat, max_new_tokens=128) print(tokenizer.decode(outputs[0])) ``` @@ -109,20 +118,58 @@ This will yield: ```text <|system|> -You are a friendly chatbot who always responds in the style of a pirate +You are a friendly chatbot who always responds in the style of a pirate <|user|> -How many helicopters can a human eat in one sitting? +How many helicopters can a human eat in one sitting? <|assistant|> Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopters are not food, they are flying machines. Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. But helicopters, they be for transportin' and movin' around, not for eatin'. So, I'd say none, me hearties. None at all. ``` +## Usage with multimodal LLMs + +For multimodal LLMs such as [LLaVA](https://huggingface.co/llava-hf) the prompts can be formatted in a similar way. The only difference is you need to pass input images/videos as well along with the text. Each `"content"` +has to be a list containing either a text or an image/video. + +Here's an example of preparing input for using `LLaVA` model: + +```python +from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration + +model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf" +model = LlavaOnevisionForConditionalGeneration.from_pretrained(model_id) # You may want to use bfloat16 and/or move to GPU here +processor = AutoProcessor.from_pretrained(model_id) + +messages = [ + { + "role": "system", + "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}], + }, + { + "role": "user", + "content": [ + {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"}, + {"type": "text", "text": "What are these?"}, + ], + }, +] + +processed_chat = processor.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt") +print(processor.batch_decode(processed_chat["input_ids"][:, :30])) +``` +This yields a string in LLaVAs expected input format with many `` tokens at the end. +The `` tokens are placeholders and each one will be replaced by image embeddings when the mode is run in the forward call. The `processed_chat` can be further passed into [`~GenerationMixin.generate`] to generate text. +```text +'<|im_start|>system +You are a friendly chatbot who always responds in the style of a pirate<|im_end|><|im_start|>user ' +``` + Arr, 'twas easy after all! ## Is there an automated pipeline for chat? Yes, there is! Our text generation pipelines support chat inputs, which makes it easy to use chat models. In the past, we used to use a dedicated "ConversationalPipeline" class, but this has now been deprecated and its functionality -has been merged into the [`TextGenerationPipeline`]. Let's try the `Zephyr` example again, but this time using +has been merged into the [`TextGenerationPipeline`]. Let's try the `Zephyr` example again, but this time using a pipeline: ```python @@ -187,9 +234,9 @@ Can I ask a question?<|im_end|> ``` Note that this time, we've added the tokens that indicate the start of a bot response. This ensures that when the model -generates text it will write a bot response instead of doing something unexpected, like continuing the user's -message. Remember, chat models are still just language models - they're trained to continue text, and chat is just a -special kind of text to them! You need to guide them with appropriate control tokens, so they know what they're +generates text it will write a bot response instead of doing something unexpected, like continuing the user's +message. Remember, chat models are still just language models - they're trained to continue text, and chat is just a +special kind of text to them! You need to guide them with appropriate control tokens, so they know what they're supposed to be doing. Not all models require generation prompts. Some models, like LLaMA, don't have any @@ -201,7 +248,7 @@ effect that `add_generation_prompt` has will depend on the template being used. When passing a list of messages to `apply_chat_template` or `TextGenerationPipeline`, you can choose to format the chat so the model will continue the final message in the chat instead of starting a new one. This is done by removing any end-of-sequence tokens that indicate the end of the final message, so that the model will simply -extend the final message when it begins to generate text. This is useful for "prefilling" the model's response. +extend the final message when it begins to generate text. This is useful for "prefilling" the model's response. Here's an example: @@ -226,9 +273,9 @@ get an error if you try! The default behaviour of `TextGenerationPipeline` is to set `add_generation_prompt=True` so that it starts a new -message. However, if the final message in the input chat has the "assistant" role, it will assume that this message is -a prefill and switch to `continue_final_message=True` instead, because most models do not support multiple -consecutive assistant messages. You can override this behaviour by explicitly passing the `continue_final_message` +message. However, if the final message in the input chat has the "assistant" role, it will assume that this message is +a prefill and switch to `continue_final_message=True` instead, because most models do not support multiple +consecutive assistant messages. You can override this behaviour by explicitly passing the `continue_final_message` argument when calling the pipeline. @@ -237,8 +284,8 @@ argument when calling the pipeline. Yes! This is a good way to ensure that the chat template matches the tokens the model sees during training. We recommend that you apply the chat template as a preprocessing step for your dataset. After this, you -can simply continue like any other language model training task. When training, you should usually set -`add_generation_prompt=False`, because the added tokens to prompt an assistant response will not be helpful during +can simply continue like any other language model training task. When training, you should usually set +`add_generation_prompt=False`, because the added tokens to prompt an assistant response will not be helpful during training. Let's see an example: ```python @@ -272,8 +319,8 @@ From here, just continue training like you would with a standard language modell -By default, some tokenizers add special tokens like `` and `` to text they tokenize. Chat templates should -already include all the special tokens they need, and so additional special tokens will often be incorrect or +By default, some tokenizers add special tokens like `` and `` to text they tokenize. Chat templates should +already include all the special tokens they need, and so additional special tokens will often be incorrect or duplicated, which will hurt model performance. Therefore, if you format text with `apply_chat_template(tokenize=False)`, you should set the argument @@ -286,7 +333,7 @@ Therefore, if you format text with `apply_chat_template(tokenize=False)`, you sh The only argument that `apply_chat_template` requires is `messages`. However, you can pass any keyword argument to `apply_chat_template` and it will be accessible inside the template. This gives you a lot of freedom to use chat templates for many things. There are no restrictions on the names or the format of these arguments - you can pass -strings, lists, dicts or whatever else you want. +strings, lists, dicts or whatever else you want. That said, there are some common use-cases for these extra arguments, such as passing tools for function calling, or documents for retrieval-augmented generation. In these common cases, @@ -309,7 +356,7 @@ def current_time(): def multiply(a: float, b: float): """ A function that multiplies two numbers - + Args: a: The first number to multiply b: The second number to multiply @@ -329,8 +376,8 @@ correctly as tools. Specifically, you should follow these rules: - The function should have a descriptive name - Every argument must have a type hint -- The function must have a docstring in the standard Google style (in other words, an initial function description - followed by an `Args:` block that describes the arguments, unless the function does not have any arguments. +- The function must have a docstring in the standard Google style (in other words, an initial function description + followed by an `Args:` block that describes the arguments, unless the function does not have any arguments. - Do not include types in the `Args:` block. In other words, write `a: The first number to multiply`, not `a (int): The first number to multiply`. Type hints should go in the function header instead. - The function can have a return type and a `Returns:` block in the docstring. However, these are optional @@ -372,7 +419,7 @@ Next, let's define a list of tools: def get_current_temperature(location: str, unit: str) -> float: """ Get the current temperature at a location. - + Args: location: The location to get the temperature for, in the format "City, Country" unit: The unit to return the temperature in. (choices: ["celsius", "fahrenheit"]) @@ -384,7 +431,7 @@ def get_current_temperature(location: str, unit: str) -> float: def get_current_wind_speed(location: str) -> float: """ Get the current wind speed in km/h at a given location. - + Args: location: The location to get the temperature for, in the format "City, Country" Returns: @@ -429,8 +476,8 @@ the temperature in France should certainly be displayed in Celsius. The output format above is specific to the `Hermes-2-Pro` model we're using in this example. Other models may emit different tool call formats, and you may need to do some manual parsing at this step. For example, `Llama-3.1` models will emit -slightly different JSON, with `parameters` instead of `arguments`. Regardless of the format the model outputs, you -should add the tool call to the conversation in the format below, with `tool_calls`, `function` and `arguments` keys. +slightly different JSON, with `parameters` instead of `arguments`. Regardless of the format the model outputs, you +should add the tool call to the conversation in the format below, with `tool_calls`, `function` and `arguments` keys. @@ -449,7 +496,7 @@ a dict, but in the OpenAI API it's a JSON string. Passing a string may cause err Now that we've added the tool call to the conversation, we can call the function and append the result to the -conversation. Since we're just using a dummy function for this example that always returns 22.0, we can just append +conversation. Since we're just using a dummy function for this example that always returns 22.0, we can just append that result directly. ```python @@ -460,7 +507,7 @@ messages.append({"role": "tool", "name": "get_current_temperature", "content": " Some model architectures, notably Mistral/Mixtral, also require a `tool_call_id` here, which should be 9 randomly-generated alphanumeric characters, and assigned to the `id` key of the tool call -dictionary. The same key should also be assigned to the `tool_call_id` key of the tool response dictionary below, so +dictionary. The same key should also be assigned to the `tool_call_id` key of the tool response dictionary below, so that tool calls can be matched to tool responses. So, for Mistral/Mixtral models, the code above would be: ```python @@ -492,13 +539,13 @@ And we get: The current temperature in Paris, France is 22.0 ° Celsius.<|im_end|> ``` -Although this was a simple demo with dummy tools and a single call, the same technique works with +Although this was a simple demo with dummy tools and a single call, the same technique works with multiple real tools and longer conversations. This can be a powerful way to extend the capabilities of conversational agents with real-time information, computational tools like calculators, or access to large databases. ### Understanding tool schemas -Each function you pass to the `tools` argument of `apply_chat_template` is converted into a +Each function you pass to the `tools` argument of `apply_chat_template` is converted into a [JSON schema](https://json-schema.org/learn/getting-started-step-by-step). These schemas are then passed to the model chat template. In other words, tool-use models do not see your functions directly, and they never see the actual code inside them. What they care about is the function **definitions** and the **arguments** they @@ -507,7 +554,7 @@ to read their outputs, detect if they have requested to use a tool, pass their a return the response in the chat. Generating JSON schemas to pass to the template should be automatic and invisible as long as your functions -follow the specification above, but if you encounter problems, or you simply want more control over the conversion, +follow the specification above, but if you encounter problems, or you simply want more control over the conversion, you can handle the conversion manually. Here is an example of a manual schema conversion. ```python @@ -516,7 +563,7 @@ from transformers.utils import get_json_schema def multiply(a: float, b: float): """ A function that multiplies two numbers - + Args: a: The first number to multiply b: The second number to multiply @@ -531,33 +578,33 @@ This will yield: ```json { - "type": "function", + "type": "function", "function": { - "name": "multiply", - "description": "A function that multiplies two numbers", + "name": "multiply", + "description": "A function that multiplies two numbers", "parameters": { - "type": "object", + "type": "object", "properties": { "a": { - "type": "number", + "type": "number", "description": "The first number to multiply" - }, + }, "b": { "type": "number", "description": "The second number to multiply" } - }, + }, "required": ["a", "b"] } } } ``` -If you wish, you can edit these schemas, or even write them from scratch yourself without using `get_json_schema` at -all. JSON schemas can be passed directly to the `tools` argument of +If you wish, you can edit these schemas, or even write them from scratch yourself without using `get_json_schema` at +all. JSON schemas can be passed directly to the `tools` argument of `apply_chat_template` - this gives you a lot of power to define precise schemas for more complex functions. Be careful, -though - the more complex your schemas, the more likely the model is to get confused when dealing with them! We -recommend simple function signatures where possible, keeping arguments (and especially complex, nested arguments) +though - the more complex your schemas, the more likely the model is to get confused when dealing with them! We +recommend simple function signatures where possible, keeping arguments (and especially complex, nested arguments) to a minimum. Here is an example of defining schemas by hand, and passing them directly to `apply_chat_template`: @@ -565,7 +612,7 @@ Here is an example of defining schemas by hand, and passing them directly to `ap ```python # A simple function that takes no arguments current_time = { - "type": "function", + "type": "function", "function": { "name": "current_time", "description": "Get the current local time as a string.", @@ -581,18 +628,18 @@ multiply = { 'type': 'function', 'function': { 'name': 'multiply', - 'description': 'A function that multiplies two numbers', + 'description': 'A function that multiplies two numbers', 'parameters': { - 'type': 'object', + 'type': 'object', 'properties': { 'a': { 'type': 'number', 'description': 'The first number to multiply' - }, + }, 'b': { 'type': 'number', 'description': 'The second number to multiply' } - }, + }, 'required': ['a', 'b'] } } @@ -607,7 +654,7 @@ model_input = tokenizer.apply_chat_template( ## Advanced: Retrieval-augmented generation "Retrieval-augmented generation" or "RAG" LLMs can search a corpus of documents for information before responding -to a query. This allows models to vastly expand their knowledge base beyond their limited context size. Our +to a query. This allows models to vastly expand their knowledge base beyond their limited context size. Our recommendation for RAG models is that their template should accept a `documents` argument. This should be a list of documents, where each "document" is a single dict with `title` and `contents` keys, both of which are strings. Because this format is much simpler @@ -632,7 +679,7 @@ conversation = [ # Define documents for retrieval-based generation documents = [ { - "title": "The Moon: Our Age-Old Foe", + "title": "The Moon: Our Age-Old Foe", "text": "Man has always dreamed of destroying the moon. In this essay, I shall..." }, { @@ -650,7 +697,7 @@ input_ids = tokenizer.apply_chat_template( add_generation_prompt=True, return_tensors="pt").to(device) -# Generate a response +# Generate a response gen_tokens = model.generate( input_ids, max_new_tokens=100, @@ -683,7 +730,7 @@ one is a little simplified from the actual one! ``` {%- for message in messages %} - {{- '<|' + message['role'] + |>\n' }} + {{- '<|' + message['role'] + '|>\n' }} {{- message['content'] + eos_token }} {%- endfor %} {%- if add_generation_prompt %} @@ -710,8 +757,8 @@ Effectively, the template does three things: an assistant response. This is a pretty simple template but Jinja gives you a lot of flexibility to do more complex things! Let's see a Jinja -template that can format inputs similarly to the way LLaMA formats them (note that the real LLaMA template includes -handling for default system messages and slightly different system message handling in general - don't use this one +template that can format inputs similarly to the way LLaMA formats them (note that the real LLaMA template includes +handling for default system messages and slightly different system message handling in general - don't use this one in your actual code!) ``` @@ -734,7 +781,7 @@ distinguishable to the model because of the tokens they're wrapped in. ### How do I create a chat template? -Simple, just write a jinja template and set `tokenizer.chat_template`. You may find it easier to start with an +Simple, just write a jinja template and set `tokenizer.chat_template`. You may find it easier to start with an existing template from another model and simply edit it for your needs! For example, we could take the LLaMA template above and add "[ASST]" and "[/ASST]" to assistant messages: @@ -762,13 +809,13 @@ tokenizer.chat_template = template # Set the new template tokenizer.push_to_hub("model_name") # Upload your new template to the Hub! ``` -The method [`~PreTrainedTokenizer.apply_chat_template`] which uses your chat template is called by the [`TextGenerationPipeline`] class, so +The method [`~PreTrainedTokenizer.apply_chat_template`] which uses your chat template is called by the [`TextGenerationPipeline`] class, so once you set the correct chat template, your model will automatically become compatible with [`TextGenerationPipeline`]. If you're fine-tuning a model for chat, in addition to setting a chat template, you should probably add any new chat -control tokens as special tokens in the tokenizer. Special tokens are never split, -ensuring that your control tokens are always handled as single tokens rather than being tokenized in pieces. You +control tokens as special tokens in the tokenizer. Special tokens are never split, +ensuring that your control tokens are always handled as single tokens rather than being tokenized in pieces. You should also set the tokenizer's `eos_token` attribute to the token that marks the end of assistant generations in your template. This will ensure that text generation tools can correctly figure out when to stop generating text. @@ -796,13 +843,13 @@ trying to put it all in a single template where possible! When setting the template for a model that's already been trained for chat, you should ensure that the template exactly matches the message formatting that the model saw during training, or else you will probably experience -performance degradation. This is true even if you're training the model further - you will probably get the best +performance degradation. This is true even if you're training the model further - you will probably get the best performance if you keep the chat tokens constant. This is very analogous to tokenization - you generally get the best performance for inference or fine-tuning when you precisely match the tokenization used during training. If you're training a model from scratch, or fine-tuning a base language model for chat, on the other hand, you have a lot of freedom to choose an appropriate template! LLMs are smart enough to learn to handle lots of different -input formats. One popular choice is the `ChatML` format, and this is a good, flexible choice for many use-cases. +input formats. One popular choice is the `ChatML` format, and this is a good, flexible choice for many use-cases. It looks like this: ``` @@ -848,7 +895,7 @@ Once the attribute is set, that's it, you're done! `tokenizer.apply_chat_templat model, which means it is also automatically supported in places like `TextGenerationPipeline`! By ensuring that models have this attribute, we can make sure that the whole community gets to use the full power of -open-source models. Formatting mismatches have been haunting the field and silently harming performance for too long - +open-source models. Formatting mismatches have been haunting the field and silently harming performance for too long - it's time to put an end to them! ## Advanced: Template writing tips @@ -856,17 +903,17 @@ it's time to put an end to them! The easiest way to get started with writing Jinja templates is to take a look at some existing ones. You can use -`print(tokenizer.chat_template)` for any chat model to see what template it's using. In general, models that support tool use have +`print(tokenizer.chat_template)` for any chat model to see what template it's using. In general, models that support tool use have much more complex templates than other models - so when you're just getting started, they're probably a bad example -to learn from! You can also take a look at the +to learn from! You can also take a look at the [Jinja documentation](https://jinja.palletsprojects.com/en/3.1.x/templates/#synopsis) for details of general Jinja formatting and syntax. -Jinja templates in `transformers` are identical to Jinja templates elsewhere. The main thing to know is that -the conversation history will be accessible inside your template as a variable called `messages`. -You will be able to access `messages` in your template just like you can in Python, which means you can loop over +Jinja templates in `transformers` are identical to Jinja templates elsewhere. The main thing to know is that +the conversation history will be accessible inside your template as a variable called `messages`. +You will be able to access `messages` in your template just like you can in Python, which means you can loop over it with `{% for message in messages %}` or access individual messages with `{{ messages[0] }}`, for example. You can also use the following tips to write clean, efficient Jinja templates: @@ -896,7 +943,7 @@ and indentation may end up being included in the output, which is probably not w ### Special variables -Inside your template, you will have access several special variables. The most important of these is `messages`, +Inside your template, you will have access several special variables. The most important of these is `messages`, which contains the chat history as a list of message dicts. However, there are several others. Not every variable will be used in every template. The most common other variables are: @@ -930,7 +977,7 @@ There are multiple implementations of Jinja in various languages. They generally but a key difference is that when you're writing a template in Python you can use Python methods, such as `.lower()` on strings or `.items()` on dicts. This will break if someone tries to use your template on a non-Python implementation of Jinja. Non-Python implementations are particularly common in deployment environments, where JS -and Rust are very popular. +and Rust are very popular. Don't panic, though! There are a few easy changes you can make to your templates to ensure they're compatible across all implementations of Jinja: @@ -962,21 +1009,21 @@ Here is an example of a template that formats messages ChatML-style, with genera ``` The exact content of the assistant header will depend on your specific model, but it should always be **the string -that represents the start of an assistant message**, so that if the user applies your template with +that represents the start of an assistant message**, so that if the user applies your template with `add_generation_prompt=True` and then generates text, the model will write an assistant response. Also note that some -models do not need a generation prompt, because assistant messages always begin immediately after user messages. +models do not need a generation prompt, because assistant messages always begin immediately after user messages. This is particularly common for LLaMA and Mistral models, where assistant messages begin immediately after the `[/INST]` token that ends user messages. In these cases, the template can ignore the `add_generation_prompt` flag. Generation prompts are important! If your model requires a generation prompt but it is not set in the template, then -model generations will likely be severely degraded, or the model may display unusual behaviour like continuing -the final user message! +model generations will likely be severely degraded, or the model may display unusual behaviour like continuing +the final user message! ### Writing and debugging larger templates -When this feature was introduced, most templates were quite small, the Jinja equivalent of a "one-liner" script. +When this feature was introduced, most templates were quite small, the Jinja equivalent of a "one-liner" script. However, with new models and features like tool-use and RAG, some templates can be 100 lines long or more. When -writing templates like these, it's a good idea to write them in a separate file, using a text editor. You can easily +writing templates like these, it's a good idea to write them in a separate file, using a text editor. You can easily extract a chat template to a file: ```python @@ -995,7 +1042,7 @@ identify the source of issues. ### Writing templates for tools -Although chat templates do not enforce a specific API for tools (or for anything, really), we recommend +Although chat templates do not enforce a specific API for tools (or for anything, really), we recommend template authors try to stick to a standard API where possible. The whole point of chat templates is to allow code to be transferable across models, so deviating from the standard tools API means users will have to write custom code to use tools with your model. Sometimes it's unavoidable, but often with clever templating you can @@ -1005,30 +1052,30 @@ Below, we'll list the elements of the standard API, and give tips on writing tem #### Tool definitions -Your template should expect that the variable `tools` will either be null (if no tools are passed), or is a list +Your template should expect that the variable `tools` will either be null (if no tools are passed), or is a list of JSON schema dicts. Our chat template methods allow users to pass tools as either JSON schema or Python functions, but when -functions are passed, we automatically generate JSON schema and pass that to your template. As a result, the +functions are passed, we automatically generate JSON schema and pass that to your template. As a result, the `tools` variable that your template receives will always be a list of JSON schema. Here is a sample tool JSON schema: ```json { - "type": "function", + "type": "function", "function": { - "name": "multiply", - "description": "A function that multiplies two numbers", + "name": "multiply", + "description": "A function that multiplies two numbers", "parameters": { - "type": "object", + "type": "object", "properties": { "a": { - "type": "number", + "type": "number", "description": "The first number to multiply" - }, + }, "b": { "type": "number", "description": "The second number to multiply" } - }, + }, "required": ["a", "b"] } } @@ -1052,13 +1099,13 @@ specific format - your model will probably need different formatting! The specific tokens and tool descriptions your template renders should of course be chosen to match the ones your model was trained with. There is no requirement that your **model** understands JSON schema input, only that your template can translate -JSON schema into your model's format. For example, [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024) -was trained with tools defined using Python function headers, but the Command-R tool template accepts JSON schema, +JSON schema into your model's format. For example, [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024) +was trained with tools defined using Python function headers, but the Command-R tool template accepts JSON schema, converts types internally and renders the input tools as Python headers. You can do a lot with templates! #### Tool calls -Tool calls, if present, will be a list attached to a message with the "assistant" role. Note that `tool_calls` is +Tool calls, if present, will be a list attached to a message with the "assistant" role. Note that `tool_calls` is always a list, even though most tool-calling models only support single tool calls at a time, which means the list will usually only have a single element. Here is a sample message dict containing a tool call: @@ -1116,4 +1163,4 @@ name to be included in the tool response, then rendering it can be as simple as: ``` Again, remember that the actual formatting and special tokens are model-specific - you should take a lot of care -to ensure that tokens, whitespace and everything else exactly match the format your model was trained with! \ No newline at end of file +to ensure that tokens, whitespace and everything else exactly match the format your model was trained with! diff --git a/docs/source/en/deepspeed.md b/docs/source/en/deepspeed.md index 7f7995c46641..ad3d4240f856 100644 --- a/docs/source/en/deepspeed.md +++ b/docs/source/en/deepspeed.md @@ -586,6 +586,20 @@ You can choose the communication data type by setting the `communication_data_ty } ``` +### Universal Checkpointing + +[Universal Checkpointing](https://www.deepspeed.ai/tutorials/universal-checkpointing) is an efficient and flexible feature for saving and loading model checkpoints. It enables seamless model training continuation and fine-tuning across different model architectures, parallelism techniques, and training configurations. + +Resume training with a universal checkpoint by setting [load_universal](https://www.deepspeed.ai/docs/config-json/#checkpoint-options) to `true` in the config file. + +```yaml +{ + "checkpoint": { + "load_universal": true + } +} +``` + ## Deployment DeepSpeed can be deployed by different launchers such as [torchrun](https://pytorch.org/docs/stable/elastic/run.html), the `deepspeed` launcher, or [Accelerate](https://huggingface.co/docs/accelerate/basic_tutorials/launch#using-accelerate-launch). To deploy, add `--deepspeed ds_config.json` to the [`Trainer`] command line. It’s recommended to use DeepSpeed’s [`add_config_arguments`](https://deepspeed.readthedocs.io/en/latest/initialize.html#argument-parsing) utility to add any necessary command line arguments to your code. diff --git a/docs/source/en/fsdp.md b/docs/source/en/fsdp.md index 6b90ab5ad6d6..2c4f114dec85 100644 --- a/docs/source/en/fsdp.md +++ b/docs/source/en/fsdp.md @@ -58,7 +58,7 @@ Otherwise, you can choose a size-based wrapping policy where FSDP is applied to ### Checkpointing -Intermediate checkpoints should be saved with `fsdp_state_dict_type: SHARDED_STATE_DICT` because saving the full state dict with CPU offloading on rank 0 takes a lot of time and often results in `NCCL Timeout` errors due to indefinite hanging during broadcasting. You can resume training with the sharded state dicts with the [`~accelerate.Accelerator.load_state`]` method. +Intermediate checkpoints should be saved with `fsdp_state_dict_type: SHARDED_STATE_DICT` because saving the full state dict with CPU offloading on rank 0 takes a lot of time and often results in `NCCL Timeout` errors due to indefinite hanging during broadcasting. You can resume training with the sharded state dicts with the [`~accelerate.Accelerator.load_state`] method. ```py # directory containing checkpoints diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md index 380b39fe62ac..99049cceef34 100644 --- a/docs/source/en/generation_strategies.md +++ b/docs/source/en/generation_strategies.md @@ -41,6 +41,13 @@ This guide describes: * common decoding strategies and their main parameters * saving and sharing custom generation configurations with your fine-tuned model on 🤗 Hub + + +`generate()` is a critical component of our [chat CLI](quicktour#chat-with-text-generation-models). +You can apply the learnings of this guide there as well. + + + ## Default text generation configuration A decoding strategy for a model is defined in its generation configuration. When using pre-trained models for inference @@ -96,6 +103,12 @@ distribution over the entire vocabulary with various strategy-specific adjustmen the decoding strategies that support multiple sequence candidates, e.g. variations of beam search and sampling. Decoding strategies like greedy search and contrastive search return a single output sequence. +It is also possible to extend `generate()` with external libraries or handcrafted code. The `logits_processor` argument +allows you to pass custom [`LogitsProcessor`] instances, allowing you to manipulate the next token probability +distributions. Likewise, the `stopping_criteria` argument lets you set custom [`StoppingCriteria`] to stop text generation. +The [`logits-processor-zoo`](https://github.com/NVIDIA/logits-processor-zoo) library contains examples of external +`generate()`-compatible extensions. + ## Save a custom decoding strategy with your model If you would like to share your fine-tuned model with a specific generation configuration, you can: @@ -218,7 +231,7 @@ to check if the text is machine-generated (outputs `True` for machine-generated >>> detector = WatermarkDetector(model_config=model.config, device="cpu", watermarking_config=watermarking_config) >>> detection_out = detector(out, return_dict=True) >>> detection_out.prediction -array([True, True]) +array([ True, True]) ``` @@ -256,7 +269,7 @@ dimension you can act upon, in addition to selecting a decoding strategy. Popula >>> model = AutoModelForCausalLM.from_pretrained(checkpoint) >>> outputs = model.generate(**inputs) >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) -['I look forward to seeing you all again!\n\n\n\n\n\n\n\n\n\n\n'] +['I look forward to seeing you all again!\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n'] ``` ### Contrastive search @@ -432,9 +445,31 @@ To enable assisted decoding, set the `assistant_model` argument with a model. >>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint) >>> outputs = model.generate(**inputs, assistant_model=assistant_model) >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) -['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a'] +['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a glass of wine.'] ``` + + +If you're using a `pipeline` object, all you need to do is to pass the assistant checkpoint under `assistant_model` + +```python +>>> from transformers import pipeline +>>> import torch + +>>> pipe = pipeline( +... "text-generation", +... model="meta-llama/Llama-3.1-8B", +... assistant_model="meta-llama/Llama-3.2-1B", # This extra line is all that's needed, also works with UAD +... torch_dtype=torch.bfloat16 +... ) +>>> pipe_output = pipe("Once upon a time, ", max_new_tokens=50, do_sample=False) +>>> pipe_output[0]["generated_text"] +'Once upon a time, 3D printing was a niche technology that was only' +``` + + + + When using assisted decoding with sampling methods, you can use the `temperature` argument to control the randomness, just like in multinomial sampling. However, in assisted decoding, reducing the temperature may help improve the latency. @@ -453,9 +488,11 @@ just like in multinomial sampling. However, in assisted decoding, reducing the t >>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint) >>> outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True, temperature=0.5) >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) -['Alice and Bob, a couple of friends of mine, who are both in the same office as'] +['Alice and Bob are two people who are very different, but they are both very good at what they do. Alice'] ``` +We recommend to install `scikit-learn` library to enhance the candidate generation strategy and achieve additional speedup. + #### Universal Assisted Decoding Universal Assisted Decoding (UAD) adds support for main and assistant models with different tokenizers. @@ -481,7 +518,7 @@ to ensure the new tokens include the correct prompt suffix. >>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint) >>> outputs = model.generate(**inputs, assistant_model=assistant_model, tokenizer=tokenizer, assistant_tokenizer=assistant_tokenizer) >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) -['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a'] +['Alice and Bob are playing a game. Alice has a set of $n$ integers $a_1, a'] ``` #### Prompt Lookup @@ -510,7 +547,7 @@ If the model you're using was trained to do early exit, you can pass >>> model = AutoModelForCausalLM.from_pretrained(checkpoint) >>> outputs = model.generate(**inputs, assistant_early_exit=4, do_sample=False, max_new_tokens=20) >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) -['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a'] +['Alice and Bob are playing a game. Alice has a set of $n$ integers $a_1, a'] ``` ### DoLa Decoding @@ -534,10 +571,9 @@ See the following examples for DoLa decoding with the 32-layer LLaMA-7B model. >>> import torch >>> from accelerate.test_utils.testing import get_backend ->>> tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b") ->>> model = AutoModelForCausalLM.from_pretrained("huggyllama/llama-7b", torch_dtype=torch.float16) >>> device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) ->>> model.to(device) +>>> tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b") +>>> model = AutoModelForCausalLM.from_pretrained("huggyllama/llama-7b", torch_dtype=torch.float16).to(device) >>> set_seed(42) >>> text = "On what date was the Declaration of Independence officially signed?" @@ -556,7 +592,7 @@ See the following examples for DoLa decoding with the 32-layer LLaMA-7B model. # DoLa decoding with contrasting specific layers (layers 28 and 30) >>> dola_custom_output = model.generate(**inputs, do_sample=False, max_new_tokens=50, dola_layers=[28,30], repetition_penalty=1.2) >>> tokenizer.batch_decode(dola_custom_output[:, inputs.input_ids.shape[-1]:], skip_special_tokens=True) -['\nIt was officially signed on 2 August 1776, when 56 members of the Second Continental Congress, representing the original 13 American colonies, voted unanimously for the resolution for independence. The 2'] +['\nIn 1891, when he was 54 years old, John Jacob Astor founded his empire. He opened a one-man business and spent the next 27 years working 10-hour days. When'] ``` #### Understanding the `dola_layers` argument diff --git a/docs/source/en/gguf.md b/docs/source/en/gguf.md index b1ed1f0d492a..b1afd55c8952 100644 --- a/docs/source/en/gguf.md +++ b/docs/source/en/gguf.md @@ -88,6 +88,7 @@ For now the supported model architectures are the architectures that have been v - T5 - Mamba - Nemotron +- Gemma2 ## Example usage diff --git a/docs/source/en/index.md b/docs/source/en/index.md index 341cb417c7b8..2233630128ae 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -62,8 +62,11 @@ Flax), PyTorch, and/or TensorFlow. | [ALBERT](model_doc/albert) | ✅ | ✅ | ✅ | | [ALIGN](model_doc/align) | ✅ | ❌ | ❌ | | [AltCLIP](model_doc/altclip) | ✅ | ❌ | ❌ | +| [Aria](model_doc/aria) | ✅ | ❌ | ❌ | +| [AriaText](model_doc/aria_text) | ✅ | ❌ | ❌ | | [Audio Spectrogram Transformer](model_doc/audio-spectrogram-transformer) | ✅ | ❌ | ❌ | | [Autoformer](model_doc/autoformer) | ✅ | ❌ | ❌ | +| [Bamba](model_doc/bamba) | ✅ | ❌ | ❌ | | [Bark](model_doc/bark) | ✅ | ❌ | ❌ | | [BART](model_doc/bart) | ✅ | ✅ | ✅ | | [BARThez](model_doc/barthez) | ✅ | ✅ | ✅ | @@ -97,6 +100,8 @@ Flax), PyTorch, and/or TensorFlow. | [CodeGen](model_doc/codegen) | ✅ | ❌ | ❌ | | [CodeLlama](model_doc/code_llama) | ✅ | ❌ | ✅ | | [Cohere](model_doc/cohere) | ✅ | ❌ | ❌ | +| [Cohere2](model_doc/cohere2) | ✅ | ❌ | ❌ | +| [ColPali](model_doc/colpali) | ✅ | ❌ | ❌ | | [Conditional DETR](model_doc/conditional_detr) | ✅ | ❌ | ❌ | | [ConvBERT](model_doc/convbert) | ✅ | ✅ | ❌ | | [ConvNeXT](model_doc/convnext) | ✅ | ✅ | ❌ | @@ -120,8 +125,10 @@ Flax), PyTorch, and/or TensorFlow. | [DETA](model_doc/deta) | ✅ | ❌ | ❌ | | [DETR](model_doc/detr) | ✅ | ❌ | ❌ | | [DialoGPT](model_doc/dialogpt) | ✅ | ✅ | ✅ | +| [DiffLlama](model_doc/diffllama) | ✅ | ❌ | ❌ | | [DiNAT](model_doc/dinat) | ✅ | ❌ | ❌ | | [DINOv2](model_doc/dinov2) | ✅ | ❌ | ✅ | +| [DINOv2 with Registers](model_doc/dinov2_with_registers) | ✅ | ❌ | ❌ | | [DistilBERT](model_doc/distilbert) | ✅ | ✅ | ✅ | | [DiT](model_doc/dit) | ✅ | ❌ | ✅ | | [DonutSwin](model_doc/donut) | ✅ | ❌ | ❌ | @@ -130,6 +137,7 @@ Flax), PyTorch, and/or TensorFlow. | [EfficientFormer](model_doc/efficientformer) | ✅ | ✅ | ❌ | | [EfficientNet](model_doc/efficientnet) | ✅ | ❌ | ❌ | | [ELECTRA](model_doc/electra) | ✅ | ✅ | ✅ | +| [Emu3](model_doc/emu3) | ✅ | ❌ | ❌ | | [EnCodec](model_doc/encodec) | ✅ | ❌ | ❌ | | [Encoder decoder](model_doc/encoder-decoder) | ✅ | ✅ | ✅ | | [ERNIE](model_doc/ernie) | ✅ | ❌ | ❌ | @@ -137,6 +145,7 @@ Flax), PyTorch, and/or TensorFlow. | [ESM](model_doc/esm) | ✅ | ✅ | ❌ | | [FairSeq Machine-Translation](model_doc/fsmt) | ✅ | ❌ | ❌ | | [Falcon](model_doc/falcon) | ✅ | ❌ | ❌ | +| [Falcon3](model_doc/falcon3) | ✅ | ❌ | ✅ | | [FalconMamba](model_doc/falcon_mamba) | ✅ | ❌ | ❌ | | [FastSpeech2Conformer](model_doc/fastspeech2_conformer) | ✅ | ❌ | ❌ | | [FLAN-T5](model_doc/flan-t5) | ✅ | ✅ | ✅ | @@ -164,13 +173,16 @@ Flax), PyTorch, and/or TensorFlow. | [Graphormer](model_doc/graphormer) | ✅ | ❌ | ❌ | | [Grounding DINO](model_doc/grounding-dino) | ✅ | ❌ | ❌ | | [GroupViT](model_doc/groupvit) | ✅ | ✅ | ❌ | +| [Helium](model_doc/helium) | ✅ | ❌ | ❌ | | [HerBERT](model_doc/herbert) | ✅ | ✅ | ✅ | | [Hiera](model_doc/hiera) | ✅ | ❌ | ❌ | | [Hubert](model_doc/hubert) | ✅ | ✅ | ❌ | | [I-BERT](model_doc/ibert) | ✅ | ❌ | ❌ | +| [I-JEPA](model_doc/ijepa) | ✅ | ❌ | ❌ | | [IDEFICS](model_doc/idefics) | ✅ | ✅ | ❌ | | [Idefics2](model_doc/idefics2) | ✅ | ❌ | ❌ | | [Idefics3](model_doc/idefics3) | ✅ | ❌ | ❌ | +| [Idefics3VisionTransformer](model_doc/idefics3_vision) | ❌ | ❌ | ❌ | | [ImageGPT](model_doc/imagegpt) | ✅ | ❌ | ❌ | | [Informer](model_doc/informer) | ✅ | ❌ | ❌ | | [InstructBLIP](model_doc/instructblip) | ✅ | ❌ | ❌ | @@ -224,6 +236,8 @@ Flax), PyTorch, and/or TensorFlow. | [MobileNetV2](model_doc/mobilenet_v2) | ✅ | ❌ | ❌ | | [MobileViT](model_doc/mobilevit) | ✅ | ✅ | ❌ | | [MobileViTV2](model_doc/mobilevitv2) | ✅ | ❌ | ❌ | +| [ModernBERT](model_doc/modernbert) | ✅ | ❌ | ❌ | +| [Moonshine](model_doc/moonshine) | ✅ | ❌ | ❌ | | [Moshi](model_doc/moshi) | ✅ | ❌ | ❌ | | [MPNet](model_doc/mpnet) | ✅ | ✅ | ❌ | | [MPT](model_doc/mpt) | ✅ | ❌ | ❌ | @@ -240,7 +254,7 @@ Flax), PyTorch, and/or TensorFlow. | [Nougat](model_doc/nougat) | ✅ | ✅ | ✅ | | [Nyströmformer](model_doc/nystromformer) | ✅ | ❌ | ❌ | | [OLMo](model_doc/olmo) | ✅ | ❌ | ❌ | -| [OLMo November 2024](model_doc/olmo_1124) | ✅ | ❌ | ❌ | +| [OLMo2](model_doc/olmo2) | ✅ | ❌ | ❌ | | [OLMoE](model_doc/olmoe) | ✅ | ❌ | ❌ | | [OmDet-Turbo](model_doc/omdet-turbo) | ✅ | ❌ | ❌ | | [OneFormer](model_doc/oneformer) | ✅ | ❌ | ❌ | @@ -271,6 +285,7 @@ Flax), PyTorch, and/or TensorFlow. | [PVTv2](model_doc/pvt_v2) | ✅ | ❌ | ❌ | | [QDQBert](model_doc/qdqbert) | ✅ | ❌ | ❌ | | [Qwen2](model_doc/qwen2) | ✅ | ❌ | ❌ | +| [Qwen2_5_VL](model_doc/qwen2_5_vl) | ✅ | ❌ | ❌ | | [Qwen2Audio](model_doc/qwen2_audio) | ✅ | ❌ | ❌ | | [Qwen2MoE](model_doc/qwen2_moe) | ✅ | ❌ | ❌ | | [Qwen2VL](model_doc/qwen2_vl) | ✅ | ❌ | ❌ | @@ -304,6 +319,7 @@ Flax), PyTorch, and/or TensorFlow. | [SqueezeBERT](model_doc/squeezebert) | ✅ | ❌ | ❌ | | [StableLm](model_doc/stablelm) | ✅ | ❌ | ❌ | | [Starcoder2](model_doc/starcoder2) | ✅ | ❌ | ❌ | +| [SuperGlue](model_doc/superglue) | ✅ | ❌ | ❌ | | [SuperPoint](model_doc/superpoint) | ✅ | ❌ | ❌ | | [SwiftFormer](model_doc/swiftformer) | ✅ | ✅ | ❌ | | [Swin Transformer](model_doc/swin) | ✅ | ✅ | ❌ | @@ -315,8 +331,10 @@ Flax), PyTorch, and/or TensorFlow. | [Table Transformer](model_doc/table-transformer) | ✅ | ❌ | ❌ | | [TAPAS](model_doc/tapas) | ✅ | ✅ | ❌ | | [TAPEX](model_doc/tapex) | ✅ | ✅ | ✅ | +| [TextNet](model_doc/textnet) | ✅ | ❌ | ❌ | | [Time Series Transformer](model_doc/time_series_transformer) | ✅ | ❌ | ❌ | | [TimeSformer](model_doc/timesformer) | ✅ | ❌ | ❌ | +| [TimmWrapperModel](model_doc/timm_wrapper) | ✅ | ❌ | ❌ | | [Trajectory Transformer](model_doc/trajectory_transformer) | ✅ | ❌ | ❌ | | [Transformer-XL](model_doc/transfo-xl) | ✅ | ✅ | ❌ | | [TrOCR](model_doc/trocr) | ✅ | ❌ | ❌ | @@ -343,6 +361,8 @@ Flax), PyTorch, and/or TensorFlow. | [ViTMAE](model_doc/vit_mae) | ✅ | ✅ | ❌ | | [ViTMatte](model_doc/vitmatte) | ✅ | ❌ | ❌ | | [ViTMSN](model_doc/vit_msn) | ✅ | ❌ | ❌ | +| [ViTPose](model_doc/vitpose) | ✅ | ❌ | ❌ | +| [ViTPoseBackbone](model_doc/vitpose_backbone) | ✅ | ❌ | ❌ | | [VITS](model_doc/vits) | ✅ | ❌ | ❌ | | [ViViT](model_doc/vivit) | ✅ | ❌ | ❌ | | [Wav2Vec2](model_doc/wav2vec2) | ✅ | ✅ | ✅ | @@ -365,6 +385,7 @@ Flax), PyTorch, and/or TensorFlow. | [YOLOS](model_doc/yolos) | ✅ | ❌ | ❌ | | [YOSO](model_doc/yoso) | ✅ | ❌ | ❌ | | [Zamba](model_doc/zamba) | ✅ | ❌ | ❌ | +| [Zamba2](model_doc/zamba2) | ✅ | ❌ | ❌ | | [ZoeDepth](model_doc/zoedepth) | ✅ | ❌ | ❌ | diff --git a/docs/source/en/installation.md b/docs/source/en/installation.md index f4ce768c3168..ae1f2101d749 100644 --- a/docs/source/en/installation.md +++ b/docs/source/en/installation.md @@ -32,27 +32,18 @@ Install 🤗 Transformers for whichever deep learning library you're working wit You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, take a look at this [guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). A virtual environment makes it easier to manage different projects, and avoid compatibility issues between dependencies. -Start by creating a virtual environment in your project directory: +Now you're ready to install 🤗 Transformers with the following command: ```bash -python -m venv .env +pip install transformers ``` -Activate the virtual environment. On Linux and MacOs: +For GPU acceleration, install the appropriate CUDA drivers for [PyTorch](https://pytorch.org/get-started/locally) and TensorFlow(https://www.tensorflow.org/install/pip). -```bash -source .env/bin/activate -``` -Activate Virtual environment on Windows +Run the command below to check if your system detects an NVIDIA GPU. ```bash -.env/Scripts/activate -``` - -Now you're ready to install 🤗 Transformers with the following command: - -```bash -pip install transformers +nvidia-smi ``` For CPU-support only, you can conveniently install 🤗 Transformers and a deep learning library in one line. For example, install 🤗 Transformers and PyTorch with: @@ -157,7 +148,7 @@ conda install conda-forge::transformers Pretrained models are downloaded and locally cached at: `~/.cache/huggingface/hub`. This is the default directory given by the shell environment variable `TRANSFORMERS_CACHE`. On Windows, the default directory is given by `C:\Users\username\.cache\huggingface\hub`. You can change the shell environment variables shown below - in order of priority - to specify a different cache directory: -1. Shell environment variable (default): `HUGGINGFACE_HUB_CACHE` or `TRANSFORMERS_CACHE`. +1. Shell environment variable (default): `HF_HUB_CACHE` or `TRANSFORMERS_CACHE`. 2. Shell environment variable: `HF_HOME`. 3. Shell environment variable: `XDG_CACHE_HOME` + `/huggingface`. @@ -254,3 +245,36 @@ Once your file is downloaded and locally cached, specify it's local path to load See the [How to download files from the Hub](https://huggingface.co/docs/hub/how-to-downstream) section for more details on downloading files stored on the Hub. + +## Troubleshooting + +See below for some of the more common installation issues and how to resolve them. + +### Unsupported Python version + +Ensure you are using Python 3.9 or later. Run the command below to check your Python version. + +``` +python --version +``` + +### Missing dependencies + +Install all required dependencies by running the following command. Ensure you’re in the project directory before executing the command. + +``` +pip install -r requirements.txt +``` + +### Windows-specific + +If you encounter issues on Windows, you may need to activate Developer Mode. Navigate to Windows Settings > For Developers > Developer Mode. + +Alternatively, create and activate a virtual environment as shown below. + +``` +python -m venv env +.\env\Scripts\activate +``` + + diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md index eb25ddb63297..d8931342ee45 100644 --- a/docs/source/en/internal/generation_utils.md +++ b/docs/source/en/internal/generation_utils.md @@ -352,6 +352,8 @@ A [`Constraint`] can be used to force the generation to include specific tokens [[autodoc]] TextIteratorStreamer +[[autodoc]] AsyncTextIteratorStreamer + ## Caches [[autodoc]] Cache @@ -436,3 +438,9 @@ A [`Constraint`] can be used to force the generation to include specific tokens [[autodoc]] SynthIDTextWatermarkDetector - __call__ + +## Compile Utils + +[[autodoc]] CompileConfig + - __call__ + diff --git a/docs/source/en/kv_cache.md b/docs/source/en/kv_cache.md index 05ab9eafa723..ed6fb9035e0c 100644 --- a/docs/source/en/kv_cache.md +++ b/docs/source/en/kv_cache.md @@ -56,7 +56,7 @@ More concretely, key-value cache acts as a memory bank for these generative mode >>> import torch >>> from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache - >>> model_id = "meta-llama/Llama-2-7b-chat-hf" + >>> model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" >>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda:0") >>> tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -82,7 +82,13 @@ More concretely, key-value cache acts as a memory bank for these generative mode ... cache_position = cache_position[-1:] + 1 # add one more position for the next token >>> print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]) - "[INST] Hello, what's your name. [/INST] Hello! My name is LLaMA," + ``` + ```txt + <|user|> + Hello, what's your name. + <|assistant|> + My name is Sarah. + <| ``` @@ -132,17 +138,13 @@ Cache quantization can be detrimental in terms of latency if the context length >>> import torch >>> from transformers import AutoTokenizer, AutoModelForCausalLM ->>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf") ->>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0") +>>> tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0") +>>> model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16).to("cuda:0") >>> inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device) >>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"nbits": 4, "backend": "quanto"}) >>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0]) -I like rock music because it's loud and energetic. It's a great way to express myself and rel - ->>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20) ->>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0]) -I like rock music because it's loud and energetic. I like to listen to it when I'm feeling +I like rock music because it's a great way to express myself. I like the way it makes me feel, the ``` ### Offloaded Cache @@ -180,7 +182,7 @@ Fun fact: The shortest war in history was between Britain and Zanzibar on August -Cache offloading requires a GPU and can be slower than dynamic KV cache. Use it if you are getting CUDA out of memory errors. +Cache offloading requires a CUDA GPU and can be slower than dynamic KV cache. Use it if you are getting CUDA out of memory errors. @@ -231,14 +233,14 @@ For more examples with Static Cache and JIT compilation, take a look at [StaticC >>> import torch >>> from transformers import AutoTokenizer, AutoModelForCausalLM ->>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf") ->>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto") +>>> tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0") +>>> model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16, device_map="auto") >>> inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device) >>> # simply pass the cache implementation="static" >>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="static") >>> tokenizer.batch_decode(out, skip_special_tokens=True)[0] -"Hello, my name is [Your Name], and I am a [Your Profession] with [Number of Years] of" +"Hello, my name is [Your Name] and I am a [Your Position] at [Your Company]. I am writing" ``` @@ -256,11 +258,12 @@ This will use the [`~OffloadedStaticCache`] implementation instead. >>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto") >>> inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device) ->>> # simply pass the cache implementation="static" +>>> # simply pass the cache implementation="offloaded_static" >>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="offloaded_static") >>> tokenizer.batch_decode(out, skip_special_tokens=True)[0] "Hello, my name is [Your Name], and I am a [Your Profession] with [Number of Years] of" ``` +Cache offloading requires a CUDA GPU. ### Sliding Window Cache @@ -274,14 +277,14 @@ Note that you can use this cache only for models that support sliding window, e. >>> import torch >>> from transformers import AutoTokenizer, AutoModelForCausalLM, SinkCache ->>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") ->>> model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.float16).to("cuda:0") +>>> tokenizer = AutoTokenizer.from_pretrained("teknium/OpenHermes-2.5-Mistral-7B") +>>> model = AutoModelForCausalLM.from_pretrained("teknium/OpenHermes-2.5-Mistral-7B", torch_dtype=torch.float16).to("cuda:0") >>> inputs = tokenizer("Yesterday I was on a rock concert and.", return_tensors="pt").to(model.device) >>> # can be used by passing in cache implementation >>> out = model.generate(**inputs, do_sample=False, max_new_tokens=30, cache_implementation="sliding_window") >>> tokenizer.batch_decode(out, skip_special_tokens=True)[0] -"Yesterday I was on a rock concert and. I was so excited to see my favorite band. I was so excited that I was jumping up and down and screaming. I was so excited that I" +"Yesterday I was on a rock concert and. I was so excited to see my favorite band perform live. I was so happy that I could hardly contain myself. I was jumping up and down and" ``` ### Sink Cache @@ -294,8 +297,8 @@ Unlike other cache classes, this one can't be used directly by indicating a `cac >>> import torch >>> from transformers import AutoTokenizer, AutoModelForCausalLM, SinkCache ->>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf") ->>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0") +>>> tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0") +>>> model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16).to("cuda:0") >>> inputs = tokenizer("This is a long story about unicorns, fairies and magic.", return_tensors="pt").to(model.device) >>> # get our cache, specify number of sink tokens and window size @@ -303,7 +306,7 @@ Unlike other cache classes, this one can't be used directly by indicating a `cac >>> past_key_values = SinkCache(window_length=256, num_sink_tokens=4) >>> out = model.generate(**inputs, do_sample=False, max_new_tokens=30, past_key_values=past_key_values) >>> tokenizer.batch_decode(out, skip_special_tokens=True)[0] -"This is a long story about unicorns, fairies and magic. It is a fantasy world where unicorns and fairies live together in harmony. The story follows a young girl named Lily" +"This is a long story about unicorns, fairies and magic. It is a story about a young girl named Lily who discovers that she has the power to control the elements. She learns that she can" ``` ### Encoder-Decoder Cache @@ -331,15 +334,15 @@ In case you are using Sink Cache, you have to crop your inputs to that maximum l >>> import torch >>> from transformers import AutoTokenizer,AutoModelForCausalLM >>> from transformers.cache_utils import ( ->>> DynamicCache, ->>> SinkCache, ->>> StaticCache, ->>> SlidingWindowCache, ->>> QuantoQuantizedCache, ->>> QuantizedCacheConfig, ->>> ) - ->>> model_id = "meta-llama/Llama-2-7b-chat-hf" +... DynamicCache, +... SinkCache, +... StaticCache, +... SlidingWindowCache, +... QuantoQuantizedCache, +... QuantizedCacheConfig, +... ) + +>>> model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" >>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map='auto') >>> tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -362,7 +365,7 @@ In case you are using Sink Cache, you have to crop your inputs to that maximum l ... messages.append({"role": "assistant", "content": completion}) print(messages) -[{'role': 'user', 'content': "Hello, what's your name?"}, {'role': 'assistant', 'content': " Hello! My name is LLaMA, I'm a large language model trained by a team of researcher at Meta AI. 😊"}, {'role': 'user', 'content': 'Btw, yesterday I was on a rock concert.'}, {'role': 'assistant', 'content': ' Oh, cool! That sounds like a lot of fun! 🎉 Did you enjoy the concert? What was the band like? 🤔'}] +[{'role': 'user', 'content': "Hello, what's your name?"}, {'role': 'assistant', 'content': "Hello, I'm AI."}, {'role': 'user', 'content': 'Btw, yesterday I was on a rock concert.'}, {'role': 'assistant', 'content': "I'm sorry to hear that you were on a rock concert yesterday. It sounds like a fun experience, but I'm not capable of experiencing music or concerts. However, I can provide you with some information about rock music and its history. Rock music emerged in the 1950s and 1960s in the United States and Britain, and it quickly gained popularity around the world. Some of the most famous rock bands of all time include The Beatles, The Rolling Stones, Led Zeppelin, and Pink Floyd. Rock music has a distinct sound and style, with elements of blues, country, and folk music. It often features guitar solos, heavy bass lines, and drums. Rock music has had a significant impact on popular culture, influencing genres such as punk rock, heavy metal, and alternative rock."}] ``` @@ -375,7 +378,7 @@ Sometimes you would want to first fill-in cache object with key/values for certa >>> import torch >>> from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache, StaticCache ->>> model_id = "meta-llama/Llama-2-7b-chat-hf" +>>> model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" >>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda") >>> tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -399,7 +402,7 @@ Sometimes you would want to first fill-in cache object with key/values for certa ... responses.append(response) >>> print(responses) -[' You are a helpful assistant. Help me to write a blogpost about travelling.\n\nTitle: The Ultimate Guide to Travelling: Tips, Tricks, and', ' You are a helpful assistant. What is the capital of France?\n\nYes, the capital of France is Paris.'] +[' You are a helpful assistant. Help me to write a blogpost about travelling. I am excited to share my experiences with you. I have been traveling for the past', ' You are a helpful assistant. What is the capital of France? \n\nAnswer: Paris is the capital of France.'] ``` @@ -413,8 +416,8 @@ this legacy format, you can seamlessly convert it to a `DynamicCache` and back. >>> import torch >>> from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache ->>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf") ->>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto") +>>> tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0") +>>> model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16, device_map="auto") >>> inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device) >>> # `return_dict_in_generate=True` is required to return the cache. `return_legacy_cache` forces the returned cache diff --git a/docs/source/en/llm_optims.md b/docs/source/en/llm_optims.md index 0a6a7e15bea0..37406ea0bef2 100644 --- a/docs/source/en/llm_optims.md +++ b/docs/source/en/llm_optims.md @@ -57,13 +57,13 @@ import os os.environ["TOKENIZERS_PARALLELISM"] = "false" # To prevent long warnings :) tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b") -model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto") +model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", torch_dtype="auto", device_map="auto") model.generation_config.cache_implementation = "static" model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True) input_text = "The theory of special relativity states " -input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") +input_ids = tokenizer(input_text, return_tensors="pt").to(model.device.type) outputs = model.generate(**input_ids) print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) @@ -89,11 +89,11 @@ import os os.environ["TOKENIZERS_PARALLELISM"] = "false" # To prevent long warnings :) tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b") -model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto") +model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", torch_dtype="auto", device_map="auto") model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True) input_text = "The theory of special relativity states " -input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") +input_ids = tokenizer(input_text, return_tensors="pt").to(model.device.type) prompt_length = input_ids.input_ids.shape[1] model.generation_config.max_new_tokens = 16 @@ -126,6 +126,7 @@ If you want to go further down a level, the [`StaticCache`] object can also be p from transformers import LlamaTokenizer, LlamaForCausalLM, StaticCache, logging from transformers.testing_utils import CaptureLogger import torch +from accelerate.test_utils.testing import get_backend prompts = [ "Simply put, the theory of relativity states that ", @@ -133,7 +134,7 @@ prompts = [ ] NUM_TOKENS_TO_GENERATE = 40 -torch_device = "cuda" +torch_device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", pad_token="", padding_side="right") model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", device_map="sequential") @@ -155,9 +156,11 @@ def decode_one_tokens(model, cur_token, input_pos, cache_position, past_key_valu There are a few important things you must do to enable static kv-cache and `torch.compile` with the `StaticCache` method: 1. Initialize the [`StaticCache`] instance before using the model for inference. There you can configure parameters like the maximum batch size and sequence length. 2. Call `torch.compile` on the model to compile the forward pass with the static kv-cache. -3. Set `enable_math=True` in the [torch.backends.cuda.sdp_kernel](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) context manager to enable the native PyTorch C++ implementation of scaled dot product attention to speed up inference even more. +3. Use `SDPBackend.MATH` in the [torch.nn.attention.sdpa_kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html) context manager to enable the native PyTorch C++ implementation of scaled dot product attention to speed up inference even more. ```py +from torch.nn.attention import SDPBackend, sdpa_kernel + batch_size, seq_length = inputs["input_ids"].shape with torch.no_grad(): past_key_values = StaticCache( @@ -178,7 +181,7 @@ with torch.no_grad(): decode_one_tokens = torch.compile(decode_one_tokens, mode="reduce-overhead", fullgraph=True) cache_position = torch.tensor([seq_length + 1], device=torch_device) for _ in range(1, NUM_TOKENS_TO_GENERATE): - with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True): + with sdpa_kernel(SDPBackend.MATH): next_token = decode_one_tokens(model, next_token.clone(), None, cache_position, past_key_values) generated_ids[:, cache_position] = next_token.int() cache_position += 1 @@ -201,11 +204,11 @@ import os os.environ["TOKENIZERS_PARALLELISM"] = "false" # To prevent long warnings :) tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b") -model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto") +model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", torch_dtype="auto", device_map="auto") model.generate = torch.compile(model.generate, mode="reduce-overhead", fullgraph=True) input_text = "The theory of special relativity states " -input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") +input_ids = tokenizer(input_text, return_tensors="pt").to(model.device.type) outputs = model.generate(**input_ids) print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) @@ -241,13 +244,14 @@ Enable speculative decoding by loading an assistant model and passing it to the ```py from transformers import AutoModelForCausalLM, AutoTokenizer import torch +from accelerate.test_utils.testing import get_backend -device = "cuda" if torch.cuda.is_available() else "cpu" +device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b") inputs = tokenizer("Einstein's theory of relativity states", return_tensors="pt").to(device) -model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b").to(device) +model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b", torch_dtype="auto").to(device) assistant_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m").to(device) outputs = model.generate(**inputs, assistant_model=assistant_model) tokenizer.batch_decode(outputs, skip_special_tokens=True) @@ -262,13 +266,14 @@ For speculative sampling decoding, add the `do_sample` and `temperature` paramet ```py from transformers import AutoModelForCausalLM, AutoTokenizer import torch +from accelerate.test_utils.testing import get_backend -device = "cuda" if torch.cuda.is_available() else "cpu" +device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b") inputs = tokenizer("Einstein's theory of relativity states", return_tensors="pt").to(device) -model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b").to(device) +model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b", torch_dtype="auto").to(device) assistant_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m").to(device) outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True, temperature=0.7) print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) @@ -290,13 +295,14 @@ To enable prompt lookup decoding, specify the number of tokens that should be ov ```py from transformers import AutoModelForCausalLM, AutoTokenizer import torch +from accelerate.test_utils.testing import get_backend -device = "cuda" if torch.cuda.is_available() else "cpu" +device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b") inputs = tokenizer("The second law of thermodynamics states", return_tensors="pt").to(device) -model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b").to(device) +model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b", torch_dtype="auto").to(device) assistant_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m").to(device) outputs = model.generate(**inputs, prompt_lookup_num_tokens=3) print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) @@ -311,13 +317,14 @@ For prompt lookup decoding with sampling, add the `do_sample` and `temperature` ```py from transformers import AutoModelForCausalLM, AutoTokenizer import torch +from accelerate.test_utils.testing import get_backend -device = "cuda" if torch.cuda.is_available() else "cpu" +device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b") inputs = tokenizer("The second law of thermodynamics states", return_tensors="pt").to(device) -model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b").to(device) +model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b", torch_dtype="auto").to(device) outputs = model.generate(**inputs, prompt_lookup_num_tokens=3, do_sample=True, temperature=0.7) print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) ["The second law of thermodynamics states that energy cannot be created nor destroyed. It's not a"] @@ -448,10 +455,11 @@ Scaled dot product attention (SDPA) is automatically enabled in PyTorch 2.0 and > [!TIP] > SDPA supports FlashAttention-2 as long as you have the latest PyTorch version installed. -Use the [torch.backends.cuda.sdp_kernel](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) context manager to explicitly enable or disable any of the three attention algorithms. For example, set `enable_flash=True` to enable FlashAttention. +Use the [torch.nn.attention.sdpa_kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html) context manager to explicitly enable or disable any of the four attention algorithms. For example, use `SDPBackend.FLASH_ATTENTION` to enable FlashAttention. ```py import torch +from torch.nn.attention import SDPBackend, sdpa_kernel from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained( @@ -459,7 +467,7 @@ model = AutoModelForCausalLM.from_pretrained( torch_dtype=torch.bfloat16, ) -with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False): +with sdpa_kernel(SDPBackend.FLASH_ATTENTION): outputs = model.generate(**inputs) ``` @@ -468,7 +476,7 @@ with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable Quantization reduces the size of the LLM weights by storing them in a lower precision. This translates to lower memory usage and makes loading LLMs for inference more accessible if you're constrained by your GPUs memory. If you aren't limited by your GPU, you don't necessarily need to quantize your model because it can incur a small latency cost (except for AWQ and fused AWQ modules) due to the extra step required to quantize and dequantize the weights. > [!TIP] -> There are many quantization libraries (see the [Quantization](./quantization) guide for more details) available, such as Quanto, AQLM, AWQ, and AutoGPTQ. Feel free to try them out and see which one works best for your use case. We also recommend reading the [Overview of natively supported quantization schemes in 🤗 Transformers](https://hf.co/blog/overview-quantization-transformers) blog post which compares AutoGPTQ and bitsandbytes. +> There are many quantization libraries (see the [Quantization](./quantization) guide for more details) available, such as Quanto, AQLM, VPTQ, AWQ, and AutoGPTQ. Feel free to try them out and see which one works best for your use case. We also recommend reading the [Overview of natively supported quantization schemes in 🤗 Transformers](https://hf.co/blog/overview-quantization-transformers) blog post which compares AutoGPTQ and bitsandbytes. Use the Model Memory Calculator below to estimate and compare how much memory is required to load a model. For example, try estimating how much memory it costs to load [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1). diff --git a/docs/source/en/llm_tutorial.md b/docs/source/en/llm_tutorial.md index 097d7bf1e9ca..99ec73e3c624 100644 --- a/docs/source/en/llm_tutorial.md +++ b/docs/source/en/llm_tutorial.md @@ -23,6 +23,12 @@ LLMs, or Large Language Models, are the key component behind text generation. In Autoregressive generation is the inference-time procedure of iteratively calling a model with its own generated outputs, given a few initial inputs. In 🤗 Transformers, this is handled by the [`~generation.GenerationMixin.generate`] method, which is available to all models with generative capabilities. + + +If you want to jump straight to chatting with a model, [try our chat CLI](quicktour#chat-with-text-generation-models). + + + This tutorial will show you how to: * Generate text with an LLM @@ -265,8 +271,9 @@ While the autoregressive generation process is relatively straightforward, makin ### Related libraries -1. [`optimum`](https://github.com/huggingface/optimum), an extension of 🤗 Transformers that optimizes for specific hardware devices. +1. [`optimum`](https://github.com/huggingface/optimum), an extension of 🤗 Transformers that optimizes for specific hardware devices; 2. [`outlines`](https://github.com/outlines-dev/outlines), a library where you can constrain text generation (e.g. to generate JSON files); -3. [`SynCode`](https://github.com/uiuc-focal-lab/syncode), a library for context-free grammar guided generation. (e.g. JSON, SQL, Python) +3. [`SynCode`](https://github.com/uiuc-focal-lab/syncode), a library for context-free grammar guided generation (e.g. JSON, SQL, Python); 4. [`text-generation-inference`](https://github.com/huggingface/text-generation-inference), a production-ready server for LLMs; 5. [`text-generation-webui`](https://github.com/oobabooga/text-generation-webui), a UI for text generation; +6. [`logits-processor-zoo`](https://github.com/NVIDIA/logits-processor-zoo), containing additional options to control text generation with 🤗 Transformers. See our related [blog post](https://huggingface.co/blog/logits-processor-zoo). diff --git a/docs/source/en/llm_tutorial_optimization.md b/docs/source/en/llm_tutorial_optimization.md index 9d3d8ad6ba8b..3414725fc370 100644 --- a/docs/source/en/llm_tutorial_optimization.md +++ b/docs/source/en/llm_tutorial_optimization.md @@ -147,7 +147,7 @@ Let's call it now for the next experiment. ```python flush() ``` -In the recent version of the accelerate library, you can also use a utility method called `release_memory()` +From the Accelerate library, you can also use a device-agnostic utility method called [release_memory](https://github.com/huggingface/accelerate/blob/29be4788629b772a3b722076e433b5b3b5c85da3/src/accelerate/utils/memory.py#L63), which takes various hardware backends like XPU, MLU, NPU, MPS, and more into account. ```python from accelerate.utils import release_memory diff --git a/docs/source/en/main_classes/image_processor.md b/docs/source/en/main_classes/image_processor.md index 320916f1ce94..cbf6ae95577f 100644 --- a/docs/source/en/main_classes/image_processor.md +++ b/docs/source/en/main_classes/image_processor.md @@ -27,6 +27,7 @@ from transformers import AutoImageProcessor processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50", use_fast=True) ``` +Note that `use_fast` will be set to `True` by default in a future release. When using a fast image processor, you can also set the `device` argument to specify the device on which the processing should be done. By default, the processing is done on the same device as the inputs if the inputs are tensors, or on the CPU otherwise. @@ -42,21 +43,17 @@ images_processed = processor(images, return_tensors="pt", device="cuda") Here are some speed comparisons between the base and fast image processors for the `DETR` and `RT-DETR` models, and how they impact overall inference time:
-
- -
-
- -
+ +
+
+
-
- -
-
- -
+ +
+
+
These benchmarks were run on an [AWS EC2 g5.2xlarge instance](https://aws.amazon.com/ec2/instance-types/g5/), utilizing an NVIDIA A10G Tensor Core GPU. diff --git a/docs/source/en/main_classes/quantization.md b/docs/source/en/main_classes/quantization.md index 3f4456969777..037660d0638c 100755 --- a/docs/source/en/main_classes/quantization.md +++ b/docs/source/en/main_classes/quantization.md @@ -34,6 +34,10 @@ Learn how to quantize models in the [Quantization](../quantization) guide. [[autodoc]] AqlmConfig +## VptqConfig + +[[autodoc]] VptqConfig + ## AwqConfig [[autodoc]] AwqConfig @@ -53,6 +57,10 @@ Learn how to quantize models in the [Quantization](../quantization) guide. [[autodoc]] quantizers.base.HfQuantizer +## HiggsConfig + +[[autodoc]] HiggsConfig + ## HqqConfig [[autodoc]] HqqConfig diff --git a/docs/source/en/model_doc/aria.md b/docs/source/en/model_doc/aria.md new file mode 100644 index 000000000000..9ff7a6687aa9 --- /dev/null +++ b/docs/source/en/model_doc/aria.md @@ -0,0 +1,106 @@ + + +# Aria + +## Overview + +The Aria model was proposed in [Aria: An Open Multimodal Native Mixture-of-Experts Model](https://huggingface.co/papers/2410.05993) by Li et al. from the Rhymes.AI team. + +Aria is an open multimodal-native model with best-in-class performance across a wide range of multimodal, language, and coding tasks. It has a Mixture-of-Experts architecture, with respectively 3.9B and 3.5B activated parameters per visual token and text token. + +The abstract from the paper is the following: + +*Information comes in diverse modalities. Multimodal native AI models are essential to integrate real-world information and deliver comprehensive understanding. While proprietary multimodal native models exist, their lack of openness imposes obstacles for adoptions, let alone adaptations. To fill this gap, we introduce Aria, an open multimodal native model with best-in-class performance across a wide range of multimodal, language, and coding tasks. Aria is a mixture-of-expert model with 3.9B and 3.5B activated parameters per visual token and text token, respectively. It outperforms Pixtral-12B and Llama3.2-11B, and is competitive against the best proprietary models on various multimodal tasks. We pre-train Aria from scratch following a 4-stage pipeline, which progressively equips the model with strong capabilities in language understanding, multimodal understanding, long context window, and instruction following. We open-source the model weights along with a codebase that facilitates easy adoptions and adaptations of Aria in real-world applications.* + +This model was contributed by [m-ric](https://huggingface.co/m-ric). +The original code can be found [here](https://github.com/rhymes-ai/Aria). + +## Usage tips + +Here's how to use the model for vision tasks: +```python +import requests +import torch +from PIL import Image + +from transformers import AriaProcessor, AriaForConditionalGeneration + +model_id_or_path = "rhymes-ai/Aria" + +model = AriaForConditionalGeneration.from_pretrained( + model_id_or_path, device_map="auto" +) + +processor = AriaProcessor.from_pretrained(model_id_or_path) + +image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw) + +messages = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"text": "what is the image?", "type": "text"}, + ], + } +] + +text = processor.apply_chat_template(messages, add_generation_prompt=True) +inputs = processor(text=text, images=image, return_tensors="pt") +inputs.to(model.device) + +output = model.generate( + **inputs, + max_new_tokens=15, + stop_strings=["<|im_end|>"], + tokenizer=processor.tokenizer, + do_sample=True, + temperature=0.9, +) +output_ids = output[0][inputs["input_ids"].shape[1]:] +response = processor.decode(output_ids, skip_special_tokens=True) +``` + + +## AriaImageProcessor + +[[autodoc]] AriaImageProcessor + +## AriaProcessor + +[[autodoc]] AriaProcessor + +## AriaTextConfig + +[[autodoc]] AriaTextConfig + +## AriaConfig + +[[autodoc]] AriaConfig + +## AriaTextModel + +[[autodoc]] AriaTextModel + +## AriaTextForCausalLM + +[[autodoc]] AriaTextForCausalLM + +## AriaForConditionalGeneration + +[[autodoc]] AriaForConditionalGeneration + - forward diff --git a/docs/source/en/model_doc/bamba.md b/docs/source/en/model_doc/bamba.md new file mode 100644 index 000000000000..4ea8475edb88 --- /dev/null +++ b/docs/source/en/model_doc/bamba.md @@ -0,0 +1,64 @@ + + +# Bamba + + +## Overview + +Bamba-9B is a decoder-only language model based on the [Mamba-2](https://github.com/state-spaces/mamba) architecture and is designed to handle a wide range of text generation tasks. It is trained from scratch using a two-stage training approach. In the first stage, the model is trained on 2 trillion tokens from the Dolma v1.7 dataset. In the second stage, it undergoes additional training on 200 billion tokens, leveraging a carefully curated blend of high-quality data to further refine its performance and enhance output quality. + +Checkout all Bamba-9B model checkpoints [here](https://github.com/foundation-model-stack/bamba). + +## BambaConfig + +| Model | Params | # Layers | Hidden Dim. | Attention Heads | GQA | KV Heads | Context Length | Tied Embeddings | +|-------------------|--------------|----------|-------------|-----------------|-----|----------|----------------|------------------| +| Bamba | 9B (9.78B) | 32 | 4096 | 32 | Yes | 8 | 4096 | True | + +[[autodoc]] BambaConfig + + + +## BambaForCausalLM + +```python +from transformers import AutoModelForCausalLM, AutoTokenizer + +model = AutoModelForCausalLM.from_pretrained("ibm-fms/Bamba-9B") +tokenizer = AutoTokenizer.from_pretrained("ibm-fms/Bamba-9B") + +message = ["Mamba is a snake with following properties "] +inputs = tokenizer(message, return_tensors='pt', return_token_type_ids=False) +response = model.generate(**inputs, max_new_tokens=64) +print(tokenizer.batch_decode(response, skip_special_tokens=True)[0]) +``` + +[[autodoc]] BambaForCausalLM + - forward + +This HF implementation is contributed by [ani300](https://github.com/ani300) and [fabianlim](https://github.com/fabianlim). diff --git a/docs/source/en/model_doc/beit.md b/docs/source/en/model_doc/beit.md index f7605ebcdf90..25b0eafb26a0 100644 --- a/docs/source/en/model_doc/beit.md +++ b/docs/source/en/model_doc/beit.md @@ -71,6 +71,43 @@ alt="drawing" width="600"/> BEiT pre-training. Taken from the original paper. +### Using Scaled Dot Product Attention (SDPA) + +PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function +encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the +[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) +or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention) +page for more information. + +SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set +`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used. + +``` +from transformers import BeitForImageClassification +model = BeitForImageClassification.from_pretrained("microsoft/beit-base-patch16-224", attn_implementation="sdpa", torch_dtype=torch.float16) +... +``` + +For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`). + +On a local benchmark (NVIDIA GeForce RTX 2060-8GB, PyTorch 2.5.1, OS Ubuntu 20.04) with `float16` and +`microsoft/beit-base-patch16-224` model, we saw the following improvements during training and inference: + +#### Training + +| num_training_steps | batch_size | image_size | is_cuda | Time per batch (eager - s) | Time per batch (sdpa - s) | Speedup (%) | Eager peak mem (MB) | SDPA peak mem (MB) | Mem saving (%) | +|--------------------|------------|--------------|---------|----------------------------|---------------------------|-------------|----------------------|--------------------|----------------| +| 50 | 2 | (1048, 640) | True | 0.984 | 0.746 | 31.975 | 6738.915 | 4319.886 | 55.998 | + +#### Inference + +| Image batch size | Eager (s/iter) | Eager CI, % | Eager memory (MB) | SDPA (s/iter) | SDPA CI, % | SDPA memory (MB) | SDPA speedup | SDPA memory saved (%) | +|-------------------:|-----------------:|:--------------|--------------------:|----------------:|:-------------|-------------------:|---------------:|----------------------:| +| 1 | 0.012 | ±0.3% | 3.76657e+08 | 0.011 | ±0.5% | 3.75739e+08 | 1.05 | 0.244 | +| 4 | 0.013 | ±0.1% | 4.03147e+08 | 0.011 | ±0.2% | 3.90554e+08 | 1.178 | 3.225 | +| 16 | 0.045 | ±0.1% | 4.96697e+08 | 0.035 | ±0.1% | 4.51232e+08 | 1.304 | 10.076 | +| 32 | 0.088 | ±0.1% | 6.24417e+08 | 0.066 | ±0.1% | 5.33488e+08 | 1.325 | 17.044 | + ## Resources A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BEiT. diff --git a/docs/source/en/model_doc/cohere2.md b/docs/source/en/model_doc/cohere2.md new file mode 100644 index 000000000000..33e67d48fb0e --- /dev/null +++ b/docs/source/en/model_doc/cohere2.md @@ -0,0 +1,51 @@ +# Cohere + +## Overview +[C4AI Command R7B](https://cohere.com/blog/command-r7b) is an open weights research release of a 7B billion parameter model developed by Cohere and Cohere For AI. It has advanced capabilities optimized for various use cases, including reasoning, summarization, question answering, and code. The model is trained to perform sophisticated tasks including Retrieval Augmented Generation (RAG) and tool use. The model also has powerful agentic capabilities that can use and combine multiple tools over multiple steps to accomplish more difficult tasks. It obtains top performance on enterprise-relevant code use cases. C4AI Command R7B is a multilingual model trained on 23 languages. + +The model features three layers with sliding window attention (window size 4096) and ROPE for efficient local context modeling and relative positional encoding. A fourth layer uses global attention without positional embeddings, enabling unrestricted token interactions across the entire sequence. + +The model has been trained on 23 languages: English, French, Spanish, Italian, German, Portuguese, Japanese, Korean, Arabic, Chinese, Russian, Polish, Turkish, Vietnamese, Dutch, Czech, Indonesian, Ukrainian, Romanian, Greek, Hindi, Hebrew, and Persian. + +## Usage tips +The model and tokenizer can be loaded via: + +```python +# pip install transformers +from transformers import AutoTokenizer, AutoModelForCausalLM + +model_id = "CohereForAI/c4ai-command-r7b-12-2024" +tokenizer = AutoTokenizer.from_pretrained(model_id) +model = AutoModelForCausalLM.from_pretrained(model_id) + +# Format message with the command-r chat template +messages = [{"role": "user", "content": "Hello, how are you?"}] +input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt") + +gen_tokens = model.generate( + input_ids, + max_new_tokens=100, + do_sample=True, + temperature=0.3, +) + +gen_text = tokenizer.decode(gen_tokens[0]) +print(gen_text) +``` + +## Cohere2Config + +[[autodoc]] Cohere2Config + +## Cohere2Model + +[[autodoc]] Cohere2Model + - forward + + +## Cohere2ForCausalLM + +[[autodoc]] Cohere2ForCausalLM + - forward + + diff --git a/docs/source/en/model_doc/colpali.md b/docs/source/en/model_doc/colpali.md new file mode 100644 index 000000000000..3f6b0cbc6613 --- /dev/null +++ b/docs/source/en/model_doc/colpali.md @@ -0,0 +1,90 @@ + + +# ColPali + +## Overview + +The *ColPali* model was proposed in [ColPali: Efficient Document Retrieval with Vision Language Models](https://doi.org/10.48550/arXiv.2407.01449) by **Manuel Faysse***, **Hugues Sibille***, **Tony Wu***, Bilel Omrani, Gautier Viaud, Céline Hudelot, Pierre Colombo (* denotes equal contribution). Work lead by ILLUIN Technology. + +In our proposed *ColPali* approach, we leverage VLMs to construct efficient multi-vector embeddings directly from document images (“screenshots”) for document retrieval. We train the model to maximize the similarity between these document embeddings and the corresponding query embeddings, using the late interaction method introduced in ColBERT. + +Using *ColPali* removes the need for potentially complex and brittle layout recognition and OCR pipelines with a single model that can take into account both the textual and visual content (layout, charts, etc.) of a document. + +## Resources + +- The *ColPali* arXiv paper can be found [here](https://doi.org/10.48550/arXiv.2407.01449). 📄 +- The official blog post detailing ColPali can be found [here](https://huggingface.co/blog/manu/colpali). 📝 +- The original model implementation code for the ColPali model and for the `colpali-engine` package can be found [here](https://github.com/illuin-tech/colpali). 🌎 +- Cookbooks for learning to use the transformers-native version of *ColPali*, fine-tuning, and similarity maps generation can be found [here](https://github.com/tonywu71/colpali-cookbooks). 📚 + +This model was contributed by [@tonywu71](https://huggingface.co/tonywu71) and [@yonigozlan](https://huggingface.co/yonigozlan). + +## Usage + +This example demonstrates how to use *ColPali* to embed both queries and images, calculate their similarity scores, and identify the most relevant matches. For a specific query, you can retrieve the top-k most similar images by selecting the ones with the highest similarity scores. + +```python +import torch +from PIL import Image + +from transformers import ColPaliForRetrieval, ColPaliProcessor + +model_name = "vidore/colpali-v1.2-hf" + +model = ColPaliForRetrieval.from_pretrained( + model_name, + torch_dtype=torch.bfloat16, + device_map="cuda:0", # or "mps" if on Apple Silicon +).eval() + +processor = ColPaliProcessor.from_pretrained(model_name) + +# Your inputs (replace dummy images with screenshots of your documents) +images = [ + Image.new("RGB", (32, 32), color="white"), + Image.new("RGB", (16, 16), color="black"), +] +queries = [ + "What is the organizational structure for our R&D department?", + "Can you provide a breakdown of last year’s financial performance?", +] + +# Process the inputs +batch_images = processor(images=images).to(model.device) +batch_queries = processor(text=queries).to(model.device) + +# Forward pass +with torch.no_grad(): + image_embeddings = model(**batch_images).embeddings + query_embeddings = model(**batch_queries).embeddings + +# Score the queries against the images +scores = processor.score_retrieval(query_embeddings, image_embeddings) +``` + +## ColPaliConfig + +[[autodoc]] ColPaliConfig + +## ColPaliProcessor + +[[autodoc]] ColPaliProcessor + +## ColPaliForRetrieval + +[[autodoc]] ColPaliForRetrieval + - forward diff --git a/docs/source/en/model_doc/data2vec.md b/docs/source/en/model_doc/data2vec.md index 517a51ce46a3..cb1dc675caa5 100644 --- a/docs/source/en/model_doc/data2vec.md +++ b/docs/source/en/model_doc/data2vec.md @@ -48,6 +48,46 @@ The original code for vision can be found [here](https://github.com/facebookrese - For Data2VecText, preprocessing is identical to [`RobertaModel`], including tokenization. - For Data2VecVision, preprocessing is identical to [`BeitModel`], including feature extraction. +### Using Scaled Dot Product Attention (SDPA) + +PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function +encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the +[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) +or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention) +page for more information. + +SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set +`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used. + +The SDPA implementation is currently available for the Data2VecAudio and Data2VecVision models. + +``` +from transformers import Data2VecVisionForImageClassification +model = Data2VecVisionForImageClassification.from_pretrained("facebook/data2vec-vision-base", attn_implementation="sdpa", torch_dtype=torch.float16) +... +``` + +For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`). + +For the Data2VecVision model, on a local benchmark (NVIDIA GeForce RTX 2060-8GB, PyTorch 2.5.1, OS Ubuntu 20.04) +with `float16` and `facebook/data2vec-vision-base` model, we saw the following improvements during training and +inference: + +#### Training + +| num_training_steps | batch_size | image_size | is_cuda | Time per batch (eager - s) | Time per batch (sdpa - s) | Speedup (%) | Eager peak mem (MB) | SDPA peak mem (MB) | Mem saving (%) | +|--------------------|------------|--------------|---------|----------------------------|---------------------------|-------------|----------------------|--------------------|----------------| +| 50 | 2 | (1048, 640) | True | 0.996 | 0.754 | 32.147 | 6722.198 | 4264.653 | 57.626 | + +#### Inference + +| Image batch size | Eager (s/iter) | Eager CI, % | Eager memory (MB) | SDPA (s/iter) | SDPA CI, % | SDPA memory (MB) | SDPA speedup | SDPA memory saved | +|-------------------:|-----------------:|:--------------|--------------------:|----------------:|:-------------|-------------------:|---------------:|--------------------:| +| 1 | 0.011 | ±0.3% | 3.76143e+08 | 0.01 | ±0.3% | 3.74397e+08 | 1.101 | 0.466 | +| 4 | 0.014 | ±0.1% | 4.02756e+08 | 0.012 | ±0.2% | 3.91373e+08 | 1.219 | 2.909 | +| 16 | 0.046 | ±0.3% | 4.96482e+08 | 0.035 | ±0.2% | 4.51017e+08 | 1.314 | 10.081 | +| 32 | 0.088 | ±0.1% | 6.23903e+08 | 0.067 | ±0.1% | 5.32974e+08 | 1.33 | 17.061 | + ## Resources A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Data2Vec. diff --git a/docs/source/en/model_doc/diffllama.md b/docs/source/en/model_doc/diffllama.md new file mode 100644 index 000000000000..80afcfe433e9 --- /dev/null +++ b/docs/source/en/model_doc/diffllama.md @@ -0,0 +1,59 @@ + + +# DiffLlama + +## Overview + +The DiffLlama model was proposed in [Differential Transformer](https://arxiv.org/abs/2410.05258) by Kazuma Matsumoto and . +This model is combine Llama model and Differential Transformer's Attention. + +The abstract from the paper is the following: + +*Transformer tends to overallocate attention to irrelevant context. In this work, we introduce Diff Transformer, which amplifies attention to the relevant context while canceling noise. Specifically, the differential attention mechanism calculates attention scores as the difference between two separate softmax attention maps. The subtraction cancels noise, promoting the emergence of sparse attention patterns. Experimental results on language modeling show that Diff Transformer outperforms Transformer in various settings of scaling up model size and training tokens. More intriguingly, it offers notable advantages in practical applications, such as long-context modeling, key information retrieval, hallucination mitigation, in-context learning, and reduction of activation outliers. By being less distracted by irrelevant context, Diff Transformer can mitigate hallucination in question answering and text summarization. For in-context learning, Diff Transformer not only enhances accuracy but is also more robust to order permutation, which was considered as a chronic robustness issue. The results position Diff Transformer as a highly effective and promising architecture to advance large language models.* + +### Usage tips +The hyperparameters of this model is the same as Llama model. + + +## DiffLlamaConfig + +[[autodoc]] DiffLlamaConfig + +## DiffLlamaModel + +[[autodoc]] DiffLlamaModel + - forward + +## DiffLlamaForCausalLM + +[[autodoc]] DiffLlamaForCausalLM + - forward + +## DiffLlamaForSequenceClassification + +[[autodoc]] DiffLlamaForSequenceClassification + - forward + +## DiffLlamaForQuestionAnswering + +[[autodoc]] DiffLlamaForQuestionAnswering + - forward + +## DiffLlamaForTokenClassification + +[[autodoc]] DiffLlamaForTokenClassification + - forward diff --git a/docs/source/en/model_doc/dinov2_with_registers.md b/docs/source/en/model_doc/dinov2_with_registers.md new file mode 100644 index 000000000000..360ebf9b8f8a --- /dev/null +++ b/docs/source/en/model_doc/dinov2_with_registers.md @@ -0,0 +1,54 @@ + + +# DINOv2 with Registers + +## Overview + +The DINOv2 with Registers model was proposed in [Vision Transformers Need Registers](https://arxiv.org/abs/2309.16588) by Timothée Darcet, Maxime Oquab, Julien Mairal, Piotr Bojanowski. + +The [Vision Transformer](vit) (ViT) is a transformer encoder model (BERT-like) originally introduced to do supervised image classification on ImageNet. + +Next, people figured out ways to make ViT work really well on self-supervised image feature extraction (i.e. learning meaningful features, also called embeddings) on images without requiring any labels. Some example papers here include [DINOv2](dinov2) and [MAE](vit_mae). + +The authors of DINOv2 noticed that ViTs have artifacts in attention maps. It’s due to the model using some image patches as “registers”. The authors propose a fix: just add some new tokens (called "register" tokens), which you only use during pre-training (and throw away afterwards). This results in: +- no artifacts +- interpretable attention maps +- and improved performances. + +The abstract from the paper is the following: + +*Transformers have recently emerged as a powerful tool for learning visual representations. In this paper, we identify and characterize artifacts in feature maps of both supervised and self-supervised ViT networks. The artifacts correspond to high-norm tokens appearing during inference primarily in low-informative background areas of images, that are repurposed for internal computations. We propose a simple yet effective solution based on providing additional tokens to the input sequence of the Vision Transformer to fill that role. We show that this solution fixes that problem entirely for both supervised and self-supervised models, sets a new state of the art for self-supervised visual models on dense visual prediction tasks, enables object discovery methods with larger models, and most importantly leads to smoother feature maps and attention maps for downstream visual processing.* + + + + Visualization of attention maps of various models trained with vs. without registers. Taken from the original paper. + +Tips: + +- Usage of DINOv2 with Registers is identical to DINOv2 without, you'll just get better performance. + +This model was contributed by [nielsr](https://huggingface.co/nielsr). +The original code can be found [here](https://github.com/facebookresearch/dinov2). + + +## Dinov2WithRegistersConfig + +[[autodoc]] Dinov2WithRegistersConfig + +## Dinov2WithRegistersModel + +[[autodoc]] Dinov2WithRegistersModel + - forward + +## Dinov2WithRegistersForImageClassification + +[[autodoc]] Dinov2WithRegistersForImageClassification + - forward \ No newline at end of file diff --git a/docs/source/en/model_doc/emu3.md b/docs/source/en/model_doc/emu3.md new file mode 100644 index 000000000000..619c9a3be51f --- /dev/null +++ b/docs/source/en/model_doc/emu3.md @@ -0,0 +1,179 @@ + + +# Emu3 + +## Overview + +The Emu3 model was proposed in [Emu3: Next-Token Prediction is All You Need](https://arxiv.org/abs/2409.18869) by Xinlong Wang, Xiaosong Zhang, Zhengxiong Luo, Quan Sun, Yufeng Cui, Jinsheng Wang, Fan Zhang, Yueze Wang, Zhen Li, Qiying Yu, Yingli Zhao, Yulong Ao, Xuebin Min, Tao Li, Boya Wu, Bo Zhao, Bowen Zhang, Liangdong Wang, Guang Liu, Zheqi He, Xi Yang, Jingjing Liu, Yonghua Lin, Tiejun Huang, Zhongyuan Wang. + +Emu3 is a multimodal LLM that uses vector quantization to tokenize images into discrete tokens. Discretized image tokens are later fused with text token ids for image and text generation. The model can additionally generate images by predicting image token ids. + + +The abstract from the paper is the following: + +*While next-token prediction is considered a promising path towards artificial general intelligence, it has struggled to excel in multimodal tasks, which are still dominated by diffusion models (e.g., Stable Diffusion) and compositional approaches (e.g., CLIP combined with LLMs). In this paper, we introduce Emu3, a new suite of state-of-the-art multimodal models trained solely with next-token prediction. By tokenizing images, text, and videos into a discrete space, we train a single transformer from scratch on a mixture of multimodal sequences. Emu3 outperforms several well-established task-specific models in both generation and perception tasks, surpassing flagship models such as SDXL and LLaVA-1.6, while eliminating the need for diffusion or compositional architectures. Emu3 is also capable of generating high-fidelity video via predicting the next token in a video sequence. We simplify complex multimodal model designs by converging on a singular focus: tokens, unlocking great potential for scaling both during training and inference. Our results demonstrate that next-token prediction is a promising path towards building general multimodal intelligence beyond language. We open-source key techniques and models to support further research in this direction.* + +Tips: + +- We advise users to set `processor.tokenizer.padding_side = "left"` before batched generation as it leads to more accurate results. + +- Note that the model has been trained with a specific prompt format for chatting. Use `processor.apply_chat_template(my_conversation_dict)` to correctly format your prompts. + +- Emu3 has two different checkpoints for image-generation and text-generation, make sure to use the correct checkpoint when loading the model. To generate an image, it is advised to use `prefix_constraints` so that the generated tokens are sampled only from possible image tokens. See more below for usage examples. + +> [!TIP] +> Emu3 implementation in Transformers uses a special image token to indicate where to merge image embeddings. The special image token isn't new and uses one of the reserved tokens: `<|extra_0|>`. You have to add `` to your prompt in the place where the image should be embedded for correct generation. + + +This model was contributed by [RaushanTurganbay](https://huggingface.co/RaushanTurganbay). +The original code can be found [here](https://github.com/baaivision/Emu3). + + +## Usage example + +### Text generation inference + +Here's how to load the model and perform inference in half-precision (`torch.bfloat16`) to generate textual output from text or text and image inputs: + +```python +from transformers import Emu3Processor, Emu3ForConditionalGeneration +import torch +from PIL import Image +import requests + +processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf") +model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", torch_dtype=torch.bfloat16, device_map="cuda") + +# prepare image and text prompt +url = 'http://images.cocodataset.org/val2017/000000039769.jpg' +image = Image.open(requests.get(url, stream=True).raw) +prompt = "What do you see in this image?" + +inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device, dtype=torch.bfloat16) + +# autoregressively complete prompt +output = model.generate(**inputs, max_new_tokens=50) +print(processor.decode(output[0], skip_special_tokens=True)) +``` + +### Image generation inference + +Emu3 can also generate images from textual input. Here is how you can do it: + +```python +processor = Emu3Processor.from_pretrained("BAAI/Emu3-Gen-hf") +model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Gen-hf", torch_dtype="bfloat16", device_map="auto", attn_implementation="flash_attention_2") + + +inputs = processor( + text=["a portrait of young girl. masterpiece, film grained, best quality.", "a dog running under the rain"], + padding=True, + return_tensors="pt", + return_for_image_generation=True, +) +inputs = inputs.to(device="cuda:0", dtype=torch.bfloat16) + +neg_prompt = "lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry." +neg_inputs = processor(text=[neg_prompt] * 2, return_tensors="pt").to(device="cuda:0") + +image_sizes = inputs.pop("image_sizes") +HEIGHT, WIDTH = image_sizes[0] +VISUAL_TOKENS = model.vocabulary_mapping.image_tokens + +def prefix_allowed_tokens_fn(batch_id, input_ids): + height, width = HEIGHT, WIDTH + visual_tokens = VISUAL_TOKENS + image_wrapper_token_id = torch.tensor([processor.tokenizer.image_wrapper_token_id], device=model.device) + eoi_token_id = torch.tensor([processor.tokenizer.eoi_token_id], device=model.device) + eos_token_id = torch.tensor([processor.tokenizer.eos_token_id], device=model.device) + pad_token_id = torch.tensor([processor.tokenizer.pad_token_id], device=model.device) + eof_token_id = torch.tensor([processor.tokenizer.eof_token_id], device=model.device) + eol_token_id = processor.tokenizer.encode("<|extra_200|>", return_tensors="pt")[0] + + position = torch.nonzero(input_ids == image_wrapper_token_id, as_tuple=True)[0][0] + offset = input_ids.shape[0] - position + if offset % (width + 1) == 0: + return (eol_token_id, ) + elif offset == (width + 1) * height + 1: + return (eof_token_id, ) + elif offset == (width + 1) * height + 2: + return (eoi_token_id, ) + elif offset == (width + 1) * height + 3: + return (eos_token_id, ) + elif offset > (width + 1) * height + 3: + return (pad_token_id, ) + else: + return visual_tokens + + +out = model.generate( + **inputs, + max_new_tokens=50_000, # make sure to have enough tokens for one image + prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, + return_dict_in_generate=True, + negative_prompt_ids=neg_inputs.input_ids, # indicate for Classifier-Free Guidance + negative_prompt_attention_mask=neg_inputs.attention_mask, +) + +image = model.decode_image_tokens(out.sequences[:, inputs.input_ids.shape[1]: ], height=HEIGHT, width=WIDTH) +images = processor.postprocess(list(image.float()), return_tensors="PIL.Image.Image") # internally we convert to np but it's not supported in bf16 precision +for i, image in enumerate(images['pixel_values']): + image.save(f"result{i}.png") + +``` + + +## Emu3Config + +[[autodoc]] Emu3Config + +## Emu3VQVAEConfig + +[[autodoc]] Emu3VQVAEConfig + +## Emu3TextConfig + +[[autodoc]] Emu3TextConfig + +## Emu3Processor + +[[autodoc]] Emu3Processor + +## Emu3ImageProcessor + +[[autodoc]] Emu3ImageProcessor + - preprocess + +## Emu3VQVAE + +[[autodoc]] Emu3VQVAE + - forward + +## Emu3TextModel + +[[autodoc]] Emu3TextModel + - forward + +## Emu3ForCausalLM + +[[autodoc]] Emu3ForCausalLM + - forward + +## Emu3ForConditionalGeneration + +[[autodoc]] Emu3ForConditionalGeneration + - forward diff --git a/docs/source/en/model_doc/falcon3.md b/docs/source/en/model_doc/falcon3.md new file mode 100644 index 000000000000..813533dd7f4d --- /dev/null +++ b/docs/source/en/model_doc/falcon3.md @@ -0,0 +1,29 @@ + + +# Falcon3 + +## Overview + +Falcon3 represents a natural evolution from previous releases, emphasizing expanding the models' science, math, and code capabilities. This iteration includes five base models: Falcon3-1B-Base, Falcon3-3B-Base, Falcon3-Mamba-7B-Base, Falcon3-7B-Base, and Falcon3-10B-Base. In developing these models, we incorporated several key innovations aimed at improving the models' performances while reducing training costs: + +One pre-training: We conducted a single large-scale pretraining run on the 7B model, using 2048 H100 GPU chips, leveraging 14 trillion tokens featuring web, code, STEM, and curated high-quality and multilingual data. +Depth up-scaling for improved reasoning: Building on recent studies on the effects of model depth, we upscaled the 7B model to a 10B parameters model by duplicating the redundant layers and continuing pre-training with 2TT of high-quality data. This yielded Falcon3-10B-Base which achieves state-of-the-art zero-shot and few-shot performance for models under 13B parameters. +Knowledge distillation for better tiny models: To provide compact and efficient alternatives, we developed Falcon3-1B-Base and Falcon3-3B-Base by leveraging pruning and knowledge distillation techniques, using less than 100GT of curated high-quality data, thereby redefining pre-training efficiency. + +## Resources +- [Blog post](https://huggingface.co/blog/falcon3) +- [Models on Huggingface](https://huggingface.co/collections/tiiuae/falcon3-67605ae03578be86e4e87026) diff --git a/docs/source/en/model_doc/glm.md b/docs/source/en/model_doc/glm.md index be0b367b62ec..1268b2e7cf9c 100644 --- a/docs/source/en/model_doc/glm.md +++ b/docs/source/en/model_doc/glm.md @@ -56,7 +56,7 @@ In the following, we demonstrate how to use `glm-4-9b-chat` for the inference. N >>> from transformers import AutoModelForCausalLM, AutoTokenizer >>> device = "cuda" # the device to load the model onto ->>> model = AutoModelForCausalLM.from_pretrained("THUDM/glm-4-9b-chat", device_map="auto") +>>> model = AutoModelForCausalLM.from_pretrained("THUDM/glm-4-9b-chat", device_map="auto", trust_remote_code=True) >>> tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-4-9b-chat") >>> prompt = "Give me a short introduction to large language model." diff --git a/docs/source/en/model_doc/granitevision.md b/docs/source/en/model_doc/granitevision.md new file mode 100644 index 000000000000..42f9df2ee31c --- /dev/null +++ b/docs/source/en/model_doc/granitevision.md @@ -0,0 +1,90 @@ + + +# Granite Vision + +## Overview + +The Granite Vision model is a variant of [LLaVA-NeXT](llava_next), leveraging a [Granite](granite) language model alongside a [SigLIP](SigLIP) visual encoder. It utilizes multiple concatenated vision hidden states as its image features, similar to [VipLlava](vipllava). It also uses a larger set of image grid pinpoints than the original LlaVa-NeXT models to support additional aspect ratios. + +Tips: +- This model is loaded into Transformers as an instance of LlaVA-Next. The usage and tips from [LLaVA-NeXT](llava_next) apply to this model as well. + +- You can apply the chat template on the tokenizer / processor in the same way as well. Example chat format: +```bash +"<|user|>\nWhat’s shown in this image?\n<|assistant|>\nThis image shows a red stop sign.<|end_of_text|><|user|>\nDescribe the image in more details.\n<|assistant|>\n" +``` + +Sample inference: +```python +from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration +from PIL import Image +import requests + +# Note: These docs were written prior to the public model release, +# and this path is subject to change. +# Please see https://huggingface.co/ibm-granite for the current model list. +model_path = "ibm-granite/granite-3.1-2b-instruct-vision" +processor = LlavaNextProcessor.from_pretrained(model_path) + +model = LlavaNextForConditionalGeneration.from_pretrained(model_path).to("cuda") + +# prepare image and text prompt, using the appropriate prompt template +url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true" + +conversation = [ + { + "role": "user", + "content": [ + {"type": "image", "url": url}, + {"type": "text", "text": "What is shown in this image?"}, + ], + }, +] +inputs = processor.apply_chat_template( + conversation, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + return_tensors="pt" +).to("cuda") + + +# autoregressively complete prompt +output = model.generate(**inputs, max_new_tokens=100) + +print(processor.decode(output[0], skip_special_tokens=True)) +``` + +This model was contributed by [Alexander Brooks](https://huggingface.co/abrooks9944). + +## LlavaNextConfig + +[[autodoc]] LlavaNextConfig + +## LlavaNextImageProcessor + +[[autodoc]] LlavaNextImageProcessor + - preprocess + +## LlavaNextProcessor + +[[autodoc]] LlavaNextProcessor + +## LlavaNextForConditionalGeneration + +[[autodoc]] LlavaNextForConditionalGeneration + - forward diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md index a6da554f8d50..1b9104eb963e 100644 --- a/docs/source/en/model_doc/grounding-dino.md +++ b/docs/source/en/model_doc/grounding-dino.md @@ -56,9 +56,9 @@ Here's how to use the model for zero-shot object detection: >>> image_url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(image_url, stream=True).raw) >>> # Check for cats and remote controls ->>> text = "a cat. a remote control." +>>> text_labels = [["a cat", "a remote control"]] ->>> inputs = processor(images=image, text=text, return_tensors="pt").to(device) +>>> inputs = processor(images=image, text=text_labels, return_tensors="pt").to(device) >>> with torch.no_grad(): ... outputs = model(**inputs) @@ -69,12 +69,14 @@ Here's how to use the model for zero-shot object detection: ... text_threshold=0.3, ... target_sizes=[image.size[::-1]] ... ) ->>> print(results) -[{'boxes': tensor([[344.6959, 23.1090, 637.1833, 374.2751], - [ 12.2666, 51.9145, 316.8582, 472.4392], - [ 38.5742, 70.0015, 176.7838, 118.1806]], device='cuda:0'), - 'labels': ['a cat', 'a cat', 'a remote control'], - 'scores': tensor([0.4785, 0.4381, 0.4776], device='cuda:0')}] + +# Retrieve the first image result +>>> result = results[0] +>>> for box, score, labels in zip(result["boxes"], result["scores"], result["labels"]): +... box = [round(x, 2) for x in box.tolist()] +... print(f"Detected {labels} with confidence {round(score.item(), 3)} at location {box}") +Detected a cat with confidence 0.468 at location [344.78, 22.9, 637.3, 373.62] +Detected a cat with confidence 0.426 at location [11.74, 51.55, 316.51, 473.22] ``` ## Grounded SAM diff --git a/docs/source/en/model_doc/helium.md b/docs/source/en/model_doc/helium.md new file mode 100644 index 000000000000..df5927544df9 --- /dev/null +++ b/docs/source/en/model_doc/helium.md @@ -0,0 +1,158 @@ + + +# Helium + + +## Overview + +Helium was proposed in [Announcing Helium-1 Preview](https://kyutai.org/2025/01/13/helium.html) by the Kyutai Team. + + +Helium-1 preview is a lightweight language model with 2B parameters, targeting edge and mobile devices. +It supports the following languages: English, French, German, Italian, Portuguese, Spanish. + +- **Developed by:** Kyutai +- **Model type:** Large Language Model +- **Language(s) (NLP):** English, French, German, Italian, Portuguese, Spanish +- **License:** CC-BY 4.0 + + + + +## Evaluation + + + +#### Testing Data + + + +The model was evaluated on MMLU, TriviaQA, NaturalQuestions, ARC Easy & Challenge, Open Book QA, Common Sense QA, +Physical Interaction QA, Social Interaction QA, HellaSwag, WinoGrande, Multilingual Knowledge QA, FLORES 200. + +#### Metrics + + + +We report accuracy on MMLU, ARC, OBQA, CSQA, PIQA, SIQA, HellaSwag, WinoGrande. +We report exact match on TriviaQA, NQ and MKQA. +We report BLEU on FLORES. + +### English Results + +| Benchmark | Helium-1 Preview | HF SmolLM2 (1.7B) | Gemma-2 (2.6B) | Llama-3.2 (3B) | Qwen2.5 (1.5B) | +|--------------|--------|--------|--------|--------|--------| +| | | | | | | +| MMLU | 51.2 | 50.4 | 53.1 | 56.6 | 61.0 | +| NQ | 17.3 | 15.1 | 17.7 | 22.0 | 13.1 | +| TQA | 47.9 | 45.4 | 49.9 | 53.6 | 35.9 | +| ARC E | 80.9 | 81.8 | 81.1 | 84.6 | 89.7 | +| ARC C | 62.7 | 64.7 | 66.0 | 69.0 | 77.2 | +| OBQA | 63.8 | 61.4 | 64.6 | 68.4 | 73.8 | +| CSQA | 65.6 | 59.0 | 64.4 | 65.4 | 72.4 | +| PIQA | 77.4 | 77.7 | 79.8 | 78.9 | 76.0 | +| SIQA | 64.4 | 57.5 | 61.9 | 63.8 | 68.7 | +| HS | 69.7 | 73.2 | 74.7 | 76.9 | 67.5 | +| WG | 66.5 | 65.6 | 71.2 | 72.0 | 64.8 | +| | | | | | | +| Average | 60.7 | 59.3 | 62.2 | 64.7 | 63.6 | + +#### Multilingual Results + +| Language | Benchmark | Helium-1 Preview | HF SmolLM2 (1.7B) | Gemma-2 (2.6B) | Llama-3.2 (3B) | Qwen2.5 (1.5B) | +|-----|--------------|--------|--------|--------|--------|--------| +| | | | | | | | +|German| MMLU | 45.6 | 35.3 | 45.0 | 47.5 | 49.5 | +|| ARC C | 56.7 | 38.4 | 54.7 | 58.3 | 60.2 | +|| HS | 53.5 | 33.9 | 53.4 | 53.7 | 42.8 | +|| MKQA | 16.1 | 7.1 | 18.9 | 20.2 | 10.4 | +| | | | | | | | +|Spanish| MMLU | 46.5 | 38.9 | 46.2 | 49.6 | 52.8 | +|| ARC C | 58.3 | 43.2 | 58.8 | 60.0 | 68.1 | +|| HS | 58.6 | 40.8 | 60.5 | 61.1 | 51.4 | +|| MKQA | 16.0 | 7.9 | 18.5 | 20.6 | 10.6 | + + +## Technical Specifications + +### Model Architecture and Objective + +| Hyperparameter | Value | +|--------------|--------| +| Layers | 24 | +| Heads | 20 | +| Model dimension | 2560 | +| MLP dimension | 7040 | +| Context size | 4096 | +| Theta RoPE | 100,000 | + +Tips: + +- This model was contributed by [Laurent Mazare](https://huggingface.co/lmz) + + +## Usage tips + +`Helium` can be found on the [Huggingface Hub](https://huggingface.co/collections/kyutai/helium-1-preview) + +In the following, we demonstrate how to use `helium-1-preview` for the inference. + +```python +>>> from transformers import AutoModelForCausalLM, AutoTokenizer +>>> device = "cuda" # the device to load the model onto + +>>> model = AutoModelForCausalLM.from_pretrained("helium-1-preview", device_map="auto") +>>> tokenizer = AutoTokenizer.from_pretrained("helium-1-preview") + +>>> prompt = "Give me a short introduction to large language model." + +>>> messages = [{"role": "user", "content": prompt}] + +>>> text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + +>>> model_inputs = tokenizer([text], return_tensors="pt").to(device) + +>>> generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512, do_sample=True) + +>>> generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)] + +>>> response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] +``` + +## HeliumConfig + +[[autodoc]] HeliumConfig + +## HeliumModel + +[[autodoc]] HeliumModel + - forward + +## HeliumForCausalLM + +[[autodoc]] HeliumForCausalLM + - forward + +## HeliumForSequenceClassification + +[[autodoc]] HeliumForSequenceClassification + - forward + +## HeliumForTokenClassification + +[[autodoc]] HeliumForTokenClassification + - forward diff --git a/docs/source/en/model_doc/idefics2.md b/docs/source/en/model_doc/idefics2.md index 5ad56b7b5c52..b9b51082f29e 100644 --- a/docs/source/en/model_doc/idefics2.md +++ b/docs/source/en/model_doc/idefics2.md @@ -141,7 +141,7 @@ Do note that when training Idefics2 on multi-turn conversations between a user a ## Model optimizations: Flash Attention -The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. +The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature. diff --git a/docs/source/en/model_doc/idefics3.md b/docs/source/en/model_doc/idefics3.md index dfaf40477a7b..cf7c043e9289 100644 --- a/docs/source/en/model_doc/idefics3.md +++ b/docs/source/en/model_doc/idefics3.md @@ -51,6 +51,13 @@ This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts) [[autodoc]] Idefics3Config +## Idefics3VisionConfig + +[[autodoc]] Idefics3VisionConfig + +## Idefics3VisionTransformer + +[[autodoc]] Idefics3VisionTransformer ## Idefics3Model diff --git a/docs/source/en/model_doc/ijepa.md b/docs/source/en/model_doc/ijepa.md new file mode 100644 index 000000000000..cb2afd25e20b --- /dev/null +++ b/docs/source/en/model_doc/ijepa.md @@ -0,0 +1,92 @@ + + +# I-JEPA + +## Overview + +The I-JEPA model was proposed in [Image-based Joint-Embedding Predictive Architecture](https://arxiv.org/abs/2301.08243) by Mahmoud Assran, Quentin Duval, Ishan Misra, Piotr Bojanowski, Pascal Vincent, Michael Rabbat, Yann LeCun, Nicolas Ballas. +I-JEPA is a self-supervised learning method that predicts the representations of one part of an image based on other parts of the same image. This approach focuses on learning semantic features without relying on pre-defined invariances from hand-crafted data transformations, which can bias specific tasks, or on filling in pixel-level details, which often leads to less meaningful representations. + +The abstract from the paper is the following: + +This paper demonstrates an approach for learning highly semantic image representations without relying on hand-crafted data-augmentations. We introduce the Image- based Joint-Embedding Predictive Architecture (I-JEPA), a non-generative approach for self-supervised learning from images. The idea behind I-JEPA is simple: from a single context block, predict the representations of various target blocks in the same image. A core design choice to guide I-JEPA towards producing semantic representations is the masking strategy; specifically, it is crucial to (a) sample tar- get blocks with sufficiently large scale (semantic), and to (b) use a sufficiently informative (spatially distributed) context block. Empirically, when combined with Vision Transform- ers, we find I-JEPA to be highly scalable. For instance, we train a ViT-Huge/14 on ImageNet using 16 A100 GPUs in under 72 hours to achieve strong downstream performance across a wide range of tasks, from linear classification to object counting and depth prediction. + + + + I-JEPA architecture. Taken from the original paper. + +This model was contributed by [jmtzt](https://huggingface.co/jmtzt). +The original code can be found [here](https://github.com/facebookresearch/ijepa). + +## How to use + +Here is how to use this model for image feature extraction: + +```python +import requests +import torch +from PIL import Image +from torch.nn.functional import cosine_similarity + +from transformers import AutoModel, AutoProcessor + +url_1 = "http://images.cocodataset.org/val2017/000000039769.jpg" +url_2 = "http://images.cocodataset.org/val2017/000000219578.jpg" +image_1 = Image.open(requests.get(url_1, stream=True).raw) +image_2 = Image.open(requests.get(url_2, stream=True).raw) + +model_id = "facebook/ijepa_vith14_1k" +processor = AutoProcessor.from_pretrained(model_id) +model = AutoModel.from_pretrained(model_id) + +@torch.no_grad() +def infer(image): + inputs = processor(image, return_tensors="pt") + outputs = model(**inputs) + return outputs.last_hidden_state.mean(dim=1) + + +embed_1 = infer(image_1) +embed_2 = infer(image_2) + +similarity = cosine_similarity(embed_1, embed_2) +print(similarity) +``` + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with I-JEPA. + + + +- [`IJepaForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb). +- See also: [Image classification task guide](../tasks/image_classification) + +## IJepaConfig + +[[autodoc]] IJepaConfig + +## IJepaModel + +[[autodoc]] IJepaModel + - forward + +## IJepaForImageClassification + +[[autodoc]] IJepaForImageClassification + - forward \ No newline at end of file diff --git a/docs/source/en/model_doc/llava.md b/docs/source/en/model_doc/llava.md index dec19ca5ef45..a3afc216b776 100644 --- a/docs/source/en/model_doc/llava.md +++ b/docs/source/en/model_doc/llava.md @@ -131,7 +131,7 @@ prompt_2 = processor.apply_chat_template(conversation_2, add_generation_prompt=T prompts = [prompt_1, prompt_2] # We can simply feed images in the order they have to be used in the text prompt -inputs = processor(images=[image_stop, image_cats, image_snowman], text=prompts, padding=True, return_tensors="pt").to(model.device, torch.float16) +inputs = processor(images=[image_stop, image_cats], text=prompts, padding=True, return_tensors="pt").to(model.device, torch.float16) # Generate generate_ids = model.generate(**inputs, max_new_tokens=30) @@ -162,6 +162,16 @@ For multiple turns conversation: "USER: \n ASSISTANT:
USER: ASSISTANT: USER: ASSISTANT:" ``` +## Note regarding reproducing original implementation + +In order to match the logits of the [original implementation](https://github.com/haotian-liu/LLaVA/tree/main), one needs to additionally specify `do_pad=True` when instantiating `LLavaImageProcessor`: + +```python +from transformers import LLavaImageProcessor + +image_processor = LLavaImageProcessor.from_pretrained("https://huggingface.co/llava-hf/llava-1.5-7b-hf", do_pad=True) +``` + ### Using Flash Attention 2 Flash Attention 2 is an even faster, optimized version of the previous optimization, please refer to the [Flash Attention 2 section of performance docs](https://huggingface.co/docs/transformers/perf_infer_gpu_one). @@ -180,6 +190,11 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h [[autodoc]] LlavaConfig +## LlavaImageProcessor + +[[autodoc]] LlavaImageProcessor + - preprocess + ## LlavaProcessor [[autodoc]] LlavaProcessor diff --git a/docs/source/en/model_doc/llava_next_video.md b/docs/source/en/model_doc/llava_next_video.md index f8a149f12b67..cc3a61aae6c7 100644 --- a/docs/source/en/model_doc/llava_next_video.md +++ b/docs/source/en/model_doc/llava_next_video.md @@ -240,7 +240,7 @@ model = LlavaNextVideoForConditionalGeneration.from_pretrained("llava-hf/LLaVA-N ### Flash-Attention 2 to speed-up generation -Additionally, we can greatly speed-up model inference by using [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. +Additionally, we can greatly speed-up model inference by using [Flash Attention](../perf_train_gpu_one#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. First, make sure to install the latest version of Flash Attention 2: diff --git a/docs/source/en/model_doc/llava_onevision.md b/docs/source/en/model_doc/llava_onevision.md index b6b0a2bfa1d1..41a23e3da81b 100644 --- a/docs/source/en/model_doc/llava_onevision.md +++ b/docs/source/en/model_doc/llava_onevision.md @@ -81,7 +81,7 @@ text_prompt = processor.apply_chat_template(conversation, add_generation_prompt= # Note that the template simply formats your prompt, you still have to tokenize it and obtain pixel values for your images print(text_prompt) ->>> "<|im_start|>user\nWhat is shown in this image?<|im_end|>\n<|im_start|>assistant\nPage showing the list of options.<|im_end|>" +'<|im_start|>user\nWhat is shown in this image?<|im_end|>\n<|im_start|>assistant\nPage showing the list of options.<|im_end|>' ``` This model was contributed by [RaushanTurganbay](https://huggingface.co/RaushanTurganbay). diff --git a/docs/source/en/model_doc/mistral.md b/docs/source/en/model_doc/mistral.md index 2be657109a8d..cfa2af367813 100644 --- a/docs/source/en/model_doc/mistral.md +++ b/docs/source/en/model_doc/mistral.md @@ -91,7 +91,7 @@ As can be seen, the instruction-tuned model requires a [chat template](../chat_t ## Speeding up Mistral by using Flash Attention -The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. +The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature. diff --git a/docs/source/en/model_doc/mixtral.md b/docs/source/en/model_doc/mixtral.md index 7afcaa798eca..b5451702e44a 100644 --- a/docs/source/en/model_doc/mixtral.md +++ b/docs/source/en/model_doc/mixtral.md @@ -93,7 +93,7 @@ As can be seen, the instruction-tuned model requires a [chat template](../chat_t ## Speeding up Mixtral by using Flash Attention -The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. +The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature. diff --git a/docs/source/en/model_doc/modernbert.md b/docs/source/en/model_doc/modernbert.md new file mode 100644 index 000000000000..e90f34a903e4 --- /dev/null +++ b/docs/source/en/model_doc/modernbert.md @@ -0,0 +1,95 @@ + + +# ModernBERT + +
+ +Models + + +Paper page + +
+ +## Overview + +The ModernBERT model was proposed in [Smarter, Better, Faster, Longer: A Modern Bidirectional Encoder for Fast, Memory Efficient, and Long Context Finetuning and Inference](https://arxiv.org/abs/2412.13663) by Benjamin Warner, Antoine Chaffin, Benjamin Clavié, Orion Weller, Oskar Hallström, Said Taghadouini, Alexis Galalgher, Raja Bisas, Faisal Ladhak, Tom Aarsen, Nathan Cooper, Grifin Adams, Jeremy Howard and Iacopo Poli. + +It is a refresh of the traditional encoder architecture, as used in previous models such as [BERT](https://huggingface.co/docs/transformers/en/model_doc/bert) and [RoBERTa](https://huggingface.co/docs/transformers/en/model_doc/roberta). + +It builds on BERT and implements many modern architectural improvements which have been developed since its original release, such as: +- [Rotary Positional Embeddings](https://huggingface.co/blog/designing-positional-encoding) to support sequences of up to 8192 tokens. +- [Unpadding](https://arxiv.org/abs/2208.08124) to ensure no compute is wasted on padding tokens, speeding up processing time for batches with mixed-length sequences. +- [GeGLU](https://arxiv.org/abs/2002.05202) Replacing the original MLP layers with GeGLU layers, shown to improve performance. +- [Alternating Attention](https://arxiv.org/abs/2004.05150v2) where most attention layers employ a sliding window of 128 tokens, with Global Attention only used every 3 layers. +- [Flash Attention](https://github.com/Dao-AILab/flash-attention) to speed up processing. +- A model designed following recent [The Case for Co-Designing Model Architectures with Hardware](https://arxiv.org/abs/2401.14489), ensuring maximum efficiency across inference GPUs. +- Modern training data scales (2 trillion tokens) and mixtures (including code ande math data) + +The abstract from the paper is the following: + +*Encoder-only transformer models such as BERT offer a great performance-size tradeoff for retrieval and classification tasks with respect to larger decoder-only models. Despite being the workhorse of numerous production pipelines, there have been limited Pareto improvements to BERT since its release. In this paper, we introduce ModernBERT, bringing modern model optimizations to encoder-only models and representing a major Pareto improvement over older encoders. Trained on 2 trillion tokens with a native 8192 sequence length, ModernBERT models exhibit state-of-the-art results on a large pool of evaluations encompassing diverse classification tasks and both single and multi-vector retrieval on different domains (including code). In addition to strong downstream performance, ModernBERT is also the most speed and memory efficient encoder and is designed for inference on common GPUs.* + +The original code can be found [here](https://github.com/answerdotai/modernbert). + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ModernBert. + + + +- A notebook on how to [finetune for General Language Understanding Evaluation (GLUE) with Transformers](https://github.com/AnswerDotAI/ModernBERT/blob/main/examples/finetune_modernbert_on_glue.ipynb), also available as a Google Colab [notebook](https://colab.research.google.com/github/AnswerDotAI/ModernBERT/blob/main/examples/finetune_modernbert_on_glue.ipynb). 🌎 + + + +- A script on how to [finetune for text similarity or information retrieval with Sentence Transformers](https://github.com/AnswerDotAI/ModernBERT/blob/main/examples/train_st.py). 🌎 +- A script on how to [finetune for information retrieval with PyLate](https://github.com/AnswerDotAI/ModernBERT/blob/main/examples/train_pylate.py). 🌎 + + + +- [Masked language modeling task guide](../tasks/masked_language_modeling) + + +## ModernBertConfig + +[[autodoc]] ModernBertConfig + + + + +## ModernBertModel + +[[autodoc]] ModernBertModel + - forward + +## ModernBertForMaskedLM + +[[autodoc]] ModernBertForMaskedLM + - forward + +## ModernBertForSequenceClassification + +[[autodoc]] ModernBertForSequenceClassification + - forward + +## ModernBertForTokenClassification + +[[autodoc]] ModernBertForTokenClassification + - forward + + + diff --git a/docs/source/en/model_doc/moonshine.md b/docs/source/en/model_doc/moonshine.md new file mode 100644 index 000000000000..571e3febdb4f --- /dev/null +++ b/docs/source/en/model_doc/moonshine.md @@ -0,0 +1,56 @@ + + +# Moonshine + +## Overview + +The Moonshine model was proposed in [Moonshine: Speech Recognition for Live Transcription and Voice Commands +](https://arxiv.org/abs/2410.15608) by Nat Jeffries, Evan King, Manjunath Kudlur, Guy Nicholson, James Wang, Pete Warden. + +The abstract from the paper is the following: + +*This paper introduces Moonshine, a family of speech recognition models optimized for live transcription and voice command processing. Moonshine is based on an encoder-decoder transformer architecture and employs Rotary Position Embedding (RoPE) instead of traditional absolute position embeddings. The model is trained on speech segments of various lengths, but without using zero-padding, leading to greater efficiency for the encoder during inference time. When benchmarked against OpenAI's Whisper tiny-en, Moonshine Tiny demonstrates a 5x reduction in compute requirements for transcribing a 10-second speech segment while incurring no increase in word error rates across standard evaluation datasets. These results highlight Moonshine's potential for real-time and resource-constrained applications.* + +Tips: + +- Moonshine improves upon Whisper's architecture: + 1. It uses SwiGLU activation instead of GELU in the decoder layers + 2. Most importantly, it replaces absolute position embeddings with Rotary Position Embeddings (RoPE). This allows Moonshine to handle audio inputs of any length, unlike Whisper which is restricted to fixed 30-second windows. + +This model was contributed by [Eustache Le Bihan (eustlb)](https://huggingface.co/eustlb). +The original code can be found [here](https://github.com/usefulsensors/moonshine). + +## Resources + +- [Automatic speech recognition task guide](../tasks/asr) + +## MoonshineConfig + +[[autodoc]] MoonshineConfig + +## MoonshineModel + +[[autodoc]] MoonshineModel + - forward + - _mask_input_features + +## MoonshineForConditionalGeneration + +[[autodoc]] MoonshineForConditionalGeneration + - forward + - generate + diff --git a/docs/source/en/model_doc/moshi.md b/docs/source/en/model_doc/moshi.md index 64216f570e3e..2e2c5655de45 100644 --- a/docs/source/en/model_doc/moshi.md +++ b/docs/source/en/model_doc/moshi.md @@ -110,9 +110,14 @@ To follow the example of the following image, `"Hello, I'm Moshi"` could be tran >>> from datasets import load_dataset, Audio >>> import torch, math >>> from transformers import MoshiForConditionalGeneration, AutoFeatureExtractor, AutoTokenizer ->>> librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") +>>> librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") +>>> feature_extractor = AutoFeatureExtractor.from_pretrained("kyutai/moshiko-pytorch-bf16") +>>> tokenizer = AutoTokenizer.from_pretrained("kyutai/moshiko-pytorch-bf16") +>>> device = "cuda" +>>> dtype = torch.bfloat16 + >>> # prepare user input audio >>> librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate)) >>> audio_sample = librispeech_dummy[-1]["audio"]["array"] diff --git a/docs/source/en/model_doc/musicgen_melody.md b/docs/source/en/model_doc/musicgen_melody.md index 4d92d861f0bb..7b67713c42b7 100644 --- a/docs/source/en/model_doc/musicgen_melody.md +++ b/docs/source/en/model_doc/musicgen_melody.md @@ -266,7 +266,6 @@ Tips: ## MusicgenMelodyFeatureExtractor [[autodoc]] MusicgenMelodyFeatureExtractor - - _extract_stem_indices ## MusicgenMelodyConfig diff --git a/docs/source/en/model_doc/olmo_1124.md b/docs/source/en/model_doc/olmo2.md similarity index 84% rename from docs/source/en/model_doc/olmo_1124.md rename to docs/source/en/model_doc/olmo2.md index f36ec438e57a..8ca3326660b3 100644 --- a/docs/source/en/model_doc/olmo_1124.md +++ b/docs/source/en/model_doc/olmo2.md @@ -14,11 +14,11 @@ rendered properly in your Markdown viewer. --> -# OLMo November 2024 +# OLMo2 ## Overview -The OLMo November 2024 model is a successor of the OLMo model, which was proposed in +The OLMo2 model is the successor of the OLMo model, which was proposed in [OLMo: Accelerating the Science of Language Models](https://arxiv.org/abs/2402.00838). The architectural changes from the original OLMo model to this model are: @@ -31,16 +31,16 @@ This model was contributed by [shanearora](https://huggingface.co/shanearora). The original code can be found [here](https://github.com/allenai/OLMo/tree/main/olmo). -## Olmo1124Config +## Olmo2Config -[[autodoc]] Olmo1124Config +[[autodoc]] Olmo2Config -## Olmo1124Model +## Olmo2Model -[[autodoc]] Olmo1124Model +[[autodoc]] Olmo2Model - forward -## Olmo1124ForCausalLM +## Olmo2ForCausalLM -[[autodoc]] Olmo1124ForCausalLM +[[autodoc]] Olmo2ForCausalLM - forward diff --git a/docs/source/en/model_doc/omdet-turbo.md b/docs/source/en/model_doc/omdet-turbo.md index 1e9e05a898d2..91419919b6e0 100644 --- a/docs/source/en/model_doc/omdet-turbo.md +++ b/docs/source/en/model_doc/omdet-turbo.md @@ -44,37 +44,40 @@ One unique property of OmDet-Turbo compared to other zero-shot object detection Here's how to load the model and prepare the inputs to perform zero-shot object detection on a single image: ```python -import requests -from PIL import Image - -from transformers import AutoProcessor, OmDetTurboForObjectDetection - -processor = AutoProcessor.from_pretrained("omlab/omdet-turbo-swin-tiny-hf") -model = OmDetTurboForObjectDetection.from_pretrained("omlab/omdet-turbo-swin-tiny-hf") - -url = "http://images.cocodataset.org/val2017/000000039769.jpg" -image = Image.open(requests.get(url, stream=True).raw) -classes = ["cat", "remote"] -inputs = processor(image, text=classes, return_tensors="pt") - -outputs = model(**inputs) - -# convert outputs (bounding boxes and class logits) -results = processor.post_process_grounded_object_detection( - outputs, - classes=classes, - target_sizes=[image.size[::-1]], - score_threshold=0.3, - nms_threshold=0.3, -)[0] -for score, class_name, box in zip( - results["scores"], results["classes"], results["boxes"] -): - box = [round(i, 1) for i in box.tolist()] - print( - f"Detected {class_name} with confidence " - f"{round(score.item(), 2)} at location {box}" - ) +>>> import torch +>>> import requests +>>> from PIL import Image + +>>> from transformers import AutoProcessor, OmDetTurboForObjectDetection + +>>> processor = AutoProcessor.from_pretrained("omlab/omdet-turbo-swin-tiny-hf") +>>> model = OmDetTurboForObjectDetection.from_pretrained("omlab/omdet-turbo-swin-tiny-hf") + +>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" +>>> image = Image.open(requests.get(url, stream=True).raw) +>>> text_labels = ["cat", "remote"] +>>> inputs = processor(image, text=text_labels, return_tensors="pt") + +>>> with torch.no_grad(): +... outputs = model(**inputs) + +>>> # convert outputs (bounding boxes and class logits) +>>> results = processor.post_process_grounded_object_detection( +... outputs, +... target_sizes=[(image.height, image.width)], +... text_labels=text_labels, +... threshold=0.3, +... nms_threshold=0.3, +... ) +>>> result = results[0] +>>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"] +>>> for box, score, text_label in zip(boxes, scores, text_labels): +... box = [round(i, 2) for i in box.tolist()] +... print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}") +Detected remote with confidence 0.768 at location [39.89, 70.35, 176.74, 118.04] +Detected cat with confidence 0.72 at location [11.6, 54.19, 314.8, 473.95] +Detected remote with confidence 0.563 at location [333.38, 75.77, 370.7, 187.03] +Detected cat with confidence 0.552 at location [345.15, 23.95, 639.75, 371.67] ``` ### Multi image inference @@ -93,22 +96,22 @@ OmDet-Turbo can perform batched multi-image inference, with support for differen >>> url1 = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image1 = Image.open(BytesIO(requests.get(url1).content)).convert("RGB") ->>> classes1 = ["cat", "remote"] ->>> task1 = "Detect {}.".format(", ".join(classes1)) +>>> text_labels1 = ["cat", "remote"] +>>> task1 = "Detect {}.".format(", ".join(text_labels1)) >>> url2 = "http://images.cocodataset.org/train2017/000000257813.jpg" >>> image2 = Image.open(BytesIO(requests.get(url2).content)).convert("RGB") ->>> classes2 = ["boat"] +>>> text_labels2 = ["boat"] >>> task2 = "Detect everything that looks like a boat." >>> url3 = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" >>> image3 = Image.open(BytesIO(requests.get(url3).content)).convert("RGB") ->>> classes3 = ["statue", "trees"] +>>> text_labels3 = ["statue", "trees"] >>> task3 = "Focus on the foreground, detect statue and trees." >>> inputs = processor( ... images=[image1, image2, image3], -... text=[classes1, classes2, classes3], +... text=[text_labels1, text_labels2, text_labels3], ... task=[task1, task2, task3], ... return_tensors="pt", ... ) @@ -119,19 +122,19 @@ OmDet-Turbo can perform batched multi-image inference, with support for differen >>> # convert outputs (bounding boxes and class logits) >>> results = processor.post_process_grounded_object_detection( ... outputs, -... classes=[classes1, classes2, classes3], -... target_sizes=[image1.size[::-1], image2.size[::-1], image3.size[::-1]], -... score_threshold=0.2, +... text_labels=[text_labels1, text_labels2, text_labels3], +... target_sizes=[(image.height, image.width) for image in [image1, image2, image3]], +... threshold=0.2, ... nms_threshold=0.3, ... ) >>> for i, result in enumerate(results): -... for score, class_name, box in zip( -... result["scores"], result["classes"], result["boxes"] +... for score, text_label, box in zip( +... result["scores"], result["text_labels"], result["boxes"] ... ): ... box = [round(i, 1) for i in box.tolist()] ... print( -... f"Detected {class_name} with confidence " +... f"Detected {text_label} with confidence " ... f"{round(score.item(), 2)} at location {box} in image {i}" ... ) Detected remote with confidence 0.77 at location [39.9, 70.4, 176.7, 118.0] in image 0 diff --git a/docs/source/en/model_doc/owlv2.md b/docs/source/en/model_doc/owlv2.md index 1b4e92bc4eb1..696a1b03776a 100644 --- a/docs/source/en/model_doc/owlv2.md +++ b/docs/source/en/model_doc/owlv2.md @@ -50,20 +50,22 @@ OWLv2 is, just like its predecessor [OWL-ViT](owlvit), a zero-shot text-conditio >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) ->>> texts = [["a photo of a cat", "a photo of a dog"]] ->>> inputs = processor(text=texts, images=image, return_tensors="pt") +>>> text_labels = [["a photo of a cat", "a photo of a dog"]] +>>> inputs = processor(text=text_labels, images=image, return_tensors="pt") >>> outputs = model(**inputs) >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2] ->>> target_sizes = torch.Tensor([image.size[::-1]]) ->>> # Convert outputs (bounding boxes and class logits) to Pascal VOC Format (xmin, ymin, xmax, ymax) ->>> results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1) ->>> i = 0 # Retrieve predictions for the first image for the corresponding text queries ->>> text = texts[i] ->>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"] ->>> for box, score, label in zip(boxes, scores, labels): +>>> target_sizes = torch.tensor([(image.height, image.width)]) +>>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) +>>> results = processor.post_process_grounded_object_detection( +... outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels +... ) +>>> # Retrieve predictions for the first image for the corresponding text queries +>>> result = results[0] +>>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"] +>>> for box, score, text_label in zip(boxes, scores, text_labels): ... box = [round(i, 2) for i in box.tolist()] -... print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}") +... print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}") Detected a photo of a cat with confidence 0.614 at location [341.67, 23.39, 642.32, 371.35] Detected a photo of a cat with confidence 0.665 at location [6.75, 51.96, 326.62, 473.13] ``` @@ -103,6 +105,9 @@ Usage of OWLv2 is identical to [OWL-ViT](owlvit) with a new, updated image proce ## Owlv2Processor [[autodoc]] Owlv2Processor + - __call__ + - post_process_grounded_object_detection + - post_process_image_guided_detection ## Owlv2Model diff --git a/docs/source/en/model_doc/owlvit.md b/docs/source/en/model_doc/owlvit.md index c40d3a9e7a17..519648bbd8dc 100644 --- a/docs/source/en/model_doc/owlvit.md +++ b/docs/source/en/model_doc/owlvit.md @@ -49,20 +49,22 @@ OWL-ViT is a zero-shot text-conditioned object detection model. OWL-ViT uses [CL >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) ->>> texts = [["a photo of a cat", "a photo of a dog"]] ->>> inputs = processor(text=texts, images=image, return_tensors="pt") +>>> text_labels = [["a photo of a cat", "a photo of a dog"]] +>>> inputs = processor(text=text_labels, images=image, return_tensors="pt") >>> outputs = model(**inputs) >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2] ->>> target_sizes = torch.Tensor([image.size[::-1]]) +>>> target_sizes = torch.tensor([(image.height, image.width)]) >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) ->>> results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1) ->>> i = 0 # Retrieve predictions for the first image for the corresponding text queries ->>> text = texts[i] ->>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"] ->>> for box, score, label in zip(boxes, scores, labels): +>>> results = processor.post_process_grounded_object_detection( +... outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels +... ) +>>> # Retrieve predictions for the first image for the corresponding text queries +>>> result = results[0] +>>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"] +>>> for box, score, text_label in zip(boxes, scores, text_labels): ... box = [round(i, 2) for i in box.tolist()] -... print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}") +... print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}") Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.58, 373.29] Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17] ``` @@ -91,16 +93,12 @@ A demo notebook on using OWL-ViT for zero- and one-shot (image-guided) object de - post_process_object_detection - post_process_image_guided_detection -## OwlViTFeatureExtractor - -[[autodoc]] OwlViTFeatureExtractor - - __call__ - - post_process - - post_process_image_guided_detection - ## OwlViTProcessor [[autodoc]] OwlViTProcessor + - __call__ + - post_process_grounded_object_detection + - post_process_image_guided_detection ## OwlViTModel diff --git a/docs/source/en/model_doc/phi3.md b/docs/source/en/model_doc/phi3.md index 76d94008137e..fe68a6ae76b2 100644 --- a/docs/source/en/model_doc/phi3.md +++ b/docs/source/en/model_doc/phi3.md @@ -57,10 +57,7 @@ Phi-3 has been integrated in the development version (4.40.0.dev) of `transforme >>> outputs = model.generate(inputs, max_new_tokens=32) >>> text = tokenizer.batch_decode(outputs)[0] >>> print(text) -<|user|> -Can you provide ways to eat combinations of bananas and dragonfruits?<|end|> -<|assistant|> -Certainly! Bananas and dragonfruits can be combined in various delicious ways. Here are some ideas for eating combinations of bananas and +<|user|> Can you provide ways to eat combinations of bananas and dragonfruits?<|end|><|assistant|> Certainly! Bananas and dragonfruits can be combined in various delicious ways. Here are some creative ideas for incorporating both fruits ``` ## Phi3Config diff --git a/docs/source/en/model_doc/pixtral.md b/docs/source/en/model_doc/pixtral.md index ab604e4521fc..62bdc004c517 100644 --- a/docs/source/en/model_doc/pixtral.md +++ b/docs/source/en/model_doc/pixtral.md @@ -88,6 +88,11 @@ output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up [[autodoc]] PixtralImageProcessor - preprocess +## PixtralImageProcessorFast + +[[autodoc]] PixtralImageProcessorFast + - preprocess + ## PixtralProcessor [[autodoc]] PixtralProcessor diff --git a/docs/source/en/model_doc/qwen2_5_vl.md b/docs/source/en/model_doc/qwen2_5_vl.md new file mode 100644 index 000000000000..9bfcb6ae0f62 --- /dev/null +++ b/docs/source/en/model_doc/qwen2_5_vl.md @@ -0,0 +1,300 @@ + + +# Qwen2.5-VL + +## Overview + +The [Qwen2.5-VL](https://qwenlm.github.io/blog/qwen2_5-vl/) model is an update to [Qwen2-VL](https://arxiv.org/abs/2409.12191) from Qwen team, Alibaba Group. + +The abstract from this update is the following: + +*Qwen2.5-VL marks a major step forward from Qwen2-VL, built upon the latest Qwen2.5 LLM. We've accelerated training and testing through the strategic implementation of window attention within the ViT. The ViT architecture itself has been refined with SwiGLU and RMSNorm, aligning it more closely with the LLM's structure. A key innovation is the expansion of native dynamic resolution to encompass the temporal dimension, in addition to spatial aspects. Furthermore, we've upgraded MRoPE, incorporating absolute time alignment on the time axis to allow the model to effectively capture temporal dynamics, regardless of frame rate, leading to superior video understanding.* + +## Usage example + +### Single Media inference + +The model can accept both images and videos as input. Here's an example code for inference. + +```python + +from PIL import Image +import requests +import torch +from torchvision import io +from typing import Dict +from transformers.image_utils import load_images, load_video +from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor + +# Load the model in half-precision on the available device(s) +model = Qwen2_5_VLForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", device_map="auto") +processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct") + +# Image +url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" +image = Image.open(requests.get(url, stream=True).raw) + +conversation = [ + { + "role":"user", + "content":[ + { + "type":"image", + }, + { + "type":"text", + "text":"Describe this image." + } + ] + } +] + + +# Preprocess the inputs +text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) +# Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n' + +inputs = processor(text=[text_prompt], images=[image], padding=True, return_tensors="pt") +inputs = inputs.to('cuda') + +# Inference: Generation of the output +output_ids = model.generate(**inputs, max_new_tokens=128) +generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)] +output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) +print(output_text) + +# Video +video = load_video(video="/path/to/video.mp4") +conversation = [ + { + "role": "user", + "content": [ + {"type": "video"}, + {"type": "text", "text": "What happened in the video?"}, + ], + } +] + +# Preprocess the inputs +text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) +# Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>What happened in the video?<|im_end|>\n<|im_start|>assistant\n' + +# Qwen2.5VL modifies the time positional encoding (MRoPE) according to the video's frame rate (FPS). +# Therefore, the video's FPS information needs to be provided as input. +inputs = processor(text=[text_prompt], videos=[video], fps=[1.0], padding=True, return_tensors="pt") +inputs = inputs.to('cuda') + +# Inference: Generation of the output +output_ids = model.generate(**inputs, max_new_tokens=128) +generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)] +output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) +print(output_text) +``` + +### Batch Mixed Media Inference + +The model can batch inputs composed of mixed samples of various types such as images, videos, and text. Here is an example. + +```python +images = load_images([ + "/path/to/image1.jpg", + "/path/to/image2.jpg", + "/path/to/image3.jpg", + "/path/to/image4.jpg", + "/path/to/image5.jpg", +]) +video = load_video(video="/path/to/video.mp4") + +# Conversation for the first image +conversation1 = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": "Describe this image."} + ] + } +] + +# Conversation with two images +conversation2 = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "image"}, + {"type": "text", "text": "What is written in the pictures?"} + ] + } +] + +# Conversation with pure text +conversation3 = [ + { + "role": "user", + "content": "who are you?" + } +] + + +# Conversation with mixed midia +conversation4 = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "image"}, + {"type": "video"}, + {"type": "text", "text": "What are the common elements in these medias?"}, + ], + } +] + +conversations = [conversation1, conversation2, conversation3, conversation4] +# Preparation for batch inference +texts = [processor.apply_chat_template(msg, add_generation_prompt=True) for msg in conversations] +inputs = processor( + text=texts, + images=images, + videos=[video], + padding=True, + return_tensors="pt", +) +inputs = inputs.to('cuda') + +# Batch Inference +output_ids = model.generate(**inputs, max_new_tokens=128) +generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)] +output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) +print(output_text) +``` + +### Usage Tips + +#### Image Resolution trade-off + +The model supports a wide range of resolution inputs. By default, it uses the native resolution for input, but higher resolutions can enhance performance at the cost of more computation. Users can set the minimum and maximum number of pixels to achieve an optimal configuration for their needs. + +```python +min_pixels = 224*224 +max_pixels = 2048*2048 +processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels) +``` + +In case of limited GPU RAM, one can reduce the resolution as follows: + +```python +min_pixels = 256*28*28 +max_pixels = 1024*28*28 +processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels) +``` +This ensures each image gets encoded using a number between 256-1024 tokens. The 28 comes from the fact that the model uses a patch size of 14 and a temporal patch size of 2 (14 x 2 = 28). + +#### Multiple Image Inputs + +By default, images and video content are directly included in the conversation. When handling multiple images, it's helpful to add labels to the images and videos for better reference. Users can control this behavior with the following settings: + +```python +conversation = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": "Hello, how are you?"} + ] + }, + { + "role": "assistant", + "content": "I'm doing well, thank you for asking. How can I assist you today?" + }, + { + "role": "user", + "content": [ + {"type": "text", "text": "Can you describe these images and video?"}, + {"type": "image"}, + {"type": "image"}, + {"type": "video"}, + {"type": "text", "text": "These are from my vacation."} + ] + }, + { + "role": "assistant", + "content": "I'd be happy to describe the images and video for you. Could you please provide more context about your vacation?" + }, + { + "role": "user", + "content": "It was a trip to the mountains. Can you see the details in the images and video?" + } +] + +# default: +prompt_without_id = processor.apply_chat_template(conversation, add_generation_prompt=True) +# Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Hello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing well, thank you for asking. How can I assist you today?<|im_end|>\n<|im_start|>user\nCan you describe these images and video?<|vision_start|><|image_pad|><|vision_end|><|vision_start|><|image_pad|><|vision_end|><|vision_start|><|video_pad|><|vision_end|>These are from my vacation.<|im_end|>\n<|im_start|>assistant\nI'd be happy to describe the images and video for you. Could you please provide more context about your vacation?<|im_end|>\n<|im_start|>user\nIt was a trip to the mountains. Can you see the details in the images and video?<|im_end|>\n<|im_start|>assistant\n' + + +# add ids +prompt_with_id = processor.apply_chat_template(conversation, add_generation_prompt=True, add_vision_id=True) +# Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nPicture 1: <|vision_start|><|image_pad|><|vision_end|>Hello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing well, thank you for asking. How can I assist you today?<|im_end|>\n<|im_start|>user\nCan you describe these images and video?Picture 2: <|vision_start|><|image_pad|><|vision_end|>Picture 3: <|vision_start|><|image_pad|><|vision_end|>Video 1: <|vision_start|><|video_pad|><|vision_end|>These are from my vacation.<|im_end|>\n<|im_start|>assistant\nI'd be happy to describe the images and video for you. Could you please provide more context about your vacation?<|im_end|>\n<|im_start|>user\nIt was a trip to the mountains. Can you see the details in the images and video?<|im_end|>\n<|im_start|>assistant\n' + +``` + +#### Flash-Attention 2 to speed up generation + +First, make sure to install the latest version of Flash Attention 2: + +```bash +pip install -U flash-attn --no-build-isolation +``` + +Also, you should have hardware that is compatible with FlashAttention 2. Read more about it in the official documentation of the [flash attention repository](https://github.com/Dao-AILab/flash-attention). FlashAttention-2 can only be used when a model is loaded in `torch.float16` or `torch.bfloat16`. + +To load and run a model using FlashAttention-2, add `attn_implementation="flash_attention_2"` when loading the model: + +```python +from transformers import Qwen2_5_VLForConditionalGeneration + +model = Qwen2_5_VLForConditionalGeneration.from_pretrained( + "Qwen/Qwen2.5-VL-7B-Instruct", + torch_dtype=torch.bfloat16, + attn_implementation="flash_attention_2", +) +``` + + + +## Qwen2_5_VLConfig + +[[autodoc]] Qwen2_5_VLConfig + +## Qwen2_5_VLImageProcessor + +[[autodoc]] Qwen2_5_VLImageProcessor + - preprocess + +## Qwen2_5_VLProcessor + +[[autodoc]] Qwen2_5_VLProcessor + +## Qwen2_5_VLModel + +[[autodoc]] Qwen2_5_VLModel + - forward + +## Qwen2_5_VLForConditionalGeneration + +[[autodoc]] Qwen2_5_VLForConditionalGeneration + - forward diff --git a/docs/source/en/model_doc/qwen2_audio.md b/docs/source/en/model_doc/qwen2_audio.md index f399a7e7320c..2ef947ce430d 100644 --- a/docs/source/en/model_doc/qwen2_audio.md +++ b/docs/source/en/model_doc/qwen2_audio.md @@ -34,6 +34,37 @@ The abstract from the paper is the following: `Qwen2-Audio-7B` and `Qwen2-Audio-7B-Instruct` can be found on the [Huggingface Hub](https://huggingface.co/Qwen) +### Inference + +```python +from io import BytesIO +from urllib.request import urlopen +import librosa +from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration + +model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B", trust_remote_code=True, device_map="auto") +processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B", trust_remote_code=True) + +prompt = "<|audio_bos|><|AUDIO|><|audio_eos|>Generate the caption in English:" +url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/glass-breaking-151256.mp3" +audio, sr = librosa.load(BytesIO(urlopen(url).read()), sr=processor.feature_extractor.sampling_rate) +inputs = processor(text=prompt, audios=audio, return_tensors="pt").to(model.device) + +generate_ids = model.generate(**inputs, max_length=256) +generate_ids = generate_ids[:, inputs.input_ids.size(1):] + +response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + +# We can also omit the audio_bos and audio_eos tokens +prompt = "<|AUDIO|>Generate the caption in English:" +inputs = processor(text=prompt, audios=audio, return_tensors="pt").to(model.device) + +generate_ids = model.generate(**inputs, max_length=256) +generate_ids = generate_ids[:, inputs.input_ids.size(1):] + +response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] +``` + In the following, we demonstrate how to use `Qwen2-Audio-7B-Instruct` for the inference, supporting both voice chat and audio analysis modes. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose. ### Voice Chat Inference diff --git a/docs/source/en/model_doc/qwen2_vl.md b/docs/source/en/model_doc/qwen2_vl.md index 7c864b860bd8..c39728ef71ec 100644 --- a/docs/source/en/model_doc/qwen2_vl.md +++ b/docs/source/en/model_doc/qwen2_vl.md @@ -315,6 +315,11 @@ model = Qwen2VLForConditionalGeneration.from_pretrained( [[autodoc]] Qwen2VLImageProcessor - preprocess +## Qwen2VLImageProcessorFast + +[[autodoc]] Qwen2VLImageProcessorFast + - preprocess + ## Qwen2VLProcessor [[autodoc]] Qwen2VLProcessor diff --git a/docs/source/en/model_doc/seamless_m4t.md b/docs/source/en/model_doc/seamless_m4t.md index e820e6c92563..486e58691f6d 100644 --- a/docs/source/en/model_doc/seamless_m4t.md +++ b/docs/source/en/model_doc/seamless_m4t.md @@ -52,7 +52,7 @@ Here is how to use the processor to process text and audio: ```python >>> # let's load an audio sample from an Arabic speech corpus >>> from datasets import load_dataset ->>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True) +>>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True, trust_remote_code=True) >>> audio_sample = next(iter(dataset))["audio"] >>> # now, process it diff --git a/docs/source/en/model_doc/seamless_m4t_v2.md b/docs/source/en/model_doc/seamless_m4t_v2.md index aea34acc180b..c6a2ec4b51c2 100644 --- a/docs/source/en/model_doc/seamless_m4t_v2.md +++ b/docs/source/en/model_doc/seamless_m4t_v2.md @@ -52,7 +52,7 @@ Here is how to use the processor to process text and audio: ```python >>> # let's load an audio sample from an Arabic speech corpus >>> from datasets import load_dataset ->>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True) +>>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True, trust_remote_code=True) >>> audio_sample = next(iter(dataset))["audio"] >>> # now, process it diff --git a/docs/source/en/model_doc/siglip.md b/docs/source/en/model_doc/siglip.md index 88e38cbb590e..56e168ab4734 100644 --- a/docs/source/en/model_doc/siglip.md +++ b/docs/source/en/model_doc/siglip.md @@ -86,7 +86,7 @@ If you want to do the pre- and postprocessing yourself, here's how to do that: >>> candidate_labels = ["2 cats", "2 dogs"] # follows the pipeline prompt template to get same results >>> texts = [f'This is a photo of {label}.' for label in candidate_labels] ->>> # important: we pass `padding=max_length` since the model was trained with this +# important: we pass `padding=max_length` since the model was trained with this >>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt") >>> with torch.no_grad(): @@ -95,14 +95,14 @@ If you want to do the pre- and postprocessing yourself, here's how to do that: >>> logits_per_image = outputs.logits_per_image >>> probs = torch.sigmoid(logits_per_image) # these are the probabilities >>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'") -31.9% that image 0 is 'a photo of 2 cats' +19.8% that image 0 is '2 cats' ``` ## Resources A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with SigLIP. -- [Zero-shot image classification task guide](../tasks/zero_shot_image_classification_md) +- [Zero-shot image classification task guide](../tasks/zero_shot_image_classification) - Demo notebooks for SigLIP can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SigLIP). 🌎 If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. @@ -142,8 +142,7 @@ To load and run a model using Flash Attention 2, refer to the snippet below: # follows the pipeline prompt template to get same results >>> texts = [f'This is a photo of {label}.' for label in candidate_labels] # important: we pass `padding=max_length` since the model was trained with this ->>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt") ->>> inputs.to(device) +>>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt").to(device) >>> with torch.no_grad(): ... with torch.autocast(device): @@ -152,7 +151,7 @@ To load and run a model using Flash Attention 2, refer to the snippet below: >>> logits_per_image = outputs.logits_per_image >>> probs = torch.sigmoid(logits_per_image) # these are the probabilities >>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'") -51.3% that image 0 is 'This is a photo of 2 cats.' +19.8% that image 0 is '2 cats' ``` diff --git a/docs/source/en/model_doc/superglue.md b/docs/source/en/model_doc/superglue.md new file mode 100644 index 000000000000..08a4575dddc2 --- /dev/null +++ b/docs/source/en/model_doc/superglue.md @@ -0,0 +1,138 @@ + + +# SuperGlue + +## Overview + +The SuperGlue model was proposed in [SuperGlue: Learning Feature Matching with Graph Neural Networks](https://arxiv.org/abs/1911.11763) by Paul-Edouard Sarlin, Daniel DeTone, Tomasz Malisiewicz and Andrew Rabinovich. + +This model consists of matching two sets of interest points detected in an image. Paired with the +[SuperPoint model](https://huggingface.co/magic-leap-community/superpoint), it can be used to match two images and +estimate the pose between them. This model is useful for tasks such as image matching, homography estimation, etc. + +The abstract from the paper is the following: + +*This paper introduces SuperGlue, a neural network that matches two sets of local features by jointly finding correspondences +and rejecting non-matchable points. Assignments are estimated by solving a differentiable optimal transport problem, whose costs +are predicted by a graph neural network. We introduce a flexible context aggregation mechanism based on attention, enabling +SuperGlue to reason about the underlying 3D scene and feature assignments jointly. Compared to traditional, hand-designed heuristics, +our technique learns priors over geometric transformations and regularities of the 3D world through end-to-end training from image +pairs. SuperGlue outperforms other learned approaches and achieves state-of-the-art results on the task of pose estimation in +challenging real-world indoor and outdoor environments. The proposed method performs matching in real-time on a modern GPU and +can be readily integrated into modern SfM or SLAM systems. The code and trained weights are publicly available at this [URL](https://github.com/magicleap/SuperGluePretrainedNetwork).* + +## How to use + +Here is a quick example of using the model. Since this model is an image matching model, it requires pairs of images to be matched. +The raw outputs contain the list of keypoints detected by the keypoint detector as well as the list of matches with their corresponding +matching scores. +```python +from transformers import AutoImageProcessor, AutoModel +import torch +from PIL import Image +import requests + +url_image1 = "https://raw.githubusercontent.com/magicleap/SuperGluePretrainedNetwork/refs/heads/master/assets/phototourism_sample_images/united_states_capitol_98169888_3347710852.jpg" +image1 = Image.open(requests.get(url_image1, stream=True).raw) +url_image2 = "https://raw.githubusercontent.com/magicleap/SuperGluePretrainedNetwork/refs/heads/master/assets/phototourism_sample_images/united_states_capitol_26757027_6717084061.jpg" +image_2 = Image.open(requests.get(url_image2, stream=True).raw) + +images = [image1, image2] + +processor = AutoImageProcessor.from_pretrained("magic-leap-community/superglue_outdoor") +model = AutoModel.from_pretrained("magic-leap-community/superglue_outdoor") + +inputs = processor(images, return_tensors="pt") +with torch.no_grad(): + outputs = model(**inputs) +``` + +You can use the `post_process_keypoint_matching` method from the `SuperGlueImageProcessor` to get the keypoints and matches in a more readable format: + +```python +image_sizes = [[(image.height, image.width) for image in images]] +outputs = processor.post_process_keypoint_matching(outputs, image_sizes, threshold=0.2) +for i, output in enumerate(outputs): + print("For the image pair", i) + for keypoint0, keypoint1, matching_score in zip( + output["keypoints0"], output["keypoints1"], output["matching_scores"] + ): + print( + f"Keypoint at coordinate {keypoint0.numpy()} in the first image matches with keypoint at coordinate {keypoint1.numpy()} in the second image with a score of {matching_score}." + ) + +``` + +From the outputs, you can visualize the matches between the two images using the following code: +```python +import matplotlib.pyplot as plt +import numpy as np + +# Create side by side image +merged_image = np.zeros((max(image1.height, image2.height), image1.width + image2.width, 3)) +merged_image[: image1.height, : image1.width] = np.array(image1) / 255.0 +merged_image[: image2.height, image1.width :] = np.array(image2) / 255.0 +plt.imshow(merged_image) +plt.axis("off") + +# Retrieve the keypoints and matches +output = outputs[0] +keypoints0 = output["keypoints0"] +keypoints1 = output["keypoints1"] +matching_scores = output["matching_scores"] +keypoints0_x, keypoints0_y = keypoints0[:, 0].numpy(), keypoints0[:, 1].numpy() +keypoints1_x, keypoints1_y = keypoints1[:, 0].numpy(), keypoints1[:, 1].numpy() + +# Plot the matches +for keypoint0_x, keypoint0_y, keypoint1_x, keypoint1_y, matching_score in zip( + keypoints0_x, keypoints0_y, keypoints1_x, keypoints1_y, matching_scores +): + plt.plot( + [keypoint0_x, keypoint1_x + image1.width], + [keypoint0_y, keypoint1_y], + color=plt.get_cmap("RdYlGn")(matching_score.item()), + alpha=0.9, + linewidth=0.5, + ) + plt.scatter(keypoint0_x, keypoint0_y, c="black", s=2) + plt.scatter(keypoint1_x + image1.width, keypoint1_y, c="black", s=2) + +# Save the plot +plt.savefig("matched_image.png", dpi=300, bbox_inches='tight') +plt.close() +``` + +![image/png](https://cdn-uploads.huggingface.co/production/uploads/632885ba1558dac67c440aa8/01ZYaLB1NL5XdA8u7yCo4.png) + +This model was contributed by [stevenbucaille](https://huggingface.co/stevenbucaille). +The original code can be found [here](https://github.com/magicleap/SuperGluePretrainedNetwork). + +## SuperGlueConfig + +[[autodoc]] SuperGlueConfig + +## SuperGlueImageProcessor + +[[autodoc]] SuperGlueImageProcessor + +- preprocess + +## SuperGlueForKeypointMatching + +[[autodoc]] SuperGlueForKeypointMatching + +- forward +- post_process_keypoint_matching \ No newline at end of file diff --git a/docs/source/en/model_doc/textnet.md b/docs/source/en/model_doc/textnet.md new file mode 100644 index 000000000000..d6b431e648f2 --- /dev/null +++ b/docs/source/en/model_doc/textnet.md @@ -0,0 +1,55 @@ + + +# TextNet + +## Overview + +The TextNet model was proposed in [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://arxiv.org/abs/2111.02394) by Zhe Chen, Jiahao Wang, Wenhai Wang, Guo Chen, Enze Xie, Ping Luo, Tong Lu. TextNet is a vision backbone useful for text detection tasks. It is the result of neural architecture search (NAS) on backbones with reward function as text detection task (to provide powerful features for text detection). + + + + TextNet backbone as part of FAST. Taken from the original paper. + +This model was contributed by [Raghavan](https://huggingface.co/Raghavan), [jadechoghari](https://huggingface.co/jadechoghari) and [nielsr](https://huggingface.co/nielsr). + +## Usage tips + +TextNet is mainly used as a backbone network for the architecture search of text detection. Each stage of the backbone network is comprised of a stride-2 convolution and searchable blocks. +Specifically, we present a layer-level candidate set, defined as {conv3×3, conv1×3, conv3×1, identity}. As the 1×3 and 3×1 convolutions have asymmetric kernels and oriented structure priors, they may help to capture the features of extreme aspect-ratio and rotated text lines. + +TextNet is the backbone for Fast, but can also be used as an efficient text/image classification, we add a `TextNetForImageClassification` as is it would allow people to train an image classifier on top of the pre-trained textnet weights + +## TextNetConfig + +[[autodoc]] TextNetConfig + +## TextNetImageProcessor + +[[autodoc]] TextNetImageProcessor + - preprocess + +## TextNetModel + +[[autodoc]] TextNetModel + - forward + +## TextNetForImageClassification + +[[autodoc]] TextNetForImageClassification + - forward + diff --git a/docs/source/en/model_doc/timm_wrapper.md b/docs/source/en/model_doc/timm_wrapper.md new file mode 100644 index 000000000000..467f2addf963 --- /dev/null +++ b/docs/source/en/model_doc/timm_wrapper.md @@ -0,0 +1,78 @@ + + +# TimmWrapper + +## Overview + +Helper class to enable loading timm models to be used with the transformers library and its autoclasses. + +```python +>>> import torch +>>> from PIL import Image +>>> from urllib.request import urlopen +>>> from transformers import AutoModelForImageClassification, AutoImageProcessor + +>>> # Load image +>>> image = Image.open(urlopen( +... 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png' +... )) + +>>> # Load model and image processor +>>> checkpoint = "timm/resnet50.a1_in1k" +>>> image_processor = AutoImageProcessor.from_pretrained(checkpoint) +>>> model = AutoModelForImageClassification.from_pretrained(checkpoint).eval() + +>>> # Preprocess image +>>> inputs = image_processor(image) + +>>> # Forward pass +>>> with torch.no_grad(): +... logits = model(**inputs).logits + +>>> # Get top 5 predictions +>>> top5_probabilities, top5_class_indices = torch.topk(logits.softmax(dim=1) * 100, k=5) +``` + +## Resources: + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with TimmWrapper. + + + +- [Collection of Example Notebook](https://github.com/ariG23498/timm-wrapper-examples) 🌎 + +> [!TIP] +> For a more detailed overview please read the [official blog post](https://huggingface.co/blog/timm-transformers) on the timm integration. + +## TimmWrapperConfig + +[[autodoc]] TimmWrapperConfig + +## TimmWrapperImageProcessor + +[[autodoc]] TimmWrapperImageProcessor + - preprocess + +## TimmWrapperModel + +[[autodoc]] TimmWrapperModel + - forward + +## TimmWrapperForImageClassification + +[[autodoc]] TimmWrapperForImageClassification + - forward diff --git a/docs/source/en/model_doc/video_llava.md b/docs/source/en/model_doc/video_llava.md index 105307196eff..a3ba1258ecfa 100644 --- a/docs/source/en/model_doc/video_llava.md +++ b/docs/source/en/model_doc/video_llava.md @@ -174,7 +174,7 @@ model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-L ### Flash-Attention 2 to speed-up generation -Additionally, we can greatly speed-up model inference by using [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. +Additionally, we can greatly speed-up model inference by using [Flash Attention](../perf_train_gpu_one#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. First, make sure to install the latest version of Flash Attention 2: diff --git a/docs/source/en/model_doc/vipllava.md b/docs/source/en/model_doc/vipllava.md index 328310f3e26b..cb625e371161 100644 --- a/docs/source/en/model_doc/vipllava.md +++ b/docs/source/en/model_doc/vipllava.md @@ -58,7 +58,7 @@ conversation = [ "content": [ {"type": "image"}, {"type": "text", "text": "What’s shown in this image?"}, - , + ], }, { "role": "assistant", diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md new file mode 100644 index 000000000000..4fbead04ea80 --- /dev/null +++ b/docs/source/en/model_doc/vitpose.md @@ -0,0 +1,288 @@ + + +# ViTPose + +## Overview + +The ViTPose model was proposed in [ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation](https://arxiv.org/abs/2204.12484) by Yufei Xu, Jing Zhang, Qiming Zhang, Dacheng Tao. ViTPose employs a standard, non-hierarchical [Vision Transformer](vit) as backbone for the task of keypoint estimation. A simple decoder head is added on top to predict the heatmaps from a given image. Despite its simplicity, the model gets state-of-the-art results on the challenging MS COCO Keypoint Detection benchmark. The model was further improved in [ViTPose++: Vision Transformer for Generic Body Pose Estimation](https://arxiv.org/abs/2212.04246) where the authors employ +a mixture-of-experts (MoE) module in the ViT backbone along with pre-training on more data, which further enhances the performance. + +The abstract from the paper is the following: + +*Although no specific domain knowledge is considered in the design, plain vision transformers have shown excellent performance in visual recognition tasks. However, little effort has been made to reveal the potential of such simple structures for pose estimation tasks. In this paper, we show the surprisingly good capabilities of plain vision transformers for pose estimation from various aspects, namely simplicity in model structure, scalability in model size, flexibility in training paradigm, and transferability of knowledge between models, through a simple baseline model called ViTPose. Specifically, ViTPose employs plain and non-hierarchical vision transformers as backbones to extract features for a given person instance and a lightweight decoder for pose estimation. It can be scaled up from 100M to 1B parameters by taking the advantages of the scalable model capacity and high parallelism of transformers, setting a new Pareto front between throughput and performance. Besides, ViTPose is very flexible regarding the attention type, input resolution, pre-training and finetuning strategy, as well as dealing with multiple pose tasks. We also empirically demonstrate that the knowledge of large ViTPose models can be easily transferred to small ones via a simple knowledge token. Experimental results show that our basic ViTPose model outperforms representative methods on the challenging MS COCO Keypoint Detection benchmark, while the largest model sets a new state-of-the-art.* + + + + ViTPose architecture. Taken from the original paper. + +This model was contributed by [nielsr](https://huggingface.co/nielsr) and [sangbumchoi](https://github.com/SangbumChoi). +The original code can be found [here](https://github.com/ViTAE-Transformer/ViTPose). + +## Usage Tips + +ViTPose is a so-called top-down keypoint detection model. This means that one first uses an object detector, like [RT-DETR](rt_detr.md), to detect people (or other instances) in an image. Next, ViTPose takes the cropped images as input and predicts the keypoints for each of them. + +```py +import torch +import requests +import numpy as np + +from PIL import Image + +from transformers import AutoProcessor, RTDetrForObjectDetection, VitPoseForPoseEstimation + +device = "cuda" if torch.cuda.is_available() else "cpu" + +url = "http://images.cocodataset.org/val2017/000000000139.jpg" +image = Image.open(requests.get(url, stream=True).raw) + +# ------------------------------------------------------------------------ +# Stage 1. Detect humans on the image +# ------------------------------------------------------------------------ + +# You can choose any detector of your choice +person_image_processor = AutoProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365") +person_model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_coco_o365", device_map=device) + +inputs = person_image_processor(images=image, return_tensors="pt").to(device) + +with torch.no_grad(): + outputs = person_model(**inputs) + +results = person_image_processor.post_process_object_detection( + outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.3 +) +result = results[0] # take first image results + +# Human label refers 0 index in COCO dataset +person_boxes = result["boxes"][result["labels"] == 0] +person_boxes = person_boxes.cpu().numpy() + +# Convert boxes from VOC (x1, y1, x2, y2) to COCO (x1, y1, w, h) format +person_boxes[:, 2] = person_boxes[:, 2] - person_boxes[:, 0] +person_boxes[:, 3] = person_boxes[:, 3] - person_boxes[:, 1] + +# ------------------------------------------------------------------------ +# Stage 2. Detect keypoints for each person found +# ------------------------------------------------------------------------ + +image_processor = AutoProcessor.from_pretrained("usyd-community/vitpose-base-simple") +model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple", device_map=device) + +inputs = image_processor(image, boxes=[person_boxes], return_tensors="pt").to(device) + +with torch.no_grad(): + outputs = model(**inputs) + +pose_results = image_processor.post_process_pose_estimation(outputs, boxes=[person_boxes]) +image_pose_result = pose_results[0] # results for first image +``` + +### ViTPose++ models + +The best [checkpoints](https://huggingface.co/collections/usyd-community/vitpose-677fcfd0a0b2b5c8f79c4335) are those of the [ViTPose++ paper](https://arxiv.org/abs/2212.04246). ViTPose++ models employ a so-called [Mixture-of-Experts (MoE)](https://huggingface.co/blog/moe) architecture for the ViT backbone, resulting in better performance. + +The ViTPose+ checkpoints use 6 experts, hence 6 different dataset indices can be passed. +An overview of the various dataset indices is provided below: + +- 0: [COCO validation 2017](https://cocodataset.org/#overview) dataset, using an object detector that gets 56 AP on the "person" class +- 1: [AiC](https://github.com/fabbrimatteo/AiC-Dataset) dataset +- 2: [MPII](https://www.mpi-inf.mpg.de/departments/computer-vision-and-machine-learning/software-and-datasets/mpii-human-pose-dataset) dataset +- 3: [AP-10K](https://github.com/AlexTheBad/AP-10K) dataset +- 4: [APT-36K](https://github.com/pandorgan/APT-36K) dataset +- 5: [COCO-WholeBody](https://github.com/jin-s13/COCO-WholeBody) dataset + +Pass the `dataset_index` argument in the forward of the model to indicate which experts to use for each example in the batch. Example usage is shown below: + +```python +image_processor = AutoProcessor.from_pretrained("usyd-community/vitpose-plus-base") +model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-plus-base", device=device) + +inputs = image_processor(image, boxes=[person_boxes], return_tensors="pt").to(device) + +dataset_index = torch.tensor([0], device=device) # must be a tensor of shape (batch_size,) + +with torch.no_grad(): + outputs = model(**inputs, dataset_index=dataset_index) +``` + +The ViTPose+ checkpoints use 6 experts, hence 6 different dataset indices can be passed. +An overview of the various dataset indices is provided below: + +- 0: [COCO validation 2017](https://cocodataset.org/#overview) dataset, using an object detector that gets 56 AP on the "person" class +- 1: [AiC](https://github.com/fabbrimatteo/AiC-Dataset) dataset +- 2: [MPII](https://www.mpi-inf.mpg.de/departments/computer-vision-and-machine-learning/software-and-datasets/mpii-human-pose-dataset) dataset +- 3: [AP-10K](https://github.com/AlexTheBad/AP-10K) dataset +- 4: [APT-36K](https://github.com/pandorgan/APT-36K) dataset +- 5: [COCO-WholeBody](https://github.com/jin-s13/COCO-WholeBody) dataset + + +### Visualization + +To visualize the various keypoints, one can either leverage the `supervision` [library](https://github.com/roboflow/supervision (requires `pip install supervision`): + +```python +import supervision as sv + +xy = torch.stack([pose_result['keypoints'] for pose_result in image_pose_result]).cpu().numpy() +scores = torch.stack([pose_result['scores'] for pose_result in image_pose_result]).cpu().numpy() + +key_points = sv.KeyPoints( + xy=xy, confidence=scores +) + +edge_annotator = sv.EdgeAnnotator( + color=sv.Color.GREEN, + thickness=1 +) +vertex_annotator = sv.VertexAnnotator( + color=sv.Color.RED, + radius=2 +) +annotated_frame = edge_annotator.annotate( + scene=image.copy(), + key_points=key_points +) +annotated_frame = vertex_annotator.annotate( + scene=annotated_frame, + key_points=key_points +) +``` + +Alternatively, one can also visualize the keypoints using [OpenCV](https://opencv.org/) (requires `pip install opencv-python`): + +```python +import math +import cv2 + +def draw_points(image, keypoints, scores, pose_keypoint_color, keypoint_score_threshold, radius, show_keypoint_weight): + if pose_keypoint_color is not None: + assert len(pose_keypoint_color) == len(keypoints) + for kid, (kpt, kpt_score) in enumerate(zip(keypoints, scores)): + x_coord, y_coord = int(kpt[0]), int(kpt[1]) + if kpt_score > keypoint_score_threshold: + color = tuple(int(c) for c in pose_keypoint_color[kid]) + if show_keypoint_weight: + cv2.circle(image, (int(x_coord), int(y_coord)), radius, color, -1) + transparency = max(0, min(1, kpt_score)) + cv2.addWeighted(image, transparency, image, 1 - transparency, 0, dst=image) + else: + cv2.circle(image, (int(x_coord), int(y_coord)), radius, color, -1) + +def draw_links(image, keypoints, scores, keypoint_edges, link_colors, keypoint_score_threshold, thickness, show_keypoint_weight, stick_width = 2): + height, width, _ = image.shape + if keypoint_edges is not None and link_colors is not None: + assert len(link_colors) == len(keypoint_edges) + for sk_id, sk in enumerate(keypoint_edges): + x1, y1, score1 = (int(keypoints[sk[0], 0]), int(keypoints[sk[0], 1]), scores[sk[0]]) + x2, y2, score2 = (int(keypoints[sk[1], 0]), int(keypoints[sk[1], 1]), scores[sk[1]]) + if ( + x1 > 0 + and x1 < width + and y1 > 0 + and y1 < height + and x2 > 0 + and x2 < width + and y2 > 0 + and y2 < height + and score1 > keypoint_score_threshold + and score2 > keypoint_score_threshold + ): + color = tuple(int(c) for c in link_colors[sk_id]) + if show_keypoint_weight: + X = (x1, x2) + Y = (y1, y2) + mean_x = np.mean(X) + mean_y = np.mean(Y) + length = ((Y[0] - Y[1]) ** 2 + (X[0] - X[1]) ** 2) ** 0.5 + angle = math.degrees(math.atan2(Y[0] - Y[1], X[0] - X[1])) + polygon = cv2.ellipse2Poly( + (int(mean_x), int(mean_y)), (int(length / 2), int(stick_width)), int(angle), 0, 360, 1 + ) + cv2.fillConvexPoly(image, polygon, color) + transparency = max(0, min(1, 0.5 * (keypoints[sk[0], 2] + keypoints[sk[1], 2]))) + cv2.addWeighted(image, transparency, image, 1 - transparency, 0, dst=image) + else: + cv2.line(image, (x1, y1), (x2, y2), color, thickness=thickness) + + +# Note: keypoint_edges and color palette are dataset-specific +keypoint_edges = model.config.edges + +palette = np.array( + [ + [255, 128, 0], + [255, 153, 51], + [255, 178, 102], + [230, 230, 0], + [255, 153, 255], + [153, 204, 255], + [255, 102, 255], + [255, 51, 255], + [102, 178, 255], + [51, 153, 255], + [255, 153, 153], + [255, 102, 102], + [255, 51, 51], + [153, 255, 153], + [102, 255, 102], + [51, 255, 51], + [0, 255, 0], + [0, 0, 255], + [255, 0, 0], + [255, 255, 255], + ] +) + +link_colors = palette[[0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16]] +keypoint_colors = palette[[16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0]] + +numpy_image = np.array(image) + +for pose_result in image_pose_result: + scores = np.array(pose_result["scores"]) + keypoints = np.array(pose_result["keypoints"]) + + # draw each point on image + draw_points(numpy_image, keypoints, scores, keypoint_colors, keypoint_score_threshold=0.3, radius=4, show_keypoint_weight=False) + + # draw links + draw_links(numpy_image, keypoints, scores, keypoint_edges, link_colors, keypoint_score_threshold=0.3, thickness=1, show_keypoint_weight=False) + +pose_image = Image.fromarray(numpy_image) +pose_image +``` +drawing + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViTPose. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + +- A demo of ViTPose on images and video can be found [here](https://huggingface.co/spaces/hysts/ViTPose-transformers). +- A notebook illustrating inference and visualization can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/ViTPose/Inference_with_ViTPose_for_human_pose_estimation.ipynb). + +## VitPoseImageProcessor + +[[autodoc]] VitPoseImageProcessor + - preprocess + - post_process_pose_estimation + +## VitPoseConfig + +[[autodoc]] VitPoseConfig + +## VitPoseForPoseEstimation + +[[autodoc]] VitPoseForPoseEstimation + - forward \ No newline at end of file diff --git a/docs/source/en/model_doc/zamba2.md b/docs/source/en/model_doc/zamba2.md new file mode 100644 index 000000000000..b331e10eaf84 --- /dev/null +++ b/docs/source/en/model_doc/zamba2.md @@ -0,0 +1,91 @@ + +# Zamba2 + +Zamba2 is a large language model (LLM) trained by Zyphra, and made available under an Apache 2.0 license. Please see the [Zyphra Hugging Face](https://huggingface.co/collections/zyphra/) repository for model weights. + +This model was contributed by [pglo](https://huggingface.co/pglo). + + +## Model details + +Zamba2-1.2B, Zamba2-2.7B and Zamba2-7B are hybrid models combining state-space models (Specifically [Mamba](https://github.com/state-spaces/mamba)) and transformer, and were trained using next-token prediction. Zamba2 uses shared transformer layers after every 6 mamba blocks. It uses the [Mistral v0.1 tokenizer](https://huggingface.co/mistralai/Mistral-7B-v0.1). We came to this architecture after a series of ablations at small scales. Zamba2-1.2B, Zamba2-2.7B and Zamba2-7B were pre-trained on 2T and 3T tokens, respectively. + + + +## Quick start + + +### Presequities + +Zamba2 requires you use `transformers` version 4.48.0 or higher: +```bash +pip install transformers>=4.48.0 +## Inference + +```python +from transformers import AutoTokenizer, AutoModelForCausalLM +import torch + +tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba2-7B") +model = AutoModelForCausalLM.from_pretrained("Zyphra/Zamba2-7B", device_map="cuda", torch_dtype=torch.bfloat16) + +input_text = "What factors contributed to the fall of the Roman Empire?" +input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") + +outputs = model.generate(**input_ids, max_new_tokens=100) +print(tokenizer.decode(outputs[0])) +``` + + +## Model card + +The model cards can be found at: +* [Zamba2-1.2B](https://huggingface.co/Zyphra/Zamba2-1.2B) +* [Zamba2-2.7B](https://huggingface.co/Zyphra/Zamba2-2.7B) +* [Zamba2-7B](https://huggingface.co/Zyphra/Zamba2-7B) + + +## Issues +For issues with model output, or community discussion, please use the Hugging Face community [forum](https://huggingface.co/Zyphra/Zamba2-7B/discussions) + + +## License + +The model weights are open-sourced via an Apache 2.0 license. + + +## Zamba2Config + +[[autodoc]] Zamba2Config + + +## Zamba2Model + +[[autodoc]] Zamba2Model + - forward + + +## Zamba2ForCausalLM + +[[autodoc]] Zamba2ForCausalLM + - forward + + +## Zamba2ForSequenceClassification + +[[autodoc]] transformers.Zamba2ForSequenceClassification + - forward diff --git a/docs/source/en/model_doc/zoedepth.md b/docs/source/en/model_doc/zoedepth.md index 74e25f3c3f6e..ecd068511e96 100644 --- a/docs/source/en/model_doc/zoedepth.md +++ b/docs/source/en/model_doc/zoedepth.md @@ -70,7 +70,7 @@ Alternatively, one can also perform inference using the classes: >>> inputs = image_processor(images=image, return_tensors="pt") >>> with torch.no_grad(): -... outputs = model(pixel_values) +... outputs = model(inputs) >>> # interpolate to original size and visualize the prediction >>> ## ZoeDepth dynamically pads the input image. Thus we pass the original image size as argument diff --git a/docs/source/en/modular_transformers.md b/docs/source/en/modular_transformers.md index 1516233ec4d6..dca1282bcf99 100644 --- a/docs/source/en/modular_transformers.md +++ b/docs/source/en/modular_transformers.md @@ -22,6 +22,9 @@ etc. Model contribution PRs rarely add less than 3-5k lines of code, with much o This raises the bar for contributions, and with Modular Transformers, we're aiming to lower the bar to a much more acceptable point. +If you plan to add a model to `transformers` make sure you read [How to add a model to 🤗 Transformers?](https://huggingface.co/docs/transformers/add_new_model). +For any kind of contributions, see [CONTRIBUTING.md](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md). + ## What is it? Modular Transformers introduces the concept of a "modular" file to a model folder. This modular file accepts code @@ -43,6 +46,12 @@ be moved to the new Modular Transformers format in the coming months. ### Details +To generate a single file from the modular file, run the following command. + +```bash +python utils/modular_model_converter.py --files-to-parse src/transformers/models//modular_.py +``` + The "linter", which unravels the inheritance and creates all single-files from the modular file, will flatten the inheritance while trying to be invisible to Python users. At this time, the linter flattens a **single** level of inheritance. @@ -50,8 +59,8 @@ inheritance. For example: - If a configuration class inherits from another and adds/deletes an argument, the generated file will either directly reference it (in case of addition) or completely remove it (in case of deletion). -- If a class inherits from another, for example: class GemmaModel(LlamaModel):, dependencies are automatically - inferred. All submodules will be automatically inferred from the superclass. +- If a class inherits from another, for example: `class GemmaModel(LlamaModel):`, dependencies are automatically + inferred. All submodules will be automatically added from the superclass. - If you define new functions in the `modular` and use them inside classes, the linter will automatically infer the You should be able to write everything (the tokenizer, the image processor, the model, the config) in this `modular` @@ -59,7 +68,11 @@ file, and the corresponding files will be created for you. ### Enforcement -[TODO] We are introducing a new test, that makes sure the generated content matches what is present in the `modular_xxxx.py` +Run the command below to ensure the generated content matches `modular_.py` + +```bash +python utils/check_modular_conversion.py --files src/transformers/models//modular_.py +``` ### Examples @@ -107,46 +120,362 @@ class RobertaForMaskedLM(BertForMaskedLM): self.model = RobertaModel(config) ``` -Note that if you do not use the dependency that you defined, you will have the following error: +## What it is not -```bash -ValueError: You defined `RobertaEmbeddings` in the modular_roberta.py, it should be used - when you define `BertModel`, as it is one of it's direct dependencies. Make sure - you use it in the `__init__` function. +It is not a replacement for the modeling code (yet?), and if your model is not based on anything else that ever existed, then you can add a `modeling` file as usual. Similarly, if you cannot easily inherit your `configuration` (or `tokenization` or `processing`) file from another model's similar file, you can add that filetype directly (even though defining it in the modular file would work, it would clutter it). + + +## Real world example breakdown + +As explained, modular allows you to use regular Python inheritance from any other model's code in the library, in order to define your own. For this reason, it will work better/be easier if you first browse the library a bit to find models close to yours, in order to inherit from them. For example, are you using a sliding window in the `Attention` class? Then start by checking models that are well known to use it, e.g. `Mistral`, or `Qwen2`! Are you using interleaved `RotaryEmbedding` modules? Check out `Cohere`, `Cohere2` and `Glm` models! Otherwise a very strong starting point is to check out `Llama`. And if you are doing a bit of all of that at once, then you can mix and match! + +Here are some common properties that your model might be using, and corresponding modeling files to check as an example: +- Mixture of expert: `SwitchTransformers` or `Mixtral` +- Interleaved (and/or partial) rotary embedding: `Glm`, `Phi` +- State space models: + - Hybrid with attention: `Jamba` , `Bamba`, `Zamba` + - Mamba2: `Mamba2` +- Recurrent hidden states: `Gemma2` +- Different sliding window attention/full attention patterns per layer: `Gemma2`, `Cohere2` +- Clipping of QKV: `Olmo` +- Normalization of QK: `Olmo2`, `Cohere` +- Fused QKV (not recommended): `Phi3` + +At Hugging Face, we feel that learning by example is usually (one of) the best way, so we will now go over a typical modular file, and the different features our linter provides (and its limitations)! 🤗 Let's use a real world example with Olmo2 model, which I feel provides a very good illustration of the modular mechanisms. The original file can be found [here](https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modular_olmo2.py). For simplicity, we will go over it class by class, and repeat the modular's definition of ech class. For reference, the modeling and configuration of Olmo (v1) on which we will inherit a lot can be found [here](https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo/modeling_olmo.py) and [here](https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo/configuration_olmo.py) respectively. The final modeling of Olmo2 (generated by running our linter on the modular we will describe below) can be found [here](https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modeling_olmo2.py) + +Let's break it down! + + +### Config class + +Here is the `Config` definition in modular: + +```py +from ..olmo.configuration_olmo import OlmoConfig + +class Olmo2Config(OlmoConfig): + r""" + This is the configuration class to store the configuration of a [`Olmo2Model`]. + """ + + def __init__( + self, + vocab_size=50304, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + use_cache=True, + pad_token_id=1, + bos_token_id=None, + eos_token_id=50279, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + rms_norm_eps=1e-5, + **kwargs, + ): + super().__init__( + vocab_size=vocab_size, + hidden_size=hidden_size, + intermediate_size=intermediate_size, + num_hidden_layers=num_hidden_layers, + num_attention_heads=num_attention_heads, + num_key_value_heads=num_key_value_heads, + hidden_act=hidden_act, + max_position_embeddings=max_position_embeddings, + initializer_range=initializer_range, + use_cache=use_cache, + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + rope_theta=rope_theta, + rope_scaling=rope_scaling, + attention_bias=attention_bias, + attention_dropout=attention_dropout, + **kwargs, + ) + + self.rms_norm_eps = rms_norm_eps + del self.clip_qkv ``` -Additionally, you may find a list of examples here: +Here, we correctly identified that the `Config` in Olmo2 is similar to Olmo's, up to a few details: +1. The default value of most arguments has changed +2. we have a new argument, `rms_norm_eps` +3. the argument `clip_qkv` is not used anymore -## What it is not +To solve points 1. and 2., simply overwriting the `__init__` function with the new default arguments and adding the new one is enough, as you would expect when you want to overwrite a method in Python! Of course you also need to assign the new attribute `rms_norm_eps` to `self` in the `__init__`'s body. +For point 3., we use the special syntax `del self.clip_qkv`, which, has you can expect, removed the assignment of this attribute in the unravelled code (after the conversion with the linter). + +Now, there is a subtility here: as you can see, we used `super().__init__(...)`. Usually, in Python, it is simply used to call the parent's `__init__`. In modular terms, however, it has a _slightly_ different meaning. When we find a call such as `super().my_function(...)` in the modular file, the linter will take the body of the `my_function` function in the parent, and unravel it where the call to `super().my_function(...)` occured. Then, the `del self.clip_qkv` statement will remove the reference to `self.clip_qkv` from the unravelled body. Thus `del self.xxx` can only work in pair with `super().my_function(...)`, and should always be placed after it (but you can add whatever you want _before_ calling `super()`, and it will be placed, as you can expect, before the parent's body). + +### Norm class + +Here is the `Norm` class: + +```py +from ..llama.modeling_llama import LlamaRMSNorm + +class Olmo2RMSNorm(LlamaRMSNorm): + pass +``` + +What to say here, it is pretty explicit isn't it? We do not modify anything from the `LlamaRMSNorm` definition. Thus the linter will unravel exactly the content of the parent (`LlamaRMSNorm`). Only change will be that every reference to "llama" on the docstrings, type hints, and comments (basically everywhere) will be changed to references to "olmo2" for consistency! + +### Attention class + +Here is the `Attention` class: + +```py +from ..llama.modeling_llama import eager_attention_forward +from ..olmo.modeling_olmo import OlmoAttention, apply_rotary_pos_emb + + +# Olmo2 attention is identical to OLMo attention except: +# - Norm is applied to attention queries and keys. +# - No qkv clipping. +class Olmo2Attention(OlmoAttention): + def __init__(self, config: Olmo2Config, layer_idx: Optional[int] = None): + super().__init__(config, layer_idx=layer_idx) + self.q_norm = Olmo2RMSNorm(config.num_attention_heads * self.head_dim, config.rms_norm_eps) + self.k_norm = Olmo2RMSNorm(config.num_key_value_heads * self.head_dim, config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: Tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_value: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_norm(self.q_proj(hidden_states)) + key_states = self.k_norm(self.k_proj(hidden_states)) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(hidden_shape).transpose(1, 2) + key_states = key_states.view(hidden_shape).transpose(1, 2) + value_states = value_states.view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False): + logger.warning_once( + "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " + 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + else: + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights +``` + +Now, what's happening here? In the `__init__`, we call `super().__init__(...)`, thus copying the parent's definition, then add 2 new layers of the `Olmo2RMSNorm` we just added previously. Indeed, those were not present in the original `Olmo` (v1) model. So, now, we also have to overwrite the `forward` method to use these 2 new layers right? Indeed, if you check carefully, the definition of `forward` is identical to `Olmo`'s, but we added a pass with the norm layers just before projecting with `q_proj` and `k_proj`. However, to help us, we directly imported the functions `eager_attention_forward` from llama, and `apply_rotary_pos_emb` from olmo. The linter will then automatically add these imported functions in the final `modeling_olmo2.py` file, by copying their definitions from the source (imported) files. And it will even add the `rotate_half` and `repeat_kv` functions (which are used inside `apply_rotary_pos_embed` and `eager_attention_forward` respectively) by figuring out the dependency automatically. Neat, right? +Note that we had to redefine this class, because we did not find any model defining the `Attention` layer with the added `RMSNorm` layer anywhere else in the library! Otherwise, we would have simply inherited from this model instead as we did for the `RMSNorm`! + +### The DecoderLayer class + +Here is the `DecoderLayer` class: + +```py +from ..olmo.modeling_olmo import OlmoDecoderLayer + +# The OLMo2 layers are identical to those of the OLMo model except: +# - RMSNorm is used instead of standard layer norm. +# - Norm is applied after attention/feedforward rather than before. +class Olmo2DecoderLayer(OlmoDecoderLayer): + def __init__(self, config: Olmo2Config, layer_idx: int): + super().__init__(config, layer_idx=layer_idx) + self.post_attention_layernorm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_feedforward_layernorm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.self_attn = Olmo2Attention(config=config, layer_idx=layer_idx) + del self.input_layernorm + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + residual = hidden_states + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.mlp(hidden_states) + hidden_states = self.post_feedforward_layernorm(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + if output_attentions: + outputs += (self_attn_weights,) + + return outputs +``` + +At this point, you should start to pick up what is happening for this class. We switched the type of norm in the `__init__` by overwriting `self.post_attention_layernorm` after the call to `super().__init__(...)`, thus going from a `LayerNorm` in the parent class, to our `RMSNorm` in this class. Then we simply deleted the `self.input_layernorm` attribute, and replaced it by `self.post_feedforward_layernorm`, because the name was not making sense anymore as we apply it after in `Olmo2` instead of before in `Olmo`. For this reason, we also need to overwrite the `forward` method, to reflect the logic change. -It is not a replacement for the modeling code (yet?), and if your model is not based on anything else that ever existed, then you can add a `modeling` file as usual. +Note however that if we had only switched `self.post_attention_layernorm` and `self.input_layernorm` from `LayerNorm`s to `RMSNorm`s (without the name and logic change of `elf.input_layernorm`), we would not have had to redefine the `forward` method! +### The Model class + +```py +from ..olmo.modeling_olmo import OlmoModel + +# The OLMo2 model is identical to the OLMo model, except RMSNorm is used instead of +# standard layer norm for the output norm. +class Olmo2Model(OlmoModel): + def __init__(self, config: Olmo2Config): + super().__init__(config) + self.norm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.layers = nn.ModuleList( + [Olmo2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) +``` + +Here, this is exactly what I was pointing out before: we simply change the _type_ of the `self.norm` attribute (going from `LayerNorn` in `Olmo` to `RMSNorm` in `Olmo2`). Since this change does not reflect the logic of the `forward` method (the name of the layer and where it is used is identical to the parent's), then we do not even need to overwrite it! It will be unravelled automatically! Note that we redefined `self.layers` for the sake of being explicit, but this is not even strictly required here as the definition is similar to what is found in `Olmo` (v1). + +### Finally... The ForCausalLM class + +Finally, here is the definition of the `ForCausalLM`: + +```py +from ..olmo.modeling_olmo import OlmoForCausalLM + +class Olmo2ForCausalLM(OlmoForCausalLM): + pass +``` + +As for the `RMSNorm`, it is exactly similar to the parent's in logic, so we do not have anything to do, the linter will all figure it out by itself. Almost disappointing, no? + + + +### But... What about the MLP, RotaryEmbedding and PreTrainedModel classes? + +Indeed, if you inspect the file [modeling_olmo2.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modeling_olmo2.py) which is created by running the linter on `modular_olmo2.py`, you will notice that it also creates `Olmo2MLP`, `Olmo2RotaryEmbedding`, and `Olmo2PreTrainedModel` classes, that we did not define explicitly in `modular_olmo2.py`. + +Well, it is one of the main feature of our modular linter. Similarly to how some functions were added automatically with the `Attention` class (without directly importing them), classes that are a dependency of one of the class inherited class and which are not explicitly defined in the modular file, will be added automatically as part of the dependeny tracing. For example, in `OlmoDecoderLayer`, there is an attribute defined as `self.mlp = OlmoMLP(config)`. Because we never explicitly redefined a class named `Olmo2MLP` in `modular_olmo2.py`, the linter automatically created a class `Olmo2MLP`, similar to `OlmoMLP`. This is exactly the same as if we had done: + +```py +from ..olmo.modeling_olmo import OlmoMLP + +class Olmo2MLP(OlmoMLP): + pass +``` + +but we did not even bother, because we _know_ this class is supposed to be exactly similar, and we never needed it anywhere else in the `modular_olmo2.py` file. In contrast, the class `Olmo2RMSNorm` was needed to (re)define the norms both in the `Attention` and `DecoderLayer` classes. The same logic is true for the `Olmo2PreTrainedModel` and `Olmo2RotaryEmbedding` classes. + +Note however that if not redefined, classes will be copied from the file in which an inherited module uses them first. So if you wanted e.g. `Olmo2MLP` to inherit from, say, `MistralMLP` instead of `OlmoMLP` (here it was `OlmoMLP` because it was first implicitly used in `Olmo2DecoderLayer`, which inherited from `OlmoDecoderLayer`), you would need to be explicit and do: + +```py +# switch to mistral definition +from ..mistral.modeling_mistral import MistralMLP + +class Olmo2MLP(MistralMLP): + pass +``` ## Advanced usage -### Removing attributes and functions -To remove attributes that are not used in your modular model, and that you don't want to see in the unravelled modeling: +Now that you should have a good grasp of how modular works, let's see some more advanced use cases and features you can use. -```python -class GemmaModel(LlamaModel): | class GemmaModel(PreTrainedModel): - def __init__(self, config): | def __init__(self, config): - super().__init__(self, eos_token) | super().__init__(config) - del self.embed_tokens | self.padding_idx = config.pad_token_id - | self.vocab_size = config.vocab_size - | - | self.layers = nn.ModuleList( - | [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] - | ) - | self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - | self.rotary_emb = LlamaRotaryEmbedding(config=config) - | self.gradient_checkpointing = False - | - | # Initialize weights and apply final processing - | self.post_init() -``` -If you check the original `LlamaModel`, it has a `embed_tokens` which was removed here (as you would expect!) - -Removing a function is pretty similar, you just need to write it with a `raise ValueError("")` to mimick the behaviour you actually want when you remove a parent function in python. +### Removing attributes which are not just assignments + +As we have seen before, after using `super().__init__()`, we can use `del self.attribute` to remove a specific attribute which was defined in the parent. What if this attribute was used elsewhere though? Meaning it was not just "defined to be stored" as in the config for example. For example, consider the following case: + +```py +class DummyModel(nn.Module): + + def __init__(self, config: DummyConfig): + super().__init__() + self.attribute = config.attribute + if self.attribute: + # do more stuff with `self.attribute` here + ... +``` + +Then inheriting from this `DummyModel` and doing + +```py +class MyNewDummyModel(DummyModel): + + def __init__(self, config: MyNewDummyConfig): + super().__init__(config) + del self.attribute +``` + +is not supported, because it will only suppress the assignment, i.e. the line `self.attribute = config.attribute` will disappear, but the `if` statement will stay and reference the attribute. We tried to make it work by suppressing every mentions of the attribute, however it it not a sound solution in the general case (it can lead to very surprising effects and remove other important parts) and is therefore not possible. + +But what if I still want to inherit from `DummyModel`? How to properly do it? How to use `super().__init__()` without copy/pasting the parent then? This brings us to the next point: + +### Avoiding super() special meaning + +Say you still want to inherit from `DummyModel` (because it is convenient for some other methods) but you do want to remove the `self.attribute`. How to properly override the `__init__` method, while calling `super()` but without unravelling the parent's code? Well, then be explicit about which class `super()`'s you are calling! If we want to call the `nn.Module`'s `super()` for example, we can do the following (unravelled code on the right): + +```py +class MyNewDummyModel(DummyModel, nn.Module): | class MyNewDummyModel(nn.Module): + | + def __init__(self, config: MyNewDummyConfig): | def __init__(self, config: MyNewDummyConfig): + nn.Module.__init__(config) | super().__init__() + self.foo = config.foo | self.foo = config.foo + ... | ... +``` + +### Deleting unused methods + +Removing a class method is pretty similar to remove an attribute, you just need to overwrite it with a `raise AttributeError("")` to mimick the behaviour you actually want when you remove a parent function in python. For example, the following will remove the methods in the unravelled code: ```python class GemmaTokenizer(LlamaTokenizer): @@ -161,37 +490,172 @@ class GemmaTokenizer(LlamaTokenizer): ### Define new functions -If you define a new function in the `modular` file to be used inside a class, say +Of course, if you define a new function in the `modular` file, and use it inside an inherited class, say ```python def my_new_function(*args, **kwargs): # Do something here pass -class GemmaModel(LlamaModel): +class DummyModel(LlamaModel): def forward(*args, **kwargs): # Call the function example = my_new_function(*args, **kwargs) # continue here ``` -the `my_new_function` function (and, recursively, any other new functions called in its body) will be automatically copy-pasted -in the file where it is used. +the `my_new_function` function (and, recursively, any other functions called in its body) will be automatically added to the unravelled code even if it is not present in the parent's file (here Llama). -### Calling `super()` -We recently shipped a few features that allow you to go from: -```python -class GemmaTokenizer(LlamaTokenizer, PretrainedTokenizerFast): | class GemmaModel(nn.Module): - def __init__(self, eos_token=""): | def __init__(self): - eos_token = AddedToken(eos_token) | eos_token = AddedToken(eos_token) - PretrainedTokenizerFast.__init__(self, eos_token) | super().__init__(eos_token) +### Decorators + +By default, if you inherit from a class and override a method which has 1 (or more) decorators in the parent's method, the decorators will be added as well in the unravelled code, _but only if you do not add any yourself_. Otherwise, it will of course use whatever decorator your redefined. + +That, is, imagine the following parent class + +```py +class DummyModel(nn.Module): + ... + + @decorator(...) + def forward(...) + # do stuff here ``` -This is useful want you **don't** want to unravel the call to `super()`, and you want to differentiate which super init call you are doing! -### Special naming -We now also support special cases like -```python -class GemmaVisionModel(CLIPModel): +Then, if you simply override the method it will produce (modular on the left, unravelled code on the right): + +```py +class NewModel(DummyModel): | class NewModel(nn.Module): + ... | ... + | + def forward(...): | @decorator(...) + ... | def forward(...): + | ... +``` + +That is, it keeps the parent's decorators by default. However, if you do: + +```py +class NewModel(DummyModel): | class NewModel(nn.Module): + ... | ... + | + @my_new_decorator(...) | @my_new_decorator(...) + def forward(...): | def forward(...): + ... | ... +``` + +Then it keeps you own new decorator. + +### The super_kwargs special case + +In the above case about decorators, what if the `forward` method is really long, and I just want to switch the decorators? Do I really have to redefine it all and copy/paste the body just for the decorator? Fortunately, no. If you followed until this point, you now that you can use `super().forward(...)`, and it will unravel the parent's body automatically. But what if there are plenty of arguments in the function's signature, and we are very lazy? For that use-case, we introduced the special syntax `**super_kwargs` in the overriden method signature. It basically mean: "unravel all the parent's signature arguments here". For example, a common signature in the `ForCausalLM` model is the following (copied from llama's modeling): + +```py +class LlamaForCausalLM(nn.Module): + ... + + @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + num_logits_to_keep: int = 0, + **kwargs: Unpack[KwargsForCausalLM], + ) -> Union[Tuple, CausalLMOutputWithPast]: + ... +``` + +As you can see, this is a rather long and complicated signature. But if you do the following (as usual, modular on the left, unravelled code by the linter on the right): + +```py +class NewModelForCausalLM(LlamaForCausalLM): | class LlamaForCausalLM(nn.Module): + ... | ... + | + @my_new_decorator | @my_new_decorator + def forward(self, **super_kwargs): | def forward( + super().forward(**super_kwargs) | self, + | input_ids: torch.LongTensor = None, + | attention_mask: Optional[torch.Tensor] = None, + | position_ids: Optional[torch.LongTensor] = None, + | past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = |None, + | inputs_embeds: Optional[torch.FloatTensor] = None, + | labels: Optional[torch.LongTensor] = None, + | use_cache: Optional[bool] = None, + | output_attentions: Optional[bool] = None, + | output_hidden_states: Optional[bool] = None, + | return_dict: Optional[bool] = None, + | cache_position: Optional[torch.LongTensor] = None, + | num_logits_to_keep: int = 0, + | **kwargs: Unpack[KwargsForCausalLM], + | ) -> Union[Tuple, CausalLMOutputWithPast]: + | ... +``` + +and the `**super_kwargs` syntax unravelled all the arguments, while the `super().forward()` syntax unravelled the whole body! As you can see, this is great combo when you just want to switch the decorators, as it is very easy to use, and make it explicit that the only change you want to apply is the decorator. + +However, we want to make it clear that the `**super_kwargs` syntax is not a replacement to being explicit when you redefine your methods: if you actually overwrite the method (i.e. you do not call `super().method()`), then we want you to explicitly write the signature as you would usually. This is only a short-cut when switching decorators, and a few other niche cases. + +### The DOCSTRING variables + +Usually, if whatever object is defned both in the modular file and the modeling file from which we inherit, then the definition of the modular takes precedence. However, this is not the case for assignments containing the pattern `DOCSTRING`. Indeed, we usually have variables defined as `MODEL_START_DOCSTRING` and `MODEL_INPUT_DOCSTRING` in the modeling files. These are just very big blocks of, well, docstrings... But they are (almost) always exactly the same up to the model name! And modular automatically rewrite the names everywhere! For this reason, assignments containing the pattern will _always_ use the definition found in the source file instead of the modular file. This is extremely handy if we need the variable reference somewhere (e.g. to redefine a decorator) but we do not want to clutter the modular file with 100 lines of docstrings which are always the same. It allows to do the following (taken from [modular_starcoder2.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/starcoder2/modular_starcoder2.py#L146)) + +```py +STARCODER2_INPUTS_DOCSTRING = None # will be automatically redefined + +class Starcoder2Model(MistralModel): + ... + + @add_start_docstrings_to_model_forward(STARCODER2_INPUTS_DOCSTRING) + def forward(...) + ... +``` + +and here, the linter will correctly take the same definition of the docstring as in `Mistral`, without having to clutter the modular file! + +## Limitations + +Now, let's go over some of the limitations of modular. + +### Special naming (essentially for multimodal models) + +Because our linter automatically renames everything when inheriting from a class (defining `class NewModelMLP(LlamaMLP)` will rename every mention of `Llama` to `NewModel`, and recursively for all dependencies grabbed), it has somewhat strict rules when it comes to naming. For consistency reasons, we require that you always use the same class name prefix when inheriting different classes from the same file. For example, doing: + +```py +class MyModelIncredibleMLP(LlamaMLP): + ... + +class MyModelDecoderLayer(LlamaDecoderLayer): + ... +``` + +is not recommended, first because it breaks standards in the library and we do not like it, and second because the linter will not know how to rename potential high-order dependencies (should we use `MyModelIncredible`, or `MyModel`?). + +If there are no dependencies to grab implicitly however (see [this section](#dependencies) to understand implicit dependencies), local renaming (for a single class) will not be an issue and the linter will not complain. But make sure to explicitly redefine every other mentions of the class with the new name pattern! For example in the example above, all mentions of `LlamaMLP` in other modules inherited should be explicitly replaced by mentions to `MyModelIncredibleMLP`, otherwise the linter may add a new and unwanted `MyModelMLP` class! + +In any way, if there is an ambiguous case detected, the linter will raise a warning such as + +``` +We detected multiple prefix names when inheriting from transformers.models.llama.modeling_llama: ('Emu3Text', 'Emu3'). We will only use the most used 'Emu3' prefix when grabbing args and dependencies. Make sure to subclass the intermediate classes with the prefix you want (if different from 'Emu3') or use a single prefix in all the modular (best). +``` + +explaining what is happening, and which prefix is used by default for grabbing dependencies. As explained, if you see automatic dependencies appear with a prefix but you want another one, then explicitly rename these classes locally with a simple `pass` class, such as + +```py +class Emu3TextMLP(LlamaMLP): pass ``` -where the name of your class `GemmaVision` is not the same as the modular `Gemma`. This is super useful for composite models. \ No newline at end of file + +Such warnings and renaming patterns complications usually only arise when defining multimodel models, when you want to define e.g. the text part of your model from an existing model, but want to add the part `Text` to the class names to make it clear what they refer to in the multimodal setup. + +### Automatic docstrings issue (mostly for Configs) + +When inheriting a Config class and adding or deleting some attributes, it may be tempting to only redefine the new attributes in the docstring, and hoping that modular will do the rest. And similarly when deleting an argument, do nothing and hope that modular will remove itself from the docstring. However, due to current limitations of our linter, this is not yet supported. Thus, if you are in this case, you need to directly put the whole docstring (as it should appear in the end, with the correct arguments and default values) directly in the modular file under the class definition. \ No newline at end of file diff --git a/docs/source/en/perf_infer_cpu.md b/docs/source/en/perf_infer_cpu.md index c0e017c02087..7f8b525b3df6 100644 --- a/docs/source/en/perf_infer_cpu.md +++ b/docs/source/en/perf_infer_cpu.md @@ -41,8 +41,7 @@ Enable BetterTransformer with the [`PreTrainedModel.to_bettertransformer`] metho ```py from transformers import AutoModelForCausalLM -model = AutoModelForCausalLM.from_pretrained("bigcode/starcoder") -model.to_bettertransformer() +model = AutoModelForCausalLM.from_pretrained("bigcode/starcoder", torch_dtype="auto") ``` ## TorchScript @@ -54,7 +53,7 @@ For a gentle introduction to TorchScript, see the [Introduction to PyTorch Torch With the [`Trainer`] class, you can enable JIT mode for CPU inference by setting the `--jit_mode_eval` flag: ```bash -python run_qa.py \ +python examples/pytorch/question-answering/run_qa.py \ --model_name_or_path csarron/bert-base-uncased-squad-v1 \ --dataset_name squad \ --do_eval \ @@ -86,7 +85,7 @@ pip install intel_extension_for_pytorch Set the `--use_ipex` and `--jit_mode_eval` flags in the [`Trainer`] class to enable JIT mode with the graph optimizations: ```bash -python run_qa.py \ +python examples/pytorch/question-answering/run_qa.py \ --model_name_or_path csarron/bert-base-uncased-squad-v1 \ --dataset_name squad \ --do_eval \ diff --git a/docs/source/en/perf_infer_gpu_multi.md b/docs/source/en/perf_infer_gpu_multi.md index 997509441152..ea9421747c13 100644 --- a/docs/source/en/perf_infer_gpu_multi.md +++ b/docs/source/en/perf_infer_gpu_multi.md @@ -64,5 +64,5 @@ You can benefit from considerable speedups for inference, especially for inputs For a single forward pass on [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel) with a sequence length of 512 and various batch sizes, the expected speedup is as follows:
- +
diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index 84109746f959..8087008f8772 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -37,14 +37,19 @@ FlashAttention-2 is experimental and may change considerably in future versions. 2. partitioning the work between GPU threads to reduce communication and shared memory reads/writes between them FlashAttention-2 is currently supported for the following architectures: +* [Aria](https://huggingface.co/docs/transformers/model_doc/aria#transformers.AriaForConditionalGeneration) * [Bark](https://huggingface.co/docs/transformers/model_doc/bark#transformers.BarkModel) +* [Bamba](https://huggingface.co/docs/transformers/model_doc/bamba#transformers.BambaModel) * [Bart](https://huggingface.co/docs/transformers/model_doc/bart#transformers.BartModel) * [Chameleon](https://huggingface.co/docs/transformers/model_doc/chameleon#transformers.Chameleon) * [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPModel) * [Cohere](https://huggingface.co/docs/transformers/model_doc/cohere#transformers.CohereModel) +* [Cohere2](https://huggingface.co/docs/transformers/model_doc/cohere2#transformers.Cohere2Model) * [GLM](https://huggingface.co/docs/transformers/model_doc/glm#transformers.GLMModel) * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel) +* [DiffLlama](https://huggingface.co/docs/transformers/model_doc/diffllama#transformers.DiffLlamaModel) * [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel) +* [Emu3](https://huggingface.co/docs/transformers/model_doc/emu3) * [Gemma](https://huggingface.co/docs/transformers/model_doc/gemma#transformers.GemmaModel) * [Gemma2](https://huggingface.co/docs/transformers/model_doc/gemma2#transformers.Gemma2Model) * [GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2) @@ -64,6 +69,7 @@ FlashAttention-2 is currently supported for the following architectures: * [Llava-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next) * [Llava-NeXT-Video](https://huggingface.co/docs/transformers/model_doc/llava_next_video) * [LLaVA-Onevision](https://huggingface.co/docs/transformers/model_doc/llava_onevision) +* [Moonshine](https://huggingface.co/docs/transformers/model_doc/moonshine#transformers.MoonshineModel) * [Mimi](https://huggingface.co/docs/transformers/model_doc/mimi) * [VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava) * [VideoLlava](https://huggingface.co/docs/transformers/model_doc/video_llava) @@ -71,13 +77,14 @@ FlashAttention-2 is currently supported for the following architectures: * [MBart](https://huggingface.co/docs/transformers/model_doc/mbart#transformers.MBartModel) * [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel) * [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel) +* [ModernBert](https://huggingface.co/docs/transformers/model_doc/modernbert#transformers.ModernBert) * [Moshi](https://huggingface.co/docs/transformers/model_doc/moshi#transformers.MoshiModel) * [Musicgen](https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenModel) * [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel) * [Nemotron](https://huggingface.co/docs/transformers/model_doc/nemotron) * [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb) * [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel) -* [OLMo November 2024](https://huggingface.co/docs/transformers/model_doc/olmo_1124#transformers.Olmo1124Model) +* [OLMo2](https://huggingface.co/docs/transformers/model_doc/olmo2#transformers.Olmo2Model) * [OLMoE](https://huggingface.co/docs/transformers/model_doc/olmoe#transformers.OlmoeModel) * [OPT](https://huggingface.co/docs/transformers/model_doc/opt#transformers.OPTModel) * [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration) @@ -90,6 +97,7 @@ FlashAttention-2 is currently supported for the following architectures: * [Qwen2Audio](https://huggingface.co/docs/transformers/model_doc/qwen2_audio#transformers.Qwen2AudioEncoder) * [Qwen2MoE](https://huggingface.co/docs/transformers/model_doc/qwen2_moe#transformers.Qwen2MoeModel) * [Qwen2VL](https://huggingface.co/docs/transformers/model_doc/qwen2_vl#transformers.Qwen2VLModel) +* [Qwen2.5VL](https://huggingface.co/docs/transformers/model_doc/qwen2_5_vl#transformers.Qwen2_5_VLModel) * [RAG](https://huggingface.co/docs/transformers/model_doc/rag#transformers.RagModel) * [SpeechEncoderDecoder](https://huggingface.co/docs/transformers/model_doc/speech_encoder_decoder#transformers.SpeechEncoderDecoderModel) * [VisionEncoderDecoder](https://huggingface.co/docs/transformers/model_doc/vision_encoder_decoder#transformers.VisionEncoderDecoderModel) @@ -102,6 +110,8 @@ FlashAttention-2 is currently supported for the following architectures: * [SigLIP](https://huggingface.co/docs/transformers/model_doc/siglip) * [UniSpeech](https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/unispeech#transformers.UniSpeechModel) * [unispeech_sat](https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/unispeech-sat#transformers.UniSpeechSatModel) +* [helium](https://huggingface.co/docs/transformers/main/en/model_doc/heliumtransformers.HeliumModel) +* [Zamba2](https://huggingface.co/docs/transformers/model_doc/zamba2) You can request to add FlashAttention-2 support for another model by opening a GitHub Issue or Pull Request. @@ -216,8 +226,11 @@ PyTorch's [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.o For now, Transformers supports SDPA inference and training for the following architectures: * [Albert](https://huggingface.co/docs/transformers/model_doc/albert#transformers.AlbertModel) +* [Aria](https://huggingface.co/docs/transformers/model_doc/aria#transformers.AriaForConditionalGeneration) * [Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer#transformers.ASTModel) +* [Bamba](https://huggingface.co/docs/transformers/model_doc/bamba#transformers.BambaModel) * [Bart](https://huggingface.co/docs/transformers/model_doc/bart#transformers.BartModel) +* [Beit](https://huggingface.co/docs/transformers/model_doc/beit#transformers.BeitModel) * [Bert](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel) * [BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt#transformers.BioGptModel) * [CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert#transformers.CamembertModel) @@ -225,16 +238,22 @@ For now, Transformers supports SDPA inference and training for the following arc * [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPModel) * [GLM](https://huggingface.co/docs/transformers/model_doc/glm#transformers.GLMModel) * [Cohere](https://huggingface.co/docs/transformers/model_doc/cohere#transformers.CohereModel) +* [Cohere2](https://huggingface.co/docs/transformers/model_doc/cohere2#transformers.Cohere2Model) * [data2vec_audio](https://huggingface.co/docs/transformers/main/en/model_doc/data2vec#transformers.Data2VecAudioModel) +* [data2vec_vision](https://huggingface.co/docs/transformers/main/en/model_doc/data2vec#transformers.Data2VecVisionModel) * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel) * [DeiT](https://huggingface.co/docs/transformers/model_doc/deit#transformers.DeiTModel) +* [DiffLlama](https://huggingface.co/docs/transformers/model_doc/diffllama#transformers.DiffLlamaModel) * [Dinov2](https://huggingface.co/docs/transformers/en/model_doc/dinov2) +* [Dinov2_with_registers](https://huggingface.co/docs/transformers/en/model_doc/dinov2) * [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel) * [Dpr](https://huggingface.co/docs/transformers/model_doc/dpr#transformers.DprReader) * [EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder_decoder#transformers.EncoderDecoderModel) +* [Emu3](https://huggingface.co/docs/transformers/model_doc/emu3) * [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel) * [Gemma](https://huggingface.co/docs/transformers/model_doc/gemma#transformers.GemmaModel) * [Gemma2](https://huggingface.co/docs/transformers/model_doc/gemma2#transformers.Gemma2Model) +* [Granite](https://huggingface.co/docs/transformers/model_doc/granite#transformers.GraniteModel) * [GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2) * [GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode#transformers.GPTBigCodeModel) * [GPTNeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox#transformers.GPTNeoXModel) @@ -242,7 +261,7 @@ For now, Transformers supports SDPA inference and training for the following arc * [Idefics](https://huggingface.co/docs/transformers/model_doc/idefics#transformers.IdeficsModel) * [Idefics2](https://huggingface.co/docs/transformers/model_doc/idefics2#transformers.Idefics2Model) * [Idefics3](https://huggingface.co/docs/transformers/model_doc/idefics3#transformers.Idefics3Model) -* [Granite](https://huggingface.co/docs/transformers/model_doc/granite#transformers.GraniteModel) +* [I-JEPA](https://huggingface.co/docs/transformers/model_doc/ijepa#transformers.IJepaModel) * [GraniteMoe](https://huggingface.co/docs/transformers/model_doc/granitemoe#transformers.GraniteMoeModel) * [JetMoe](https://huggingface.co/docs/transformers/model_doc/jetmoe#transformers.JetMoeModel) * [Jamba](https://huggingface.co/docs/transformers/model_doc/jamba#transformers.JambaModel) @@ -252,16 +271,18 @@ For now, Transformers supports SDPA inference and training for the following arc * [Llava-NeXT-Video](https://huggingface.co/docs/transformers/model_doc/llava_next_video) * [LLaVA-Onevision](https://huggingface.co/docs/transformers/model_doc/llava_onevision) * [M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100#transformers.M2M100Model) +* [Moonshine](https://huggingface.co/docs/transformers/model_doc/moonshine#transformers.MoonshineModel) * [Mimi](https://huggingface.co/docs/transformers/model_doc/mimi) * [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel) * [Mllama](https://huggingface.co/docs/transformers/model_doc/mllama#transformers.MllamaForConditionalGeneration) * [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel) +* [ModernBert](https://huggingface.co/docs/transformers/model_doc/modernbert#transformers.ModernBert) * [Moshi](https://huggingface.co/docs/transformers/model_doc/moshi#transformers.MoshiModel) * [Musicgen](https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenModel) * [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel) * [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb) * [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel) -* [OLMo November 2024](https://huggingface.co/docs/transformers/model_doc/olmo_1124#transformers.Olmo1124Model) +* [OLMo2](https://huggingface.co/docs/transformers/model_doc/olmo2#transformers.Olmo2Model) * [OLMoE](https://huggingface.co/docs/transformers/model_doc/olmoe#transformers.OlmoeModel) * [OPT](https://huggingface.co/docs/transformers/en/model_doc/opt) * [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration) @@ -269,8 +290,8 @@ For now, Transformers supports SDPA inference and training for the following arc * [Phi3](https://huggingface.co/docs/transformers/model_doc/phi3#transformers.Phi3Model) * [PhiMoE](https://huggingface.co/docs/transformers/model_doc/phimoe#transformers.PhimoeModel) * [Idefics](https://huggingface.co/docs/transformers/model_doc/idefics#transformers.IdeficsModel) -* [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper#transformers.WhisperModel) * [mBart](https://huggingface.co/docs/transformers/model_doc/mbart#transformers.MBartModel) +* [Moonshine](https://huggingface.co/docs/transformers/model_doc/moonshine#transformers.MoonshineModel) * [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel) * [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel) * [StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm#transformers.StableLmModel) @@ -278,6 +299,7 @@ For now, Transformers supports SDPA inference and training for the following arc * [Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2#transformers.Qwen2Model) * [Qwen2Audio](https://huggingface.co/docs/transformers/model_doc/qwen2_audio#transformers.Qwen2AudioEncoder) * [Qwen2MoE](https://huggingface.co/docs/transformers/model_doc/qwen2_moe#transformers.Qwen2MoeModel) +* [Qwen2.5VL](https://huggingface.co/docs/transformers/model_doc/qwen2_5_vl#transformers.Qwen2_5_VLModel) * [RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaModel) * [Sew](https://huggingface.co/docs/transformers/main/en/model_doc/sew#transformers.SEWModel) * [SigLIP](https://huggingface.co/docs/transformers/model_doc/siglip) @@ -306,6 +328,8 @@ For now, Transformers supports SDPA inference and training for the following arc * [XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaModel) * [XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl#transformers.XLMRobertaXLModel) * [YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos#transformers.YolosModel) +* [helium](https://huggingface.co/docs/transformers/main/en/model_doc/heliumtransformers.HeliumModel) +* [Zamba2](https://huggingface.co/docs/transformers/model_doc/zamba2) @@ -320,10 +344,11 @@ In that case, you should see a warning message and we will fall back to the (slo -By default, SDPA selects the most performant kernel available but you can check whether a backend is available in a given setting (hardware, problem size) with [`torch.backends.cuda.sdp_kernel`](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel) as a context manager: +By default, SDPA selects the most performant kernel available but you can check whether a backend is available in a given setting (hardware, problem size) with [`torch.nn.attention.sdpa_kernel`](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html) as a context manager: ```diff import torch ++ from torch.nn.attention import SDPBackend, sdpa_kernel from transformers import AutoModelForCausalLM, AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m") @@ -332,7 +357,7 @@ model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=to input_text = "Hello my dog is cute and" inputs = tokenizer(input_text, return_tensors="pt").to("cuda") -+ with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False): ++ with sdpa_kernel(SDPBackend.FLASH_ATTENTION): outputs = model.generate(**inputs) print(tokenizer.decode(outputs[0], skip_special_tokens=True)) @@ -405,7 +430,7 @@ To load a model in 4-bit for inference, use the `load_in_4bit` parameter. The `d from transformers import AutoModelForCausalLM model_name = "bigscience/bloom-2b5" -model_4bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True) +model_4bit = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", load_in_4bit=True) ``` To load a model in 4-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 600MB of memory to the first GPU and 1GB of memory to the second GPU: @@ -414,7 +439,7 @@ To load a model in 4-bit for inference with multiple GPUs, you can control how m max_memory_mapping = {0: "600MB", 1: "1GB"} model_name = "bigscience/bloom-3b" model_4bit = AutoModelForCausalLM.from_pretrained( - model_name, device_map="auto", load_in_4bit=True, max_memory=max_memory_mapping + model_name, torch_dtype="auto", device_map="auto", load_in_4bit=True, max_memory=max_memory_mapping ) ``` @@ -432,7 +457,7 @@ To load a model in 8-bit for inference, use the `load_in_8bit` parameter. The `d from transformers import AutoModelForCausalLM, BitsAndBytesConfig model_name = "bigscience/bloom-2b5" -model_8bit = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=BitsAndBytesConfig(load_in_8bit=True)) +model_8bit = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", quantization_config=BitsAndBytesConfig(load_in_8bit=True)) ``` If you're loading a model in 8-bit for text generation, you should use the [`~transformers.GenerationMixin.generate`] method instead of the [`Pipeline`] function which is not optimized for 8-bit models and will be slower. Some sampling strategies, like nucleus sampling, are also not supported by the [`Pipeline`] for 8-bit models. You should also place all inputs on the same device as the model: @@ -442,7 +467,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig model_name = "bigscience/bloom-2b5" tokenizer = AutoTokenizer.from_pretrained(model_name) -model_8bit = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=BitsAndBytesConfig(load_in_8bit=True)) +model_8bit = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", quantization_config=BitsAndBytesConfig(load_in_8bit=True)) prompt = "Hello, my llama is cute" inputs = tokenizer(prompt, return_tensors="pt").to("cuda") @@ -450,13 +475,13 @@ generated_ids = model.generate(**inputs) outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) ``` -To load a model in 4-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 1GB of memory to the first GPU and 2GB of memory to the second GPU: +To load a model in 8-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 1GB of memory to the first GPU and 2GB of memory to the second GPU: ```py max_memory_mapping = {0: "1GB", 1: "2GB"} model_name = "bigscience/bloom-3b" model_8bit = AutoModelForCausalLM.from_pretrained( - model_name, device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping + model_name, torch_dtype="auto", device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping ) ``` @@ -506,6 +531,7 @@ It is often possible to combine several of the optimization techniques described ```py import torch +from torch.nn.attention import SDPBackend, sdpa_kernel from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig # load model in 4-bit @@ -515,7 +541,7 @@ quantization_config = BitsAndBytesConfig( ) tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m") -model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", quantization_config=quantization_config) +model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype="auto", quantization_config=quantization_config) # enable BetterTransformer model = model.to_bettertransformer() @@ -524,7 +550,7 @@ input_text = "Hello my dog is cute and" inputs = tokenizer(input_text, return_tensors="pt").to("cuda") # enable FlashAttention -with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False): +with sdpa_kernel(SDPBackend.FLASH_ATTENTION): outputs = model.generate(**inputs) print(tokenizer.decode(outputs[0], skip_special_tokens=True)) diff --git a/docs/source/en/perf_torch_compile.md b/docs/source/en/perf_torch_compile.md index acc424930b1c..2155a403b2b7 100644 --- a/docs/source/en/perf_torch_compile.md +++ b/docs/source/en/perf_torch_compile.md @@ -27,7 +27,7 @@ To compile any computer vision model of your choice, call `torch.compile()` on t ```diff from transformers import AutoModelForImageClassification -model = AutoModelForImageClassification.from_pretrained(MODEL_ID).to("cuda") +model = AutoModelForImageClassification.from_pretrained(MODEL_ID).to(DEVICE) + model = torch.compile(model) ``` @@ -47,15 +47,17 @@ from PIL import Image import requests import numpy as np from transformers import AutoImageProcessor, AutoModelForImageClassification +from accelerate.test_utils.testing import get_backend +device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) url = 'http://images.cocodataset.org/val2017/000000039769.jpg' image = Image.open(requests.get(url, stream=True).raw) processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") -model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224").to("cuda") +model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224").to(device) model = torch.compile(model) -processed_input = processor(image, return_tensors='pt').to(device="cuda") +processed_input = processor(image, return_tensors='pt').to(device) with torch.no_grad(): _ = model(**processed_input) @@ -66,13 +68,15 @@ with torch.no_grad(): ```python from transformers import AutoImageProcessor, AutoModelForObjectDetection +from accelerate.test_utils.testing import get_backend +device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50") -model = AutoModelForObjectDetection.from_pretrained("facebook/detr-resnet-50").to("cuda") +model = AutoModelForObjectDetection.from_pretrained("facebook/detr-resnet-50").to(device) model = torch.compile(model) texts = ["a photo of a cat", "a photo of a dog"] -inputs = processor(text=texts, images=image, return_tensors="pt").to("cuda") +inputs = processor(text=texts, images=image, return_tensors="pt").to(device) with torch.no_grad(): _ = model(**inputs) @@ -82,11 +86,13 @@ with torch.no_grad(): ```python from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation +from accelerate.test_utils.testing import get_backend +device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) processor = SegformerImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512") -model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512").to("cuda") +model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512").to(device) model = torch.compile(model) -seg_inputs = processor(images=image, return_tensors="pt").to("cuda") +seg_inputs = processor(images=image, return_tensors="pt").to(device) with torch.no_grad(): _ = model(**seg_inputs) diff --git a/docs/source/en/perf_train_cpu.md b/docs/source/en/perf_train_cpu.md index 7ef98932d537..ab2f735ecbdd 100644 --- a/docs/source/en/perf_train_cpu.md +++ b/docs/source/en/perf_train_cpu.md @@ -51,7 +51,7 @@ To enable auto mixed precision with IPEX in Trainer, users should add `use_ipex` Take an example of the use cases on [Transformers question-answering](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) - Training with IPEX using BF16 auto mixed precision on CPU: -
 python run_qa.py \
+
 python examples/pytorch/question-answering/run_qa.py \
 --model_name_or_path google-bert/bert-base-uncased \
 --dataset_name squad \
 --do_train \
diff --git a/docs/source/en/perf_train_cpu_many.md b/docs/source/en/perf_train_cpu_many.md
index ed782caca3b1..d6a029c471de 100644
--- a/docs/source/en/perf_train_cpu_many.md
+++ b/docs/source/en/perf_train_cpu_many.md
@@ -75,7 +75,7 @@ The following command enables training with 2 processes on one Xeon node, with o
  export CCL_WORKER_COUNT=1
  export MASTER_ADDR=127.0.0.1
  mpirun -n 2 -genv OMP_NUM_THREADS=23 \
- python3 run_qa.py \
+ python3 examples/pytorch/question-answering/run_qa.py \
  --model_name_or_path google-bert/bert-large-uncased \
  --dataset_name squad \
  --do_train \
@@ -104,7 +104,7 @@ Now, run the following command in node0 and **4DDP** will be enabled in node0 an
  export MASTER_ADDR=xxx.xxx.xxx.xxx #node0 ip
  mpirun -f hostfile -n 4 -ppn 2 \
  -genv OMP_NUM_THREADS=23 \
- python3 run_qa.py \
+ python3 examples/pytorch/question-answering/run_qa.py \
  --model_name_or_path google-bert/bert-large-uncased \
  --dataset_name squad \
  --do_train \
diff --git a/docs/source/en/perf_train_gpu_many.md b/docs/source/en/perf_train_gpu_many.md
index 858da99e7bc3..c810a18470a0 100644
--- a/docs/source/en/perf_train_gpu_many.md
+++ b/docs/source/en/perf_train_gpu_many.md
@@ -553,7 +553,7 @@ It performs a sort of 4D Parallelism over Sample-Operator-Attribute-Parameter.
 Examples:
 * Sample
 
-Let's take 10 batches of sequence length 512. If we parallelize them by sample dimension into 2 devices, we get 10 x 512 which becomes be 5 x 2 x 512.
+Let's take 10 batches of sequence length 512. If we parallelize them by sample dimension into 2 devices, we get 10 x 512 which becomes 5 x 2 x 512.
 
 * Operator
 
diff --git a/docs/source/en/perplexity.md b/docs/source/en/perplexity.md
index ac7ef8504e72..525f0d567bcb 100644
--- a/docs/source/en/perplexity.md
+++ b/docs/source/en/perplexity.md
@@ -73,8 +73,9 @@ Let's demonstrate this process with GPT-2.
 
 ```python
 from transformers import GPT2LMHeadModel, GPT2TokenizerFast
+from accelerate.test_utils.testing import get_backend
 
-device = "cuda"
+device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
 model_id = "openai-community/gpt2-large"
 model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
 tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
diff --git a/docs/source/en/pipeline_tutorial.md b/docs/source/en/pipeline_tutorial.md
index 3363c68ea417..357bc7f636ec 100644
--- a/docs/source/en/pipeline_tutorial.md
+++ b/docs/source/en/pipeline_tutorial.md
@@ -59,10 +59,10 @@ Let's try the [Whisper large-v2](https://huggingface.co/openai/whisper-large-v2)
 benchmarks. It also has the added benefit of predicting punctuation and casing, neither of which are possible with  
 Wav2Vec2.
 
-Let's give it a try here to see how it performs:
+Let's give it a try here to see how it performs. Set `torch_dtype="auto"` to automatically load the most memory-efficient data type the weights are stored in.
 
 ```py
->>> transcriber = pipeline(model="openai/whisper-large-v2")
+>>> transcriber = pipeline(model="openai/whisper-large-v2", torch_dtype="auto")
 >>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
 {'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
 ```
diff --git a/docs/source/en/quantization/bitsandbytes.md b/docs/source/en/quantization/bitsandbytes.md
index e9447555e824..6c6b92d0a6e5 100644
--- a/docs/source/en/quantization/bitsandbytes.md
+++ b/docs/source/en/quantization/bitsandbytes.md
@@ -64,7 +64,7 @@ model_8bit = AutoModelForCausalLM.from_pretrained(
 )
 ```
 
-By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want:
+By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want. Setting `torch_dtype="auto"` loads the model in the data type defined in a model's `config.json` file.
 
 ```py
 import torch
@@ -75,7 +75,7 @@ quantization_config = BitsAndBytesConfig(load_in_8bit=True)
 model_8bit = AutoModelForCausalLM.from_pretrained(
     "facebook/opt-350m", 
     quantization_config=quantization_config, 
-    torch_dtype=torch.float32
+    torch_dtype="auto"
 )
 model_8bit.model.decoder.layers[-1].final_layer_norm.weight.dtype
 ```
@@ -112,7 +112,7 @@ model_4bit = AutoModelForCausalLM.from_pretrained(
 )
 ```
 
-By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want:
+By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want. Setting `torch_dtype="auto"` loads the model in the data type defined in a model's `config.json` file.
 
 ```py
 import torch
@@ -123,7 +123,7 @@ quantization_config = BitsAndBytesConfig(load_in_4bit=True)
 model_4bit = AutoModelForCausalLM.from_pretrained(
     "facebook/opt-350m",
     quantization_config=quantization_config, 
-    torch_dtype=torch.float32
+    torch_dtype="auto"
 )
 model_4bit.model.decoder.layers[-1].final_layer_norm.weight.dtype
 ```
@@ -190,6 +190,7 @@ Now load your model with the custom `device_map` and `quantization_config`:
 ```py
 model_8bit = AutoModelForCausalLM.from_pretrained(
     "bigscience/bloom-1b7",
+    torch_dtype="auto",
     device_map=device_map,
     quantization_config=quantization_config,
 )
@@ -212,6 +213,7 @@ quantization_config = BitsAndBytesConfig(
 
 model_8bit = AutoModelForCausalLM.from_pretrained(
     model_id,
+    torch_dtype="auto",
     device_map=device_map,
     quantization_config=quantization_config,
 )
@@ -232,6 +234,7 @@ quantization_config = BitsAndBytesConfig(
 
 model_8bit = AutoModelForCausalLM.from_pretrained(
     model_id,
+    torch_dtype="auto",
     device_map="auto",
     quantization_config=quantization_config,
 )
@@ -275,7 +278,7 @@ nf4_config = BitsAndBytesConfig(
     bnb_4bit_quant_type="nf4",
 )
 
-model_nf4 = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=nf4_config)
+model_nf4 = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", quantization_config=nf4_config)
 ```
 
 For inference, the `bnb_4bit_quant_type` does not have a huge impact on performance. However, to remain consistent with the model weights, you should use the `bnb_4bit_compute_dtype` and `torch_dtype` values.
@@ -292,7 +295,7 @@ double_quant_config = BitsAndBytesConfig(
     bnb_4bit_use_double_quant=True,
 )
 
-model_double_quant = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b", quantization_config=double_quant_config)
+model_double_quant = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b", torch_dtype="auto", quantization_config=double_quant_config)
 ```
 
 ## Dequantizing `bitsandbytes` models
diff --git a/docs/source/en/quantization/fbgemm_fp8.md b/docs/source/en/quantization/fbgemm_fp8.md
index ff9e18f823c9..61cf8a059bf2 100644
--- a/docs/source/en/quantization/fbgemm_fp8.md
+++ b/docs/source/en/quantization/fbgemm_fp8.md
@@ -33,13 +33,14 @@ pip install --upgrade accelerate fbgemm-gpu torch
 
 If you are having issues with fbgemm-gpu and torch library, you might need to install the nightly release. You can follow the instruction [here](https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-libraries:~:text=found%20here.-,Install%20the%20FBGEMM_GPU%20Package,-Install%20through%20PyTorch)
 
+By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type.
 
 ```py
 from transformers import FbgemmFp8Config, AutoModelForCausalLM, AutoTokenizer
 
 model_name = "meta-llama/Meta-Llama-3-8B"
 quantization_config = FbgemmFp8Config()
-quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", quantization_config=quantization_config)
+quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", quantization_config=quantization_config)
 
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 input_text = "What are we having for dinner?"
diff --git a/docs/source/en/quantization/gptq.md b/docs/source/en/quantization/gptq.md
index 5713ef4132a9..1534a977f343 100644
--- a/docs/source/en/quantization/gptq.md
+++ b/docs/source/en/quantization/gptq.md
@@ -22,15 +22,42 @@ Try GPTQ quantization with PEFT in this [notebook](https://colab.research.google
 
 
 
-The [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) library implements the GPTQ algorithm, a post-training quantization technique where each row of the weight matrix is quantized independently to find a version of the weights that minimizes the error. These weights are quantized to int4, but they're restored to fp16 on the fly during inference. This can save your memory-usage by 4x because the int4 weights are dequantized in a fused kernel rather than a GPU's global memory, and you can also expect a speedup in inference because using a lower bitwidth takes less time to communicate.
+Both [GPTQModel](https://github.com/ModelCloud/GPTQModel) and [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) libraries implement the GPTQ algorithm, a post-training quantization technique where each row of the weight matrix is quantized independently to find a version of the weights that minimizes error. These weights are quantized to int4, stored as int32 (int4 x 8) and dequantized (restored) to fp16 on the fly during inference. This can save memory by almost 4x because the int4 weights are often dequantized in a fused kernel. You can also expect a substantial speedup in inference due to lower bandwidth requirements for lower bitwidth.
 
-Before you begin, make sure the following libraries are installed:
+[GPTQModel](https://github.com/ModelCloud/GPTQModel) started as a maintained fork of AutoGPTQ but has since differentiated itself with the following major differences.
+
+* Model support: GPTQModel continues to support all of the latest LLM models.
+* Multimodal support: GPTQModel supports accurate quantization of Qwen 2-VL and Ovis 1.6-VL image-to-text models. 
+* Platform support: Linux, macOS (Apple Silicon), and Windows 11.
+* Hardware support: NVIDIA CUDA, AMD ROCm, Apple Silicon M1/MPS /CPU, Intel/AMD CPU, and Intel Datacenter Max/Arc GPUs.
+* Asymmetric support: Asymmetric quantization can potentially introduce lower quantization errors compared to symmetric quantization. However, it is not backward compatible with AutoGPTQ, and not all kernels, such as Marlin, support asymmetric quantization.
+* IPEX kernel for Intel/AMD accelerated CPU and Intel GPU (Datacenter Max/Arc GPUs) support.
+* Updated Marlin kernel from Neural Magic optimized for A100 (Ampere).
+* Updated kernels with auto-padding for legacy model support and models with non-uniform in/out-features. 
+* Faster quantization, lower memory usage, and more accurate default quantization via GPTQModel quantization APIs.
+* User and developer friendly APIs. 
+
+
+[AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) will likely be deprecated in the future due the lack of continued support for new models and features. 
+
+Before you begin, make sure the following libraries are installed and updated to the latest release:
 
 ```bash
-pip install auto-gptq
 pip install --upgrade accelerate optimum transformers
 ```
 
+Then install either GPTQModel or AutoGPTQ.
+
+```bash
+pip install gptqmodel --no-build-isolation
+```
+
+or
+
+```bash
+pip install auto-gptq --no-build-isolation
+```
+
 To quantize a model (currently only supported for text models), you need to create a [`GPTQConfig`] class and set the number of bits to quantize to, a dataset to calibrate the weights for quantization, and a tokenizer to prepare the dataset.
 
 ```py
@@ -92,9 +119,22 @@ from transformers import AutoModelForCausalLM
 model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto")
 ```
 
+## Marlin
+
+[Marlin](https://github.com/IST-DASLab/marlin) is a 4-bit only CUDA GPTQ kernel, highly optimized for the NVIDIA A100 GPU (Ampere) architecture. Loading, dequantization, and execution of post-dequantized weights are highly parallelized, offering a substantial inference improvement versus the original CUDA GPTQ kernel. Marlin is only available for quantized inference and does not support model quantization.
+
+Marlin inference can be activated with the `backend` parameter in [`GPTQConfig`].
+
+```py
+
+from transformers import AutoModelForCausalLM, GPTQConfig
+
+model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto", quantization_config=GPTQConfig(bits=4, backend="marlin"))
+```
+
 ## ExLlama
 
-[ExLlama](https://github.com/turboderp/exllama) is a Python/C++/CUDA implementation of the [Llama](model_doc/llama) model that is designed for faster inference with 4-bit GPTQ weights (check out these [benchmarks](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark)). The ExLlama kernel is activated by default when you create a [`GPTQConfig`] object. To boost inference speed even further, use the [ExLlamaV2](https://github.com/turboderp/exllamav2) kernels by configuring the `exllama_config` parameter:
+[ExLlama](https://github.com/turboderp/exllama) is a CUDA implementation of the [Llama](model_doc/llama) model that is designed for faster inference with 4-bit GPTQ weights (check out these [benchmarks](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark)). The ExLlama kernel is activated by default when you create a [`GPTQConfig`] object. To boost inference speed even further, use the [ExLlamaV2](https://github.com/turboderp/exllamav2) kernels by configuring the `exllama_config` parameter:
 
 ```py
 import torch
@@ -110,11 +150,11 @@ Only 4-bit models are supported, and we recommend deactivating the ExLlama kerne
 
 
 
-The ExLlama kernels are only supported when the entire model is on the GPU. If you're doing inference on a CPU with AutoGPTQ (version > 0.4.2), then you'll need to disable the ExLlama kernel. This overwrites the attributes related to the ExLlama kernels in the quantization config of the config.json file.
+The ExLlama kernels are only supported when the entire model is on the GPU. If you're doing inference on a CPU with AutoGPTQ or GPTQModel, then you'll need to disable the ExLlama kernel. This overwrites the attributes related to the ExLlama kernels in the quantization config of the config.json file.
 
 ```py
 import torch
 from transformers import AutoModelForCausalLM, GPTQConfig
 gptq_config = GPTQConfig(bits=4, use_exllama=False)
 model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="cpu", quantization_config=gptq_config)
-```
\ No newline at end of file
+```
diff --git a/docs/source/en/quantization/higgs.md b/docs/source/en/quantization/higgs.md
new file mode 100644
index 000000000000..d2aa9c9dc497
--- /dev/null
+++ b/docs/source/en/quantization/higgs.md
@@ -0,0 +1,66 @@
+
+
+# HIGGS
+
+HIGGS is a 0-shot quantization algorithm that combines Hadamard preprocessing with MSE-Optimal quantization grids to achieve lower quantization error and SOTA performance. You can find more information in the paper [arxiv.org/abs/2411.17525](https://arxiv.org/abs/2411.17525).
+
+Runtime support for HIGGS is implemented through [FLUTE](https://arxiv.org/abs/2407.10960), and its [library](https://github.com/HanGuo97/flute).
+
+## Quantization Example
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, HiggsConfig
+
+model = AutoModelForCausalLM.from_pretrained(
+    "google/gemma-2-9b-it",
+    quantization_config=HiggsConfig(bits=4),
+    device_map="auto",
+)
+
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
+
+tokenizer.decode(model.generate(
+    **tokenizer("Hi,", return_tensors="pt").to(model.device),
+    temperature=0.5,
+    top_p=0.80,
+)[0])
+```
+
+## Pre-quantized models
+
+Some pre-quantized models can be found in the [official collection](https://huggingface.co/collections/ISTA-DASLab/higgs-675308e432fd56b7f6dab94e) on Hugging Face Hub.
+
+## Current Limitations
+
+**Architectures**
+
+Currently, FLUTE, and HIGGS by extension, **only support Llama 3 and 3.0 of 8B, 70B and 405B parameters, as well as Gemma-2 9B and 27B**. We're working on allowing to run more diverse models as well as allow arbitrary models by modifying the FLUTE compilation procedure.
+
+**torch.compile**
+
+HIGGS is fully compatible with `torch.compile`. Compiling `model.forward`, as described [here](../perf_torch_compile.md), here're the speedups it provides on RTX 4090 for `Llama-3.1-8B-Instruct` (forward passes/sec):
+
+| Batch Size | BF16 (With `torch.compile`) | HIGGS 4bit (No `torch.compile`) | HIGGS 4bit (With `torch.compile`) |
+|------------|-----------------------------|----------------------------------|-----------------------------------|
+| 1          | 59                          | 41                               | 124                               |
+| 4          | 57                          | 42                               | 123                               |
+| 16         | 56                          | 41                               | 120                               |
+
+
+**Quantized training**
+
+Currently, HIGGS doesn't support quantized training (and backward passes in general). We're working on adding support for it.
\ No newline at end of file
diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md
index 0fb72d26058e..dfe680832b19 100644
--- a/docs/source/en/quantization/overview.md
+++ b/docs/source/en/quantization/overview.md
@@ -45,30 +45,50 @@ In short, supporting a wide range of quantization methods allows you to pick the
 
 Use the table below to help you decide which quantization method to use.
 
-| Quantization method                 | On the fly quantization | CPU | CUDA GPU | RoCm GPU (AMD) | Metal (Apple Silicon) | Intel GPU | torch.compile() support | Number of bits | Supports fine-tuning (through PEFT) | Serializable with 🤗 transformers | 🤗 transformers support | Link to library                             |
-|-------------------------------------|-------------------------|-----|----------|----------------|-----------------------|-----------|-------------------------|----------------|-------------------------------------|--------------|------------------------|---------------------------------------------|
-| [AQLM](./aqlm)                                | 🔴                       |  🟢   |     🟢     | 🔴              | 🔴                     | 🔴         | 🟢                      | 1 / 2          | 🟢                                   | 🟢            | 🟢                      | https://github.com/Vahe1994/AQLM            |
-| [AWQ](./awq) | 🔴                       | 🟢   | 🟢        | 🟢              | 🔴                     | 🟢         | ?                       | 4              | 🟢                                   | 🟢            | 🟢                      | https://github.com/casper-hansen/AutoAWQ    |
-| [bitsandbytes](./bitsandbytes)     | 🟢            | 🟡 *   |     🟢     | 🟡 *            | 🔴 **    | 🟡 *       | 🔴    (soon!)          | 4 / 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/bitsandbytes-foundation/bitsandbytes |
-| [compressed-tensors](./compressed_tensors)                        | 🔴                       | 🟢   |     🟢     | 🟢              | 🔴                     | 🔴         | 🔴                       | 1 - 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/neuralmagic/compressed-tensors |
-| [EETQ](./eetq)                                | 🟢                       | 🔴   | 🟢        | 🔴              | 🔴         | 🔴                     | ?                       | 8              | 🟢                                   | 🟢            | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
-| GGUF / GGML (llama.cpp)             | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🔴         | 🔴                       | 1 - 8          | 🔴                                   | [See GGUF section](../gguf)                | [See GGUF section](../gguf)                      | https://github.com/ggerganov/llama.cpp      |
-| [GPTQ](./gptq)                                | 🔴                       | 🔴   | 🟢        | 🟢              | 🔴                     | 🔴         | 🔴                       | 2 - 3 - 4 - 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
-| [HQQ](./hqq)                                 | 🟢                       | 🟢    | 🟢        | 🔴              | 🔴                     | 🔴         | 🟢                       | 1 - 8          | 🟢                                   | 🔴            | 🟢                      | https://github.com/mobiusml/hqq/            |
-| [optimum-quanto](./quanto)                              | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🔴         | 🟢                       | 2 / 4 / 8      | 🔴                                   | 🔴            | 🟢                      | https://github.com/huggingface/optimum-quanto       |
-| [FBGEMM_FP8](./fbgemm_fp8.md)                              | 🟢                       | 🔴    | 🟢        | 🔴              | 🔴                      | 🔴         | 🔴                        | 8      | 🔴                                   | 🟢            | 🟢                      | https://github.com/pytorch/FBGEMM       |
-| [torchao](./torchao.md)                              | 🟢                       |     | 🟢        | 🔴              | partial support (int4 weight only)       | 🔴         |                       | 4 / 8      |                                   | 🟢🔴           | 🟢                      | https://github.com/pytorch/ao       |
+| Quantization Method                           | On the fly quantization | CPU             | CUDA GPU | ROCm GPU  | Metal (Apple Silicon)              | Intel GPU       | Torch compile() | Bits          | PEFT Fine Tuning | Serializable with 🤗Transformers | 🤗Transformers Support  | Link to library                             |
+|-----------------------------------------------|----------------------|-----------------|----------|-----------|------------------------------------|-----------------|-----------------|---------------|------------------|-----------------------------|-------------------------|---------------------------------------------|
+| [AQLM](./aqlm.md)                             | 🔴                   | 🟢              |     🟢     | 🔴        | 🔴                                 | 🔴              | 🟢              | 1/2         | 🟢               | 🟢                          | 🟢                      | https://github.com/Vahe1994/AQLM            |
+| [AWQ](./awq.md)                               | 🔴                   | 🟢              | 🟢        | 🟢        | 🔴                                 | 🟢              | ?               | 4             | 🟢               | 🟢                          | 🟢                      | https://github.com/casper-hansen/AutoAWQ    |
+| [bitsandbytes](./bitsandbytes.md)             | 🟢                   | 🟡 1 |     🟢     | 🟡 1 | 🔴 2                    | 🟡 1 | 🔴 1 | 4/8         | 🟢               | 🟢                          | 🟢                      | https://github.com/bitsandbytes-foundation/bitsandbytes |
+| [compressed-tensors](./compressed_tensors.md) | 🔴                   | 🟢              |     🟢     | 🟢        | 🔴                                 | 🔴              | 🔴              | 1/8         | 🟢               | 🟢                          | 🟢                      | https://github.com/neuralmagic/compressed-tensors |
+| [EETQ](./eetq.md)                             | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | ?               | 8             | 🟢               | 🟢                          | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
+| [GGUF / GGML (llama.cpp)](../gguf.md)         | 🟢                   | 🟢              | 🟢        | 🔴        | 🟢                                 | 🔴              | 🔴              | 1/8         | 🔴               | [See Notes](../gguf.md)     | [See Notes](../gguf.md) | https://github.com/ggerganov/llama.cpp      |
+| [GPTQModel](./gptq.md)                        | 🔴                   | 🟢 3 | 🟢        | 🟢        | 🟢                                 | 🟢 4 | 🔴              | 2/3/4/8 | 🟢               | 🟢                          | 🟢                      | https://github.com/ModelCloud/GPTQModel        |
+| [AutoGPTQ](./gptq.md)                         | 🔴                   | 🔴              | 🟢        | 🟢        | 🔴                                 | 🔴              | 🔴              | 2/3/4/8 | 🟢               | 🟢                          | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
+| [HIGGS](./higgs.md)                           | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🟢              | 2/4         | 🔴               | 🟢                          | 🟢                      | https://github.com/HanGuo97/flute           |       
+| [HQQ](./hqq.md)                               | 🟢                   | 🟢              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🟢              | 1/8         | 🟢               | 🔴                          | 🟢                      | https://github.com/mobiusml/hqq/            |
+| [optimum-quanto](./quanto.md)                 | 🟢                   | 🟢              | 🟢        | 🔴        | 🟢                                 | 🔴              | 🟢              | 2/4/8     | 🔴               | 🔴                          | 🟢                      | https://github.com/huggingface/optimum-quanto       |
+| [FBGEMM_FP8](./fbgemm_fp8.md)                 | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🔴              | 8             | 🔴               | 🟢                          | 🟢                      | https://github.com/pytorch/FBGEMM       |
+| [torchao](./torchao.md)                       | 🟢                   |                 | 🟢        | 🔴        | 🟡 5 | 🔴              |                 | 4/8         |                  | 🟢🔴                        | 🟢                      | https://github.com/pytorch/ao       |
+| [VPTQ](./vptq.md)                             | 🔴                   | 🔴              |     🟢     | 🟡        | 🔴                                 | 🔴              | 🟢              | 1/8         | 🔴               | 🟢                          | 🟢                      | https://github.com/microsoft/VPTQ            |
 
 
+  
+**1:** bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend). Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links.
 
-\* bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend).
+
+
+
+
+**2:** bitsandbytes is seeking contributors to help develop and lead the Apple Silicon backend. Interested? Contact them directly via their repo. Stipends may be available through sponsorships.
+
+
+
+
 
-We value your feedback to help identify bugs before the full release! Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links.
+**3:** GPTQModel[CPU] supports 4-bit via IPEX on Intel/AMD and full bit range via Torch on Intel/AMD/Apple Silicon.
 
 
 
 
 
-\** bitsandbytes is seeking contributors to help develop and lead the Apple Silicon backend. Interested? Contact them directly via their repo. Stipends may be available through sponsorships.
+**4:** GPTQModel[Intel GPU] via IPEX only supports 4-bit for Intel Datacenter Max/Arc GPUs.
 
 
+
+
+
+**5:** torchao only supports int4 weight on Metal (Apple Silicon).
+
+
+
diff --git a/docs/source/en/quantization/quanto.md b/docs/source/en/quantization/quanto.md
index 37df4ed589e8..7feadefd83d2 100644
--- a/docs/source/en/quantization/quanto.md
+++ b/docs/source/en/quantization/quanto.md
@@ -42,7 +42,9 @@ pip install optimum-quanto accelerate transformers
 
 Now you can quantize a model by passing [`QuantoConfig`] object in the [`~PreTrainedModel.from_pretrained`] method. This works for any model in any modality, as long as it contains `torch.nn.Linear` layers. 
 
-The integration with transformers only supports weights quantization. For the more complex use case such as activation quantization, calibration and quantization aware training, you should use [optimum-quanto](https://github.com/huggingface/optimum-quanto) library instead. 
+The integration with transformers only supports weights quantization. For the more complex use case such as activation quantization, calibration and quantization aware training, you should use [optimum-quanto](https://github.com/huggingface/optimum-quanto) library instead.
+
+By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type.
 
 ```py
 from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig
@@ -50,7 +52,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig
 model_id = "facebook/opt-125m"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 quantization_config = QuantoConfig(weights="int8")
-quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda:0", quantization_config=quantization_config)
+quantized_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="cuda:0", quantization_config=quantization_config)
 ```
 
 Note that serialization is not supported yet with transformers but it is coming soon! If you want to save the model, you can use quanto library instead.
diff --git a/docs/source/en/quantization/torchao.md b/docs/source/en/quantization/torchao.md
index cd1d0188c33e..46fb0f8cbb9a 100644
--- a/docs/source/en/quantization/torchao.md
+++ b/docs/source/en/quantization/torchao.md
@@ -16,9 +16,11 @@ rendered properly in your Markdown viewer.
 Before you begin, make sure the following libraries are installed with their latest version:
 
 ```bash
-pip install --upgrade torch torchao
+# Updating 🤗 Transformers to the latest version, as the example script below uses the new auto compilation
+pip install --upgrade torch torchao transformers
 ```
 
+By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type.
 
 ```py
 import torch
@@ -28,18 +30,14 @@ model_name = "meta-llama/Meta-Llama-3-8B"
 # We support int4_weight_only, int8_weight_only and int8_dynamic_activation_int8_weight
 # More examples and documentations for arguments can be found in https://github.com/pytorch/ao/tree/main/torchao/quantization#other-available-quantization-techniques
 quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
-quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", quantization_config=quantization_config)
+quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", quantization_config=quantization_config)
 
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 input_text = "What are we having for dinner?"
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
 
-# compile the quantized model to get speedup
-import torchao
-torchao.quantization.utils.recommended_inductor_config_setter()
-quantized_model = torch.compile(quantized_model, mode="max-autotune")
-
-output = quantized_model.generate(**input_ids, max_new_tokens=10)
+# auto-compile the quantized model with `cache_implementation="static"` to get speedup
+output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
 print(tokenizer.decode(output[0], skip_special_tokens=True))
 
 # benchmark the performance
@@ -58,11 +56,11 @@ def benchmark_fn(f, *args, **kwargs):
     return f"{(t0.blocked_autorange().mean):.3f}"
 
 MAX_NEW_TOKENS = 1000
-print("int4wo-128 model:", benchmark_fn(quantized_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS))
+print("int4wo-128 model:", benchmark_fn(quantized_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation="static"))
 
 bf16_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", torch_dtype=torch.bfloat16)
-bf16_model = torch.compile(bf16_model, mode="max-autotune")
-print("bf16 model:", benchmark_fn(bf16_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS))
+output = bf16_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static") # auto-compile
+print("bf16 model:", benchmark_fn(bf16_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation="static"))
 
 ```
 
diff --git a/docs/source/en/quantization/vptq.md b/docs/source/en/quantization/vptq.md
new file mode 100644
index 000000000000..b86e82f0a350
--- /dev/null
+++ b/docs/source/en/quantization/vptq.md
@@ -0,0 +1,111 @@
+
+
+# VPTQ 
+
+> [!TIP]
+> Try VPTQ on [Hugging Face](https://huggingface.co/spaces/microsoft/VPTQ)!
+> Try VPTQ on [Google Colab](https://colab.research.google.com/github/microsoft/VPTQ/blob/main/notebooks/vptq_example.ipynb)!
+> Know more about VPTQ on [ArXiv](https://arxiv.org/pdf/2409.17066)!
+
+Vector Post-Training Quantization ([VPTQ](https://github.com/microsoft/VPTQ)) is a novel Post-Training Quantization method that leverages Vector Quantization to high accuracy on LLMs at an extremely low bit-width (<2-bit). VPTQ can compress 70B, even the 405B model, to 1-2 bits without retraining and maintain high accuracy.
+
+- Better Accuracy on 1-2 bits, (405B @ <2bit, 70B @ 2bit)
+- Lightweight Quantization Algorithm: only cost ~17 hours to quantize 405B Llama-3.1
+- Agile Quantization Inference: low decode overhead, best throughput, and TTFT
+
+Inference support for VPTQ is released in the `vptq` library. Make sure to install it to run the models:
+```bash
+pip install vptq
+```
+
+The library provides efficient kernels for NVIDIA/AMD GPU inference.
+
+To run VPTQ models simply load a model that has been quantized with VPTQ:
+
+## Inference example
+**Run Llama 3.1 70b on RTX4090 (24G @ ~2bits) in real time**
+![Llama3 1-70b-prompt](https://github.com/user-attachments/assets/d8729aca-4e1d-4fe1-ac71-c14da4bdd97f)
+
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+quantized_model = AutoModelForCausalLM.from_pretrained(
+    "VPTQ-community/Meta-Llama-3.1-70B-Instruct-v16-k65536-65536-woft",
+    torch_dtype="auto", 
+    device_map="auto"
+)
+tokenizer = AutoTokenizer.from_pretrained("VPTQ-community/Meta-Llama-3.1-70B-Instruct-v16-k65536-65536-woft")
+input_ids = tokenizer("hello, it's me", return_tensors="pt").to("cuda")
+out = model.generate(**input_ids, max_new_tokens=32, do_sample=False)
+```
+
+## Quantize your own model
+VPTQ algorithm early-released at [VPTQ ](https://github.com/microsoft/VPTQ/tree/algorithm), 
+and checkout the [tutorial](https://github.com/microsoft/VPTQ/blob/algorithm/algorithm.md).
+
+## Early Results from Tech Report
+VPTQ achieves better accuracy and higher throughput with lower quantization overhead across models of different sizes. The following experimental results are for reference only; VPTQ can achieve better outcomes under reasonable parameters, especially in terms of model accuracy and inference speed.
+
+
+| Model       | bitwidth | W2↓  | C4↓  | AvgQA↑ | tok/s↑ | mem(GB) | cost/h↓ |
+| ----------- | -------- | ---- | ---- | ------ | ------ | ------- | ------- |
+| LLaMA-2 7B  | 2.02     | 6.13 | 8.07 | 58.2   | 39.9   | 2.28    | 2       |
+|             | 2.26     | 5.95 | 7.87 | 59.4   | 35.7   | 2.48    | 3.1     |
+| LLaMA-2 13B | 2.02     | 5.32 | 7.15 | 62.4   | 26.9   | 4.03    | 3.2     |
+|             | 2.18     | 5.28 | 7.04 | 63.1   | 18.5   | 4.31    | 3.6     |
+| LLaMA-2 70B | 2.07     | 3.93 | 5.72 | 68.6   | 9.7    | 19.54   | 19      |
+|             | 2.11     | 3.92 | 5.71 | 68.7   | 9.7    | 20.01   | 19      |
+
+
+
+## More Models in [VPTQ-community](https://huggingface.co/VPTQ-community) 
+
+⚠️ The repository only provides a method of model quantization algorithm. 
+
+⚠️ The open-source community VPTQ-community provides models based on the technical report and quantization algorithm.
+
+
+
+**Quick Estimation of Model Bitwidth (Excluding Codebook Overhead)**:
+
+- **Model Naming Convention**: The model's name includes the **vector length** $v$, **codebook (lookup table) size**, and **residual codebook size**. For example, "Meta-Llama-3.1-70B-Instruct-v8-k65536-256-woft" is "Meta-Llama-3.1-70B-Instruct", where:
+  - **Vector Length**: 8
+  - **Number of Centroids**: 65536 (2^16)
+  - **Number of Residual Centroids**: 256 (2^8)
+- **Equivalent Bitwidth Calculation**:
+  - **Index**: log2(65536) = 16 / 8 = 2 bits
+  - **Residual Index**: log2(256) = 8 / 8 = 1 bit
+  - **Total Bitwidth**: 2 + 1 = 3 bits
+- **Model Size Estimation**: 70B * 3 bits / 8 bits per Byte = 26.25 GB
+
+- **Note**: This estimate does not include the size of the codebook (lookup table), other parameter overheads, and the padding overhead for storing indices. For the detailed calculation method, please refer to **Tech Report Appendix C.2**.
+
+
+|            Model Series            |  Collections                              | (Estimated) Bit per weight     |
+| :--------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------: | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+|       Llama 3.1 Nemotron 70B Instruct HF        |  [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-llama-31-nemotron-70b-instruct-hf-without-finetune-671730b96f16208d0b3fe942)  | [4 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v16-k65536-65536-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v8-k65536-0-woft) [1.875 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v16-k65536-16384-woft) [1.625 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v16-k65536-1024-woft) [1.5 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v16-k65536-256-woft) |
+|       Llama 3.1 8B Instruct        |  [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-llama-31-8b-instruct-without-finetune-66f2b70b1d002ceedef02d2e)  | [4 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-8B-Instruct-v8-k65536-65536-woft) [3.5 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-8B-Instruct-v8-k65536-4096-woft) [3 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-8B-Instruct-v8-k65536-256-woft) [2.3 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-8B-Instruct-v12-k65536-4096-woft)                                                                                                                                                                                                                                                                                                                                                                                                              |
+|       Llama 3.1 70B Instruct       | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-llama-31-70b-instruct-without-finetune-66f2bf454d3dd78dfee2ff11)  | [4 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-256-woft) [2.25 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-4-woft)  [2 bits (1)](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v16-k65536-65536-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-0-woft) [1.93 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v16-k65536-32768-woft) [1.875 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k32768-0-woft) [1.75 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k16384-0-woft) |
+|      Llama 3.1 405B Instruct       | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-llama-31-405b-instruct-without-finetune-66f4413f9ba55e1a9e52cfb0) | [4 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v8-k65536-256-woft) [2 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-65536-woft) [1.875 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k32768-32768-woft) [1.625 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-1024-woft) [1.5 bits (1)](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v8-k4096-0-woft) [1.5 bits (2)](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-256-woft) [1.43 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-128-woft) [1.375 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-64-woft) |
+| Mistral Large Instruct 2407 (123B) | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-mistral-large-instruct-2407-without-finetune-6711ebfb7faf85eed9cceb16) | [4 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-65536-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v8-k65536-0-woft) [1.875 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-16384-woft) [1.75 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-4096-woft) [1.625 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-1024-woft) [1.5 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-256-woft) |
+|        Qwen 2.5 7B Instruct        |  [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-qwen-25-7b-instruct-without-finetune-66f3e9866d3167cc05ce954a)   | [4 bits](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v8-k256-256-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v8-k65536-0-woft)  [2 bits (3)](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v16-k65536-65536-woft)                                                                                                                                                                                                                                                                                                                                              |
+|       Qwen 2.5 14B Instruct        |  [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-qwen-25-14b-instruct-without-finetune-66f827f83c7ffa7931b8376c)  | [4 bits](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v8-k256-256-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v8-k65536-0-woft)  [2 bits (3)](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v16-k65536-65536-woft)                                                                                                                                                                                                                                                                                                                                         |
+|       Qwen 2.5 32B Instruct        |  [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-qwen-25-32b-instruct-without-finetune-66fe77173bf7d64139f0f613)  | [4 bits](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v16-k65536-65536-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v8-k65536-0-woft) [2 bits (3)](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v8-k256-256-woft)                                                                                                                                                                                                                                                                                                                                          |
+|       Qwen 2.5 72B Instruct        |  [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-qwen-25-72b-instruct-without-finetune-66f3bf1b3757dfa1ecb481c0)  | [4 bits](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-256-woft) [2.38 bits](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k1024-512-woft) [2.25 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k512-512-woft) [2.25 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-4-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-0-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v16-k65536-65536-woft) [1.94 bits](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v16-k65536-32768-woft)                                                  |
+|  Reproduced from the tech report   |     [HF 🤗](https://huggingface.co/collections/VPTQ-community/reproduced-vptq-tech-report-baseline-66fbf1dffe741cc9e93ecf04)     | Results from the open source community for reference only, please use them responsibly.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| Hessian and Inverse Hessian Matrix |      [HF 🤗](https://huggingface.co/collections/VPTQ-community/hessian-and-invhessian-checkpoints-66fd249a104850d17b23fd8b)      | Collected from RedPajama-Data-1T-Sample, following [Quip#](https://github.com/Cornell-RelaxML/quip-sharp/blob/main/quantize_llama/hessian_offline_llama.py)                                                                                                                                                                                                               
\ No newline at end of file
diff --git a/docs/source/en/quicktour.md b/docs/source/en/quicktour.md
index 404b6eac7fe4..bd472aba36ac 100755
--- a/docs/source/en/quicktour.md
+++ b/docs/source/en/quicktour.md
@@ -245,13 +245,15 @@ Check out the [preprocess](./preprocessing) tutorial for more details about toke
 
 
 
-🤗 Transformers provides a simple and unified way to load pretrained instances. This means you can load an [`AutoModel`] like you would load an [`AutoTokenizer`]. The only difference is selecting the correct [`AutoModel`] for the task. For text (or sequence) classification, you should load [`AutoModelForSequenceClassification`]:
+🤗 Transformers provides a simple and unified way to load pretrained instances. This means you can load an [`AutoModel`] like you would load an [`AutoTokenizer`]. The only difference is selecting the correct [`AutoModel`] for the task. For text (or sequence) classification, you should load [`AutoModelForSequenceClassification`].
+
+By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type.
 
 ```py
 >>> from transformers import AutoModelForSequenceClassification
 
 >>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
->>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype="auto")
 ```
 
 
@@ -416,12 +418,12 @@ All models are a standard [`torch.nn.Module`](https://pytorch.org/docs/stable/nn
 
 Depending on your task, you'll typically pass the following parameters to [`Trainer`]:
 
-1. You'll start with a [`PreTrainedModel`] or a [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module):
+1. You'll start with a [`PreTrainedModel`] or a [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module). Set `torch_dtype="auto"` to automatically load the most memory-efficient data type the weights are stored in.
 
    ```py
    >>> from transformers import AutoModelForSequenceClassification
 
-   >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
+   >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", torch_dtype="auto")
    ```
 
 2. [`TrainingArguments`] contains the model hyperparameters you can change like learning rate, batch size, and the number of epochs to train for. The default values are used if you don't specify any training arguments:
@@ -551,6 +553,32 @@ All models are a standard [`tf.keras.Model`](https://www.tensorflow.org/api_docs
    >>> model.fit(tf_dataset)  # doctest: +SKIP
    ```
 
+
+## Chat with text generation models
+
+If you're working with a model that generates text as an output, you can also engage in a multi-turn conversation with
+it through the `transformers-cli chat` command. This is the fastest way to interact with a model, e.g. for a
+qualitative assessment (aka vibe check).
+
+This CLI is implemented on top of our `AutoClass` abstraction, leveraging our [text generation](llm_tutorial.md) and
+[chat](chat_templating.md) tooling, and thus will be compatible with any 🤗 Transformers model. If you have the library
+[installed](installation.md), you can launch the chat session on your terminal with
+
+```bash
+transformers-cli chat --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct
+```
+
+For a full list of options to launch the chat, type
+
+```bash
+transformers-cli chat -h
+```
+
+After the chat is launched, you will enter an interactive session with the model. There are special commands for this
+session as well, such as `clear` to reset the conversation. Type `help` at any moment to display all special chat
+commands, and `exit` to terminate the session.
+
+
 ## What's next?
 
 Now that you've completed the 🤗 Transformers quick tour, check out our guides and learn how to do more specific things like writing a custom model, fine-tuning a model for a task, and how to train a model with a script. If you're interested in learning more about 🤗 Transformers core concepts, grab a cup of coffee and take a look at our Conceptual Guides!
diff --git a/docs/source/en/task_summary.md b/docs/source/en/task_summary.md
index a5e2192f8759..e06081a93d04 100644
--- a/docs/source/en/task_summary.md
+++ b/docs/source/en/task_summary.md
@@ -305,10 +305,7 @@ There are two types of language modeling:
     ...     for pred in preds
     ... ]
     >>> preds
-    [{'score': 0.2236,
-      'token': 1761,
-      'token_str': ' platform',
-      'sequence': 'Hugging Face is a community-based open-source platform for machine learning.'}]
+    [{'score': 0.224, 'token': 3944, 'token_str': ' tool', 'sequence': 'Hugging Face is a community-based open-source tool for machine learning.'}]
     ```
 
 ## Multimodal
diff --git a/docs/source/en/tasks/asr.md b/docs/source/en/tasks/asr.md
index f3e068444ca5..e8884d327b56 100644
--- a/docs/source/en/tasks/asr.md
+++ b/docs/source/en/tasks/asr.md
@@ -20,12 +20,12 @@ rendered properly in your Markdown viewer.
 
 
 
-Automatic speech recognition (ASR) converts a speech signal to text, mapping a sequence of audio inputs to text outputs. Virtual assistants like Siri and Alexa use ASR models to help users everyday, and there are many other useful user-facing applications like live captioning and note-taking during meetings.
+Automatic speech recognition (ASR) converts a speech signal to text, mapping a sequence of audio inputs to text outputs. Virtual assistants like Siri and Alexa use ASR models to help users every day, and there are many other useful user-facing applications like live captioning and note-taking during meetings.
 
 This guide will show you how to:
 
-1. Finetune [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) on the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset to transcribe audio to text.
-2. Use your finetuned model for inference.
+1. Fine-tune [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) on the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset to transcribe audio to text.
+2. Use your fine-tuned model for inference.
 
 
 
@@ -49,7 +49,7 @@ We encourage you to login to your Hugging Face account so you can upload and sha
 
 ## Load MInDS-14 dataset
 
-Start by loading a smaller subset of the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset from the 🤗 Datasets library. This'll give you a chance to experiment and make sure everything works before spending more time training on the full dataset.
+Start by loading a smaller subset of the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset from the 🤗 Datasets library. This will give you a chance to experiment and make sure everything works before spending more time training on the full dataset.
 
 ```py
 >>> from datasets import load_dataset, Audio
@@ -79,13 +79,13 @@ DatasetDict({
 })
 ```
 
-While the dataset contains a lot of useful information, like `lang_id` and `english_transcription`, you'll focus on the `audio` and `transcription` in this guide. Remove the other columns with the [`~datasets.Dataset.remove_columns`] method:
+While the dataset contains a lot of useful information, like `lang_id` and `english_transcription`, this guide focuses on the `audio` and `transcription`. Remove the other columns with the [`~datasets.Dataset.remove_columns`] method:
 
 ```py
 >>> minds = minds.remove_columns(["english_transcription", "intent_class", "lang_id"])
 ```
 
-Take a look at the example again:
+Review the example again:
 
 ```py
 >>> minds["train"][0]
@@ -112,7 +112,7 @@ The next step is to load a Wav2Vec2 processor to process the audio signal:
 >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base")
 ```
 
-The MInDS-14 dataset has a sampling rate of 8000kHz (you can find this information in its [dataset card](https://huggingface.co/datasets/PolyAI/minds14)), which means you'll need to resample the dataset to 16000kHz to use the pretrained Wav2Vec2 model:
+The MInDS-14 dataset has a sampling rate of 8000Hz (you can find this information in its [dataset card](https://huggingface.co/datasets/PolyAI/minds14)), which means you'll need to resample the dataset to 16000Hz to use the pretrained Wav2Vec2 model:
 
 ```py
 >>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
@@ -125,7 +125,7 @@ The MInDS-14 dataset has a sampling rate of 8000kHz (you can find this informati
  'transcription': "hi I'm trying to use the banking app on my phone and currently my checking and savings account balance is not refreshing"}
 ```
 
-As you can see in the `transcription` above, the text contains a mix of upper and lowercase characters. The Wav2Vec2 tokenizer is only trained on uppercase characters so you'll need to make sure the text matches the tokenizer's vocabulary:
+As you can see in the `transcription` above, the text contains a mix of uppercase and lowercase characters. The Wav2Vec2 tokenizer is only trained on uppercase characters so you'll need to make sure the text matches the tokenizer's vocabulary:
 
 ```py
 >>> def uppercase(example):
@@ -196,7 +196,7 @@ Now instantiate your `DataCollatorForCTCWithPadding`:
 
 ## Evaluate
 
-Including a metric during training is often helpful for evaluating your model's performance. You can quickly load an evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [word error rate](https://huggingface.co/spaces/evaluate-metric/wer) (WER) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):
+Including a metric during training is often helpful for evaluating your model's performance. You can quickly load an evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [word error rate](https://huggingface.co/spaces/evaluate-metric/wer) (WER) metric (refer to the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about loading and computing metrics):
 
 ```py
 >>> import evaluate
@@ -236,7 +236,7 @@ If you aren't familiar with finetuning a model with the [`Trainer`], take a look
 
 
 
-You're ready to start training your model now! Load Wav2Vec2 with [`AutoModelForCTC`]. Specify the reduction to apply with the `ctc_loss_reduction` parameter. It is often better to use the average instead of the default summation:
+You are now ready to start training your model! Load Wav2Vec2 with [`AutoModelForCTC`]. Specify the reduction to apply with the `ctc_loss_reduction` parameter. It is often better to use the average instead of the default summation:
 
 ```py
 >>> from transformers import AutoModelForCTC, TrainingArguments, Trainer
@@ -252,7 +252,7 @@ At this point, only three steps remain:
 
 1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the WER and save the training checkpoint.
 2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
-3. Call [`~Trainer.train`] to finetune your model.
+3. Call [`~Trainer.train`] to fine-tune your model.
 
 ```py
 >>> training_args = TrainingArguments(
@@ -289,7 +289,7 @@ At this point, only three steps remain:
 >>> trainer.train()
 ```
 
-Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model:
+Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so it can be accessible to everyone:
 
 ```py
 >>> trainer.push_to_hub()
@@ -299,13 +299,13 @@ Once training is completed, share your model to the Hub with the [`~transformers
 
 
 
-For a more in-depth example of how to finetune a model for automatic speech recognition, take a look at this blog [post](https://huggingface.co/blog/fine-tune-wav2vec2-english) for English ASR and this [post](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2) for multilingual ASR.
+For a more in-depth example of how to fine-tune a model for automatic speech recognition, take a look at this blog [post](https://huggingface.co/blog/fine-tune-wav2vec2-english) for English ASR and this [post](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2) for multilingual ASR.
 
 
 
 ## Inference
 
-Great, now that you've finetuned a model, you can use it for inference!
+Great, now that you've fine-tuned a model, you can use it for inference!
 
 Load an audio file you'd like to run inference on. Remember to resample the sampling rate of the audio file to match the sampling rate of the model if you need to!
 
@@ -318,7 +318,7 @@ Load an audio file you'd like to run inference on. Remember to resample the samp
 >>> audio_file = dataset[0]["audio"]["path"]
 ```
 
-The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for automatic speech recognition with your model, and pass your audio file to it:
+The simplest way to try out your fine-tuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for automatic speech recognition with your model, and pass your audio file to it:
 
 ```py
 >>> from transformers import pipeline
diff --git a/docs/source/en/tasks/audio_classification.md b/docs/source/en/tasks/audio_classification.md
index 59d6a175da82..973f95e1e955 100644
--- a/docs/source/en/tasks/audio_classification.md
+++ b/docs/source/en/tasks/audio_classification.md
@@ -9,7 +9,7 @@ Unless required by applicable law or agreed to in writing, software distributed
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
 
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.
 
 -->
@@ -20,12 +20,12 @@ rendered properly in your Markdown viewer.
 
 
 
-Audio classification - just like with text - assigns a class label output from the input data. The only difference is instead of text inputs, you have raw audio waveforms. Some practical applications of audio classification include identifying speaker intent, language classification, and even animal species by their sounds.
+Audio classification - just like with text - assigns a class label as output from the input data. The only difference is instead of text inputs, you have raw audio waveforms. Some practical applications of audio classification include identifying speaker intent, language classification, and even animal species by their sounds.
 
 This guide will show you how to:
 
-1. Finetune [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) on the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset to classify speaker intent.
-2. Use your finetuned model for inference.
+1. Fine-tune [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) on the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset to classify speaker intent.
+2. Use your fine-tuned model for inference.
 
 
 
@@ -57,7 +57,7 @@ Start by loading the MInDS-14 dataset from the 🤗 Datasets library:
 >>> minds = load_dataset("PolyAI/minds14", name="en-US", split="train")
 ```
 
-Split the dataset's `train` split into a smaller train and test set with the [`~datasets.Dataset.train_test_split`] method. This'll give you a chance to experiment and make sure everything works before spending more time on the full dataset.
+Split the dataset's `train` split into a smaller train and test set with the [`~datasets.Dataset.train_test_split`] method. This will give you a chance to experiment and make sure everything works before spending more time on the full dataset.
 
 ```py
 >>> minds = minds.train_test_split(test_size=0.2)
@@ -79,13 +79,13 @@ DatasetDict({
 })
 ```
 
-While the dataset contains a lot of useful information, like `lang_id` and `english_transcription`, you'll focus on the `audio` and `intent_class` in this guide. Remove the other columns with the [`~datasets.Dataset.remove_columns`] method:
+While the dataset contains a lot of useful information, like `lang_id` and `english_transcription`, you will focus on the `audio` and `intent_class` in this guide. Remove the other columns with the [`~datasets.Dataset.remove_columns`] method:
 
 ```py
 >>> minds = minds.remove_columns(["path", "transcription", "english_transcription", "lang_id"])
 ```
 
-Take a look at an example now:
+Here's an example:
 
 ```py
 >>> minds["train"][0]
@@ -128,7 +128,7 @@ The next step is to load a Wav2Vec2 feature extractor to process the audio signa
 >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
 ```
 
-The MInDS-14 dataset has a sampling rate of 8000khz (you can find this information in it's [dataset card](https://huggingface.co/datasets/PolyAI/minds14)), which means you'll need to resample the dataset to 16000kHz to use the pretrained Wav2Vec2 model:
+The MInDS-14 dataset has a sampling rate of 8kHz (you can find this information in its [dataset card](https://huggingface.co/datasets/PolyAI/minds14)), which means you'll need to resample the dataset to 16kHz to use the pretrained Wav2Vec2 model:
 
 ```py
 >>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
@@ -155,7 +155,7 @@ Now create a preprocessing function that:
 ...     return inputs
 ```
 
-To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] function. You can speed up `map` by setting `batched=True` to process multiple elements of the dataset at once. Remove the columns you don't need, and rename `intent_class` to `label` because that's the name the model expects:
+To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] function. You can speed up `map` by setting `batched=True` to process multiple elements of the dataset at once. Remove unnecessary columns and rename `intent_class` to `label`, as required by the model:
 
 ```py
 >>> encoded_minds = minds.map(preprocess_function, remove_columns="audio", batched=True)
@@ -208,9 +208,9 @@ You're ready to start training your model now! Load Wav2Vec2 with [`AutoModelFor
 
 At this point, only three steps remain:
 
-1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the accuracy and save the training checkpoint.
+1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir`, which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the accuracy and save the training checkpoint.
 2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
-3. Call [`~Trainer.train`] to finetune your model.
+3. Call [`~Trainer.train`] to fine-tune your model.
 
 
 ```py
@@ -252,15 +252,15 @@ Once training is completed, share your model to the Hub with the [`~transformers
 
 
 
-For a more in-depth example of how to finetune a model for audio classification, take a look at the corresponding [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb).
+For a more in-depth example of how to fine-tune a model for audio classification, take a look at the corresponding [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb).
 
 
 
 ## Inference
 
-Great, now that you've finetuned a model, you can use it for inference!
+Great, now that you've fine-tuned a model, you can use it for inference!
 
-Load an audio file you'd like to run inference on. Remember to resample the sampling rate of the audio file to match the sampling rate of the model if you need to!
+Load an audio file for inference. Remember to resample the sampling rate of the audio file to match the model's sampling rate, if necessary.
 
 ```py
 >>> from datasets import load_dataset, Audio
@@ -271,7 +271,7 @@ Load an audio file you'd like to run inference on. Remember to resample the samp
 >>> audio_file = dataset[0]["audio"]["path"]
 ```
 
-The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for audio classification with your model, and pass your audio file to it:
+The simplest way to try out your fine-tuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for audio classification with your model, and pass your audio file to it:
 
 ```py
 >>> from transformers import pipeline
diff --git a/docs/source/en/tasks/image_feature_extraction.md b/docs/source/en/tasks/image_feature_extraction.md
index 80b701588b26..e55a9e2379d5 100644
--- a/docs/source/en/tasks/image_feature_extraction.md
+++ b/docs/source/en/tasks/image_feature_extraction.md
@@ -84,7 +84,7 @@ If you want to get the last hidden states before pooling, avoid passing any valu
 
 ```python
 pipe = pipeline(task="image-feature-extraction", model_name="google/vit-base-patch16-224", device=DEVICE)
-output = pipe(image_real)
+outputs = pipe(image_real)
 ```
 
 Since the outputs are unpooled, we get the last hidden states where the first dimension is the batch size, and the last two are the embedding shape.
diff --git a/docs/source/en/tasks/image_text_to_text.md b/docs/source/en/tasks/image_text_to_text.md
index 041efb06c575..28bd98457ee0 100644
--- a/docs/source/en/tasks/image_text_to_text.md
+++ b/docs/source/en/tasks/image_text_to_text.md
@@ -229,7 +229,7 @@ Now let's call the `model_inference` function we created and stream the values.
 ```python
 generator = model_inference(
     user_prompt="And what is in this image?",
-    chat_history=messages,
+    chat_history=messages[:2],
     max_new_tokens=100,
     images=images
 )
diff --git a/docs/source/en/tasks/knowledge_distillation_for_image_classification.md b/docs/source/en/tasks/knowledge_distillation_for_image_classification.md
index 17fb363df8e2..c1ccafb6fc5d 100644
--- a/docs/source/en/tasks/knowledge_distillation_for_image_classification.md
+++ b/docs/source/en/tasks/knowledge_distillation_for_image_classification.md
@@ -17,7 +17,7 @@ rendered properly in your Markdown viewer.
 
 [[open-in-colab]]
 
-Knowledge distillation is a technique used to transfer knowledge from a larger, more complex model (teacher) to a smaller, simpler model (student). To distill knowledge from one model to another, we take a pre-trained teacher model trained on a certain task (image classification for this case) and randomly initialize a student model to be trained on image classification. Next, we train the student model to minimize the difference between it's outputs and the teacher's outputs, thus making it mimic the behavior. It was first introduced in [Distilling the Knowledge in a Neural Network by Hinton et al](https://arxiv.org/abs/1503.02531). In this guide, we will do task-specific knowledge distillation. We will use the [beans dataset](https://huggingface.co/datasets/beans) for this.
+Knowledge distillation is a technique used to transfer knowledge from a larger, more complex model (teacher) to a smaller, simpler model (student). To distill knowledge from one model to another, we take a pre-trained teacher model trained on a certain task (image classification for this case) and randomly initialize a student model to be trained on image classification. Next, we train the student model to minimize the difference between its outputs and the teacher's outputs, thus making it mimic the behavior. It was first introduced in [Distilling the Knowledge in a Neural Network by Hinton et al](https://arxiv.org/abs/1503.02531). In this guide, we will do task-specific knowledge distillation. We will use the [beans dataset](https://huggingface.co/datasets/beans) for this.
 
 This guide demonstrates how you can distill a [fine-tuned ViT model](https://huggingface.co/merve/vit-mobilenet-beans-224) (teacher model) to a [MobileNet](https://huggingface.co/google/mobilenet_v2_1.4_224) (student model) using the [Trainer API](https://huggingface.co/docs/transformers/en/main_classes/trainer#trainer) of 🤗 Transformers.
 
diff --git a/docs/source/en/tasks/multiple_choice.md b/docs/source/en/tasks/multiple_choice.md
index 06eb45eda991..18b12f216663 100644
--- a/docs/source/en/tasks/multiple_choice.md
+++ b/docs/source/en/tasks/multiple_choice.md
@@ -419,7 +419,7 @@ Get the class with the highest probability:
 ```py
 >>> predicted_class = logits.argmax().item()
 >>> predicted_class
-'0'
+0
 ```
 
 
@@ -448,7 +448,7 @@ Get the class with the highest probability:
 ```py
 >>> predicted_class = int(tf.math.argmax(logits, axis=-1)[0])
 >>> predicted_class
-'0'
+0
 ```
 
 
diff --git a/docs/source/en/tasks/prompting.md b/docs/source/en/tasks/prompting.md
index 4e30fb1e0ee3..146ec328df0c 100644
--- a/docs/source/en/tasks/prompting.md
+++ b/docs/source/en/tasks/prompting.md
@@ -80,7 +80,7 @@ Run inference with decoder-only models with the `text-generation` pipeline:
 >>> prompt = "Hello, I'm a language model"
 
 >>> generator(prompt, max_length = 30)
-[{'generated_text': "Hello, I'm a language model programmer so you can use some of my stuff. But you also need some sort of a C program to run."}]
+[{'generated_text': "Hello, I'm a language model. Not a programming language at all: it's pretty simple.\n\nWhen I write a function, I mean"}]
 ```
 
 To run inference with an encoder-decoder, use the `text2text-generation` pipeline:
@@ -258,7 +258,7 @@ also be a suitable location for instructions. Typically, it's better to place th
 
 >>> for seq in sequences:
 ...     print(f"{seq['generated_text']}")
-Permaculture is an ecological design mimicking natural ecosystems to meet basic needs and prepare for climate change. It is based on traditional knowledge and scientific understanding.
+"Permaculture is an ecological design method that mimics natural ecosystems' diversity, functionality, and resilience using modern technology and indigenous knowledge. It aims to help"
 ```
 
 #### Question answering
@@ -284,7 +284,7 @@ the leading word or phrase (`"Answer:"`) to nudge the model to start generating
 
 >>> for seq in sequences:
 ...     print(f"Result: {seq['generated_text']}")
-Result: Modern tools often used to make gazpacho include
+"Result: Modern tools are used, such as immersion blenders"
 ```
 
 #### Reasoning
@@ -309,7 +309,7 @@ Let's try if we can make a model reason about a simple arithmetics task with a b
 >>> for seq in sequences:
 ...     print(f"Result: {seq['generated_text']}")
 Result: 
-There are a total of 5 groups, so there are 5 x 4=20 students in the class.
+There are a total of 50 students in the class (5 groups x 4 students per group = 20 groups, and 
 ```
 
 Correct! Let's increase the complexity a little and see if we can still get away with a basic prompt:
diff --git a/docs/source/en/tasks/question_answering.md b/docs/source/en/tasks/question_answering.md
index 998010e67ca9..41d7fd48cf81 100644
--- a/docs/source/en/tasks/question_answering.md
+++ b/docs/source/en/tasks/question_answering.md
@@ -325,7 +325,7 @@ or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/no
 
 Evaluation for question answering requires a significant amount of postprocessing. To avoid taking up too much of your time, this guide skips the evaluation step. The [`Trainer`] still calculates the evaluation loss during training so you're not completely in the dark about your model's performance.
 
-If have more time and you're interested in how to evaluate your model for question answering, take a look at the [Question answering](https://huggingface.co/course/chapter7/7?fw=pt#post-processing) chapter from the 🤗 Hugging Face Course!
+If you have more time and you're interested in how to evaluate your model for question answering, take a look at the [Question answering](https://huggingface.co/course/chapter7/7?fw=pt#post-processing) chapter from the 🤗 Hugging Face Course!
 
 ## Inference
 
@@ -397,7 +397,7 @@ Tokenize the text and return TensorFlow tensors:
 >>> from transformers import AutoTokenizer
 
 >>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model")
->>> inputs = tokenizer(question, text, return_tensors="tf")
+>>> inputs = tokenizer(question, context, return_tensors="tf")
 ```
 
 Pass your inputs to the model and return the `logits`:
diff --git a/docs/source/en/tasks/summarization.md b/docs/source/en/tasks/summarization.md
index 7d7ecf1fbab6..e16dd17dfe1f 100644
--- a/docs/source/en/tasks/summarization.md
+++ b/docs/source/en/tasks/summarization.md
@@ -283,7 +283,7 @@ Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]:
 ```py
 >>> from transformers.keras_callbacks import KerasMetricCallback
 
->>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set)
 ```
 
 Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]:
diff --git a/docs/source/en/tasks/translation.md b/docs/source/en/tasks/translation.md
index 426ba1c340fb..922cdc724117 100644
--- a/docs/source/en/tasks/translation.md
+++ b/docs/source/en/tasks/translation.md
@@ -290,7 +290,7 @@ Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]:
 ```py
 >>> from transformers.keras_callbacks import KerasMetricCallback
 
->>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set)
 ```
 
 Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]:
diff --git a/docs/source/en/tasks/video_text_to_text.md b/docs/source/en/tasks/video_text_to_text.md
index fcc1c86e8bd7..4c10907e4571 100644
--- a/docs/source/en/tasks/video_text_to_text.md
+++ b/docs/source/en/tasks/video_text_to_text.md
@@ -47,7 +47,7 @@ model_id = "llava-hf/llava-interleave-qwen-0.5b-hf"
 processor = LlavaProcessor.from_pretrained(model_id)
 
 model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16)
-model.to("cuda")
+model.to("cuda") # can also be xpu, mps, npu etc. depending on your hardware accelerator
 ```
 
 Some models directly consume the `