Merge branch 'habana_main' into Enc_dec_mss

HabanaAI · Jan 13, 2025 · da339e4 · da339e4
2 parents f93b319 + c245ef0
commit da339e4
Show file tree

Hide file tree

Showing 6 changed files with 139 additions and 13 deletions.
diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
@@ -0,0 +1,10 @@
+self-hosted-runner:
+  # Labels of self-hosted runner in array of strings.
+  labels:
+    - generic-runner
+paths:
+  .github/workflows/trigger_jenkins.yml:
+    ignore:
+      - shellcheck reported issue in this script: SC2116:.+
+      - shellcheck reported issue in this script: SC2086:.+ 
+      - shellcheck reported issue in this script: SC2001:.+    
diff --git a/.github/workflows/trigger_jenkins.yml b/.github/workflows/trigger_jenkins.yml
@@ -6,11 +6,108 @@ on:
 permissions:
     pull-requests: write
 jobs:
-  TriggerJenkinsTests:
+  DependencyReview:
+    name: Dependency Review
     runs-on: ubuntu-latest
     steps:
-        - name: Trigger Jenkins Tests
+      - name: 'Checkout Repository'
+        uses: actions/checkout@v4
+      - name: 'Dependency Review'
+        uses: actions/dependency-review-action@v4
+        with:
+          fail-on-severity: high
+  CodeQLScan:
+    name: CodeQL Scan
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4      
+    - name: Initialize CodeQL
+      uses: github/codeql-action/init@v3
+      with:
+        languages: python
+        build-mode: none
+    - name: Perform CodeQL Analysis
+      uses: github/codeql-action/analyze@v3
+      with:
+        category: "/language:python"
+        upload: "never"
+  CalculateJobs:
+    runs-on: generic-runner
+    name: Calculate Tests To Trigger
+    needs: [DependencyReview,CodeQLScan]
+    outputs:
+        tests_list: ${{ steps.tests.outputs.tests_list }}
+    steps:
+        - name: Checkout
+          uses: actions/checkout@v4
+        - name: Install YQ
+          run: |
+            wget https://github.com/mikefarah/yq/releases/download/v4.14.1/yq_linux_amd64.tar.gz -O - |\
+              tar xz && sudo mv yq_linux_amd64 /usr/bin/yq                
+        - name: Calculate Tests
+          id: tests
+          run: |
+            test_list=$(yq -oj e .jenkins/test_config.yaml  | jq -c "[.stages[].steps[]]")
+            echo "tests_list=${test_list}" >> "$GITHUB_OUTPUT"
+  TestRun:
+    name: Test / ${{matrix.tests.name}}
+    needs: [CalculateJobs]
+    runs-on: generic-runner
+    strategy:
+      fail-fast: false
+      matrix:
+          tests: ${{ fromJson(needs.CalculateJobs.outputs.tests_list) }}          
+    env:
+        USERNAME: ${{ secrets.SWUSERNAME }}
+        PASSWORD: ${{ secrets.SWPASSWORD }}
+        POD_TEMPLATE: ${{ secrets.POD_TEMPLATE }}
+        TEST_COMMAND: ${{ matrix.tests.command }}
+    steps:
+        - name: Download Hlctl
+          run: |
+            curl --show-error --silent ${{ secrets.HLCTL_ADDRESS }} | bash &> /dev/null
+        - name: Config Hlctl
+          run: |
+            ${{ secrets.HLCTL_COMMAND }} &> /dev/null
+        - name: Create Pod Template
+          env:
+            TARGET_BRANCH: ${{ github.base_ref }}
+            RELEASED_SYNAPSE_VERSION: ${{ vars.RELEASED_SYNAPSE_VERSION }}
+            BASE_BRANCH: ${{github.head_ref}}      
+          run: |
+            if [[ $TARGET_BRANCH == "habana_main" ]]; then
+              synapse_version=${RELEASED_SYNAPSE_VERSION#v}
+            elif [[ $TARGET_BRANCH =~ v*.*.* ]]; then
+              synapse_version=${TARGET_BRANCH#v}
+            else
+              echo "Cant Calculate Synapse Version, Failing The Test"
+              exit 1
+            fi
+            synapse_build=$(curl "https://dms.habana-labs.com/api/v1.1/branch/info/v$synapse_version" | jq -r ".release_id")
+            pt_version=${{ vars.PT_VERSION }}
+            BUILD_TAG="Github-vLLM-Fork-${{ github.event.number }}-${{github.run_number}}"
+            safe_cmd=${TEST_COMMAND//&/\\&}
+            echo "Writing Pod Template To File"
+            echo "${POD_TEMPLATE}" > pod.yml
+            sed -i "s/##VERSION##/${synapse_version}/g" pod.yml
+            sed -i "s/##BUILD##/${synapse_build}/g" pod.yml
+            sed -i "s/##BUILD_TAG##/${BUILD_TAG}/g" pod.yml
+            sed -i "s/##PYTORCH_VERSION##/${pt_version}/g" pod.yml
+            sed -i "s|##GIT_BRANCH##|$BASE_BRANCH|g" pod.yml
+            sed -i "s|##CMD##|$safe_cmd|g" pod.yml
+            echo "Pod Template Created"
+        - name: Run Test
           run: |
-            curl -XPOST -H "Content-Type: application/json" \
-            "${{ secrets.WEBHOOK_URL }}" \
-            -d '${{ toJson(github) }}'          
+            converted_test_name=$(echo ${{ matrix.tests.name }} | tr "_" "-")
+            if [[ ${#converted_test_name} -ge 33 ]];then
+              converted_test_name=${converted_test_name:12}
+            fi
+            hlctl create containers \
+                --file=pod.yml \
+                --flavor=${{ matrix.tests.flavor}} \
+                --name="vllm-fork-${{github.event.number}}-${converted_test_name}" \
+                --namespace="framework" \
+                --priority="high" \
+                --retry \
+                --shm=10240           
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
@@ -321,6 +321,17 @@ def _xla_weights_iterator(iterator: Generator):
 
             weights_iterator = _xla_weights_iterator(weights_iterator)
 
+        if current_platform.is_hpu():
+
+            import habana_frameworks.torch.core as htcore
+
+            def _hpu_weights_iterator(iterator: Generator):
+                for weights in iterator:
+                    yield weights
+                    htcore.mark_step()
+
+            weights_iterator = _hpu_weights_iterator(weights_iterator)
+
         # Apply the prefix.
         return ((source.prefix + name, tensor)
                 for (name, tensor) in weights_iterator)

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
@@ -435,8 +435,6 @@ def load_weights(self, weights: Iterable[Tuple[str,
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
-            if is_hpu:
-                torch.hpu.synchronize()
         return loaded_params
 
     # If this function is called, it should always initialize KV cache scale

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
@@ -44,7 +44,6 @@
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
@@ -483,6 +482,4 @@ def load_weights(self, weights: Iterable[Tuple[str,
                                             default_weight_loader)
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
-            if current_platform.is_hpu():
-                torch.hpu.synchronize()
         return loaded_params
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
@@ -1119,7 +1119,7 @@ def _prepare_decode(
                                         device='cpu')
         else:
             real_batch_size = len(seq_group_metadata_list)
-            input_tokens = output[:real_batch_size]
+            input_tokens = output[:real_batch_size].clone()
 
         input_positions = torch.tensor(input_positions,
                                        dtype=torch.long,
@@ -2340,18 +2340,31 @@ def try_revert_dummy_output_tokens():
 
                     result = self._prepare_decode(seq_group_metadata_list,
                                                   output=output)
+                    if self.lora_config:
+                        lora_mapping = LoRAMapping(
+                            **dict(index_mapping=result.lora_index_mapping,
+                                   prompt_mapping=result.lora_prompt_mapping,
+                                   is_prefill=False))
+                        self.set_active_loras(result.lora_requests,
+                                              lora_mapping)
+                        lora_mask, lora_logits_mask = self.create_lora_mask(
+                            result.input_tokens, result.lora_ids, False)
+
                     execute_model_kwargs.update({
                         "input_ids":
                         result.input_tokens,
                         "positions":
                         result.input_positions,
                         "attn_metadata":
-                        self.trim_attn_metadata(result.attn_metadata)
+                        self.trim_attn_metadata(result.attn_metadata),
+                        "lora_mask":
+                        lora_mask,
                     })
                     model_kwargs_broadcast_data = {
                         "input_ids": result.input_tokens,
                         "positions": result.input_positions,
-                        "attn_metadata": vars(result.attn_metadata)
+                        "attn_metadata": vars(result.attn_metadata),
+                        "lora_mask": lora_mask,
                     }
                     broadcast_tensor_dict(model_kwargs_broadcast_data, src=0)
                 else: