instructlab · alimaredia · Nov 25, 2024 · RobotSail · Nov 26, 2024 · RobotSail
diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml
@@ -188,14 +188,26 @@ jobs:
           # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
           # and we know that it will be written into a directory created by `mktemp -d`. 
           # Given this information, we can use the following command to find the file:
-          log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl")
-          mv "${log_file}" training-log.jsonl
+          log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
+          phase_num=1;
+          for log_file in $log_files; do
+              mv "${log_file}" phase-${phase_num}-training-log.jsonl
+              ((phase_num++))
+          done
+
+      - name: Upload training logs Phase 1
+        uses: actions/upload-artifact@v4
+        with:
+          name: phase-2-training-log.jsonl
+          path: ./instructlab/phase-1-training-log.jsonl
+          retention-days: 1
+          overwrite: true
 
-      - name: Upload training logs
+      - name: Upload training logs Phase 2
         uses: actions/upload-artifact@v4
         with:
-          name: training-log.jsonl
-          path: ./instructlab/training-log.jsonl
+          name: phase-2-training-log.jsonl
+          path: ./instructlab/phase-2-training-log.jsonl
           retention-days: 1
           overwrite: true
 
@@ -269,24 +281,31 @@ jobs:
           label: ${{ needs.start-large-ec2-runner.outputs.label }}
           ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
 
-      - name: Download loss data
-        id: download-logs
+      - name: Download loss data Phase 1
+        id: phase-1-download-logs
         uses: actions/download-artifact@v4
         with:
-          name: training-log.jsonl
+          name: phase-1-training-log.jsonl
+          path: downloaded-data
+
+      - name: Download loss data Phase 2
+        id: phase-2-download-logs
+        uses: actions/download-artifact@v4
+        with:
+          name: phase-2-training-log.jsonl
           path: downloaded-data
 
       - name: Install dependencies
         run: |
           pip install -r requirements-dev.txt
 
-      - name: Try to upload to s3
-        id: upload-s3
+      - name: Try to upload Phase 1 to s3
+        id: phase-1-upload-s3
         continue-on-error: true
         run: |
-          output_file='./test.md' 
+          output_file='./phase-1-test.md' 
           python scripts/create-loss-graph.py  \
-            --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \
+            --log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
             --output-file "${output_file}" \
             --aws-region "${{ vars.AWS_REGION }}" \
             --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
@@ -295,10 +314,34 @@ jobs:
             --head-sha "${{ github.event.pull_request.head.sha }}" \
             --origin-repository "${{ github.repository }}"
 
-
+
+          cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"
+
-
+
+          cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"
+
-      - name: Check S3 upload status
-        if: steps.upload-s3.outcome == 'failure'
+      - name: Try to upload Phase 2 to s3
+        id: phase-2-upload-s3
+        continue-on-error: true
+        run: |
+          output_file='./phase-2-test.md' 
+          python scripts/create-loss-graph.py  \
+            --log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
+            --output-file "${output_file}" \
+            --aws-region "${{ vars.AWS_REGION }}" \
+            --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
+            --base-branch "${{ github.event.pull_request.base.ref }}" \
+            --pr-number "${{ github.event.pull_request.number }}" \
+            --head-sha "${{ github.event.pull_request.head.sha }}" \
+            --origin-repository "${{ github.repository }}"
+
-
+          cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"
-
+          cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"
+
+      - name: Check Phase 1 S3 upload status
+        if: steps.phase-1-upload-s3.outcome == 'failure'
+        run: |
+          echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate."
+          echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
+
+          cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"
-          cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"
-          cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"
+
+      - name: Check Phase 2 S3 upload status
+        if: steps.phase-2-upload-s3.outcome == 'failure'
         run: |
           echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate."
           echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
 
-          cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"
+          cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"
-          cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"
-          cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"