Skip to content

Commit

Permalink
ci: Upload phase 1 & phase 2 training logs for loss graphs
Browse files Browse the repository at this point in the history
Signed-off-by: Ali Maredia <[email protected]>
  • Loading branch information
alimaredia committed Nov 25, 2024
1 parent 84c0f72 commit f261630
Showing 1 changed file with 58 additions and 15 deletions.
73 changes: 58 additions & 15 deletions .github/workflows/e2e-nvidia-l40s-x4.yml
Original file line number Diff line number Diff line change
Expand Up @@ -188,14 +188,26 @@ jobs:
# we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
# and we know that it will be written into a directory created by `mktemp -d`.
# Given this information, we can use the following command to find the file:
log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl")
mv "${log_file}" training-log.jsonl
log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
phase_num=1;
for log_file in $log_files; do
mv "${log_file}" phase-${phase_num}-training-log.jsonl
((phase_num++))
done
- name: Upload training logs Phase 1
uses: actions/upload-artifact@v4
with:
name: phase-2-training-log.jsonl
path: ./instructlab/phase-1-training-log.jsonl
retention-days: 1
overwrite: true

- name: Upload training logs
- name: Upload training logs Phase 2
uses: actions/upload-artifact@v4
with:
name: training-log.jsonl
path: ./instructlab/training-log.jsonl
name: phase-2-training-log.jsonl
path: ./instructlab/phase-2-training-log.jsonl
retention-days: 1
overwrite: true

Expand Down Expand Up @@ -269,24 +281,31 @@ jobs:
label: ${{ needs.start-large-ec2-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}

- name: Download loss data
id: download-logs
- name: Download loss data Phase 1
id: phase-1-download-logs
uses: actions/download-artifact@v4
with:
name: training-log.jsonl
name: phase-1-training-log.jsonl
path: downloaded-data

- name: Download loss data Phase 2
id: phase-2-download-logs
uses: actions/download-artifact@v4
with:
name: phase-2-training-log.jsonl
path: downloaded-data

- name: Install dependencies
run: |
pip install -r requirements-dev.txt
- name: Try to upload to s3
id: upload-s3
- name: Try to upload Phase 1 to s3
id: phase-1-upload-s3
continue-on-error: true
run: |
output_file='./test.md'
output_file='./phase-1-test.md'
python scripts/create-loss-graph.py \
--log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \
--log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
--output-file "${output_file}" \
--aws-region "${{ vars.AWS_REGION }}" \
--bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
Expand All @@ -295,10 +314,34 @@ jobs:
--head-sha "${{ github.event.pull_request.head.sha }}" \
--origin-repository "${{ github.repository }}"
- name: Check S3 upload status
if: steps.upload-s3.outcome == 'failure'
- name: Try to upload Phase 2 to s3
id: phase-2-upload-s3
continue-on-error: true
run: |
output_file='./phase-2-test.md'
python scripts/create-loss-graph.py \
--log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
--output-file "${output_file}" \
--aws-region "${{ vars.AWS_REGION }}" \
--bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
--base-branch "${{ github.event.pull_request.base.ref }}" \
--pr-number "${{ github.event.pull_request.number }}" \
--head-sha "${{ github.event.pull_request.head.sha }}" \
--origin-repository "${{ github.repository }}"
- name: Check Phase 1 S3 upload status
if: steps.phase-1-upload-s3.outcome == 'failure'
run: |
echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate."
echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"

- name: Check Phase 2 S3 upload status
if: steps.phase-2-upload-s3.outcome == 'failure'
run: |
echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate."
echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"
cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"

0 comments on commit f261630

Please sign in to comment.