Skip to content

Fix a regression with checkpointing optimizer state #101

Fix a regression with checkpointing optimizer state

Fix a regression with checkpointing optimizer state #101

Workflow file for this run

name: Main
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
on:
pull_request:
branches:
- main
push:
branches:
- main
tags:
- 'v*.*.*'
env:
# Change this to invalidate existing cache.
CACHE_PREFIX: v0
PYTHONPATH: ./src/
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
jobs:
checks:
name: ${{ matrix.task.name }}
# TODO: change to 'ubuntu-latest' once repo is public (will have more RAM then), and update the torch
# install command in the setup-venv action.
runs-on: [macos-13]
timeout-minutes: 5
strategy:
fail-fast: false
matrix:
python: ['3.10']
task:
- name: Lint
run: make lint-check
- name: Test
run: |
pytest -v --color=yes --durations=3 src/test/ \
--ignore-glob='src/test/distributed/fsdp*' \
--ignore-glob='src/test/distributed/checkpoint*'
- name: Test checkpoint
run: |
pytest -v --color=yes --durations=3 src/test/distributed/checkpoint*
- name: Test FSDP
run: |
pytest -v --color=yes --durations=3 src/test/distributed/fsdp/
- name: Type check
run: make type-check
- name: Build
run: make build
- name: Style
run: make style-check
include:
- python: '3.8'
task:
name: Lint (min Python)
run: make lint-check
steps:
- uses: actions/checkout@v3
- name: Setup Python environment
uses: ./.github/actions/setup-venv
with:
python-version: ${{ matrix.python }}
cache-prefix: ${{ env.CACHE_PREFIX }}
- name: Restore mypy cache
if: matrix.task.name == 'Type check'
uses: actions/cache@v3
with:
path: .mypy_cache
key: mypy-${{ env.CACHE_PREFIX }}-${{ runner.os }}-${{ matrix.python }}-${{ hashFiles('pyproject.toml') }}-${{ github.ref }}-${{ github.sha }}
restore-keys: |
mypy-${{ env.CACHE_PREFIX }}-${{ runner.os }}-${{ matrix.python }}-${{ hashFiles('pyproject.toml') }}-${{ github.ref }}
mypy-${{ env.CACHE_PREFIX }}-${{ runner.os }}-${{ matrix.python }}-${{ hashFiles('pyproject.toml') }}
- name: ${{ matrix.task.name }}
run: |
. .venv/bin/activate
${{ matrix.task.run }}
- name: Upload package distribution files
if: matrix.task.name == 'Build'
uses: actions/upload-artifact@v3
with:
name: package
path: dist
- name: Clean up
if: always()
run: |
. .venv/bin/activate
pip uninstall -y ai2-olmo-core
gpu_checks:
name: ${{ matrix.task.name }}
runs-on: ubuntu-latest
timeout-minutes: 8
env:
BEAKER_TOKEN: ${{ secrets.BEAKER_TOKEN }}
BEAKER_IMAGE: olmo-torch2-test
BEAKER_WORKSPACE: ai2/llm-testing
strategy:
fail-fast: false
matrix:
task:
- name: Test (GPU)
run: pytest -v --color=yes --durations=3 -m gpu src/test/ --ignore-glob='src/test/distributed/fsdp*' --ignore-glob='src/test/distributed/checkpoint*'
- name: Test checkpoint (GPU)
run: pytest -v --color=yes --durations=3 -m gpu src/test/distributed/checkpoint*
- name: Test FSDP (GPU)
run: pytest -v --color=yes --durations=3 -m gpu src/test/distributed/fsdp/
steps:
- name: Determine current commit SHA (pull request)
if: github.event_name == 'pull_request'
run: |
echo "COMMIT_SHA=${{ github.event.pull_request.head.sha }}" >> $GITHUB_ENV
- name: Determine current commit SHA (push)
if: github.event_name != 'pull_request'
run: |
echo "COMMIT_SHA=$GITHUB_SHA" >> $GITHUB_ENV
- name: GPU Tests
uses: allenai/[email protected]
if: env.BEAKER_TOKEN != ''
with:
spec: |
version: v2
description: OLMo-core ${{ matrix.task.name }}
budget: ai2/oe-training
tasks:
- name: tests
image:
beaker: ${{ env.BEAKER_IMAGE }}
context:
priority: low
preemptible: true
resources:
gpuCount: 2
constraints:
cluster:
- ai2/general-cirrascale
- ai2/general-cirrascale-a100-80g-ib
- ai2/allennlp-cirrascale
- ai2/allennlp-elanding-a100-40g
- ai2/pluto-cirrascale
- ai2/jupiter-cirrascale
envVars:
- name: CUBLAS_WORKSPACE_CONFIG
value: ":16:8"
- name: TOKENIZERS_PARALLELISM
value: "false"
- name: AWS_ACCESS_KEY_ID
secret: AWS_ACCESS_KEY_ID
- name: AWS_SECRET_ACCESS_KEY
secret: AWS_SECRET_ACCESS_KEY
command:
- "bash"
- "-c"
- "git clone https://github.com/allenai/OLMo-core.git && cd OLMo-core && git checkout ${{ env.COMMIT_SHA }} && pip install -e .[all] && ${{ matrix.task.run }}"
result:
path: /unused
token: ${{ env.BEAKER_TOKEN }}
workspace: ${{ env.BEAKER_WORKSPACE }}