Fix a regression with checkpointing optimizer state #100
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Main | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.ref }} | |
cancel-in-progress: true | |
on: | |
pull_request: | |
branches: | |
- main | |
push: | |
branches: | |
- main | |
tags: | |
- 'v*.*.*' | |
env: | |
# Change this to invalidate existing cache. | |
CACHE_PREFIX: v0 | |
PYTHONPATH: ./src/ | |
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
jobs: | |
checks: | |
name: ${{ matrix.task.name }} | |
# TODO: change to 'ubuntu-latest' once repo is public (will have more RAM then), and update the torch | |
# install command in the setup-venv action. | |
runs-on: [macos-13] | |
timeout-minutes: 5 | |
strategy: | |
fail-fast: false | |
matrix: | |
python: ['3.10'] | |
task: | |
- name: Lint | |
run: make lint-check | |
- name: Test | |
run: | | |
pytest -v --color=yes --durations=3 src/test/ \ | |
--ignore-glob='src/test/distributed/fsdp*' \ | |
--ignore-glob='src/test/distributed/checkpoint*' | |
- name: Test checkpoint | |
run: | | |
pytest -v --color=yes --durations=3 src/test/distributed/checkpoint* | |
- name: Test FSDP | |
run: | | |
pytest -v --color=yes --durations=3 src/test/distributed/fsdp/ | |
- name: Type check | |
run: make type-check | |
- name: Build | |
run: make build | |
- name: Style | |
run: make style-check | |
include: | |
- python: '3.8' | |
task: | |
name: Lint (min Python) | |
run: make lint-check | |
steps: | |
- uses: actions/checkout@v3 | |
- name: Setup Python environment | |
uses: ./.github/actions/setup-venv | |
with: | |
python-version: ${{ matrix.python }} | |
cache-prefix: ${{ env.CACHE_PREFIX }} | |
- name: Restore mypy cache | |
if: matrix.task.name == 'Type check' | |
uses: actions/cache@v3 | |
with: | |
path: .mypy_cache | |
key: mypy-${{ env.CACHE_PREFIX }}-${{ runner.os }}-${{ matrix.python }}-${{ hashFiles('pyproject.toml') }}-${{ github.ref }}-${{ github.sha }} | |
restore-keys: | | |
mypy-${{ env.CACHE_PREFIX }}-${{ runner.os }}-${{ matrix.python }}-${{ hashFiles('pyproject.toml') }}-${{ github.ref }} | |
mypy-${{ env.CACHE_PREFIX }}-${{ runner.os }}-${{ matrix.python }}-${{ hashFiles('pyproject.toml') }} | |
- name: ${{ matrix.task.name }} | |
run: | | |
. .venv/bin/activate | |
${{ matrix.task.run }} | |
- name: Upload package distribution files | |
if: matrix.task.name == 'Build' | |
uses: actions/upload-artifact@v3 | |
with: | |
name: package | |
path: dist | |
- name: Clean up | |
if: always() | |
run: | | |
. .venv/bin/activate | |
pip uninstall -y ai2-olmo-core | |
gpu_checks: | |
name: ${{ matrix.task.name }} | |
runs-on: ubuntu-latest | |
timeout-minutes: 8 | |
env: | |
BEAKER_TOKEN: ${{ secrets.BEAKER_TOKEN }} | |
BEAKER_IMAGE: olmo-torch2-test | |
BEAKER_WORKSPACE: ai2/llm-testing | |
strategy: | |
fail-fast: false | |
matrix: | |
task: | |
- name: Test (GPU) | |
run: pytest -v --color=yes --durations=3 -m gpu src/test/ --ignore-glob='src/test/distributed/fsdp*' --ignore-glob='src/test/distributed/checkpoint*' | |
- name: Test checkpoint (GPU) | |
run: pytest -v --color=yes --durations=3 -m gpu src/test/distributed/checkpoint* | |
- name: Test FSDP (GPU) | |
run: pytest -v --color=yes --durations=3 -m gpu src/test/distributed/fsdp/ | |
steps: | |
- name: Determine current commit SHA (pull request) | |
if: github.event_name == 'pull_request' | |
run: | | |
echo "COMMIT_SHA=${{ github.event.pull_request.head.sha }}" >> $GITHUB_ENV | |
- name: Determine current commit SHA (push) | |
if: github.event_name != 'pull_request' | |
run: | | |
echo "COMMIT_SHA=$GITHUB_SHA" >> $GITHUB_ENV | |
- name: GPU Tests | |
uses: allenai/[email protected] | |
if: env.BEAKER_TOKEN != '' | |
with: | |
spec: | | |
version: v2 | |
description: OLMo-core ${{ matrix.task.name }} | |
budget: ai2/oe-training | |
tasks: | |
- name: tests | |
image: | |
beaker: ${{ env.BEAKER_IMAGE }} | |
context: | |
priority: low | |
preemptible: true | |
resources: | |
gpuCount: 2 | |
constraints: | |
cluster: | |
- ai2/general-cirrascale | |
- ai2/general-cirrascale-a100-80g-ib | |
- ai2/allennlp-cirrascale | |
- ai2/allennlp-elanding-a100-40g | |
- ai2/pluto-cirrascale | |
- ai2/jupiter-cirrascale | |
envVars: | |
- name: CUBLAS_WORKSPACE_CONFIG | |
value: ":16:8" | |
- name: TOKENIZERS_PARALLELISM | |
value: "false" | |
- name: AWS_ACCESS_KEY_ID | |
secret: AWS_ACCESS_KEY_ID | |
- name: AWS_SECRET_ACCESS_KEY | |
secret: AWS_SECRET_ACCESS_KEY | |
command: | |
- "bash" | |
- "-c" | |
- "git clone https://github.com/allenai/OLMo-core.git && cd OLMo-core && git checkout ${{ env.COMMIT_SHA }} && pip install -e .[all] && ${{ matrix.task.run }}" | |
result: | |
path: /unused | |
token: ${{ env.BEAKER_TOKEN }} | |
workspace: ${{ env.BEAKER_WORKSPACE }} |