From eb252f689c79f1998ea8f4f2e5dc1d45fb7b8b56 Mon Sep 17 00:00:00 2001 From: Shane A Date: Thu, 11 Jul 2024 11:17:23 -0700 Subject: [PATCH 1/6] Fix hang in dependency installation, install only train dependencies --- docker/Dockerfile.lumi | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile.lumi b/docker/Dockerfile.lumi index 2a330f146..7097a8610 100644 --- a/docker/Dockerfile.lumi +++ b/docker/Dockerfile.lumi @@ -95,8 +95,10 @@ RUN cd /opt && \ # Install more dependencies COPY pyproject.toml . -RUN mkdir olmo && touch olmo/__init__.py && \ - pip install --no-cache-dir .[all] && \ +RUN mkdir olmo && \ + touch olmo/__init__.py && \ + echo 'VERSION = "0.1.0"' > olmo/version.py && \ + pip install --no-cache-dir .[train] && \ pip uninstall -y ai2-olmo && \ rm -rf olmo/ From 202015f30f5e4200e875c2cba7c456f59b40f9db Mon Sep 17 00:00:00 2001 From: Shane A Date: Thu, 11 Jul 2024 13:02:21 -0700 Subject: [PATCH 2/6] Update rocm and torch versions --- docker/Dockerfile.lumi | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/Dockerfile.lumi b/docker/Dockerfile.lumi index 7097a8610..b759c0e28 100644 --- a/docker/Dockerfile.lumi +++ b/docker/Dockerfile.lumi @@ -47,8 +47,8 @@ RUN apt-get install google-cloud-cli # Install ROCm RUN mkdir --parents --mode=0755 /etc/apt/keyrings && \ wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | tee /etc/apt/keyrings/rocm.gpg > /dev/null && \ - echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/5.6/ubuntu jammy main" >> /etc/apt/sources.list.d/amdgpu.list && \ - echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/5.6 jammy main" >> /etc/apt/sources.list.d/rocm.list && \ + echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/6.0.2/ubuntu jammy main" >> /etc/apt/sources.list.d/amdgpu.list && \ + echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/6.0.2 jammy main" >> /etc/apt/sources.list.d/rocm.list && \ echo 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' > /etc/apt/preferences.d/rocm-pin-600 && \ apt-get update && \ apt-get install -y rccl rccl-dev rocm-libs rocm-gdb rocm-dev rocm-developer-tools rocm-hip-runtime-dev rocm-utils rocm-hip-sdk && \ @@ -83,7 +83,7 @@ ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH # Install torch RUN pip install --upgrade pip -RUN pip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 --index-url https://download.pytorch.org/whl/rocm5.6 +RUN pip install --no-cache-dir torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/rocm6.0 # Install DeepSpeed RUN pip install --no-cache-dir mpi4py From 2174f9701a49a2ddb5f2b982771fbac79a07361f Mon Sep 17 00:00:00 2001 From: Shane A Date: Thu, 11 Jul 2024 13:03:21 -0700 Subject: [PATCH 3/6] Update awscli install --- docker/Dockerfile.lumi | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile.lumi b/docker/Dockerfile.lumi index b759c0e28..fe2b1e475 100644 --- a/docker/Dockerfile.lumi +++ b/docker/Dockerfile.lumi @@ -21,9 +21,9 @@ RUN apt-get install -y \ vim \ fish \ wget \ + unzip \ parallel \ s3cmd \ - awscli \ htop \ wget \ fish \ @@ -38,6 +38,11 @@ RUN apt-get install -y \ gdb \ apt-utils +RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" +RUN unzip awscliv2.zip +RUN ./aws/install +RUN rm -r ./aws/ + # Install Google tools RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - From 60ef2182d61f8869392804e0a749cf7eada4c26d Mon Sep 17 00:00:00 2001 From: Shane A Date: Thu, 11 Jul 2024 13:04:03 -0700 Subject: [PATCH 4/6] Turn off flash attention --- docker/Dockerfile.lumi | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docker/Dockerfile.lumi b/docker/Dockerfile.lumi index fe2b1e475..160ff8849 100644 --- a/docker/Dockerfile.lumi +++ b/docker/Dockerfile.lumi @@ -110,14 +110,14 @@ RUN mkdir olmo && \ RUN pip install --no-cache-dir py-spy RUN pip install --no-cache-dir wandb --upgrade +# # Install flash attention (for MI200 series!) +# RUN cd /opt && \ +# git clone --recursive https://github.com/ROCm/flash-attention.git && \ +# cd flash-attention && \ +# GPU_ARCHS="gfx90a" pip install . + # Cleanup RUN apt-get autoremove RUN rm -rf /opt/mpich-3.1.4 /opt/aws-ofi-rccl /opt/DeepSpeed RUN apt-get clean -RUN pip cache purge - -# Install flash attention (for MI200 series!) -RUN cd /opt && \ - git clone --recursive https://github.com/ROCm/flash-attention.git && \ - cd flash-attention && \ - GPU_ARCHS="gfx90a" pip install . +RUN pip cache purge \ No newline at end of file From 8ddb3a8eda7832b3faeb90e09666eecf272994db Mon Sep 17 00:00:00 2001 From: Shane A Date: Thu, 11 Jul 2024 13:05:04 -0700 Subject: [PATCH 5/6] Lock Ubuntu to jammy --- docker/Dockerfile.lumi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile.lumi b/docker/Dockerfile.lumi index 160ff8849..7950e9539 100644 --- a/docker/Dockerfile.lumi +++ b/docker/Dockerfile.lumi @@ -1,4 +1,4 @@ -FROM ubuntu:latest +FROM ubuntu:jammy ENV DEBIAN_FRONTEND=noninteractive ENV LC_ALL=C.UTF-8 @@ -47,7 +47,7 @@ RUN rm -r ./aws/ RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - RUN apt-get update -RUN apt-get install google-cloud-cli +RUN apt-get install -y google-cloud-cli # Install ROCm RUN mkdir --parents --mode=0755 /etc/apt/keyrings && \ From e0efc20dd38810fe8f01bc8e9cc7bf682a0576bb Mon Sep 17 00:00:00 2001 From: Shane A Date: Thu, 11 Jul 2024 13:16:09 -0700 Subject: [PATCH 6/6] Update to torch 2.3.1 --- docker/Dockerfile.lumi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile.lumi b/docker/Dockerfile.lumi index 7950e9539..5c99998ff 100644 --- a/docker/Dockerfile.lumi +++ b/docker/Dockerfile.lumi @@ -88,7 +88,7 @@ ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH # Install torch RUN pip install --upgrade pip -RUN pip install --no-cache-dir torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/rocm6.0 +RUN pip install --no-cache-dir torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/rocm6.0 # Install DeepSpeed RUN pip install --no-cache-dir mpi4py