Merge branch 'main' into hetero_tp

FlagOpen · Jun 7, 2024 · 27577d9 · 27577d9
2 parents d5c38a9 + ac373cb
commit 27577d9
Show file tree

Hide file tree

Showing 44 changed files with 2,171 additions and 131 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -0,0 +1,5 @@
+[html]
+directory = coverage
+
+[run]
+data_file = .coverage_$LOCAL_RANK
diff --git a/.github/workflows/approve.yml b/.github/workflows/approve.yml
@@ -0,0 +1,33 @@
+name: approve
+
+on:
+  pull_request:
+    branches: [ "main" ]
+    types: [opened, synchronize, reopened]
+
+jobs:
+  approve:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v2
+
+    - name: Get PR Approvals
+      run: |
+        PR_NUMBER=$(jq --raw-output .number "$GITHUB_EVENT_PATH")
+        APPROVERS=$(curl -s -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
+          "https://api.github.com/repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews" \
+          | jq -r '.[] | select(.state == "APPROVED") | .user.login')
+
+        echo "APPROVERS=$APPROVERS" >> $GITHUB_ENV
+
+    - name: Check for Specific Approver
+      run: |
+        SPECIFIC_APPROVER="aoyulong"
+        if echo "$APPROVERS" | grep -q "$SPECIFIC_APPROVER"; then
+          echo "Specific approver has approved the PR."
+        else
+          echo "The PR has not been approved by the specific approver."
+          exit 1
+        fi
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -1,48 +1,54 @@
 # This workflow will install Python dependencies, run tests and lint with a single version of Python
 # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 
-name: flagscale-test
+name: test
 
 on:
   push:
-    branches: [ "main", "add_CICD" ]
+    branches: [ "main" ]
   pull_request:
     branches: [ "main" ]
 
 jobs:
-  container-test-job:
+  test:
       runs-on: self-hosted
       container:
-        image: localhost:5000/flagscale_cicd:v1.1
-        env:                                         
-          NODE_ENV: development
+        image: localhost:5000/flagscale_cicd:v1.3
         ports:
           - 80
-        options: --gpus all --hostname flagscale_cicd 
+        volumes:
+          - /home/flagscale_cicd/flask/static:/workspace/report
+        options: --gpus all --hostname flagscale_cicd
       steps:
-        - name: checkout-code
+        - name: Checkout Code
           uses: actions/checkout@v2
 
-        - name: unit_test-megatron
+        - name: Megatron Unit Test
           run: |
-            export PYTHONPATH=./megatron:$PYTHONPATH
-            export PYTHONPATH=./../../FlagScale/:$PYTHONPATH
             cd megatron
-            # torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/data
-            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/dist_checkpointing
-            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/fusions
-            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/models
-            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/pipeline_parallel
-            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/tensor_parallel
-            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/transformer
-            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/*.py
-            
-        - name: unit_test-flagscale
+            export PYTHONPATH=..:$PYTHONPATH
+            torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/data
+            torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/dist_checkpointing
+            torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/fusions
+            torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/models
+            torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/pipeline_parallel
+            torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/tensor_parallel
+            torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/transformer 
+            torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/*.py 
+        
+        - name: Megatron Unit Test Coverage Online Report
+          run: |
+            echo "You can access the test coverage report at the http://120.92.110.224:8081/${{github.sha}}/cov-report-megatron/index.html"
+
+        - name: Flagscale Unit Test
           run: |
-            export PYTHONPATH=./flagscale:$PYTHONPATH
-            torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/launcher
+            torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-flagscale --cov=flagscale -q -x tests/unit_tests/launcher
         
-        - name: functional_test-flagscale
+        - name: Flagscale Unit Test Coverage Online Report
+          run: |
+            echo "You can access the test coverage report at the http://120.92.110.224:8081/${{github.sha}}/cov-report-flagscale/index.html"
+
+        - name: Flagscale Functional Test
           run: |
             python run.py --config-path tests/functional_tests/aquila/conf --config-name config action=test
             pytest -s tests/functional_tests/test_result.py --test_reaults_path=./tests/functional_tests/aquila/test_result
diff --git a/__init__.py b/__init__.py
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -0,0 +1,40 @@
+FROM nvcr.io/nvidia/pytorch:24.05-py3
+
+ENV DEBIAN_FRONTEND noninteractive
+ENV TZ=Asia/Shanghai
+
+##############################################################################
+# Change apt source to Ksyun
+##############################################################################
+RUN sed -i "s#\S\+#http://apt.ksyun.cn/ubuntu/#2" /etc/apt/sources.list && \
+    > /etc/apt/apt.conf.d/docker-clean && \
+    > /etc/dpkg/dpkg.cfg.d/pkg-config-hook-config
+
+##############################################################################
+# Install basic utilities
+##############################################################################
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        curl wget vim tmux less unzip \
+        htop iftop iotop ca-certificates openssh-client openssh-server \
+        rsync iputils-ping net-tools \
+        tzdata psmisc screen && \
+    apt-get clean
+
+##############################################################################
+# SSH configuration (not secure, only for development purpose)
+##############################################################################
+RUN mkdir -p /run/sshd && \
+    ssh-keygen -t rsa -N "" -f ~/.ssh/id_rsa && \
+    cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys && \
+    echo "StrictHostKeyChecking no\n" >> /etc/ssh/ssh_config
+
+##############################################################################
+# Install Miniconda 
+##############################################################################
+RUN mkdir -p ~/miniconda3 && \
+    wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda3/miniconda.sh && \
+    bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3 || { echo 'Miniconda installation failed' ; exit 1; } && \
+    rm -rf ~/miniconda3/miniconda.sh && \
+    ~/miniconda3/bin/conda init bash || { echo 'conda init failed' ; exit 1; } && \
+    ~/miniconda3/bin/conda config --set auto_activate_base false || { echo 'conda config failed' ; exit 1; }
diff --git a/examples/aquila/conf/config_auto_tuner.yaml b/examples/aquila/conf/config_auto_tuner.yaml
@@ -0,0 +1,43 @@
+defaults:
+  - train: demo 
+  - _self_
+
+experiment:
+  exp_name: aquila2
+  exp_dir: ./outputs
+  task:
+    type: train
+    backend: megatron
+    entrypoint: ./flagscale/train/train_aquila.py
+  runner:
+    backend: torchrun
+    nnodes: 1 
+    nproc_per_node: 8 
+  envs:
+    CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+  auto_tuner:
+    space:
+      data_parallel_size: "auto"
+      use_distributed_optimizer: [true, false]
+      tensor_model_parallel_size: [2, 4, 8]
+      sequence_parallel: [true]
+      pipeline_model_parallel_size: "auto"
+      num_layers_per_virtual_pipeline_stage: [1]
+      context_parallel_size: "auto"
+      expert_model_parallel_size: [1]
+      micro_batch_size: "auto"
+      use_recompute: [true]
+      recompute_method: "auto"
+      recompute_granularity: "auto"
+      recompute_num_layers: "auto"
+    control:
+      max_time_per_task: 300
+      train_iters: 5
+      max_time: 600
+
+action: run
+
+hydra:
+  run:
+    dir: ${experiment.exp_dir}/hydra 
diff --git a/examples/aquila/conf/train/train_aquila_7b.yaml b/examples/aquila/conf/train/train_aquila_7b.yaml
@@ -61,4 +61,4 @@ data:
     vocab_file: ./examples/aquila/tokenizer/vocab.json
     merge_file: ./examples/aquila/tokenizer/merges.txt
     special_tokens_file: ./examples/aquila/tokenizer/special_tokens.txt
-    vocab_size: 100008
+    vocab_size: 100008
diff --git a/flagscale/auto_tuner/__init__.py b/flagscale/auto_tuner/__init__.py
@@ -0,0 +1 @@
+from .tuner import AutoTuner
diff --git a/flagscale/auto_tuner/generate.py b/flagscale/auto_tuner/generate.py
@@ -0,0 +1,97 @@
+import os
+import copy
+
+
+class Generator:
+
+    def __init__(self, config):
+        self.config = config
+        # TODO: Just a temporary solution, need to be configurated by user
+        if "args_mapping" in config.experiment.auto_tuner:
+            self.args_mapping = config.experiment.auto_tuner.args_mapping
+        else:
+            self.args_mapping = {
+                "data_parallel_size": "data_parallel_size",
+                "use_distributed_optimizer": "use_distributed_optimizer",
+                "tensor_model_parallel_size": "tensor_model_parallel_size",
+                "sequence_parallel": "sequence_parallel",
+                "pipeline_model_parallel_size": "pipeline_model_parallel_size",
+                "num_layers_per_virtual_pipeline_stage":
+                "num_layers_per_virtual_pipeline_stage",
+                "recompute_method": "recompute_method",
+                "recompute_granularity": "recompute_granularity",
+                "recompute_num_layers": "recompute_num_layers",
+                "micro_batch_size": "micro_batch_size",
+                "context_parallel_size": "context_parallel_size",
+                "expert_model_parallel_size": "expert_model_parallel_size",
+            }
+
+    def _set_value(self, strategy, config):
+        for key, value in self.args_mapping.items():
+            if key in ["micro_batch_size"]:
+                config.train.model[value] = strategy[key]
+            elif key in ["data_parallel_size"]:
+                continue
+            else:
+                if strategy[key] is None:
+                    if value in config.train.system:
+                        del config.train.system[value]
+                    continue
+                config.train.system[value] = strategy[key]
+
+    def gen(self, strategy):
+        config = copy.deepcopy(self.config)
+        self._set_value(strategy, config)
+
+        # Logging interval should be 1
+        config.train.system.logging.log_interval = 1
+
+        # Set redict and tee
+        config.experiment.runner.tee = 3
+        config.experiment.runner.redirects = 3
+
+        # auto_tune should be true, it will not save ckpt when train ended and report memory every iteration
+        config.train.system.auto_tune = True
+
+        # Del lr_warmup_samples and train_samples to run megatron.
+        assert "optimizer" in config.train.model
+        assert "lr_scheduler" in config.train.model.optimizer
+        if "lr_warmup_samples" in config.train.model.optimizer.lr_scheduler:
+            del config.train.model.optimizer.lr_scheduler.lr_warmup_samples
+        # Del lr_decay_samples and train_samples to run megatron.
+        if "lr_decay_samples" in config.train.model.optimizer.lr_scheduler:
+            del config.train.model.optimizer.lr_scheduler.lr_decay_samples
+        # Del rampup_batch_size and train_samples to run megatron.
+        if "rampup_batch_size" in config.train.model.optimizer.lr_scheduler:
+            del config.train.model.optimizer.lr_scheduler.rampup_batch_size
+        # Del lr_decay_samples and train_samples to run megatron.
+        if "lr_warmup_fraction" in config.train.model.optimizer.lr_scheduler:
+            del config.train.model.optimizer.lr_scheduler.lr_warmup_fraction
+
+        if "train_samples" in config.train.model:
+            del config.train.model.train_samples
+
+        # Del checkpoint load
+        if "checkpoint" in config.train.system:
+            if "load" in config.train.system.checkpoint:
+                del config.train.system.checkpoint.load
+            if "save_interval" in config.train.system.checkpoint:
+                config.train.system.checkpoint.save_interval = 2000
+
+        # Set train_iters of each task
+        if "control" in config.experiment.auto_tuner:
+            config.train.model.train_iters = config.experiment.auto_tuner.control.get(
+                "train_iters", 5)
+        else:
+            config.train.model.train_iters = 5
+
+        # log dir
+        config.experiment.exp_dir = os.path.join(config.experiment.exp_dir,
+                                                 "auto_tuner",
+                                                 f"task_{strategy['idx']}")
+
+        return config
+
+    def gen_best_task(self, strategy, config):
+        self._set_value(strategy, config)
+        return config
diff --git a/flagscale/auto_tuner/prune/__init__.py b/flagscale/auto_tuner/prune/__init__.py
@@ -0,0 +1 @@
+from .pruner import Pruner