feat: Add GPU support (#849)

zama-ai · Sep 2, 2024 · 945aead · 945aead
1 parent ee58a68
commit 945aead
Show file tree

Hide file tree

Showing 26 changed files with 1,694 additions and 2,369 deletions.
diff --git a/.github/workflows/refresh-notebooks-gpu.yaml b/.github/workflows/refresh-notebooks-gpu.yaml
@@ -0,0 +1,174 @@
+name: Refresh Notebooks GPU
+on:
+  workflow_dispatch:
+
+concurrency:
+  group: "${{ github.ref }}-${{ github.event_name }}-${{ github.workflow }}"
+  cancel-in-progress: false
+
+env:
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+
+jobs:
+  start-runner-linux:
+    name: Start EC2 runner
+    runs-on: ubuntu-20.04
+    outputs:
+      label-38: ${{ steps.start-ec2-runner-38.outputs.label }}
+      ec2-instance-id-38: ${{ steps.start-ec2-runner-38.outputs.ec2-instance-id || '' }}
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+
+      - name: Start EC2 runner python 38
+        id: start-ec2-runner-38
+        uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e
+        with:
+          mode: start
+          github-token: ${{ secrets.EC2_RUNNER_BOT_TOKEN }}
+          ec2-image-id: ${{ secrets.AWS_EC2_AMI }}
+          ec2-instance-type: "p3.2xlarge"
+          subnet-id: ${{ secrets.AWS_EC2_SUBNET_ID }}
+          security-group-id: ${{ secrets.AWS_EC2_SECURITY_GROUP_ID }}
+
+  refresh-notebooks:
+    needs: [start-runner-linux]
+
+    runs-on: ${{ needs.start-runner-linux.outputs.label-38 }}
+    # Run in a clean container
+    container:
+      image: ubuntu:20.04
+    defaults:
+      run:
+        shell: bash
+    env:
+      PIP_INDEX_URL: ${{ secrets.PIP_INDEX_URL }}
+      PIP_EXTRA_INDEX_URL: ${{ secrets.PIP_EXTRA_INDEX_URL }}
+      KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }}
+      KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }}
+
+    steps:
+      - name: Add masks
+        run: |
+          echo "::add-mask::${{ secrets.INTERNAL_PYPI_URL_FOR_MASK }}"
+          echo "::add-mask::${{ secrets.INTERNAL_REPO_URL_FOR_MASK }}"
+
+      # Replace default archive.ubuntu.com from docker image with fr mirror
+      # original archive showed performance issues and is farther away
+      - name: Docker container related setup and git installation
+        run: |
+          TZ=Europe/Paris
+          echo "TZ=${TZ}" >> "$GITHUB_ENV"
+          ln -snf /usr/share/zoneinfo/${TZ} /etc/localtime && echo ${TZ} > /etc/timezone
+          sed -i 's|^deb http://archive|deb http://fr.archive|g' /etc/apt/sources.list
+          apt update && apt install git git-lfs -y
+
+      - name: Checkout Code
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          lfs: true
+
+      - name: 'Set up Python 3.8'
+        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f
+        with:
+          python-version: '3.8'
+
+      - name: Install dependencies
+        run: |
+          ./script/make_utils/setup_os_deps.sh
+          make setup_env
+
+      - name: Refresh Notebooks
+        run: |
+          make jupyter_execute_gpu
+
+      - name: Prepare PR Body
+        run: |
+          SUCCESSFUL_NOTEBOOKS=$(cat ./successful_notebooks.txt | tr '\n' ' ' | sed 's/ /\\n- /g')
+          FAILED_NOTEBOOKS=$(cat ./failed_notebooks.txt | tr '\n' ' ' | sed 's/ /\\n- /g')
+          PR_BODY="Automatic PR with notebook refresh for ${{ github.ref_name }}.\\n"
+          PR_BODY+="## Successful Notebooks\\n- $SUCCESSFUL_NOTEBOOKS\\n"
+          PR_BODY+="## Failed Notebooks\\n- $FAILED_NOTEBOOKS"
+          echo "PR_BODY=${PR_BODY}" >> "$GITHUB_ENV"
+
+      - name: Open PR
+        uses: peter-evans/create-pull-request@c5a7806660adbe173f04e3e038b0ccdcd758773c
+        with:
+          token: ${{ secrets.BOT_TOKEN }}
+          commit-message: "chore: refresh notebooks"
+          branch: "refresh-notebooks-for-${{ github.ref_name }}"
+          base: "${{ github.ref_name }}"
+          title: "Refresh notebooks for ${{ github.ref_name }}"
+          body: ${{ env.PR_BODY }}
+          add-paths: |
+            docs/**/*.ipynb
+
+  stop-runner-linux:
+    name: Stop EC2 runner
+    needs: [refresh-notebooks, start-runner-linux]
+    runs-on: ubuntu-20.04
+    if: ${{ always() && (needs.start-runner-linux.result != 'skipped') }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+
+      - name: Stop EC2 runner python 38
+        uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e
+        if: ${{ always() && needs.start-runner-linux.outputs.ec2-instance-id-38 }}
+        with:
+          github-token: ${{ secrets.EC2_RUNNER_BOT_TOKEN }}
+          label: ${{ needs.start-runner-linux.outputs.label-38 }}
+          ec2-instance-id: ${{ needs.start-runner-linux.outputs.ec2-instance-id-38 }}
+          mode: stop
+
+  send-report:
+    if: ${{ always() }}
+    needs:
+      [
+        start-runner-linux,
+        refresh-notebooks,
+        stop-runner-linux,
+      ]
+
+    name: Send Slack notification
+    runs-on: ubuntu-20.04
+    steps:
+      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+
+      - name: Prepare whole job status
+        if: ${{ always() }}
+        continue-on-error: true
+        env:
+          NEEDS_JSON: ${{ toJSON(needs) }}
+        run: |
+          echo "${NEEDS_JSON}" > /tmp/needs_context.json
+          JOB_STATUS=$(python3 ./script/actions_utils/actions_combine_status.py \
+          --needs_context_json /tmp/needs_context.json)
+          echo "JOB_STATUS=${JOB_STATUS}" >> "$GITHUB_ENV"
+
+      - name: Slack Notification
+        if: ${{ always() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_COLOR: ${{ env.JOB_STATUS || 'failure' }}
+          SLACK_MESSAGE: "Full run finished with status ${{ env.JOB_STATUS || 'failure' }} \
+            (${{ env.ACTION_RUN_URL }})\n\
+            - start-runner-linux: ${{ needs.start-runner-linux.result || 'Did not run.'}}\n\n\
+            - refresh-notebooks: ${{ needs.refresh-notebooks.result || 'Did not run.' }}\n\n\
+            - stop-runner-linux: ${{ needs.stop-runner-linux.result || 'Did not run.'}}"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
diff --git a/Makefile b/Makefile
@@ -463,6 +463,11 @@ jupyter_execute:
 	poetry run env ./script/make_utils/jupyter.sh --run_all_notebooks
 	"$(MAKE)" finalize_nb
 
+.PHONY: jupyter_execute_gpu # Execute all GPU jupyter notebooks sequentially and sanitize
+jupyter_execute_gpu:
+	poetry run env ./script/make_utils/jupyter.sh --run_all_notebooks_gpu
+	"$(MAKE)" finalize_nb
+
 .PHONY: jupyter_execute_one # Execute one jupyter notebook and sanitize
 jupyter_execute_one:
 	poetry run env ./script/make_utils/jupyter.sh --run_notebook "$${NOTEBOOK}"

diff --git a/conftest.py b/conftest.py
@@ -59,6 +59,12 @@ def pytest_addoption(parser):
         help="To do longer tests.",
     )
 
+    parser.addoption(
+        "--use_gpu",
+        action="store_true",
+        help="Force GPU compilation and execution in tests.",
+    )
+
     parser.addoption(
         "--no-flaky", action="store_true", default=False, help="Don't run known flaky tests."
     )
@@ -275,6 +281,20 @@ def is_weekly_option(request):
     return is_weekly
 
 
+@pytest.fixture
+def get_device_for_compilation(request):
+    """Get the hardware device to compile circuits in tests."""
+
+    def get_device_for_compilation_impl(fhe_mode):
+        use_gpu = request.config.getoption("--use_gpu")
+        device_for_tests = "cuda" if use_gpu else "cpu"
+        if fhe_mode == "execute":
+            return device_for_tests
+        return "cpu"
+
+    return get_device_for_compilation_impl
+
+
 # Method is not ideal as some MLIR can contain TLUs but not the associated graph
 # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/2381
 def check_graph_input_has_no_tlu_impl(graph: CPGraph):

diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md
@@ -32,6 +32,7 @@
 - [Production deployment](guides/client_server.md)
 - [Hybrid models](guides/hybrid-models.md)
 - [Serialization](guides/serialization.md)
+- [GPU acceleration](guides/using_gpu.md)
 
 ## Tutorials
 

diff --git a/docs/advanced_examples/ClientServer.ipynb b/docs/advanced_examples/ClientServer.ipynb
@@ -93,6 +93,13 @@
    "execution_count": 3,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Compilation device override, was 'cpu' -> change to 'cuda'\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -102,6 +109,8 @@
     }
    ],
    "source": [
+    "from concrete.compiler import check_gpu_available\n",
+    "\n",
     "# Let's first get some data and train a model.\n",
     "X, y = load_breast_cancer(return_X_y=True)\n",
     "\n",
@@ -114,10 +123,14 @@
     "if platform.system() == \"Darwin\":\n",
     "    n_estimators = 9\n",
     "\n",
+    "\n",
+    "use_gpu_if_available = False\n",
+    "device = \"cuda\" if use_gpu_if_available and check_gpu_available() else \"cpu\"\n",
+    "\n",
     "# Train the model and compile it\n",
     "model_dev = XGBClassifier(n_bits=2, n_estimators=n_estimators, max_depth=3)\n",
     "model_dev.fit(X_model_owner, y_model_owner)\n",
-    "model_dev.compile(X_model_owner)\n",
+    "model_dev.compile(X_model_owner, device=device)\n",
     "\n",
     "print(\"Model trained and compiled.\")"
    ]
@@ -145,9 +158,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "total 72K\r\n",
-      "-rw-r--r-- 1 root root  59K Jun 20 17:23 client.zip\r\n",
-      "-rw-r--r-- 1 root root 8.6K Jun 20 17:23 server.zip\r\n"
+      "total 64K\r\n",
+      "-rw-rw-r-- 1 stoiana stoiana  59K août  31 14:34 client.zip\r\n",
+      "-rw-rw-r-- 1 stoiana stoiana 2,6K août  31 14:34 server.zip\r\n"
      ]
     }
    ],
@@ -175,8 +188,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "total 12K\r\n",
-      "-rw-r--r-- 1 root root 8.6K Jun 20 17:23 server.zip\r\n"
+      "total 4,0K\r\n",
+      "-rw-rw-r-- 1 stoiana stoiana 2,6K août  31 14:34 server.zip\r\n"
      ]
     }
    ],
@@ -196,7 +209,7 @@
      "output_type": "stream",
      "text": [
       "total 60K\r\n",
-      "-rw-r--r-- 1 root root 59K Jun 20 17:23 client.zip\r\n"
+      "-rw-rw-r-- 1 stoiana stoiana 59K août  31 14:34 client.zip\r\n"
      ]
     }
    ],
@@ -222,7 +235,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "KeySetCache: miss, regenerating /tmp/tmpywfsyoq6/12032042141691665494\n"
+      "KeySetCache: miss, regenerating /tmp/tmpqbqd0vq6/3497183917896914639\n"
      ]
     }
    ],
@@ -243,7 +256,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Evaluation keys size: 29.86 MB\n"
+      "Evaluation keys size: 29.14 MB\n"
      ]
     }
    ],
@@ -267,7 +280,7 @@
      "output_type": "stream",
      "text": [
       "Encrypted data is 4.10 times larger than the clear data\n",
-      "The average execution time is 0.36 seconds per sample.\n"
+      "The average execution time is 0.55 seconds per sample.\n"
      ]
     }
    ],