From 6e0925bf31638c80ef1b951264a4ab770bbee859 Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Fri, 6 Dec 2024 13:56:44 +0000
Subject: [PATCH 01/42] migrate from circleci to github actions

---
 .circleci/config.yml                 | 346 ---------------------------
 .github/workflows/build_and_test.yml | 204 ++++++++++++++++
 2 files changed, 204 insertions(+), 346 deletions(-)
 delete mode 100644 .circleci/config.yml
 create mode 100644 .github/workflows/build_and_test.yml

diff --git a/.circleci/config.yml b/.circleci/config.yml
deleted file mode 100644
index 07e685b8f..000000000
--- a/.circleci/config.yml
+++ /dev/null
@@ -1,346 +0,0 @@
-version: 2.1
-
-orbs:
-  python: circleci/python@2
-
-commands:
-  run_chatgpt_api_test:
-    parameters:
-      inference_engine:
-        type: string
-      model_id:
-        type: string
-      expected_output:
-        type: string
-      prompt:
-        type: string
-    steps:
-      - run:
-          name: Run chatgpt api integration test (<<parameters.inference_engine>>, <<parameters.model_id>>)
-          command: |
-            source env/bin/activate
-
-            # Set CLANG=1 for tinygrad only
-            if [ "<<parameters.inference_engine>>" = "tinygrad" ]; then
-              pip install llvmlite
-              export TOKENIZERS_PARALLELISM=true SUPPORT_BF16=0 CLANG=1
-            fi
-
-            # Start first instance
-            HF_HOME="$(pwd)/.hf_cache_node1" DEBUG_DISCOVERY=7 DEBUG=7 exo --inference-engine <<parameters.inference_engine>> \
-              --node-id "node1" --listen-port 5678 --broadcast-port 5679 --chatgpt-api-port 8000 \
-              --chatgpt-api-response-timeout 900 --disable-tui > output1.log &
-            PID1=$!
-            tail -f output1.log &
-            TAIL1=$!
-
-            # Start second instance
-            HF_HOME="$(pwd)/.hf_cache_node2" DEBUG_DISCOVERY=7 DEBUG=7 exo --inference-engine <<parameters.inference_engine>> \
-              --node-id "node2" --listen-port 5679 --broadcast-port 5678 --chatgpt-api-port 8001 \
-              --chatgpt-api-response-timeout 900 --disable-tui > output2.log &
-            PID2=$!
-            tail -f output2.log &
-            TAIL2=$!
-
-            # Remember to kill the tail processes at the end
-            trap 'kill $TAIL1 $TAIL2' EXIT
-
-            # Wait for discovery
-            sleep 10
-
-            # Function to check if processes are still running
-            check_processes() {
-              if ! kill -0 $PID1 2>/dev/null; then
-                echo "First instance (PID $PID1) died unexpectedly. Log output:"
-                cat output1.log
-                exit 1
-              fi
-              if ! kill -0 $PID2 2>/dev/null; then
-                echo "Second instance (PID $PID2) died unexpectedly. Log output:"
-                cat output2.log
-                exit 1
-              fi
-            }
-
-            # Check processes before proceeding
-            check_processes
-
-            echo "Sending request to first instance..."
-            response_1=$(curl -s http://localhost:8000/v1/chat/completions \
-              -H "Content-Type: application/json" \
-              -d '{
-                "model": "<<parameters.model_id>>",
-                "messages": [{"role": "user", "content": "<<parameters.prompt>>"}],
-                "temperature": 0.7
-              }')
-            echo "Response 1: $response_1"
-
-            # Check processes after first response
-            check_processes
-
-            echo "Sending request to second instance..."
-            response_2=$(curl -s http://localhost:8001/v1/chat/completions \
-              -H "Content-Type: application/json" \
-              -d '{
-                "model": "<<parameters.model_id>>",
-                "messages": [{"role": "user", "content": "<<parameters.prompt>>"}],
-                "temperature": 0.7
-              }')
-            echo "Response 2: $response_2"
-
-            # Check processes after second response
-            check_processes
-
-            # Stop both instances
-            kill $PID1 $PID2
-
-            echo ""
-            # Extract content using jq and check if it contains expected output
-            content1=$(echo "$response_1" | jq -r '.choices[0].message.content')
-            content2=$(echo "$response_2" | jq -r '.choices[0].message.content')
-
-            if [[ "$content1" != *"<<parameters.expected_output>>"* ]] || [[ "$content2" != *"<<parameters.expected_output>>"* ]]; then
-              echo "Test failed: Response does not match '<<parameters.expected_output>>'"
-              echo "Response 1 content: $content1"
-              echo ""
-              echo "Response 2 content: $content2"
-              echo "Output of first instance:"
-              cat output1.log
-              echo "Output of second instance:"
-              cat output2.log
-              exit 1
-            else
-              echo "Test passed: Response from both nodes matches '<<parameters.expected_output>>'"
-            fi
-
-jobs:
-  unit_test:
-    macos:
-      xcode: "16.0.0"
-    resource_class: m2pro.large
-    steps:
-      - checkout
-      - run:
-          name: Set up Python
-          command: |
-            brew install python@3.12
-            python3.12 -m venv env
-            source env/bin/activate
-      - run:
-          name: Install dependencies
-          command: |
-            source env/bin/activate
-            pip install --upgrade pip
-            pip install .
-      - run:
-          name: Run tests
-          command: |
-            source env/bin/activate
-            # set TEMPERATURE to 0 for deterministic sampling
-            echo "Running inference engine tests..."
-            METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 METAL_XCODE=1 TEMPERATURE=0 python3 -m exo.inference.test_inference_engine
-            echo "Running tokenizer tests..."
-            python3 ./test/test_tokenizers.py
-            python3 ./test/test_model_helpers.py
-
-  discovery_integration_test:
-    macos:
-      xcode: "16.0.0"
-    steps:
-      - checkout
-      - run:
-          name: Set up Python
-          command: |
-            brew install python@3.12
-            python3.12 -m venv env
-            source env/bin/activate
-      - run:
-          name: Install dependencies
-          command: |
-            source env/bin/activate
-            pip install --upgrade pip
-            pip install .
-      - run:
-          name: Run discovery integration test
-          command: |
-            source env/bin/activate
-            DEBUG_DISCOVERY=7 DEBUG=7 exo --node-id "node1" --listen-port 5678 --broadcast-port 5679 --chatgpt-api-port 8000 --disable-tui > output1.log 2>&1 &
-            PID1=$!
-            DEBUG_DISCOVERY=7 DEBUG=7 exo --node-id "node2" --listen-port 5679 --broadcast-port 5678 --chatgpt-api-port 8001 --disable-tui > output2.log 2>&1 &
-            PID2=$!
-            sleep 10
-            kill $PID1 $PID2
-            if grep -q "Peer statuses: {\\'node2\\': \\'is_connected=True, health_check=True" output1.log && ! grep -q "Failed to connect peers:" output1.log && grep -q "Peer statuses: {\\'node1\\': \\'is_connected=True, health_check=True" output2.log && ! grep -q "Failed to connect peers:" output2.log; then
-              echo "Test passed: Both instances discovered each other"
-              exit 0
-            else
-              echo "Test failed: Devices did not discover each other"
-              echo "Output of first instance:"
-              cat output1.log
-              echo "Output of second instance:"
-              cat output2.log
-              exit 1
-            fi
-
-  chatgpt_api_integration_test_mlx:
-    macos:
-      xcode: "16.0.0"
-    resource_class: m2pro.large
-    steps:
-      - checkout
-      - run:
-          name: Set up Python
-          command: |
-            brew install python@3.12
-            python3.12 -m venv env
-            source env/bin/activate
-      - run:
-          name: Install dependencies
-          command: |
-            source env/bin/activate
-            pip install --upgrade pip
-            pip install .
-      - run_chatgpt_api_test:
-          inference_engine: mlx
-          model_id: llama-3.2-1b
-          prompt: "Keep responses concise. Who was the king of pop?"
-          expected_output: "Michael Jackson"
-
-  chatgpt_api_integration_test_dummy:
-    macos:
-      xcode: "16.0.0"
-    resource_class: m2pro.large
-    steps:
-      - checkout
-      - run:
-          name: Set up Python
-          command: |
-            brew install python@3.12
-            python3.12 -m venv env
-            source env/bin/activate
-      - run:
-          name: Install dependencies
-          command: |
-            source env/bin/activate
-            pip install --upgrade pip
-            pip install .
-      - run_chatgpt_api_test:
-          inference_engine: dummy
-          model_id: dummy
-          prompt: "Dummy prompt."
-          expected_output: "dummy"
-
-  chatgpt_api_integration_test_tinygrad:
-    macos:
-      xcode: "16.0.0"
-    resource_class: m2pro.large
-    steps:
-      - checkout
-      - run:
-          name: Set up Python
-          command: |
-            brew install python@3.12
-            python3.12 -m venv env
-            source env/bin/activate
-      - run:
-          name: Install dependencies
-          command: |
-            source env/bin/activate
-            pip install --upgrade pip
-            pip install .
-      - run_chatgpt_api_test:
-          inference_engine: tinygrad
-          model_id: llama-3.2-1b
-          prompt: "Keep responses concise. Who was the king of pop?"
-          expected_output: "Michael Jackson"
-
-  measure_pip_sizes:
-    macos:
-      xcode: "16.0.0"
-    steps:
-      - checkout
-      - run:
-          name: Set up Python
-          command: |
-            brew install python@3.12
-            python3.12 -m venv env
-            source env/bin/activate
-      - run:
-          name: Install dependencies and measure sizes
-          command: |
-            source env/bin/activate
-            pip install --upgrade pip
-            pip install .
-            python ./extra/pipsize.py --json ./pipsize.json
-      - store_artifacts:
-          path: ./pipsize.json
-          destination: pip-sizes.json
-
-  check_line_count:
-    docker:
-      - image: cimg/python:3.10
-    steps:
-      - checkout
-
-      - run:
-          name: Setup git for PR comparison
-          command: |
-            if [[ -n "$CIRCLE_PULL_REQUEST" ]]; then
-              PR_NUMBER=$(echo $CIRCLE_PULL_REQUEST | rev | cut -d'/' -f1 | rev)
-              BASE_BRANCH=$(curl -s -H "Circle-Token: $CIRCLE_TOKEN" \
-                "https://circleci.com/api/v2/project/github/$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME/pipeline/$CIRCLE_WORKFLOW_ID" \
-                | jq -r '.target_branch')
-
-              git clone -b $BASE_BRANCH --single-branch \
-                https://github.com/$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME.git \
-                base_branch
-            fi
-
-      - run:
-          name: Install dependencies
-          command: |
-            python -m pip install --upgrade pip
-            pip install tabulate
-
-      - run:
-          name: Run line count check
-          command: |
-            if [[ -n "$CIRCLE_PULL_REQUEST" ]]; then
-              python extra/line_counter.py base_branch .
-            else
-              python extra/line_counter.py .
-            fi
-
-      - store_artifacts:
-          path: line-count-snapshot.json
-          destination: line-count-snapshot.json
-
-      - store_artifacts:
-          path: line-count-diff.json
-          destination: line-count-diff.json
-
-      - run:
-          name: Create test results directory
-          command: |
-            mkdir -p test-results/line-count
-            cp line-count-*.json test-results/line-count/
-
-      - store_test_results:
-          path: test-results
-
-workflows:
-  version: 2
-  build_and_test:
-    jobs:
-      - check_line_count:
-          filters:
-            branches:
-              only: /.*/
-            tags:
-              only: /.*/
-      - unit_test
-      - discovery_integration_test
-      - chatgpt_api_integration_test_mlx
-      - chatgpt_api_integration_test_tinygrad
-      - chatgpt_api_integration_test_dummy
-      - measure_pip_sizes
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
new file mode 100644
index 000000000..02289150a
--- /dev/null
+++ b/.github/workflows/build_and_test.yml
@@ -0,0 +1,204 @@
+name: Build and Test
+
+on:
+  push:
+    branches: [ '*' ]
+    tags: [ '*' ]
+  pull_request:
+    branches: [ '*' ]
+
+env:
+  PYTHON_VERSION: "3.12"
+
+jobs:
+  check_line_count:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install tabulate
+
+      - name: Run line count check
+        run: |
+          if [[ -n "${{ github.event.pull_request }}" ]]; then
+            git fetch origin ${{ github.base_ref }}
+            git clone -b ${{ github.base_ref }} --single-branch \
+              https://github.com/${{ github.repository }}.git base_branch
+            python extra/line_counter.py base_branch .
+          else
+            python extra/line_counter.py .
+          fi
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: line-count-results
+          path: |
+            line-count-snapshot.json
+            line-count-diff.json
+
+  unit_test:
+    runs-on: macos-14
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+
+      - name: Install dependencies
+        run: |
+          python -m venv env
+          source env/bin/activate
+          pip install --upgrade pip
+          pip install .
+
+      - name: Run tests
+        run: |
+          source env/bin/activate
+          # set TEMPERATURE to 0 for deterministic sampling
+          echo "Running inference engine tests..."
+          METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 METAL_XCODE=1 TEMPERATURE=0 python3 -m exo.inference.test_inference_engine
+          echo "Running tokenizer tests..."
+          python3 ./test/test_tokenizers.py
+          python3 ./test/test_model_helpers.py
+
+  discovery_integration_test:
+    runs-on: macos-14
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+
+      - name: Install dependencies
+        run: |
+          python -m venv env
+          source env/bin/activate
+          pip install --upgrade pip
+          pip install .
+
+      - name: Run discovery integration test
+        run: |
+          source env/bin/activate
+          DEBUG_DISCOVERY=7 DEBUG=7 exo --node-id "node1" --listen-port 5678 --broadcast-port 5679 --chatgpt-api-port 8000 --disable-tui > output1.log 2>&1 &
+          PID1=$!
+          DEBUG_DISCOVERY=7 DEBUG=7 exo --node-id "node2" --listen-port 5679 --broadcast-port 5678 --chatgpt-api-port 8001 --disable-tui > output2.log 2>&1 &
+          PID2=$!
+          sleep 10
+          kill $PID1 $PID2
+          if grep -q "Peer statuses: {\\'node2\\': \\'is_connected=True, health_check=True" output1.log && ! grep -q "Failed to connect peers:" output1.log && grep -q "Peer statuses: {\\'node1\\': \\'is_connected=True, health_check=True" output2.log && ! grep -q "Failed to connect peers:" output2.log; then
+            echo "Test passed: Both instances discovered each other"
+            exit 0
+          else
+            echo "Test failed: Devices did not discover each other"
+            echo "Output of first instance:"
+            cat output1.log
+            echo "Output of second instance:"
+            cat output2.log
+            exit 1
+          fi
+
+  chatgpt_api_tests:
+    runs-on: macos-14
+    strategy:
+      matrix:
+        inference_engine: [mlx, tinygrad, dummy]
+        include:
+          - inference_engine: mlx
+            model_id: llama-3.2-1b
+            prompt: "Keep responses concise. Who was the king of pop?"
+            expected_output: "Michael Jackson"
+          - inference_engine: tinygrad
+            model_id: llama-3.2-1b
+            prompt: "Keep responses concise. Who was the king of pop?"
+            expected_output: "Michael Jackson"
+          - inference_engine: dummy
+            model_id: dummy
+            prompt: "Dummy prompt."
+            expected_output: "dummy"
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+
+      - name: Install dependencies
+        run: |
+          python -m venv env
+          source env/bin/activate
+          pip install --upgrade pip
+          pip install .
+          if [ "${{ matrix.inference_engine }}" = "tinygrad" ]; then
+            pip install llvmlite
+          fi
+
+      - name: Run ChatGPT API test
+        env:
+          TOKENIZERS_PARALLELISM: ${{ matrix.inference_engine == 'tinygrad' && 'true' || 'false' }}
+          SUPPORT_BF16: ${{ matrix.inference_engine == 'tinygrad' && '0' || '' }}
+          CLANG: ${{ matrix.inference_engine == 'tinygrad' && '1' || '' }}
+        run: |
+          source env/bin/activate
+
+          # Start first instance
+          HF_HOME="$(pwd)/.hf_cache_node1" DEBUG_DISCOVERY=7 DEBUG=7 exo --inference-engine ${{ matrix.inference_engine }} \
+            --node-id "node1" --listen-port 5678 --broadcast-port 5679 --chatgpt-api-port 8000 \
+            --chatgpt-api-response-timeout 900 --disable-tui > output1.log &
+          PID1=$!
+          tail -f output1.log &
+          TAIL1=$!
+
+          # Start second instance
+          HF_HOME="$(pwd)/.hf_cache_node2" DEBUG_DISCOVERY=7 DEBUG=7 exo --inference-engine ${{ matrix.inference_engine }} \
+            --node-id "node2" --listen-port 5679 --broadcast-port 5678 --chatgpt-api-port 8001 \
+            --chatgpt-api-response-timeout 900 --disable-tui > output2.log &
+          PID2=$!
+          tail -f output2.log &
+          TAIL2=$!
+
+          # Remember to kill the tail processes at the end
+          trap 'kill $TAIL1 $TAIL2' EXIT
+
+          # Rest of the test script remains the same as in your CircleCI config
+          # ... (Copy the remaining test logic from the CircleCI config)
+
+  measure_pip_sizes:
+    runs-on: macos-14
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+
+      - name: Install dependencies and measure sizes
+        run: |
+          python -m venv env
+          source env/bin/activate
+          pip install --upgrade pip
+          pip install .
+          python ./extra/pipsize.py --json ./pipsize.json
+
+      - name: Upload pip sizes artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: pip-sizes
+          path: ./pipsize.json

From df832e20df02ad93a9ea46a88a82c7ab6214ecba Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Fri, 6 Dec 2024 14:00:37 +0000
Subject: [PATCH 02/42] macos 15

---
 .github/workflows/build_and_test.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 02289150a..9bcba4070 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -48,7 +48,7 @@ jobs:
             line-count-diff.json
 
   unit_test:
-    runs-on: macos-14
+    runs-on: macos-15
     steps:
       - uses: actions/checkout@v4
 
@@ -75,7 +75,7 @@ jobs:
           python3 ./test/test_model_helpers.py
 
   discovery_integration_test:
-    runs-on: macos-14
+    runs-on: macos-15
     steps:
       - uses: actions/checkout@v4
 
@@ -113,7 +113,7 @@ jobs:
           fi
 
   chatgpt_api_tests:
-    runs-on: macos-14
+    runs-on: macos-15
     strategy:
       matrix:
         inference_engine: [mlx, tinygrad, dummy]
@@ -180,7 +180,7 @@ jobs:
           # ... (Copy the remaining test logic from the CircleCI config)
 
   measure_pip_sizes:
-    runs-on: macos-14
+    runs-on: macos-15
     steps:
       - uses: actions/checkout@v4
 

From 62c9ec96962512bf2766364602549c839f02dfb2 Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Fri, 6 Dec 2024 14:02:56 +0000
Subject: [PATCH 03/42] github env vars

---
 extra/line_counter.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/extra/line_counter.py b/extra/line_counter.py
index e5bf04289..01e7ee20e 100644
--- a/extra/line_counter.py
+++ b/extra/line_counter.py
@@ -74,9 +74,9 @@ def gen_diff(table_old, table_new):
 
 def create_json_report(table, is_diff=False):
     timestamp = datetime.now(timezone.utc).isoformat()
-    commit_sha = os.environ.get('CIRCLE_SHA1', 'unknown')
-    branch = os.environ.get('CIRCLE_BRANCH', 'unknown')
-    pr_number = os.environ.get('CIRCLE_PR_NUMBER', '')
+    commit_sha = os.environ.get('GITHUB_SHA', 'unknown')
+    branch = os.environ.get('GITHUB_REF_NAME', 'unknown')
+    pr_number = os.environ.get('GITHUB_EVENT_NUMBER', '')
 
     if is_diff:
         files = [{

From 550f70b1864d0ef08147b11a1955880a252d4dbe Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Fri, 6 Dec 2024 14:06:12 +0000
Subject: [PATCH 04/42] job

---
 .github/workflows/build_and_test.yml | 69 +++++++++++++++++++++++++++-
 1 file changed, 67 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 9bcba4070..04eba6d22 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -176,8 +176,73 @@ jobs:
           # Remember to kill the tail processes at the end
           trap 'kill $TAIL1 $TAIL2' EXIT
 
-          # Rest of the test script remains the same as in your CircleCI config
-          # ... (Copy the remaining test logic from the CircleCI config)
+          # Wait for discovery
+          sleep 10
+
+          # Function to check if processes are still running
+          check_processes() {
+            if ! kill -0 $PID1 2>/dev/null; then
+              echo "First instance (PID $PID1) died unexpectedly. Log output:"
+              cat output1.log
+              exit 1
+            fi
+            if ! kill -0 $PID2 2>/dev/null; then
+              echo "Second instance (PID $PID2) died unexpectedly. Log output:"
+              cat output2.log
+              exit 1
+            fi
+          }
+
+          # Check processes before proceeding
+          check_processes
+
+          echo "Sending request to first instance..."
+          response_1=$(curl -s http://localhost:8000/v1/chat/completions \
+            -H "Content-Type: application/json" \
+            -d '{
+              "model": "${{ matrix.model_id }}",
+              "messages": [{"role": "user", "content": "${{ matrix.prompt }}"}],
+              "temperature": 0.7
+            }')
+          echo "Response 1: $response_1"
+
+          # Check processes after first response
+          check_processes
+
+          echo "Sending request to second instance..."
+          response_2=$(curl -s http://localhost:8001/v1/chat/completions \
+            -H "Content-Type: application/json" \
+            -d '{
+              "model": "${{ matrix.model_id }}",
+              "messages": [{"role": "user", "content": "${{ matrix.prompt }}"}],
+              "temperature": 0.7
+            }')
+          echo "Response 2: $response_2"
+
+          # Check processes after second response
+          check_processes
+
+          # Stop both instances
+          kill $PID1 $PID2
+
+          echo ""
+          # Extract content using jq and check if it contains expected output
+          content1=$(echo "$response_1" | jq -r '.choices[0].message.content')
+          content2=$(echo "$response_2" | jq -r '.choices[0].message.content')
+
+          if [[ "$content1" != *"${{ matrix.expected_output }}"* ]] || [[ "$content2" != *"${{ matrix.expected_output }}"* ]]; then
+            echo "Test failed: Response does not match '${{ matrix.expected_output }}'"
+            echo "Response 1 content: $content1"
+            echo ""
+            echo "Response 2 content: $content2"
+            echo "Output of first instance:"
+            cat output1.log
+            echo "Output of second instance:"
+            cat output2.log
+            exit 1
+          else
+            echo "Test passed: Response from both nodes matches '${{ matrix.expected_output }}'"
+          fi
 
   measure_pip_sizes:
     runs-on: macos-15

From 66c7c3386926ae2541a09b4e325328dc4d4a1fae Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Fri, 6 Dec 2024 14:46:56 +0000
Subject: [PATCH 05/42] run tinygrad and discovery integratrion tests on linux

---
 .github/workflows/build_and_test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 04eba6d22..ab8bd2052 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -75,7 +75,7 @@ jobs:
           python3 ./test/test_model_helpers.py
 
   discovery_integration_test:
-    runs-on: macos-15
+    runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
 
@@ -113,7 +113,7 @@ jobs:
           fi
 
   chatgpt_api_tests:
-    runs-on: macos-15
+    runs-on: ${{ matrix.inference_engine == 'tinygrad' && 'ubuntu-latest' || 'macos-15' }}
     strategy:
       matrix:
         inference_engine: [mlx, tinygrad, dummy]

From 0a0c058b8024f7f738194fe2430e0980a32ee23a Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Fri, 6 Dec 2024 14:52:07 +0000
Subject: [PATCH 06/42] more robust discovery log check

---
 .github/workflows/build_and_test.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index ab8bd2052..705027aae 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -100,7 +100,10 @@ jobs:
           PID2=$!
           sleep 10
           kill $PID1 $PID2
-          if grep -q "Peer statuses: {\\'node2\\': \\'is_connected=True, health_check=True" output1.log && ! grep -q "Failed to connect peers:" output1.log && grep -q "Peer statuses: {\\'node1\\': \\'is_connected=True, health_check=True" output2.log && ! grep -q "Failed to connect peers:" output2.log; then
+          if grep -q "Peer statuses: {.*'node2': 'is_connected=True, health_check=True" output1.log && \
+             ! grep -q "Failed to connect peers:" output1.log && \
+             grep -q "Peer statuses: {.*'node1': 'is_connected=True, health_check=True" output2.log && \
+             ! grep -q "Failed to connect peers:" output2.log; then
             echo "Test passed: Both instances discovered each other"
             exit 0
           else

From 4940f5269cf275e8ac8b9135a23dc0f71da3090e Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Fri, 6 Dec 2024 15:00:26 +0000
Subject: [PATCH 07/42] check discovery on integration tests too

---
 .github/workflows/build_and_test.yml | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 705027aae..8ea096751 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -116,7 +116,7 @@ jobs:
           fi
 
   chatgpt_api_tests:
-    runs-on: ${{ matrix.inference_engine == 'tinygrad' && 'ubuntu-latest' || 'macos-15' }}
+    runs-on: ${{ matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy' ? 'ubuntu-latest' : 'macos-15' }}
     strategy:
       matrix:
         inference_engine: [mlx, tinygrad, dummy]
@@ -179,8 +179,20 @@ jobs:
           # Remember to kill the tail processes at the end
           trap 'kill $TAIL1 $TAIL2' EXIT
 
-          # Wait for discovery
+          # Wait for discovery and verify peer connections
           sleep 10
+          if ! grep -q "Peer statuses: {.*'node2': 'is_connected=True, health_check=True" output1.log || \
+             grep -q "Failed to connect peers:" output1.log || \
+             ! grep -q "Peer statuses: {.*'node1': 'is_connected=True, health_check=True" output2.log || \
+             grep -q "Failed to connect peers:" output2.log; then
+            echo "Test failed: Nodes did not discover each other properly"
+            echo "Output of first instance:"
+            cat output1.log
+            echo "Output of second instance:"
+            cat output2.log
+            exit 1
+          fi
+          echo "Peer discovery successful"
 
           # Function to check if processes are still running
           check_processes() {

From 4feaf73142a850cc104cbb6c0ef27c89b72c77e0 Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Fri, 6 Dec 2024 16:25:31 +0000
Subject: [PATCH 08/42] cond

---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 8ea096751..e6152c401 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -116,7 +116,7 @@ jobs:
           fi
 
   chatgpt_api_tests:
-    runs-on: ${{ matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy' ? 'ubuntu-latest' : 'macos-15' }}
+    runs-on: ${{ (matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy') && 'ubuntu-latest' || 'macos-15' }}
     strategy:
       matrix:
         inference_engine: [mlx, tinygrad, dummy]

From 3bbca5723e206e0ca0daa99ac75a2939cc4e8675 Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Fri, 6 Dec 2024 16:39:39 +0000
Subject: [PATCH 09/42] give this a goh

---
 .github/workflows/build_and_test.yml | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index e6152c401..e7068f0b0 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -9,6 +9,8 @@ on:
 
 env:
   PYTHON_VERSION: "3.12"
+  TOKENIZERS_PARALLELISM: "false"
+  PYTHONPATH: "."
 
 jobs:
   check_line_count:
@@ -49,6 +51,7 @@ jobs:
 
   unit_test:
     runs-on: macos-15
+    timeout-minutes: 20
     steps:
       - uses: actions/checkout@v4
 
@@ -57,20 +60,29 @@ jobs:
         with:
           python-version: ${{ env.PYTHON_VERSION }}
 
+      # - name: Cache python packages
+      #   uses: actions/cache@v4
+      #   with:
+      #     path: ${{ env.Python3_ROOT_DIR }}/lib/python3.12/site-packages
+      #     key: testing-packages-${{ hashFiles('**/setup.py') }}
+
       - name: Install dependencies
         run: |
           python -m venv env
           source env/bin/activate
           pip install --upgrade pip
+          pip install llvmlite
           pip install .
 
+      - name: Basic import test
+        run: |
+          source env/bin/activate
+          python -c "from tinygrad.tensor import Tensor; print(Tensor([1,2,3,4,5]))"
+
       - name: Run tests
         run: |
           source env/bin/activate
-          # set TEMPERATURE to 0 for deterministic sampling
-          echo "Running inference engine tests..."
-          METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 METAL_XCODE=1 TEMPERATURE=0 python3 -m exo.inference.test_inference_engine
-          echo "Running tokenizer tests..."
+          METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=1 METAL_XCODE=1 TEMPERATURE=0 python3 -m exo.inference.test_inference_engine
           python3 ./test/test_tokenizers.py
           python3 ./test/test_model_helpers.py
 

From a52ac61835b2baa2c6b1703ae33d06e829c9f40f Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Fri, 6 Dec 2024 16:42:22 +0000
Subject: [PATCH 10/42] tooonygrad

---
 .github/workflows/build_and_test.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index e7068f0b0..17d9b5cad 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -167,8 +167,6 @@ jobs:
       - name: Run ChatGPT API test
         env:
           TOKENIZERS_PARALLELISM: ${{ matrix.inference_engine == 'tinygrad' && 'true' || 'false' }}
-          SUPPORT_BF16: ${{ matrix.inference_engine == 'tinygrad' && '0' || '' }}
-          CLANG: ${{ matrix.inference_engine == 'tinygrad' && '1' || '' }}
         run: |
           source env/bin/activate
 

From be18c96cef00fae061fcde3529c1aecfcbd14f5b Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Fri, 6 Dec 2024 16:51:54 +0000
Subject: [PATCH 11/42] disable mlx test for now..plan to run this on a
 self-hosted runner

---
 .github/workflows/build_and_test.yml | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 17d9b5cad..31eeba249 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -131,12 +131,13 @@ jobs:
     runs-on: ${{ (matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy') && 'ubuntu-latest' || 'macos-15' }}
     strategy:
       matrix:
-        inference_engine: [mlx, tinygrad, dummy]
+        # inference_engine: [mlx, tinygrad, dummy]
+        inference_engine: [tinygrad, dummy]
         include:
-          - inference_engine: mlx
-            model_id: llama-3.2-1b
-            prompt: "Keep responses concise. Who was the king of pop?"
-            expected_output: "Michael Jackson"
+          # - inference_engine: mlx
+          #   model_id: llama-3.2-1b
+          #   prompt: "Keep responses concise. Who was the king of pop?"
+          #   expected_output: "Michael Jackson"
           - inference_engine: tinygrad
             model_id: llama-3.2-1b
             prompt: "Keep responses concise. Who was the king of pop?"

From b6529c204770f06d7a6b2ed706b668eedaab88af Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Fri, 6 Dec 2024 16:59:18 +0000
Subject: [PATCH 12/42] clang for tinygrad

---
 .github/workflows/build_and_test.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 31eeba249..8ef7cd244 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -168,6 +168,8 @@ jobs:
       - name: Run ChatGPT API test
         env:
           TOKENIZERS_PARALLELISM: ${{ matrix.inference_engine == 'tinygrad' && 'true' || 'false' }}
+          SUPPORT_BF16: ${{ matrix.inference_engine == 'tinygrad' && '0' || '0' }}
+          CLANG: ${{ matrix.inference_engine == 'tinygrad' && '1' || '0' }}
         run: |
           source env/bin/activate
 

From db7d3a5f7430ff294a1dee8950534fdaddec3da5 Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Fri, 6 Dec 2024 17:03:16 +0000
Subject: [PATCH 13/42] add another chatgpt api integration test for tinygrad
 on metal

---
 .github/workflows/build_and_test.yml | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 8ef7cd244..3ea443db9 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -128,24 +128,25 @@ jobs:
           fi
 
   chatgpt_api_tests:
-    runs-on: ${{ (matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy') && 'ubuntu-latest' || 'macos-15' }}
+    runs-on: ${{ (matrix.inference_engine == 'tinygrad' && matrix.backend != 'metal') || matrix.inference_engine == 'dummy' ? 'ubuntu-latest' : 'macos-14' }}
     strategy:
       matrix:
-        # inference_engine: [mlx, tinygrad, dummy]
-        inference_engine: [tinygrad, dummy]
         include:
-          # - inference_engine: mlx
-          #   model_id: llama-3.2-1b
-          #   prompt: "Keep responses concise. Who was the king of pop?"
-          #   expected_output: "Michael Jackson"
           - inference_engine: tinygrad
             model_id: llama-3.2-1b
             prompt: "Keep responses concise. Who was the king of pop?"
             expected_output: "Michael Jackson"
+            backend: cpu
+          - inference_engine: tinygrad
+            model_id: llama-3.2-1b
+            prompt: "Keep responses concise. Who was the king of pop?"
+            expected_output: "Michael Jackson"
+            backend: metal
           - inference_engine: dummy
             model_id: dummy
             prompt: "Dummy prompt."
             expected_output: "dummy"
+            backend: cpu
 
     steps:
       - uses: actions/checkout@v4
@@ -170,6 +171,11 @@ jobs:
           TOKENIZERS_PARALLELISM: ${{ matrix.inference_engine == 'tinygrad' && 'true' || 'false' }}
           SUPPORT_BF16: ${{ matrix.inference_engine == 'tinygrad' && '0' || '0' }}
           CLANG: ${{ matrix.inference_engine == 'tinygrad' && '1' || '0' }}
+          METAL: ${{ matrix.backend == 'metal' && '1' || '0' }}
+          METAL_DEVICE_WRAPPER_TYPE: ${{ matrix.backend == 'metal' && '1' || '0' }}
+          METAL_DEBUG_ERROR_MODE: ${{ matrix.backend == 'metal' && '1' || '0' }}
+          METAL_XCODE: ${{ matrix.backend == 'metal' && '1' || '0' }}
+
         run: |
           source env/bin/activate
 

From 0fe8065ffbf65687e50f9c07d2d061e7d9e7de6f Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Fri, 6 Dec 2024 17:06:48 +0000
Subject: [PATCH 14/42] remove tinygrad macos

---
 .github/workflows/build_and_test.yml | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 3ea443db9..8ef7cd244 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -128,25 +128,24 @@ jobs:
           fi
 
   chatgpt_api_tests:
-    runs-on: ${{ (matrix.inference_engine == 'tinygrad' && matrix.backend != 'metal') || matrix.inference_engine == 'dummy' ? 'ubuntu-latest' : 'macos-14' }}
+    runs-on: ${{ (matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy') && 'ubuntu-latest' || 'macos-15' }}
     strategy:
       matrix:
+        # inference_engine: [mlx, tinygrad, dummy]
+        inference_engine: [tinygrad, dummy]
         include:
+          # - inference_engine: mlx
+          #   model_id: llama-3.2-1b
+          #   prompt: "Keep responses concise. Who was the king of pop?"
+          #   expected_output: "Michael Jackson"
           - inference_engine: tinygrad
             model_id: llama-3.2-1b
             prompt: "Keep responses concise. Who was the king of pop?"
             expected_output: "Michael Jackson"
-            backend: cpu
-          - inference_engine: tinygrad
-            model_id: llama-3.2-1b
-            prompt: "Keep responses concise. Who was the king of pop?"
-            expected_output: "Michael Jackson"
-            backend: metal
           - inference_engine: dummy
             model_id: dummy
             prompt: "Dummy prompt."
             expected_output: "dummy"
-            backend: cpu
 
     steps:
       - uses: actions/checkout@v4
@@ -171,11 +170,6 @@ jobs:
           TOKENIZERS_PARALLELISM: ${{ matrix.inference_engine == 'tinygrad' && 'true' || 'false' }}
           SUPPORT_BF16: ${{ matrix.inference_engine == 'tinygrad' && '0' || '0' }}
           CLANG: ${{ matrix.inference_engine == 'tinygrad' && '1' || '0' }}
-          METAL: ${{ matrix.backend == 'metal' && '1' || '0' }}
-          METAL_DEVICE_WRAPPER_TYPE: ${{ matrix.backend == 'metal' && '1' || '0' }}
-          METAL_DEBUG_ERROR_MODE: ${{ matrix.backend == 'metal' && '1' || '0' }}
-          METAL_XCODE: ${{ matrix.backend == 'metal' && '1' || '0' }}
-
         run: |
           source env/bin/activate
 

From 099917a1deaf62a5922d612f72ffa99659724323 Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Fri, 6 Dec 2024 17:13:56 +0000
Subject: [PATCH 15/42] prio loopback over container virtual

---
 exo/helpers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/exo/helpers.py b/exo/helpers.py
index 943d7f514..fb497d188 100644
--- a/exo/helpers.py
+++ b/exo/helpers.py
@@ -240,11 +240,11 @@ def get_interface_priority_and_type(ifname: str) -> Tuple[int, str]:
   # Local container/virtual interfaces
   if (ifname.startswith(('docker', 'br-', 'veth', 'cni', 'flannel', 'calico', 'weave')) or
     'bridge' in ifname):
-    return (7, "Container Virtual")
+    return (6, "Container Virtual")
 
   # Loopback interface
   if ifname.startswith('lo'):
-    return (6, "Loopback")
+    return (7, "Loopback")
 
   # Thunderbolt/10GbE detection
   if ifname.startswith(('tb', 'nx', 'ten')):

From 32ba0b4cd25f986aea7fa64db5afd596d410bddf Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Fri, 6 Dec 2024 17:21:25 +0000
Subject: [PATCH 16/42] teeenygrad

---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 8ef7cd244..52fc7daad 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -168,7 +168,7 @@ jobs:
       - name: Run ChatGPT API test
         env:
           TOKENIZERS_PARALLELISM: ${{ matrix.inference_engine == 'tinygrad' && 'true' || 'false' }}
-          SUPPORT_BF16: ${{ matrix.inference_engine == 'tinygrad' && '0' || '0' }}
+          SUPPORT_BF16: ${{ matrix.inference_engine == 'tinygrad' && '1' || '0' }}
           CLANG: ${{ matrix.inference_engine == 'tinygrad' && '1' || '0' }}
         run: |
           source env/bin/activate

From e645b2b6b3d02607a12d09da6c40d7f85853bf71 Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Fri, 6 Dec 2024 17:30:52 +0000
Subject: [PATCH 17/42] run on beefy machine

---
 .github/workflows/build_and_test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 52fc7daad..39a939c24 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -128,7 +128,7 @@ jobs:
           fi
 
   chatgpt_api_tests:
-    runs-on: ${{ (matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy') && 'ubuntu-latest' || 'macos-15' }}
+    runs-on: ${{ (matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy') && 'ubuntu-latest-large' || 'macos-15' }}
     strategy:
       matrix:
         # inference_engine: [mlx, tinygrad, dummy]
@@ -168,7 +168,7 @@ jobs:
       - name: Run ChatGPT API test
         env:
           TOKENIZERS_PARALLELISM: ${{ matrix.inference_engine == 'tinygrad' && 'true' || 'false' }}
-          SUPPORT_BF16: ${{ matrix.inference_engine == 'tinygrad' && '1' || '0' }}
+          SUPPORT_BF16: ${{ matrix.inference_engine == 'tinygrad' && '0' || '0' }}
           CLANG: ${{ matrix.inference_engine == 'tinygrad' && '1' || '0' }}
         run: |
           source env/bin/activate

From 8eaf9d74c36dd24e90b7cdb261c7b36f9ff95c62 Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Fri, 6 Dec 2024 17:47:38 +0000
Subject: [PATCH 18/42] group and labels

---
 .github/workflows/build_and_test.yml | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 39a939c24..ca0b3b36a 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -50,7 +50,9 @@ jobs:
             line-count-diff.json
 
   unit_test:
-    runs-on: macos-15
+    runs-on:
+      group: Default
+      labels: macos-15
     timeout-minutes: 20
     steps:
       - uses: actions/checkout@v4
@@ -128,7 +130,9 @@ jobs:
           fi
 
   chatgpt_api_tests:
-    runs-on: ${{ (matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy') && 'ubuntu-latest-large' || 'macos-15' }}
+    runs-on:
+      group: ${{ (matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy') && 'Default Larger Runners' || 'Default' }}
+      labels: ${{ (matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy') && 'ubuntu-latest-large' || 'macos-15' }}
     strategy:
       matrix:
         # inference_engine: [mlx, tinygrad, dummy]
@@ -273,7 +277,9 @@ jobs:
           fi
 
   measure_pip_sizes:
-    runs-on: macos-15
+    runs-on:
+      group: Default
+      labels: macos-15
     steps:
       - uses: actions/checkout@v4
 

From 8cf9a871d3a7d844fc2e0d02cfacdea140f3cf1e Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Fri, 6 Dec 2024 17:49:49 +0000
Subject: [PATCH 19/42] t

---
 .github/workflows/build_and_test.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index ca0b3b36a..b80be1385 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -51,7 +51,6 @@ jobs:
 
   unit_test:
     runs-on:
-      group: Default
       labels: macos-15
     timeout-minutes: 20
     steps:
@@ -278,7 +277,6 @@ jobs:
 
   measure_pip_sizes:
     runs-on:
-      group: Default
       labels: macos-15
     steps:
       - uses: actions/checkout@v4

From 9fbce8c10ad2977b750ab5dc9ecbe25ad3f6da9e Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Fri, 6 Dec 2024 17:51:20 +0000
Subject: [PATCH 20/42] t


From f4ffdcfef26495709bd9ee66e5f4c9178b11fb1b Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Fri, 6 Dec 2024 17:53:39 +0000
Subject: [PATCH 21/42] t

---
 .github/workflows/build_and_test.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index b80be1385..feab6c4c3 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -130,7 +130,6 @@ jobs:
 
   chatgpt_api_tests:
     runs-on:
-      group: ${{ (matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy') && 'Default Larger Runners' || 'Default' }}
       labels: ${{ (matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy') && 'ubuntu-latest-large' || 'macos-15' }}
     strategy:
       matrix:

From e873d273cdc7fa27aaa183b4df56a82f4f3ea481 Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Fri, 6 Dec 2024 17:58:31 +0000
Subject: [PATCH 22/42] t

---
 .github/workflows/build_and_test.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index feab6c4c3..7e94ed4f6 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -130,7 +130,8 @@ jobs:
 
   chatgpt_api_tests:
     runs-on:
-      labels: ${{ (matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy') && 'ubuntu-latest-large' || 'macos-15' }}
+      # group: ${{ (matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy') && 'Default Larger Runners' || 'Default' }}
+      group: ${{ (matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy') && 'ubuntu-latest-large' || 'macos-15' }}
     strategy:
       matrix:
         # inference_engine: [mlx, tinygrad, dummy]

From de99933e4ad5a60b1cd13a762def454549431dc6 Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Fri, 6 Dec 2024 17:59:33 +0000
Subject: [PATCH 23/42] a

---
 .github/workflows/build_and_test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 7e94ed4f6..d9dcaf646 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -129,9 +129,9 @@ jobs:
           fi
 
   chatgpt_api_tests:
-    runs-on:
+    runs-on: ${{ (matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy') && 'ubuntu-latest-large' || 'macos-15' }}
       # group: ${{ (matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy') && 'Default Larger Runners' || 'Default' }}
-      group: ${{ (matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy') && 'ubuntu-latest-large' || 'macos-15' }}
+      # group: ${{ (matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy') && 'ubuntu-latest-large' || 'macos-15' }}
     strategy:
       matrix:
         # inference_engine: [mlx, tinygrad, dummy]

From 0d95fe380d9c9334b03d3ac12332d1a22c09820a Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Fri, 6 Dec 2024 18:00:28 +0000
Subject: [PATCH 24/42] e


From 135a2276279f3347ed4601e2e777ec0d140eddda Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Fri, 6 Dec 2024 18:02:51 +0000
Subject: [PATCH 25/42] t

---
 .github/workflows/build_and_test.yml | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index d9dcaf646..8ef7cd244 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -50,8 +50,7 @@ jobs:
             line-count-diff.json
 
   unit_test:
-    runs-on:
-      labels: macos-15
+    runs-on: macos-15
     timeout-minutes: 20
     steps:
       - uses: actions/checkout@v4
@@ -129,9 +128,7 @@ jobs:
           fi
 
   chatgpt_api_tests:
-    runs-on: ${{ (matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy') && 'ubuntu-latest-large' || 'macos-15' }}
-      # group: ${{ (matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy') && 'Default Larger Runners' || 'Default' }}
-      # group: ${{ (matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy') && 'ubuntu-latest-large' || 'macos-15' }}
+    runs-on: ${{ (matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy') && 'ubuntu-latest' || 'macos-15' }}
     strategy:
       matrix:
         # inference_engine: [mlx, tinygrad, dummy]
@@ -276,8 +273,7 @@ jobs:
           fi
 
   measure_pip_sizes:
-    runs-on:
-      labels: macos-15
+    runs-on: macos-15
     steps:
       - uses: actions/checkout@v4
 

From 5abdf6a9d88a536882ed1f468d46902272092ca3 Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Sun, 8 Dec 2024 18:24:56 +0000
Subject: [PATCH 26/42] new dash

---
 extra/dashboard/dashboard.py     | 1270 +++---------------------------
 extra/dashboard/mock_data.json   |   54 ++
 extra/dashboard/requirements.txt |    7 +-
 3 files changed, 181 insertions(+), 1150 deletions(-)
 create mode 100644 extra/dashboard/mock_data.json

diff --git a/extra/dashboard/dashboard.py b/extra/dashboard/dashboard.py
index 0ef6d3edd..df047c3f7 100644
--- a/extra/dashboard/dashboard.py
+++ b/extra/dashboard/dashboard.py
@@ -1,1147 +1,127 @@
-import os
+import dash
+from dash import html, dcc
+import plotly.graph_objs as go
+from dash.dependencies import Input, Output
+import boto3
 import json
-import logging
-import asyncio
-import aiohttp
-import pandas as pd
-import plotly.express as px
-from typing import List, Dict, Optional
-from pathlib import Path
-from plotly.subplots import make_subplots
-import plotly.graph_objects as go
-import time
-import pygame.mixer
-from datetime import datetime
-
-class AsyncCircleCIClient:
-    def __init__(self, token: str, project_slug: str):
-        self.token = token
-        self.project_slug = project_slug
-        self.base_url = "https://circleci.com/api/v2"
-        self.headers = {
-            "Circle-Token": token,
-            "Accept": "application/json"
-        }
-        self.logger = logging.getLogger("CircleCI")
-
-    async def get_json(self, session: aiohttp.ClientSession, url: str, params: Dict = None) -> Dict:
-        async with session.get(url, params=params) as response:
-            response.raise_for_status()
-            return await response.json()
-
-    async def get_recent_pipelines(
-        self,
-        session: aiohttp.ClientSession,
-        org_slug: str = None,
-        page_token: str = None,
-        limit: int = None,
-        branch: str = None
-    ):
-        """
-        Get recent pipelines for a project with pagination support
-        """
-        params = {
-            "branch": branch,
-            "page-token": page_token
-        }
-
-        # Remove None values
-        params = {k: v for k, v in params.items() if v is not None}
-
-        url = f"{self.base_url}/project/{self.project_slug}/pipeline"
-        data = await self.get_json(session, url, params)
-        pipelines = data["items"]
-
-        next_page_token = data.get("next_page_token")
-
-        # If we have a limit, check if we need more pages
-        if limit and len(pipelines) >= limit:
-            return pipelines
-
-        # If there are more pages and we haven't hit the limit, recursively get them
-        if next_page_token:
-            next_pipelines = await self.get_recent_pipelines(
-                session,
-                org_slug,
-                page_token=next_page_token,
-                limit=limit - len(pipelines) if limit else None,  # Adjust limit for next page
-                branch=branch
-            )
-            pipelines.extend(next_pipelines)
-
-        return pipelines
-
-    async def get_workflow_jobs(self, session: aiohttp.ClientSession, pipeline_id: str) -> List[Dict]:
-        self.logger.debug(f"Fetching workflows for pipeline {pipeline_id}")
-        url = f"{self.base_url}/pipeline/{pipeline_id}/workflow"
-        workflows_data = await self.get_json(session, url)
-        workflows = workflows_data["items"]
-
-        # Fetch all jobs for all workflows in parallel
-        jobs_tasks = []
-        for workflow in workflows:
-            url = f"{self.base_url}/workflow/{workflow['id']}/job"
-            jobs_tasks.append(self.get_json(session, url))
-
-        jobs_responses = await asyncio.gather(*jobs_tasks, return_exceptions=True)
-
-        all_jobs = []
-        for jobs_data in jobs_responses:
-            if isinstance(jobs_data, Exception):
-                continue
-            all_jobs.extend(jobs_data["items"])
-
-        return all_jobs
-
-    async def get_artifacts(self, session: aiohttp.ClientSession, job_number: str) -> List[Dict]:
-        url = f"{self.base_url}/project/{self.project_slug}/{job_number}/artifacts"
-        data = await self.get_json(session, url)
-        return data["items"]
-
-class PackageSizeTracker:
-    def __init__(self, token: str, project_slug: str, debug: bool = False):
-        self.setup_logging(debug)
-        self.client = AsyncCircleCIClient(token, project_slug)
-        self.logger = logging.getLogger("PackageSizeTracker")
-        self.last_data_hash = None
-        self.debug = debug
-
-        # Initialize pygame mixer
-        pygame.mixer.init()
-
-        # Sound file paths - can use MP3 files with pygame
-        sounds_dir = Path(__file__).parent / "sounds"
-        self.sounds = {
-            'lines_up': sounds_dir / "gta5_wasted.mp3",
-            'lines_down': sounds_dir / "pokemon_evolve.mp3",
-            'tokens_up': sounds_dir / "pokemon_evolve.mp3",
-            'tokens_down': sounds_dir / "gta5_wasted.mp3",
-            'size_up': sounds_dir / "gta5_wasted.mp3",
-            'size_down': sounds_dir / "pokemon_evolve.mp3"
-        }
-
-    def test_sound_effects(self):
-        """Test all sound effects with a small delay between each"""
-        self.logger.info("Testing sound effects...")
-        for sound_key in self.sounds:
-            self.logger.info(f"Playing {sound_key}")
-            self._play_sound(sound_key)
-            time.sleep(1)  # Wait 1 second between sounds
-
-    def setup_logging(self, debug: bool):
-        level = logging.DEBUG if debug else logging.INFO
-        logging.basicConfig(
-            level=level,
-            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-            datefmt='%H:%M:%S'
-        )
-
-    def extract_commit_info(self, pipeline: Dict) -> Optional[Dict]:
-        try:
-            # Extract from github_app first (preferred)
-            if 'trigger_parameters' in pipeline and 'github_app' in pipeline['trigger_parameters']:
-                github_app = pipeline['trigger_parameters']['github_app']
-                return {
-                    'commit_hash': github_app.get('checkout_sha'),
-                    'web_url': f"{github_app.get('repo_url')}/commit/{github_app.get('checkout_sha')}",
-                    'branch': github_app.get('branch', 'unknown'),
-                    'author': {
-                        'name': github_app.get('commit_author_name'),
-                        'email': github_app.get('commit_author_email'),
-                        'username': github_app.get('user_username')
-                    },
-                    'message': github_app.get('commit_message')
-                }
-
-            # Fallback to git parameters
-            if 'trigger_parameters' in pipeline and 'git' in pipeline['trigger_parameters']:
-                git = pipeline['trigger_parameters']['git']
-                return {
-                    'commit_hash': git.get('checkout_sha'),
-                    'web_url': f"{git.get('repo_url')}/commit/{git.get('checkout_sha')}",
-                    'branch': git.get('branch', 'unknown'),
-                    'author': {
-                        'name': git.get('commit_author_name'),
-                        'email': git.get('commit_author_email'),
-                        'username': git.get('author_login')
-                    },
-                    'message': git.get('commit_message')
-                }
-
-            self.logger.warning(f"Could not find commit info in pipeline {pipeline['id']}")
-            return None
-
-        except Exception as e:
-            self.logger.error(f"Error extracting commit info: {str(e)}")
-            return None
-
-    async def process_pipeline(self, session: aiohttp.ClientSession, pipeline: Dict) -> Optional[Dict]:
-        try:
-            commit_info = self.extract_commit_info(pipeline)
-            if not commit_info:
-                return None
-
-            data_point = {
-                "commit_hash": commit_info['commit_hash'],
-                "commit_url": commit_info['web_url'],
-                "timestamp": pipeline.get("created_at", pipeline.get("updated_at")),
-                "pipeline_status": pipeline.get("state", "unknown"),
-                "branch": commit_info['branch'],
-                "author": commit_info['author'],
-                "commit_message": commit_info['message']
-            }
-
-            jobs = await self.client.get_workflow_jobs(session, pipeline["id"])
-
-            # Get package size data
-            size_job = next(
-                (j for j in jobs if j["name"] == "measure_pip_sizes" and j["status"] == "success"),
-                None
-            )
-
-            # Get line count data
-            linecount_job = next(
-                (j for j in jobs if j["name"] == "check_line_count" and j["status"] == "success"),
-                None
-            )
-
-            # Get benchmark data from runner job
-            benchmark_job = next(
-                (j for j in jobs if j["name"] == "runner" and j["status"] == "success"),
-                None
-            )
-
-            # Return None if no relevant jobs found
-            if not size_job and not linecount_job and not benchmark_job:
-                self.logger.debug(f"No relevant jobs found for pipeline {pipeline['id']}")
-                return None
-
-            # Process benchmark data if available
-            if benchmark_job:
-                benchmark_artifacts = await self.client.get_artifacts(session, benchmark_job["job_number"])
-                benchmark_report = next(
-                    (a for a in benchmark_artifacts if a["path"].endswith("benchmark.json")),
-                    None
-                )
-                if benchmark_report:
-                    benchmark_data = await self.client.get_json(session, benchmark_report["url"])
-                    data_point.update({
-                        "tokens_per_second": benchmark_data["tokens_per_second"],
-                        "time_to_first_token": benchmark_data.get("time_to_first_token", 0)
-                    })
-                    self.logger.info(
-                        f"Processed benchmark data for pipeline {pipeline['id']}: "
-                        f"commit {commit_info['commit_hash'][:7]}, "
-                        f"tokens/s {benchmark_data['tokens_per_second']:.2f}"
-                    )
-
-            # Process size data if available
-            if size_job:
-                size_artifacts = await self.client.get_artifacts(session, size_job["job_number"])
-                size_report = next(
-                    (a for a in size_artifacts if a["path"].endswith("pip-sizes.json")),
-                    None
-                )
-                if size_report:
-                    size_data = await self.client.get_json(session, size_report["url"])
-                    data_point.update({
-                        "total_size_mb": size_data["total_size_mb"],
-                        "packages": size_data["packages"]
-                    })
-                    self.logger.info(
-                        f"Processed size data for pipeline {pipeline['id']}: "
-                        f"commit {commit_info['commit_hash'][:7]}, "
-                        f"size {size_data['total_size_mb']:.2f}MB"
-                    )
-
-            # Process linecount data if available
-            if linecount_job:
-                linecount_artifacts = await self.client.get_artifacts(session, linecount_job["job_number"])
-                linecount_report = next(
-                    (a for a in linecount_artifacts if a["path"].endswith("line-count-snapshot.json")),
-                    None
-                )
-                if linecount_report:
-                    linecount_data = await self.client.get_json(session, linecount_report["url"])
-                    data_point.update({
-                        "total_lines": linecount_data["total_lines"],
-                        "total_files": linecount_data["total_files"],
-                        "files": linecount_data["files"]
-                    })
-                    self.logger.info(
-                        f"Processed line count data for pipeline {pipeline['id']}: "
-                        f"commit {commit_info['commit_hash'][:7]}, "
-                        f"lines {linecount_data['total_lines']:,}"
-                    )
-
-            return data_point
-
-        except Exception as e:
-            self.logger.error(f"Error processing pipeline {pipeline['id']}: {str(e)}")
-            return None
-
-    async def process_pipeline_batch(
-        self,
-        session: aiohttp.ClientSession,
-        pipelines: List[Dict],
-        batch_size: int = 5
-    ) -> List[Dict]:
-        """
-        Process a batch of pipelines with rate limiting.
-
-        Args:
-            session: aiohttp client session
-            pipelines: List of pipelines to process
-            batch_size: Number of pipelines to process in parallel
-
-        Returns:
-            List of processed pipeline data points
-        """
-        data_points = []
-
-        for i in range(0, len(pipelines), batch_size):
-            batch = pipelines[i:i + batch_size]
-
-            # Process batch in parallel
-            tasks = [self.process_pipeline(session, pipeline) for pipeline in batch]
-            batch_results = await asyncio.gather(*tasks)
-
-            # Filter out None results
-            batch_data = [r for r in batch_results if r is not None]
-            data_points.extend(batch_data)
-
-            # Add delay between batches if there are more to process
-            if i + batch_size < len(pipelines):
-                await asyncio.sleep(1)  # 1 second delay between batches
-
-        return data_points
-
-    async def collect_data(self) -> List[Dict]:
-        self.logger.info("Starting data collection...")
-        async with aiohttp.ClientSession(headers=self.client.headers) as session:
-            # Get pipelines from main branch
-            main_pipelines = await self.client.get_recent_pipelines(
-                session,
-                org_slug=self.client.project_slug,
-                limit=20,
-                branch="main"
-            )
-
-            # Add delay between branch requests
-            await asyncio.sleep(2)
-
-            # Get pipelines from circleci branch
-            circleci_pipelines = await self.client.get_recent_pipelines(
-                session,
-                org_slug=self.client.project_slug,
-                limit=20,
-                branch="circleci"
-            )
-
-            # Combine pipelines and sort by created_at date
-            pipelines = main_pipelines + circleci_pipelines
-            pipelines.sort(
-                key=lambda x: datetime.fromisoformat(
-                    x.get("created_at", x.get("updated_at")).replace('Z', '+00:00')
-                ),
-                reverse=True  # Most recent first
-            )
-
-            self.logger.info(f"Found {len(pipelines)} recent pipelines")
-
-            # Process pipelines in batches
-            data_points = await self.process_pipeline_batch(session, pipelines)
-
-            # Sort by timestamp
-            data_points.sort(
-                key=lambda x: datetime.fromisoformat(
-                    x.get("timestamp").replace('Z', '+00:00')
-                ),
-                reverse=True  # Most recent first
-            )
-
-        return data_points
-
-    def generate_report(self, data: List[Dict], output_dir: str = "reports") -> Optional[str]:
-        self.logger.info("Generating report...")
-        if not data:
-            self.logger.error("No data to generate report from!")
-            return None
-
-        # Get latest pipeline status based on errors
-        latest_main_pipeline = next((d for d in data if d.get('branch') == 'main'), None)
-        latest_pipeline_status = 'success' if latest_main_pipeline and not latest_main_pipeline.get('errors') else 'failure'
-
-        # Log the pipeline status
-        if latest_main_pipeline:
-            self.logger.info(
-                f"Latest main branch pipeline status: {latest_pipeline_status} "
-                f"(commit: {latest_main_pipeline['commit_hash'][:7]})"
-            )
-        else:
-            self.logger.warning("No pipeline data found for main branch")
-
-        # Convert output_dir to Path object
-        output_dir = Path(output_dir)
-
-        # Create output directory if it doesn't exist
-        output_dir.mkdir(parents=True, exist_ok=True)
-
-        # Create separate dataframes for each metric
-        df_size = pd.DataFrame([d for d in data if 'total_size_mb' in d])
-        df_lines = pd.DataFrame([d for d in data if 'total_lines' in d])
-        df_benchmark = pd.DataFrame([d for d in data if 'tokens_per_second' in d])
-
-        # Create a single figure with subplots
-        fig = make_subplots(
-            rows=3, cols=2,
-            subplot_titles=('', 'Package Size', '', 'Line Count', '', 'Tokens per Second'),
-            vertical_spacing=0.2,
-            column_widths=[0.2, 0.8],
-            specs=[[{"type": "indicator"}, {"type": "scatter"}],
-                   [None, {"type": "scatter"}],
-                   [None, {"type": "scatter"}]]
-        )
-
-        # Add package size trace if we have data
-        if not df_size.empty:
-            df_size['timestamp'] = pd.to_datetime(df_size['timestamp'])
-            df_size = df_size.sort_values('timestamp')
-
-            fig.add_trace(
-                go.Scatter(
-                    x=df_size['timestamp'],
-                    y=df_size['total_size_mb'],
-                    mode='lines+markers',
-                    name='Package Size',
-                    customdata=df_size[['commit_hash', 'commit_url']].values,
-                    hovertemplate="<br>".join([
-                        "Size: %{y:.2f}MB",
-                        "Date: %{x}",
-                        "Commit: %{customdata[0]}",
-                        "<extra></extra>"
-                    ])
-                ),
-                row=1, col=2
-            )
-            fig.update_yaxes(title_text="Size (MB)", row=1, col=2)
-
-        # Add line count trace if we have data
-        if not df_lines.empty:
-            df_lines['timestamp'] = pd.to_datetime(df_lines['timestamp'])
-            df_lines = df_lines.sort_values('timestamp')
-
-            fig.add_trace(
-                go.Scatter(
-                    x=df_lines['timestamp'],
-                    y=df_lines['total_lines'],
-                    mode='lines+markers',
-                    name='Line Count',
-                    customdata=df_lines[['commit_hash', 'commit_url']].values,
-                    hovertemplate="<br>".join([
-                        "Lines: %{y:,.0f}",
-                        "Date: %{x}",
-                        "Commit: %{customdata[0]}",
-                        "<extra></extra>"
-                    ])
-                ),
-                row=2, col=2
-            )
-            fig.update_yaxes(title_text="Total Lines", row=2, col=2)
-
-        # Add tokens per second trace if we have data
-        if not df_benchmark.empty:
-            df_benchmark['timestamp'] = pd.to_datetime(df_benchmark['timestamp'])
-            df_benchmark = df_benchmark.sort_values('timestamp')
-
-            fig.add_trace(
-                go.Scatter(
-                    x=df_benchmark['timestamp'],
-                    y=df_benchmark['tokens_per_second'],
-                    mode='lines+markers',
-                    name='Tokens/Second',
-                    customdata=df_benchmark[['commit_hash', 'commit_url']].values,
-                    hovertemplate="<br>".join([
-                        "Tokens/s: %{y:.2f}",
-                        "Date: %{x}",
-                        "Commit: %{customdata[0]}",
-                        "<extra></extra>"
-                    ])
-                ),
-                row=3, col=2
-            )
-            fig.update_yaxes(title_text="Tokens per Second", row=3, col=2)
-
-        # Update layout
-        fig.update_layout(
-            height=800,
-            showlegend=False,
-            title_text="Package Metrics Dashboard",
-            title_x=0.5,
-            plot_bgcolor='white',
-            paper_bgcolor='white',
-            font=dict(size=12),
-            hovermode='x unified'
-        )
-
-        # Update the dashboard HTML with date range picker
-        dashboard_html = f"""
-        <html>
-        <head>
-            <title>Package Metrics Dashboard</title>
-            <link rel="stylesheet" type="text/css" href="https://cdn.jsdelivr.net/npm/daterangepicker/daterangepicker.css" />
-            <style>
-                body {{
-                    background-color: #f5f6fa;
-                    margin: 0;
-                    padding: 20px;
-                    font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
-                }}
-
-                .date-picker-container {{
-                    background: white;
-                    padding: 15px;
-                    border-radius: 12px;
-                    box-shadow: 0 2px 4px rgba(0,0,0,0.1);
-                    margin: 20px auto;
-                    width: fit-content;
-                }}
-
-                #daterange {{
-                    padding: 8px 12px;
-                    border: 1px solid #ddd;
-                    border-radius: 8px;
-                    font-size: 14px;
-                    width: 300px;
-                    cursor: pointer;
-                }}
-
-                .quick-ranges {{
-                    margin-top: 10px;
-                    display: flex;
-                    gap: 8px;
-                    justify-content: center;
-                }}
-
-                .quick-ranges button {{
-                    padding: 8px 16px;
-                    border: 1px solid #e1e4e8;
-                    border-radius: 8px;
-                    background: white;
-                    cursor: pointer;
-                    font-size: 13px;
-                    transition: all 0.2s ease;
-                }}
-
-                .quick-ranges button:hover {{
-                    background: #f0f0f0;
-                    transform: translateY(-1px);
-                }}
-
-                .dashboard-grid {{
-                    display: grid;
-                    grid-template-columns: 300px 1fr;
-                    gap: 20px;
-                    margin-top: 20px;
-                }}
-
-                .chart-container {{
-                    background: white;
-                    border-radius: 12px;
-                    box-shadow: 0 2px 4px rgba(0,0,0,0.1);
-                    padding: 20px;
-                    height: 350px;
-                }}
-
-                .chart-row {{
-                    display: grid;
-                    grid-template-columns: repeat(2, 1fr);
-                    gap: 20px;
-                }}
-
-                .chart-row-full {{
-                    grid-column: 2 / -1;
-                }}
-
-                .chart-box {{
-                    background: white;
-                    border-radius: 12px;
-                    box-shadow: 0 2px 4px rgba(0,0,0,0.1);
-                    padding: 20px;
-                    display: flex;
-                    flex-direction: column;
-                }}
-
-                .chart-title {{
-                    font-size: 16px;
-                    font-weight: 600;
-                    color: #2c3e50;
-                    margin-bottom: 15px;
-                    padding-bottom: 10px;
-                    border-bottom: 1px solid #eee;
-                }}
-
-                .status-container {{
-                    background: white;
-                    border-radius: 12px;
-                    box-shadow: 0 2px 4px rgba(0,0,0,0.1);
-                    padding: 20px;
-                    height: 350px;
-                    display: flex;
-                    flex-direction: column;
-                    align-items: center;
-                    justify-content: center;
-                }}
-
-                .traffic-light {{
-                    width: 150px;
-                    height: 150px;
-                    border-radius: 50%;
-                    margin: 20px;
-                    box-shadow: 0 0 20px rgba(0,0,0,0.2);
-                    position: relative;
-                }}
-
-                .traffic-light.success {{
-                    background: #2ecc71;  /* Bright green */
-                    border: 8px solid #27ae60;  /* Darker green border */
-                }}
-
-                .traffic-light.failure {{
-                    background: #e74c3c;  /* Bright red */
-                    border: 8px solid #c0392b;  /* Darker red border */
-                }}
-
-                .status-text {{
-                    font-size: 24px;
-                    font-weight: bold;
-                    margin-top: 20px;
-                    color: #2c3e50;
-                }}
-
-                /* Override Plotly's default margins */
-                .js-plotly-plot .plotly {{
-                    margin: 0 !important;
-                }}
-            </style>
-        </head>
-        <body>
-            <div class="date-picker-container">
-                <input type="text" id="daterange" />
-                <div class="quick-ranges">
-                    <button onclick="setQuickRange('1h')">Last Hour</button>
-                    <button onclick="setQuickRange('6h')">Last 6 Hours</button>
-                    <button onclick="setQuickRange('1d')">Last 24 Hours</button>
-                    <button onclick="setQuickRange('7d')">Last 7 Days</button>
-                    <button onclick="setQuickRange('30d')">Last 30 Days</button>
-                    <button onclick="setQuickRange('all')">All Time</button>
-                </div>
-            </div>
-
-            <div class="dashboard-grid">
-                <div class="status-container">
-                    <div class="chart-title">Pipeline Status</div>
-                    <div class="traffic-light {'success' if latest_pipeline_status == 'success' else 'failure'}"></div>
-                    <div class="status-text">
-                        {'✓ Pipeline Passing' if latest_pipeline_status == 'success' else '✗ Pipeline Failing'}
-                    </div>
-                </div>
-                <div class="chart-row">
-                    <div class="chart-box">
-                        <div class="chart-title">Package Size</div>
-                        <div id="size-chart"></div>
-                    </div>
-                    <div class="chart-box">
-                        <div class="chart-title">Line Count</div>
-                        <div id="lines-chart"></div>
-                    </div>
-                </div>
-                <div class="chart-row chart-row-full">
-                    <div class="chart-box">
-                        <div class="chart-title">Tokens per Second</div>
-                        <div id="tokens-chart"></div>
-                    </div>
-                </div>
-            </div>
-
-            <script type="text/javascript" src="https://cdn.jsdelivr.net/jquery/latest/jquery.min.js"></script>
-            <script type="text/javascript" src="https://cdn.jsdelivr.net/momentjs/latest/moment.min.js"></script>
-            <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/daterangepicker/daterangepicker.min.js"></script>
-            <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
-            <script>
-                let globalMinDate = null;
-                let globalMaxDate = null;
-
-                // Split the original figure into separate charts
-                const originalData = {fig.to_json()};
-
-                function initializeCharts() {{
-                    // Create the size trend chart
-                    const sizeTrace = originalData.data.find(trace => trace.name === 'Package Size');
-                    if (sizeTrace) {{
-                        Plotly.newPlot('size-chart',
-                            [sizeTrace],
-                            {{
-                                showlegend: false,
-                                height: 280,
-                                margin: {{ t: 10, b: 40, l: 50, r: 20 }},
-                                yaxis: {{ title: 'Size (MB)' }},
-                                xaxis: {{
-                                    type: 'date',
-                                    title: null,
-                                    range: [sizeTrace.x[0], sizeTrace.x[sizeTrace.x.length - 1]]
-                                }}
-                            }}
-                        );
-                    }}
-
-                    // Create the line count chart
-                    const lineTrace = originalData.data.find(trace => trace.name === 'Line Count');
-                    if (lineTrace) {{
-                        Plotly.newPlot('lines-chart',
-                            [lineTrace],
-                            {{
-                                showlegend: false,
-                                height: 280,
-                                margin: {{ t: 10, b: 40, l: 50, r: 20 }},
-                                yaxis: {{ title: 'Total Lines' }},
-                                xaxis: {{
-                                    type: 'date',
-                                    title: null,
-                                    range: [lineTrace.x[0], lineTrace.x[lineTrace.x.length - 1]]
-                                }}
-                            }}
-                        );
-                    }}
-
-                    // Create the tokens per second chart
-                    const tokensTrace = originalData.data.find(trace => trace.name === 'Tokens/Second');
-                    if (tokensTrace) {{
-                        Plotly.newPlot('tokens-chart',
-                            [tokensTrace],
-                            {{
-                                showlegend: false,
-                                height: 280,
-                                margin: {{ t: 10, b: 40, l: 50, r: 20 }},
-                                yaxis: {{ title: 'Tokens/Second' }},
-                                xaxis: {{
-                                    type: 'date',
-                                    title: null,
-                                    range: [tokensTrace.x[0], tokensTrace.x[tokensTrace.x.length - 1]]
-                                }}
-                            }}
-                        );
-                    }}
-
-                    // Add debug logs to check axis names
-                    console.log('Size Chart Layout:', document.getElementById('size-chart').layout);
-                    console.log('Lines Chart Layout:', document.getElementById('lines-chart').layout);
-                    console.log('Tokens Chart Layout:', document.getElementById('tokens-chart').layout);
-                }}
-
-                function setQuickRange(range) {{
-                    let start, end = moment();
-
-                    switch(range) {{
-                        case '1h':
-                            start = moment().subtract(1, 'hours');
-                            break;
-                        case '6h':
-                            start = moment().subtract(6, 'hours');
-                            break;
-                        case '1d':
-                            start = moment().subtract(1, 'days');
-                            break;
-                        case '7d':
-                            start = moment().subtract(7, 'days');
-                            break;
-                        case '30d':
-                            start = moment().subtract(30, 'days');
-                            break;
-                        case 'all':
-                            start = moment(globalMinDate);
-                            end = moment(globalMaxDate);
-                            break;
-                    }}
-
-                    $('#daterange').data('daterangepicker').setStartDate(start);
-                    $('#daterange').data('daterangepicker').setEndDate(end);
-                    updatePlotRange(start.toISOString(), end.toISOString());
-                }}
-
-                function updatePlotRange(startDate, endDate) {{
-                    console.log('Updating range:', startDate, endDate);
-
-                    // Get the actual x-axis names from the chart layouts
-                    const sizeChartLayout = document.getElementById('size-chart').layout;
-                    const sizeXAxisName = Object.keys(sizeChartLayout).find(key => key.startsWith('xaxis'));
-
-                    const linesChartLayout = document.getElementById('lines-chart').layout;
-                    const linesXAxisName = Object.keys(linesChartLayout).find(key => key.startsWith('xaxis'));
-
-                    const tokensChartLayout = document.getElementById('tokens-chart').layout;
-                    const tokensXAxisName = Object.keys(tokensChartLayout).find(key => key.startsWith('xaxis'));
-
-                    // Update the ranges
-                    const sizeUpdateLayout = {{}};
-                    sizeUpdateLayout[`{{sizeXAxisName}}.range`] = [startDate, endDate];
-
-                    const linesUpdateLayout = {{}};
-                    linesUpdateLayout[`{{linesXAxisName}}.range`] = [startDate, endDate];
-
-                    const tokensUpdateLayout = {{}};
-                    tokensUpdateLayout[`{{tokensXAxisName}}.range`] = [startDate, endDate];
-
-                    // Update both charts
-                    Plotly.relayout('size-chart', sizeUpdateLayout)
-                        .catch(err => console.error('Error updating size chart:', err));
-
-                    Plotly.relayout('lines-chart', linesUpdateLayout)
-                        .catch(err => console.error('Error updating lines chart:', err));
-
-                    Plotly.relayout('tokens-chart', tokensUpdateLayout)
-                        .catch(err => console.error('Error updating tokens chart:', err));
-                }}
-
-                function findDateRange(data) {{
-                    let minDate = null;
-                    let maxDate = null;
-
-                    data.forEach(trace => {{
-                        if (trace.x && trace.x.length > 0) {{
-                            const dates = trace.x.map(d => new Date(d));
-                            const traceMin = new Date(Math.min(...dates));
-                            const traceMax = new Date(Math.max(...dates));
-
-                            if (!minDate || traceMin < minDate) minDate = traceMin;
-                            if (!maxDate || traceMax > maxDate) maxDate = traceMax;
-                        }}
-                    }});
-
-                    return {{ minDate, maxDate }};
-                }}
-
-                // Initialize everything when document is ready
-                $(document).ready(function() {{
-                    // Initialize charts
-                    initializeCharts();
-
-                    // Find date range from data
-                    const {{ minDate, maxDate }} = findDateRange(originalData.data);
-                    globalMinDate = minDate;
-                    globalMaxDate = maxDate;
-
-                    // Initialize daterangepicker
-                    $('#daterange').daterangepicker({{
-                        startDate: minDate,
-                        endDate: maxDate,
-                        minDate: minDate,
-                        maxDate: maxDate,
-                        timePicker: true,
-                        timePicker24Hour: true,
-                        timePickerIncrement: 1,
-                        opens: 'center',
-                        locale: {{
-                            format: 'YYYY-MM-DD HH:mm',
-                            applyLabel: "Apply",
-                            cancelLabel: "Cancel",
-                            customRangeLabel: "Custom Range"
-                        }},
-                        ranges: {{
-                            'Last Hour': [moment().subtract(1, 'hours'), moment()],
-                            'Last 6 Hours': [moment().subtract(6, 'hours'), moment()],
-                            'Last 24 Hours': [moment().subtract(1, 'days'), moment()],
-                            'Last 7 Days': [moment().subtract(7, 'days'), moment()],
-                            'Last 30 Days': [moment().subtract(30, 'days'), moment()],
-                            'All Time': [moment(minDate), moment(maxDate)]
-                        }}
-                    }});
-
-                    // Update plots when date range changes
-                    $('#daterange').on('apply.daterangepicker', function(ev, picker) {{
-                        console.log('Date range changed:', picker.startDate.toISOString(), picker.endDate.toISOString());
-                        updatePlotRange(picker.startDate.toISOString(), picker.endDate.toISOString());
-                    }});
-
-                    // Add click handlers for charts
-                    ['size-chart', 'lines-chart', 'tokens-chart'].forEach(chartId => {{
-                        const chart = document.getElementById(chartId);
-                        if (chart) {{
-                            chart.on('plotly_click', function(data) {{
-                                const point = data.points[0];
-                                if (point.customdata && point.customdata[1]) {{
-                                    window.open(point.customdata[1], '_blank');
-                                }}
-                            }});
-                        }}
-                    }});
-
-                    // Add debug logging for chart initialization
-                    console.log('Size Chart:', document.getElementById('size-chart'));
-                    console.log('Lines Chart:', document.getElementById('lines-chart'));
-                    console.log('Tokens Chart:', document.getElementById('tokens-chart'));
-                }});
-            </script>
-        </body>
-        </html>
-        """
-
-        # Write the dashboard
-        dashboard_path = output_dir / "dashboard.html"
-        with open(dashboard_path, "w") as f:
-            f.write(dashboard_html)
-
-        # Generate summary with available metrics
-        latest_data = {}
-
-        if not df_size.empty:
-            latest = df_size.iloc[-1]
-            previous = df_size.iloc[-2] if len(df_size) > 1 else latest
-            size_change = float(latest['total_size_mb'] - previous['total_size_mb'])
-            latest_data.update({
-                'timestamp': latest['timestamp'].isoformat(),
-                'commit_hash': latest['commit_hash'],
-                'commit_url': latest['commit_url'],
-                'total_size_mb': float(latest['total_size_mb']),
-                'size_change_mb': size_change,
-                'packages': latest.get('packages', [])
-            })
-
-        if not df_lines.empty:
-            latest = df_lines.iloc[-1]
-            previous = df_lines.iloc[-2] if len(df_lines) > 1 else latest
-            linecount_change = int(latest['total_lines'] - previous['total_lines'])
-            if not latest_data:  # Only add timestamp and commit info if not already added
-                latest_data.update({
-                    'timestamp': latest['timestamp'].isoformat(),
-                    'commit_hash': latest['commit_hash'],
-                    'commit_url': latest['commit_url'],
-                })
-            latest_data.update({
-                'total_lines': int(latest['total_lines']),
-                'linecount_change': linecount_change
-            })
-
-        if not df_benchmark.empty:
-            latest = df_benchmark.iloc[-1]
-            previous = df_benchmark.iloc[-2] if len(df_benchmark) > 1 else latest
-            tokens_change = float(latest['tokens_per_second'] - previous['tokens_per_second'])
-            if not latest_data:  # Only add timestamp and commit info if not already added
-                latest_data.update({
-                    'timestamp': latest['timestamp'].isoformat(),
-                    'commit_hash': latest['commit_hash'],
-                    'commit_url': latest['commit_url'],
-                })
-            latest_data.update({
-                'tokens_per_second': float(latest['tokens_per_second']),
-                'tokens_change': tokens_change
-            })
-
-        if latest_data:
-            with open(output_dir / 'latest_data.json', 'w') as f:
-                json.dump(latest_data, f, indent=2)
-
-            self._print_summary(latest_data)
-            self.logger.info(f"Report generated in {output_dir}")
-            return str(output_dir)
-
-        return None
-
-    def _print_summary(self, latest_data: Dict):
-        print("\n=== Package Size Summary ===")
-        print(f"Timestamp: {latest_data['timestamp']}")
-        print(f"Commit: {latest_data['commit_hash'][:7]}")
-
-        if 'total_size_mb' in latest_data:
-            print(f"Total Size: {latest_data['total_size_mb']:.2f}MB")
-            change = latest_data['size_change_mb']
-            change_symbol = "↓" if change <= 0 else "↑"
-            print(f"Change: {change_symbol} {abs(change):.2f}MB")
-
-            if latest_data.get('packages'):
-                print("\nTop 5 Largest Packages:")
-                sorted_packages = sorted(latest_data['packages'], key=lambda x: x['size_mb'], reverse=True)
-                for pkg in sorted_packages[:5]:
-                    print(f"- {pkg['name']}: {pkg['size_mb']:.2f}MB")
-
-        if 'total_lines' in latest_data:
-            print("\nLine Count Stats:")
-            print(f"Total Lines: {latest_data['total_lines']:,}")
-            change = latest_data['linecount_change']
-            change_symbol = "↓" if change <= 0 else "↑"
-            print(f"Change: {change_symbol} {abs(change):,}")
-
-        if 'tokens_per_second' in latest_data:
-            print("\nBenchmark Stats:")
-            print(f"Tokens per Second: {latest_data['tokens_per_second']:.2f}")
-            if 'time_to_first_token' in latest_data:
-                print(f"Time to First Token: {latest_data['time_to_first_token']:.3f}s")
-
-        print("\n")
-
-    def _calculate_data_hash(self, data: List[Dict]) -> str:
-        """Calculate a hash of the data to detect changes"""
-        return hash(str(sorted([
-            (d.get('commit_hash'), d.get('timestamp'))
-            for d in data
-        ])))
-
-    def _play_sound(self, sound_key: str):
-        """Play a specific notification sound using pygame"""
-        try:
-            sound_path = self.sounds.get(sound_key)
-            if sound_path and sound_path.exists():
-                sound = pygame.mixer.Sound(str(sound_path))
-                sound.play()
-                # Wait for the sound to finish playing
-                pygame.time.wait(int(sound.get_length() * 1000))
-            else:
-                self.logger.warning(f"Sound file not found: {sound_key} at {sound_path}")
-        except Exception as e:
-            self.logger.error(f"Failed to play sound {sound_key}: {e}")
-
-    def _check_metrics_changes(self, current_data: List[Dict], previous_data: List[Dict]):
-        # Sort data by timestamp in descending order (most recent first)
-        def sort_by_timestamp(data):
-            return sorted(
-                data,
-                key=lambda x: x.get('timestamp', ''),
-                reverse=True  # Most recent first
-            )
-
-        current_data = sort_by_timestamp(current_data)
-        previous_data = sort_by_timestamp(previous_data)
-
-        # Helper to find latest entry with a specific metric
-        def find_latest_with_metric(data: List[Dict], metric: str) -> Optional[Dict]:
-            return next((d for d in data if metric in d), None)
-
-        # Check line count changes
-        current_lines = find_latest_with_metric(current_data, 'total_lines')
-        previous_lines = find_latest_with_metric(previous_data, 'total_lines')
-
-        if current_lines and previous_lines:
-            diff = current_lines['total_lines'] - previous_lines['total_lines']
-            self.logger.debug(f"Lines of code diff: {diff}")
-            if diff > 0:
-                self.logger.info(f"Lines of code increased by {diff:,}")
-                self._play_sound('lines_up')
-            elif diff < 0:
-                self.logger.info(f"Lines of code decreased by {abs(diff):,}")
-                self._play_sound('lines_down')
-        else:
-            self.logger.debug("No lines of code data found")
-
-        # Check tokens per second changes
-        current_tokens = find_latest_with_metric(current_data, 'tokens_per_second')
-        previous_tokens = find_latest_with_metric(previous_data, 'tokens_per_second')
-
-        if current_tokens and previous_tokens:
-            diff = current_tokens['tokens_per_second'] - previous_tokens['tokens_per_second']
-            self.logger.debug(f"Tokens per second diff: {diff}")
-            if diff > 0:
-                self.logger.info(f"Tokens per second increased by {diff:.2f}")
-                self._play_sound('tokens_up')
-            elif diff < 0:
-                self.logger.info(f"Tokens per second decreased by {abs(diff):.2f}")
-                self._play_sound('tokens_down')
-        else:
-            self.logger.debug("No tokens per second data found")
-
-        # Check package size changes
-        current_size = find_latest_with_metric(current_data, 'total_size_mb')
-        previous_size = find_latest_with_metric(previous_data, 'total_size_mb')
-
-        if current_size and previous_size:
-            diff = current_size['total_size_mb'] - previous_size['total_size_mb']
-            self.logger.debug(f"Package size diff: {diff:.2f}MB")
-            if diff > 0:
-                self.logger.info(f"Package size increased by {diff:.2f}MB")
-                self._play_sound('size_up')
-            elif diff < 0:
-                self.logger.info(f"Package size decreased by {abs(diff):.2f}MB")
-                self._play_sound('size_down')
-        else:
-            self.logger.debug("No package size data found")
-
-    async def run_dashboard(self, update_interval: int = 10):
-        """Run the dashboard with periodic updates"""
-        try:
-            update_interval = float(update_interval)
-            self.logger.debug(f"Update interval type: {type(update_interval)}, value: {update_interval}")
-        except ValueError as e:
-            self.logger.error(f"Failed to convert update_interval to float: {update_interval}")
-            raise
-
-        self.logger.info(f"Starting real-time dashboard with {update_interval}s updates")
-        previous_data = None
-
-        while True:
-            try:
-                start_time = time.time()
-
-                # Collect new data
-                current_data = await self.collect_data()
-                if not current_data:
-                    self.logger.warning("No data collected")
-                    await asyncio.sleep(update_interval)
-                    continue
-
-                # Generate report
-                report_path = self.generate_report(current_data)
-                if report_path:
-                    self.logger.info(
-                        f"Dashboard updated at {datetime.now().strftime('%H:%M:%S')}"
-                    )
-
-                    print("Curr:", len(current_data))
-                    print("Prev:", len(previous_data) if previous_data else "None")
-                    if previous_data:
-                        # Check for metric changes and play appropriate sounds
-                        self.logger.debug(f"Checking metrics changes between {len(current_data)} current and {len(previous_data)} previous data points")
-                        self._check_metrics_changes(current_data, previous_data)
-
-                # Update previous data
-                previous_data = current_data.copy()  # Make a copy to prevent reference issues
-
-                # Calculate sleep time
-                elapsed = float(time.time() - start_time)
-                sleep_time = max(0.0, update_interval - elapsed)
-                await asyncio.sleep(sleep_time)
-
-            except Exception as e:
-                self.logger.error(f"Error in dashboard update loop: {e}", exc_info=True)
-                if self.debug:
-                    raise
-                await asyncio.sleep(update_interval)
-
-async def main():
-    token = os.getenv("CIRCLECI_TOKEN")
-    project_slug = os.getenv("CIRCLECI_PROJECT_SLUG")
-    debug = os.getenv("DEBUG", "").lower() in ("true", "1", "yes")
-
-    try:
-        # Get update interval from environment or use default
-        update_interval = float(os.getenv("UPDATE_INTERVAL", "10"))
-        print(f"Update interval type: {type(update_interval)}, value: {update_interval}")  # Debug print
-    except ValueError as e:
-        print(f"Error converting UPDATE_INTERVAL to float: {os.getenv('UPDATE_INTERVAL')}")
-        update_interval = 10.0
-
-    if not token or not project_slug:
-        print("Error: Please set CIRCLECI_TOKEN and CIRCLECI_PROJECT_SLUG environment variables")
-        return
-
-    tracker = PackageSizeTracker(token, project_slug, debug)
-
-    try:
-        await tracker.run_dashboard(update_interval)
-    except KeyboardInterrupt:
-        print("\nDashboard stopped by user")
-    except Exception as e:
-        logging.error(f"Error: {str(e)}", exc_info=True)
-        if debug:
-            raise
+from collections import defaultdict
+import os
 
-if __name__ == "__main__":
-    asyncio.run(main())
+s3 = boto3.client('s3')
+BUCKET_NAME = 'exo-benchmarks'
+
+def load_mock_data():
+  current_dir = os.path.dirname(os.path.abspath(__file__))
+  mock_data_path = os.path.join(current_dir, 'mock_data.json')
+  with open(mock_data_path, 'r') as f:
+    return json.load(f)
+
+def load_data_from_s3():
+  # For testing, use mock data if environment variable is set
+  if os.getenv('USE_MOCK_DATA'):
+    return load_mock_data()
+    
+  config_data = defaultdict(list)
+  
+  paginator = s3.get_paginator('list_objects_v2')
+  for page in paginator.paginate(Bucket=BUCKET_NAME):
+    for obj in page.get('Contents', []):
+      key = obj['Key']
+      config_name = key.split('/')[0]
+      response = s3.get_object(Bucket=BUCKET_NAME, Key=key)
+      data = json.loads(response['Body'].read().decode('utf-8'))
+      print(f"Processing object: {obj['Key']}: {data}")
+      config_data[config_name].append({
+        'timestamp': data.get('timestamp', obj['LastModified'].strftime('%Y-%m-%dT%H:%M:%S')),
+        'prompt_tps': data.get('prompt_tps', 0),
+        'generation_tps': data.get('generation_tps', 0),
+        'commit': data.get('commit', ''),
+        'run_id': data.get('run_id', '')
+      })
+  
+  for config in config_data:
+    config_data[config].sort(key=lambda x: x['timestamp'])
+    
+  return config_data
+
+app = dash.Dash(__name__)
+
+app.layout = html.Div([
+  html.H1('Benchmark Performance Dashboard'),
+  html.Div(id='graphs-container'),
+  dcc.Interval(
+    id='interval-component',
+    interval=300000,  # Update every 5 minutes
+    n_intervals=0
+  )
+])
+
+@app.callback(
+  Output('graphs-container', 'children'),
+  Input('interval-component', 'n_intervals')
+)
+def update_graphs(n):
+  config_data = load_data_from_s3()
+  graphs = []
+  
+  for config_name, data in config_data.items():
+    timestamps = [d['timestamp'] for d in data]
+    prompt_tps = [d['prompt_tps'] for d in data]
+    generation_tps = [d['generation_tps'] for d in data]
+    commits = [d['commit'] for d in data]
+    run_ids = [d['run_id'] for d in data]
+    
+    fig = go.Figure()
+    
+    fig.add_trace(go.Scatter(
+      x=timestamps,
+      y=prompt_tps,
+      name='Prompt TPS',
+      mode='lines+markers',
+      hovertemplate='Commit: %{text}<br>TPS: %{y}<extra></extra>',
+      text=commits,
+      customdata=run_ids
+    ))
+    
+    fig.add_trace(go.Scatter(
+      x=timestamps,
+      y=generation_tps,
+      name='Generation TPS',
+      mode='lines+markers',
+      hovertemplate='Commit: %{text}<br>TPS: %{y}<extra></extra>',
+      text=commits,
+      customdata=run_ids
+    ))
+    
+    fig.update_layout(
+      title=f'Performance Metrics - {config_name}',
+      xaxis_title='Timestamp',
+      yaxis_title='Tokens per Second',
+      hovermode='x unified',
+      clickmode='event'
+    )
+    
+    graphs.append(html.Div([
+      dcc.Graph(
+        figure=fig,
+        id={'type': 'dynamic-graph', 'index': config_name},
+        config={'displayModeBar': True}
+      )
+    ]))
+    
+  return graphs
+
+@app.callback(
+  Output('_', 'children'),
+  Input({'type': 'dynamic-graph', 'index': dash.ALL}, 'clickData')
+)
+def handle_click(clickData):
+  if clickData and clickData['points'][0].get('customdata'):
+    run_id = clickData['points'][0]['customdata']
+    url = f'https://github.com/exo-explore/exo/actions/runs/{run_id}'
+    import webbrowser
+    webbrowser.open_new_tab(url)
+  return dash.no_update
+
+if __name__ == '__main__':
+  app.run_server(debug=True)
diff --git a/extra/dashboard/mock_data.json b/extra/dashboard/mock_data.json
new file mode 100644
index 000000000..f6738eeda
--- /dev/null
+++ b/extra/dashboard/mock_data.json
@@ -0,0 +1,54 @@
+{
+  "config1": [
+    {
+      "timestamp": "2024-03-01T10:00:00",
+      "prompt_tps": 150.5,
+      "generation_tps": 120.3,
+      "commit": "abc123",
+      "run_id": "12345678",
+      "configuration": {
+        "M4_Pro_16GB": 1,
+        "M4_Pro_24GB": 2,
+        "M4_32GB": 1
+      }
+    },
+    {
+      "timestamp": "2024-03-02T10:00:00",
+      "prompt_tps": 155.2,
+      "generation_tps": 125.1,
+      "commit": "def456",
+      "run_id": "23456789",
+      "configuration": {
+        "M4_Pro_16GB": 1,
+        "M4_Pro_24GB": 2,
+        "M4_32GB": 1
+      }
+    }
+  ],
+  "config2": [
+    {
+      "timestamp": "2024-03-01T10:00:00",
+      "prompt_tps": 140.8,
+      "generation_tps": 110.5,
+      "commit": "ghi789",
+      "run_id": "34567890",
+      "configuration": {
+        "M4_Pro_16GB": 1,
+        "M4_Pro_24GB": 2,
+        "M4_32GB": 1
+      }
+    },
+    {
+      "timestamp": "2024-03-02T10:00:00",
+      "prompt_tps": 145.6,
+      "generation_tps": 115.2,
+      "commit": "jkl012",
+      "run_id": "45678901",
+      "configuration": {
+        "M4_Pro_16GB": 1,
+        "M4_Pro_24GB": 2,
+        "M4_32GB": 1
+      }
+    }
+  ]
+} 
\ No newline at end of file
diff --git a/extra/dashboard/requirements.txt b/extra/dashboard/requirements.txt
index 7b978bc0e..5f1621133 100644
--- a/extra/dashboard/requirements.txt
+++ b/extra/dashboard/requirements.txt
@@ -1,5 +1,2 @@
-plotly
-pandas
-requests
-aiohttp
-pygame
\ No newline at end of file
+boto3==1.35.76
+dash==2.18.2

From 2f0b974dd5938de0ecbe84408336e8ceadfbfc14 Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Sun, 8 Dec 2024 18:25:42 +0000
Subject: [PATCH 27/42] test depot runner

---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 8ef7cd244..e5b0e97f2 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -14,7 +14,7 @@ env:
 
 jobs:
   check_line_count:
-    runs-on: ubuntu-latest
+    runs-on: depot-ubuntu-22.04-4
     steps:
       - uses: actions/checkout@v4
         with:

From 61deb32404cbfc9e08c181de867d9d5cd64c344a Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Sun, 8 Dec 2024 18:43:08 +0000
Subject: [PATCH 28/42] t


From b1a386af02c493167f2eab35f786d5d35d1395c3 Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Sun, 8 Dec 2024 18:48:14 +0000
Subject: [PATCH 29/42] add back mlx, use depot runners

---
 .github/workflows/build_and_test.yml | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index e5b0e97f2..dfa0e56b2 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -50,7 +50,7 @@ jobs:
             line-count-diff.json
 
   unit_test:
-    runs-on: macos-15
+    runs-on: depot-macos-latest
     timeout-minutes: 20
     steps:
       - uses: actions/checkout@v4
@@ -87,7 +87,7 @@ jobs:
           python3 ./test/test_model_helpers.py
 
   discovery_integration_test:
-    runs-on: ubuntu-latest
+    runs-on: depot-ubuntu-22.04-4
     steps:
       - uses: actions/checkout@v4
 
@@ -128,16 +128,15 @@ jobs:
           fi
 
   chatgpt_api_tests:
-    runs-on: ${{ (matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy') && 'ubuntu-latest' || 'macos-15' }}
+    runs-on: ${{ (matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy') && 'depot-ubuntu-22.04-4' || 'depot-macos-latest' }}
     strategy:
       matrix:
-        # inference_engine: [mlx, tinygrad, dummy]
-        inference_engine: [tinygrad, dummy]
+        inference_engine: [mlx, tinygrad, dummy]
         include:
-          # - inference_engine: mlx
-          #   model_id: llama-3.2-1b
-          #   prompt: "Keep responses concise. Who was the king of pop?"
-          #   expected_output: "Michael Jackson"
+          - inference_engine: mlx
+            model_id: llama-3.2-1b
+            prompt: "Keep responses concise. Who was the king of pop?"
+            expected_output: "Michael Jackson"
           - inference_engine: tinygrad
             model_id: llama-3.2-1b
             prompt: "Keep responses concise. Who was the king of pop?"
@@ -273,7 +272,7 @@ jobs:
           fi
 
   measure_pip_sizes:
-    runs-on: macos-15
+    runs-on: depot-macos-latest
     steps:
       - uses: actions/checkout@v4
 

From a44bf6fdc4f3375407df8d8ef05b6617b77d2ba7 Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Sun, 8 Dec 2024 18:51:08 +0000
Subject: [PATCH 30/42] t


From 45b3582f131eea24bb6e40d85f2986b1cecec4ed Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Sun, 8 Dec 2024 19:02:59 +0000
Subject: [PATCH 31/42] tiny tweaks

---
 .github/workflows/build_and_test.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index dfa0e56b2..c78504c1e 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -167,8 +167,11 @@ jobs:
       - name: Run ChatGPT API test
         env:
           TOKENIZERS_PARALLELISM: ${{ matrix.inference_engine == 'tinygrad' && 'true' || 'false' }}
-          SUPPORT_BF16: ${{ matrix.inference_engine == 'tinygrad' && '0' || '0' }}
+          SUPPORT_BF16: '0'
           CLANG: ${{ matrix.inference_engine == 'tinygrad' && '1' || '0' }}
+          METAL_DEBUG_ERROR_MODE: '0'
+          METAL_DEVICE_WRAPPER_TYPE: '1'
+          METAL_XCODE: '1'
         run: |
           source env/bin/activate
 

From 8f259e7c1efde4bbccab8888f473bec105523064 Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Sun, 8 Dec 2024 20:03:56 +0000
Subject: [PATCH 32/42] own runner test

---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index c78504c1e..b03430c89 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -128,7 +128,7 @@ jobs:
           fi
 
   chatgpt_api_tests:
-    runs-on: ${{ (matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy') && 'depot-ubuntu-22.04-4' || 'depot-macos-latest' }}
+    runs-on: ${{ (matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy') && 'M4PRO_GPU16_24GB' || 'M4PRO_GPU16_24GB' }}
     strategy:
       matrix:
         inference_engine: [mlx, tinygrad, dummy]

From 750bfb9d1025e0685f726b277ddcffa3725eca63 Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Sun, 8 Dec 2024 20:09:36 +0000
Subject: [PATCH 33/42] use depot runners

---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index b03430c89..c78504c1e 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -128,7 +128,7 @@ jobs:
           fi
 
   chatgpt_api_tests:
-    runs-on: ${{ (matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy') && 'M4PRO_GPU16_24GB' || 'M4PRO_GPU16_24GB' }}
+    runs-on: ${{ (matrix.inference_engine == 'tinygrad' || matrix.inference_engine == 'dummy') && 'depot-ubuntu-22.04-4' || 'depot-macos-latest' }}
     strategy:
       matrix:
         inference_engine: [mlx, tinygrad, dummy]

From d953f6f538a958f4a32b397cc6430c15b3e419e7 Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Sun, 8 Dec 2024 21:17:06 +0000
Subject: [PATCH 34/42] add model to benchmark key

---
 extra/dashboard/dashboard.py | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/extra/dashboard/dashboard.py b/extra/dashboard/dashboard.py
index df047c3f7..0d1f11faf 100644
--- a/extra/dashboard/dashboard.py
+++ b/extra/dashboard/dashboard.py
@@ -20,14 +20,17 @@ def load_data_from_s3():
   # For testing, use mock data if environment variable is set
   if os.getenv('USE_MOCK_DATA'):
     return load_mock_data()
-    
+
   config_data = defaultdict(list)
-  
+
   paginator = s3.get_paginator('list_objects_v2')
   for page in paginator.paginate(Bucket=BUCKET_NAME):
     for obj in page.get('Contents', []):
       key = obj['Key']
-      config_name = key.split('/')[0]
+      key_parts = key.split('/')
+      if len(key_parts) < 2:
+        continue
+      config_name = f"{key_parts[0]}/{key_parts[1]}"  # Include both config and model
       response = s3.get_object(Bucket=BUCKET_NAME, Key=key)
       data = json.loads(response['Body'].read().decode('utf-8'))
       print(f"Processing object: {obj['Key']}: {data}")
@@ -38,10 +41,10 @@ def load_data_from_s3():
         'commit': data.get('commit', ''),
         'run_id': data.get('run_id', '')
       })
-  
+
   for config in config_data:
     config_data[config].sort(key=lambda x: x['timestamp'])
-    
+
   return config_data
 
 app = dash.Dash(__name__)
@@ -63,16 +66,16 @@ def load_data_from_s3():
 def update_graphs(n):
   config_data = load_data_from_s3()
   graphs = []
-  
+
   for config_name, data in config_data.items():
     timestamps = [d['timestamp'] for d in data]
     prompt_tps = [d['prompt_tps'] for d in data]
     generation_tps = [d['generation_tps'] for d in data]
     commits = [d['commit'] for d in data]
     run_ids = [d['run_id'] for d in data]
-    
+
     fig = go.Figure()
-    
+
     fig.add_trace(go.Scatter(
       x=timestamps,
       y=prompt_tps,
@@ -82,7 +85,7 @@ def update_graphs(n):
       text=commits,
       customdata=run_ids
     ))
-    
+
     fig.add_trace(go.Scatter(
       x=timestamps,
       y=generation_tps,
@@ -92,7 +95,7 @@ def update_graphs(n):
       text=commits,
       customdata=run_ids
     ))
-    
+
     fig.update_layout(
       title=f'Performance Metrics - {config_name}',
       xaxis_title='Timestamp',
@@ -100,7 +103,7 @@ def update_graphs(n):
       hovermode='x unified',
       clickmode='event'
     )
-    
+
     graphs.append(html.Div([
       dcc.Graph(
         figure=fig,
@@ -108,7 +111,7 @@ def update_graphs(n):
         config={'displayModeBar': True}
       )
     ]))
-    
+
   return graphs
 
 @app.callback(

From 54d3c823b94bc4fc1fe5d08948213ad4919b5bca Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Sun, 8 Dec 2024 21:53:40 +0000
Subject: [PATCH 35/42] dash sounds

---
 .../{sounds => assets}/gta5_wasted.mp3        |   0
 .../{sounds => assets}/pokemon_evolve.mp3     |   0
 extra/dashboard/dashboard.py                  | 153 ++++++++++++++++--
 extra/dashboard/requirements.txt              |   2 +
 4 files changed, 139 insertions(+), 16 deletions(-)
 rename extra/dashboard/{sounds => assets}/gta5_wasted.mp3 (100%)
 rename extra/dashboard/{sounds => assets}/pokemon_evolve.mp3 (100%)

diff --git a/extra/dashboard/sounds/gta5_wasted.mp3 b/extra/dashboard/assets/gta5_wasted.mp3
similarity index 100%
rename from extra/dashboard/sounds/gta5_wasted.mp3
rename to extra/dashboard/assets/gta5_wasted.mp3
diff --git a/extra/dashboard/sounds/pokemon_evolve.mp3 b/extra/dashboard/assets/pokemon_evolve.mp3
similarity index 100%
rename from extra/dashboard/sounds/pokemon_evolve.mp3
rename to extra/dashboard/assets/pokemon_evolve.mp3
diff --git a/extra/dashboard/dashboard.py b/extra/dashboard/dashboard.py
index 0d1f11faf..009a9efc2 100644
--- a/extra/dashboard/dashboard.py
+++ b/extra/dashboard/dashboard.py
@@ -1,11 +1,15 @@
 import dash
-from dash import html, dcc
+from dash import html, dcc, ctx
 import plotly.graph_objs as go
-from dash.dependencies import Input, Output
+from dash.dependencies import Input, Output, State
 import boto3
 import json
 from collections import defaultdict
 import os
+import base64
+import numpy as np
+from plotly.subplots import make_subplots
+import plotly.express as px
 
 s3 = boto3.client('s3')
 BUCKET_NAME = 'exo-benchmarks'
@@ -51,21 +55,55 @@ def load_data_from_s3():
 
 app.layout = html.Div([
   html.H1('Benchmark Performance Dashboard'),
+  html.Button('Test Sound', id='test-sound-button', n_clicks=0),
   html.Div(id='graphs-container'),
+  html.Audio(id='success-sound', src='assets/pokemon_evolve.mp3', preload="auto", style={'display': 'none'}),
+  html.Audio(id='failure-sound', src='assets/gta5_wasted.mp3', preload="auto", style={'display': 'none'}),
+  html.Audio(id='startup-sound', src='assets/pokemon_evolve.mp3', preload="auto", style={'display': 'none'}),
+  html.Div(id='audio-trigger', style={'display': 'none'}),
+  dcc.Store(id='previous-data', storage_type='memory'),
   dcc.Interval(
     id='interval-component',
-    interval=300000,  # Update every 5 minutes
+    interval=10000,  # Update every 10 seconds
     n_intervals=0
   )
 ])
 
 @app.callback(
-  Output('graphs-container', 'children'),
-  Input('interval-component', 'n_intervals')
+  [Output('graphs-container', 'children'),
+   Output('previous-data', 'data'),
+   Output('audio-trigger', 'children')],
+  [Input('interval-component', 'n_intervals')],
+  [State('previous-data', 'data')]
 )
-def update_graphs(n):
+def update_graphs(n, previous_data):
   config_data = load_data_from_s3()
   graphs = []
+  trigger_sound = None
+
+  if previous_data:
+    for config_name, data in config_data.items():
+      if config_name in previous_data and data and previous_data[config_name]:
+        current_prompt_tps = data[-1]['prompt_tps']
+        previous_prompt_tps = previous_data[config_name][-1]['prompt_tps']
+
+        # Add clear logging for TPS changes
+        if current_prompt_tps != previous_prompt_tps:
+          print("\n" + "="*50)
+          print(f"Config: {config_name}")
+          print(f"Previous TPS: {previous_prompt_tps}")
+          print(f"Current TPS: {current_prompt_tps}")
+          print(f"Change: {current_prompt_tps - previous_prompt_tps}")
+
+        if current_prompt_tps > previous_prompt_tps:
+          print("🔼 TPS INCREASED - Should play success sound")
+          trigger_sound = 'success'
+        elif current_prompt_tps < previous_prompt_tps:
+          print("🔽 TPS DECREASED - Should play failure sound")
+          trigger_sound = 'failure'
+
+        if current_prompt_tps != previous_prompt_tps:
+            print("="*50 + "\n")
 
   for config_name, data in config_data.items():
     timestamps = [d['timestamp'] for d in data]
@@ -74,8 +112,12 @@ def update_graphs(n):
     commits = [d['commit'] for d in data]
     run_ids = [d['run_id'] for d in data]
 
-    fig = go.Figure()
+    # Create subplot with 2 columns
+    fig = make_subplots(rows=1, cols=2,
+                       subplot_titles=('Performance Over Time', 'Generation TPS Distribution'),
+                       column_widths=[0.7, 0.3])
 
+    # Time series plot (left)
     fig.add_trace(go.Scatter(
       x=timestamps,
       y=prompt_tps,
@@ -84,7 +126,7 @@ def update_graphs(n):
       hovertemplate='Commit: %{text}<br>TPS: %{y}<extra></extra>',
       text=commits,
       customdata=run_ids
-    ))
+    ), row=1, col=1)
 
     fig.add_trace(go.Scatter(
       x=timestamps,
@@ -94,16 +136,55 @@ def update_graphs(n):
       hovertemplate='Commit: %{text}<br>TPS: %{y}<extra></extra>',
       text=commits,
       customdata=run_ids
-    ))
+    ), row=1, col=1)
+
+    # Calculate statistics
+    gen_tps_array = np.array(generation_tps)
+    stats = {
+      'Mean': np.mean(gen_tps_array),
+      'Std Dev': np.std(gen_tps_array),
+      'Min': np.min(gen_tps_array),
+      'Max': np.max(gen_tps_array)
+    }
+
+    # Histogram plot (right)
+    fig.add_trace(go.Histogram(
+      x=generation_tps,
+      name='Generation TPS Distribution',
+      nbinsx=10,
+      showlegend=False
+    ), row=1, col=2)
+
+    # Add statistics as annotations
+    stats_text = '<br>'.join([f'{k}: {v:.2f}' for k, v in stats.items()])
+    fig.add_annotation(
+      x=0.98,
+      y=0.98,
+      xref='paper',
+      yref='paper',
+      text=stats_text,
+      showarrow=False,
+      font=dict(size=12),
+      align='left',
+      bgcolor='rgba(255, 255, 255, 0.8)',
+      bordercolor='black',
+      borderwidth=1
+    )
 
     fig.update_layout(
       title=f'Performance Metrics - {config_name}',
-      xaxis_title='Timestamp',
-      yaxis_title='Tokens per Second',
+      height=500,
+      showlegend=True,
       hovermode='x unified',
       clickmode='event'
     )
 
+    # Update x and y axis labels
+    fig.update_xaxes(title_text='Timestamp', row=1, col=1)
+    fig.update_xaxes(title_text='Generation TPS', row=1, col=2)
+    fig.update_yaxes(title_text='Tokens per Second', row=1, col=1)
+    fig.update_yaxes(title_text='Count', row=1, col=2)
+
     graphs.append(html.Div([
       dcc.Graph(
         figure=fig,
@@ -112,19 +193,59 @@ def update_graphs(n):
       )
     ]))
 
-  return graphs
+  return graphs, config_data, trigger_sound
 
 @app.callback(
-  Output('_', 'children'),
-  Input({'type': 'dynamic-graph', 'index': dash.ALL}, 'clickData')
+  Output('graphs-container', 'children', allow_duplicate=True),
+  Input({'type': 'dynamic-graph', 'index': dash.ALL}, 'clickData'),
+  prevent_initial_call=True
 )
 def handle_click(clickData):
-  if clickData and clickData['points'][0].get('customdata'):
-    run_id = clickData['points'][0]['customdata']
+  if clickData and clickData[0] and clickData[0]['points'][0].get('customdata'):
+    run_id = clickData[0]['points'][0]['customdata']
     url = f'https://github.com/exo-explore/exo/actions/runs/{run_id}'
     import webbrowser
     webbrowser.open_new_tab(url)
   return dash.no_update
 
+app.clientside_callback(
+  """
+  function(trigger, test_clicks) {
+    if (!trigger && !test_clicks) return window.dash_clientside.no_update;
+
+    if (test_clicks > 0 && dash_clientside.callback_context.triggered[0].prop_id.includes('test-sound-button')) {
+      console.log('Test button clicked');
+      const audio = document.getElementById('startup-sound');
+      if (audio) {
+        audio.currentTime = 0;
+        audio.play().catch(e => console.log('Error playing audio:', e));
+      }
+    } else if (trigger) {
+      console.log('Audio trigger received:', trigger);
+      if (trigger === 'success') {
+        console.log('Playing success sound');
+        const audio = document.getElementById('success-sound');
+        if (audio) {
+          audio.currentTime = 0;
+          audio.play().catch(e => console.log('Error playing success sound:', e));
+        }
+      } else if (trigger === 'failure') {
+        console.log('Playing failure sound');
+        const audio = document.getElementById('failure-sound');
+        if (audio) {
+          audio.currentTime = 0;
+          audio.play().catch(e => console.log('Error playing failure sound:', e));
+        }
+      }
+    }
+    return window.dash_clientside.no_update;
+  }
+  """,
+  Output('audio-trigger', 'children', allow_duplicate=True),
+  [Input('audio-trigger', 'children'),
+   Input('test-sound-button', 'n_clicks')],
+  prevent_initial_call=True
+)
+
 if __name__ == '__main__':
   app.run_server(debug=True)
diff --git a/extra/dashboard/requirements.txt b/extra/dashboard/requirements.txt
index 5f1621133..5a49ed1a8 100644
--- a/extra/dashboard/requirements.txt
+++ b/extra/dashboard/requirements.txt
@@ -1,2 +1,4 @@
 boto3==1.35.76
 dash==2.18.2
+numpy
+pandas

From 16651a350639ebebb0c9accd950c938d72afea6b Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Wed, 11 Dec 2024 11:04:25 +0000
Subject: [PATCH 36/42] dashboard tweaks

---
 extra/dashboard/dashboard.py | 40 +++++++++++++++---------------------
 1 file changed, 16 insertions(+), 24 deletions(-)

diff --git a/extra/dashboard/dashboard.py b/extra/dashboard/dashboard.py
index 009a9efc2..56bdda785 100644
--- a/extra/dashboard/dashboard.py
+++ b/extra/dashboard/dashboard.py
@@ -84,30 +84,29 @@ def update_graphs(n, previous_data):
   if previous_data:
     for config_name, data in config_data.items():
       if config_name in previous_data and data and previous_data[config_name]:
-        current_prompt_tps = data[-1]['prompt_tps']
-        previous_prompt_tps = previous_data[config_name][-1]['prompt_tps']
+        current_generation_tps = data[-1]['generation_tps']
+        previous_generation_tps = previous_data[config_name][-1]['generation_tps']
 
         # Add clear logging for TPS changes
-        if current_prompt_tps != previous_prompt_tps:
+        if current_generation_tps != previous_generation_tps:
           print("\n" + "="*50)
           print(f"Config: {config_name}")
-          print(f"Previous TPS: {previous_prompt_tps}")
-          print(f"Current TPS: {current_prompt_tps}")
-          print(f"Change: {current_prompt_tps - previous_prompt_tps}")
+          print(f"Previous Generation TPS: {previous_generation_tps}")
+          print(f"Current Generation TPS: {current_generation_tps}")
+          print(f"Change: {current_generation_tps - previous_generation_tps}")
 
-        if current_prompt_tps > previous_prompt_tps:
-          print("🔼 TPS INCREASED - Should play success sound")
+        if current_generation_tps > previous_generation_tps:
+          print("🔼 Generation TPS INCREASED - Should play success sound")
           trigger_sound = 'success'
-        elif current_prompt_tps < previous_prompt_tps:
-          print("🔽 TPS DECREASED - Should play failure sound")
+        elif current_generation_tps < previous_generation_tps:
+          print("🔽 Generation TPS DECREASED - Should play failure sound")
           trigger_sound = 'failure'
 
-        if current_prompt_tps != previous_prompt_tps:
+        if current_generation_tps != previous_generation_tps:
             print("="*50 + "\n")
 
   for config_name, data in config_data.items():
     timestamps = [d['timestamp'] for d in data]
-    prompt_tps = [d['prompt_tps'] for d in data]
     generation_tps = [d['generation_tps'] for d in data]
     commits = [d['commit'] for d in data]
     run_ids = [d['run_id'] for d in data]
@@ -118,16 +117,6 @@ def update_graphs(n, previous_data):
                        column_widths=[0.7, 0.3])
 
     # Time series plot (left)
-    fig.add_trace(go.Scatter(
-      x=timestamps,
-      y=prompt_tps,
-      name='Prompt TPS',
-      mode='lines+markers',
-      hovertemplate='Commit: %{text}<br>TPS: %{y}<extra></extra>',
-      text=commits,
-      customdata=run_ids
-    ), row=1, col=1)
-
     fig.add_trace(go.Scatter(
       x=timestamps,
       y=generation_tps,
@@ -135,7 +124,9 @@ def update_graphs(n, previous_data):
       mode='lines+markers',
       hovertemplate='Commit: %{text}<br>TPS: %{y}<extra></extra>',
       text=commits,
-      customdata=run_ids
+      customdata=run_ids,
+      line=dict(color='#2196F3', width=2),
+      marker=dict(color='#2196F3')
     ), row=1, col=1)
 
     # Calculate statistics
@@ -152,7 +143,8 @@ def update_graphs(n, previous_data):
       x=generation_tps,
       name='Generation TPS Distribution',
       nbinsx=10,
-      showlegend=False
+      showlegend=False,
+      marker=dict(color='#2196F3')
     ), row=1, col=2)
 
     # Add statistics as annotations

From 2dbb5e177e9db9e64804927a6111438d2072a0b8 Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Wed, 11 Dec 2024 15:37:10 +0000
Subject: [PATCH 37/42] more robust configure_mlx.sh

---
 configure_mlx.sh | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/configure_mlx.sh b/configure_mlx.sh
index 8a5b67378..f3469c820 100755
--- a/configure_mlx.sh
+++ b/configure_mlx.sh
@@ -3,10 +3,27 @@
 # Get the total memory in MB
 TOTAL_MEM_MB=$(($(sysctl -n hw.memsize) / 1024 / 1024))
 
-# Set WIRED_LIMIT_MB to 80%
-WIRED_LIMIT_MB=$(($TOTAL_MEM_MB * 80 / 100))
-# Set  WIRED_LWM_MB to 70%
-WIRED_LWM_MB=$(($TOTAL_MEM_MB * 70 / 100))
+# Calculate 80% and TOTAL_MEM_GB-5GB in MB
+EIGHTY_PERCENT=$(($TOTAL_MEM_MB * 80 / 100))
+MINUS_5GB=$((($TOTAL_MEM_MB - 5120)))
+
+# Calculate 70% and TOTAL_MEM_GB-8GB in MB
+SEVENTY_PERCENT=$(($TOTAL_MEM_MB * 70 / 100))
+MINUS_8GB=$((($TOTAL_MEM_MB - 8192)))
+
+# Set WIRED_LIMIT_MB to higher value
+if [ $EIGHTY_PERCENT -gt $MINUS_5GB ]; then
+  WIRED_LIMIT_MB=$EIGHTY_PERCENT
+else
+  WIRED_LIMIT_MB=$MINUS_5GB
+fi
+
+# Set WIRED_LWM_MB to higher value
+if [ $SEVENTY_PERCENT -gt $MINUS_8GB ]; then
+  WIRED_LWM_MB=$SEVENTY_PERCENT
+else
+  WIRED_LWM_MB=$MINUS_8GB
+fi
 
 # Display the calculated values
 echo "Total memory: $TOTAL_MEM_MB MB"

From f12487b81a6b0afed9a4ec37df2d7f43e4b8e6d4 Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Thu, 12 Dec 2024 14:32:52 +0000
Subject: [PATCH 38/42] add --generate option to upload best.json to s3 with
 best benchmark results

---
 extra/dashboard/dashboard.py | 54 ++++++++++++++++++++++++++++++++++--
 1 file changed, 52 insertions(+), 2 deletions(-)

diff --git a/extra/dashboard/dashboard.py b/extra/dashboard/dashboard.py
index 56bdda785..c460c5955 100644
--- a/extra/dashboard/dashboard.py
+++ b/extra/dashboard/dashboard.py
@@ -43,7 +43,14 @@ def load_data_from_s3():
         'prompt_tps': data.get('prompt_tps', 0),
         'generation_tps': data.get('generation_tps', 0),
         'commit': data.get('commit', ''),
-        'run_id': data.get('run_id', '')
+        'run_id': data.get('run_id', ''),
+        'model': data.get('model', ''),
+        'branch': data.get('branch', ''),
+        'configuration': data.get('configuration', {}),
+        'prompt_len': data.get('prompt_len', 0),
+        'ttft': data.get('ttft', 0),
+        'response_len': data.get('response_len', 0),
+        'total_time': data.get('total_time', 0)
       })
 
   for config in config_data:
@@ -51,6 +58,31 @@ def load_data_from_s3():
 
   return config_data
 
+def get_best_benchmarks():
+  config_data = load_data_from_s3()
+  best_results = {}
+
+  for config_name, data in config_data.items():
+    if not data:
+      continue
+
+    # Split config_name into config and model
+    config, model = config_name.split('/')
+
+    # Find the entry with the highest generation_tps
+    best_result = max(data, key=lambda x: x['generation_tps'])
+
+    # Create result dictionary with all original data plus config/model info
+    result = dict(best_result)  # Make a copy of all data from the best run
+    result.update({
+      'config': config,
+      'model': model,
+    })
+
+    best_results[config_name] = result
+
+  return best_results
+
 app = dash.Dash(__name__)
 
 app.layout = html.Div([
@@ -240,4 +272,22 @@ def handle_click(clickData):
 )
 
 if __name__ == '__main__':
-  app.run_server(debug=True)
+  import sys
+  if '--generate' in sys.argv:
+    best_benchmarks = get_best_benchmarks()
+    print(json.dumps(best_benchmarks, indent=2))
+
+    # Upload best benchmarks to S3
+    try:
+      s3.put_object(
+        Bucket=BUCKET_NAME,
+        Key='best.json',
+        Body=json.dumps(best_benchmarks, indent=2),
+        ContentType='application/json'
+      )
+      print("Successfully uploaded best.json to S3")
+      print(f"Public URL: https://{BUCKET_NAME}.s3.amazonaws.com/best.json")
+    except Exception as e:
+      print(f"Error uploading to S3: {e}")
+  else:
+    app.run_server(debug=True)

From 6016e1185fbe1cea0bbf1106499e1d575ba57ce0 Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Thu, 12 Dec 2024 18:50:34 +0000
Subject: [PATCH 39/42] 100x faster dashboard

---
 extra/dashboard/dashboard.py     | 134 +++++++++++++++++++------------
 extra/dashboard/requirements.txt |   3 +-
 2 files changed, 83 insertions(+), 54 deletions(-)

diff --git a/extra/dashboard/dashboard.py b/extra/dashboard/dashboard.py
index c460c5955..fdab4980d 100644
--- a/extra/dashboard/dashboard.py
+++ b/extra/dashboard/dashboard.py
@@ -2,7 +2,9 @@
 from dash import html, dcc, ctx
 import plotly.graph_objs as go
 from dash.dependencies import Input, Output, State
-import boto3
+import aioboto3
+import asyncio
+from aiohttp import ClientSession
 import json
 from collections import defaultdict
 import os
@@ -11,7 +13,9 @@
 from plotly.subplots import make_subplots
 import plotly.express as px
 
-s3 = boto3.client('s3')
+# Replace boto3 client with aioboto3 session
+session = aioboto3.Session()
+
 BUCKET_NAME = 'exo-benchmarks'
 
 def load_mock_data():
@@ -20,46 +24,67 @@ def load_mock_data():
   with open(mock_data_path, 'r') as f:
     return json.load(f)
 
-def load_data_from_s3():
+async def load_data_from_s3():
   # For testing, use mock data if environment variable is set
   if os.getenv('USE_MOCK_DATA'):
     return load_mock_data()
 
   config_data = defaultdict(list)
 
-  paginator = s3.get_paginator('list_objects_v2')
-  for page in paginator.paginate(Bucket=BUCKET_NAME):
-    for obj in page.get('Contents', []):
-      key = obj['Key']
-      key_parts = key.split('/')
-      if len(key_parts) < 2:
-        continue
-      config_name = f"{key_parts[0]}/{key_parts[1]}"  # Include both config and model
-      response = s3.get_object(Bucket=BUCKET_NAME, Key=key)
-      data = json.loads(response['Body'].read().decode('utf-8'))
-      print(f"Processing object: {obj['Key']}: {data}")
-      config_data[config_name].append({
-        'timestamp': data.get('timestamp', obj['LastModified'].strftime('%Y-%m-%dT%H:%M:%S')),
-        'prompt_tps': data.get('prompt_tps', 0),
-        'generation_tps': data.get('generation_tps', 0),
-        'commit': data.get('commit', ''),
-        'run_id': data.get('run_id', ''),
-        'model': data.get('model', ''),
-        'branch': data.get('branch', ''),
-        'configuration': data.get('configuration', {}),
-        'prompt_len': data.get('prompt_len', 0),
-        'ttft': data.get('ttft', 0),
-        'response_len': data.get('response_len', 0),
-        'total_time': data.get('total_time', 0)
-      })
-
-  for config in config_data:
-    config_data[config].sort(key=lambda x: x['timestamp'])
-
-  return config_data
-
-def get_best_benchmarks():
-  config_data = load_data_from_s3()
+  async with session.client('s3') as s3:
+    paginator = s3.get_paginator('list_objects_v2')
+    objects_to_fetch = []
+
+    # First, get all object keys
+    async for page in paginator.paginate(Bucket=BUCKET_NAME):
+      for obj in page.get('Contents', []):
+        key = obj['Key']
+        key_parts = key.split('/')
+        if len(key_parts) < 2:
+          continue
+        objects_to_fetch.append((key, obj['LastModified'], f"{key_parts[0]}/{key_parts[1]}"))
+
+    # Then fetch all objects in parallel
+    async def fetch_object(key, last_modified, config_name):
+      response = await s3.get_object(Bucket=BUCKET_NAME, Key=key)
+      body = await response['Body'].read()
+      data = json.loads(body.decode('utf-8'))
+      print(f"Processing object: {key}: {data}")
+      return {
+        'config_name': config_name,
+        'data': {
+          'timestamp': data.get('timestamp', last_modified.strftime('%Y-%m-%dT%H:%M:%S')),
+          'prompt_tps': data.get('prompt_tps', 0),
+          'generation_tps': data.get('generation_tps', 0),
+          'commit': data.get('commit', ''),
+          'run_id': data.get('run_id', ''),
+          'model': data.get('model', ''),
+          'branch': data.get('branch', ''),
+          'configuration': data.get('configuration', {}),
+          'prompt_len': data.get('prompt_len', 0),
+          'ttft': data.get('ttft', 0),
+          'response_len': data.get('response_len', 0),
+          'total_time': data.get('total_time', 0)
+        }
+      }
+
+    # Create tasks for all objects
+    tasks = [fetch_object(key, last_modified, config_name)
+             for key, last_modified, config_name in objects_to_fetch]
+    results = await asyncio.gather(*tasks)
+
+    # Organize results into config_data
+    for result in results:
+      config_data[result['config_name']].append(result['data'])
+
+    # Sort data by timestamp for each config
+    for config in config_data:
+      config_data[config].sort(key=lambda x: x['timestamp'])
+
+    return config_data
+
+async def get_best_benchmarks():
+  config_data = await load_data_from_s3()
   best_results = {}
 
   for config_name, data in config_data.items():
@@ -96,7 +121,7 @@ def get_best_benchmarks():
   dcc.Store(id='previous-data', storage_type='memory'),
   dcc.Interval(
     id='interval-component',
-    interval=10000,  # Update every 10 seconds
+    interval=15000,  # Update every 15 seconds
     n_intervals=0
   )
 ])
@@ -109,7 +134,8 @@ def get_best_benchmarks():
   [State('previous-data', 'data')]
 )
 def update_graphs(n, previous_data):
-  config_data = load_data_from_s3()
+  # Run async operations synchronously
+  config_data = asyncio.run(load_data_from_s3())
   graphs = []
   trigger_sound = None
 
@@ -225,6 +251,7 @@ def update_graphs(n, previous_data):
   prevent_initial_call=True
 )
 def handle_click(clickData):
+  # If you add any async operations here, wrap them with asyncio.run()
   if clickData and clickData[0] and clickData[0]['points'][0].get('customdata'):
     run_id = clickData[0]['points'][0]['customdata']
     url = f'https://github.com/exo-explore/exo/actions/runs/{run_id}'
@@ -274,20 +301,21 @@ def handle_click(clickData):
 if __name__ == '__main__':
   import sys
   if '--generate' in sys.argv:
-    best_benchmarks = get_best_benchmarks()
-    print(json.dumps(best_benchmarks, indent=2))
-
-    # Upload best benchmarks to S3
-    try:
-      s3.put_object(
-        Bucket=BUCKET_NAME,
-        Key='best.json',
-        Body=json.dumps(best_benchmarks, indent=2),
-        ContentType='application/json'
-      )
-      print("Successfully uploaded best.json to S3")
-      print(f"Public URL: https://{BUCKET_NAME}.s3.amazonaws.com/best.json")
-    except Exception as e:
-      print(f"Error uploading to S3: {e}")
+    async def generate_best():
+      async with session.client('s3') as s3:
+        best_benchmarks = await get_best_benchmarks()
+        try:
+          await s3.put_object(
+            Bucket=BUCKET_NAME,
+            Key='best.json',
+            Body=json.dumps(best_benchmarks, indent=2),
+            ContentType='application/json'
+          )
+          print("Successfully uploaded best.json to S3")
+          print(f"Public URL: https://{BUCKET_NAME}.s3.amazonaws.com/best.json")
+        except Exception as e:
+          print(f"Error uploading to S3: {e}")
+
+    asyncio.run(generate_best())
   else:
     app.run_server(debug=True)
diff --git a/extra/dashboard/requirements.txt b/extra/dashboard/requirements.txt
index 5a49ed1a8..ab2d3a551 100644
--- a/extra/dashboard/requirements.txt
+++ b/extra/dashboard/requirements.txt
@@ -1,4 +1,5 @@
-boto3==1.35.76
+aioboto3==13.2.0
 dash==2.18.2
 numpy
 pandas
+aiohttp
\ No newline at end of file

From 0fa8f1f5bb4ac19621dd254e957c326fa4790dc1 Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Fri, 13 Dec 2024 22:27:11 +0000
Subject: [PATCH 40/42] discord notifications on new benchmark runs

---
 extra/dashboard/dashboard.py | 115 ++++++++++++++++++++++++++++++-----
 1 file changed, 100 insertions(+), 15 deletions(-)

diff --git a/extra/dashboard/dashboard.py b/extra/dashboard/dashboard.py
index fdab4980d..d618ce4e3 100644
--- a/extra/dashboard/dashboard.py
+++ b/extra/dashboard/dashboard.py
@@ -12,11 +12,15 @@
 import numpy as np
 from plotly.subplots import make_subplots
 import plotly.express as px
+import aiohttp
+from datetime import datetime
 
 # Replace boto3 client with aioboto3 session
 session = aioboto3.Session()
 
 BUCKET_NAME = 'exo-benchmarks'
+DISCORD_WEBHOOK_URL = os.getenv('DISCORD_WEBHOOK_URL')
+CURSOR_KEY = 'last_processed_timestamp.txt'
 
 def load_mock_data():
   current_dir = os.path.dirname(os.path.abspath(__file__))
@@ -108,6 +112,102 @@ async def get_best_benchmarks():
 
   return best_results
 
+async def send_discord_notification(benchmark_data):
+  if not DISCORD_WEBHOOK_URL:
+    print("Discord webhook URL not configured, skipping notification")
+    return
+
+  # Create a formatted message
+  config_name = f"{benchmark_data['config']}/{benchmark_data['model']}"
+
+  # Create a simple JSON string of the topology
+  topology = benchmark_data.get('configuration', {})
+  topology_str = "```json\n" + json.dumps(topology, indent=2) + "\n```"
+
+  message = (
+    f"🚀 New Benchmark Result for **{config_name}**\n\n"
+    f"📊 Performance Metrics:\n"
+    f"• Generation TPS: **{benchmark_data['generation_tps']:.2f}**\n"
+    f"• Prompt TPS: **{benchmark_data['prompt_tps']:.2f}**\n"
+    f"• TTFT: **{benchmark_data['ttft'] * 1000:.2f}ms**\n"
+    f"• Prompt Length: {benchmark_data['prompt_len']}\n"
+    f"• Response Length: {benchmark_data['response_len']}\n\n"
+    f"🔍 Run Details:\n"
+    f"• Commit: {benchmark_data['commit'][:7]}\n"
+    f"• Branch: {benchmark_data['branch']}\n"
+    f"• Run ID: [{benchmark_data['run_id']}](https://github.com/exo-explore/exo/actions/runs/{benchmark_data['run_id']})\n\n"
+    f"{topology_str}"
+  )
+
+  async with aiohttp.ClientSession() as session:
+    await session.post(DISCORD_WEBHOOK_URL, json={'content': message})
+
+async def get_cursor():
+  try:
+    async with session.client('s3') as s3:
+      response = await s3.get_object(Bucket=BUCKET_NAME, Key=CURSOR_KEY)
+      body = await response['Body'].read()
+      return body.decode('utf-8').strip()
+  except:
+    return "1970-01-01T00:00:00"  # Default to epoch if no cursor exists
+
+async def update_cursor(timestamp):
+  async with session.client('s3') as s3:
+    await s3.put_object(
+      Bucket=BUCKET_NAME,
+      Key=CURSOR_KEY,
+      Body=timestamp.encode('utf-8')
+    )
+
+async def generate_best():
+  # Get the last processed timestamp
+  last_processed = await get_cursor()
+  print(f"Last processed timestamp: {last_processed}")
+
+  async with session.client('s3') as s3:
+    # Load all benchmark data
+    config_data = await load_data_from_s3()
+    best_benchmarks = await get_best_benchmarks()
+
+    # Check for new benchmarks in all data
+    new_latest = last_processed
+    for config_name, data_list in config_data.items():
+      for benchmark in data_list:
+        timestamp = benchmark['timestamp']
+
+        # If this benchmark is newer than our last processed timestamp
+        if timestamp > last_processed:
+          print(f"Found new benchmark for {config_name} at {timestamp}")
+          # Add config and model info to the benchmark data
+          config, model = config_name.split('/')
+          benchmark_with_info = dict(benchmark)
+          benchmark_with_info.update({
+            'config': config,
+            'model': model,
+          })
+          await send_discord_notification(benchmark_with_info)
+
+          # Update the latest timestamp if this is the newest we've seen
+          if timestamp > new_latest:
+            new_latest = timestamp
+
+    # Update the cursor if we found any new benchmarks
+    if new_latest > last_processed:
+      await update_cursor(new_latest)
+
+    # Upload the best benchmarks as before
+    try:
+      await s3.put_object(
+        Bucket=BUCKET_NAME,
+        Key='best.json',
+        Body=json.dumps(best_benchmarks, indent=2),
+        ContentType='application/json'
+      )
+      print("Successfully uploaded best.json to S3")
+      print(f"Public URL: https://{BUCKET_NAME}.s3.amazonaws.com/best.json")
+    except Exception as e:
+      print(f"Error uploading to S3: {e}")
+
 app = dash.Dash(__name__)
 
 app.layout = html.Div([
@@ -301,21 +401,6 @@ def handle_click(clickData):
 if __name__ == '__main__':
   import sys
   if '--generate' in sys.argv:
-    async def generate_best():
-      async with session.client('s3') as s3:
-        best_benchmarks = await get_best_benchmarks()
-        try:
-          await s3.put_object(
-            Bucket=BUCKET_NAME,
-            Key='best.json',
-            Body=json.dumps(best_benchmarks, indent=2),
-            ContentType='application/json'
-          )
-          print("Successfully uploaded best.json to S3")
-          print(f"Public URL: https://{BUCKET_NAME}.s3.amazonaws.com/best.json")
-        except Exception as e:
-          print(f"Error uploading to S3: {e}")
-
     asyncio.run(generate_best())
   else:
     app.run_server(debug=True)

From 149849f94ece22d9bcf7ebfa85c5a73483f14eef Mon Sep 17 00:00:00 2001
From: Alex Cheema <alexcheema123@gmail.com>
Date: Fri, 13 Dec 2024 22:36:55 +0000
Subject: [PATCH 41/42] format metric changes nicely in discord

---
 extra/dashboard/dashboard.py | 68 +++++++++++++++++++++++++++++++-----
 1 file changed, 60 insertions(+), 8 deletions(-)

diff --git a/extra/dashboard/dashboard.py b/extra/dashboard/dashboard.py
index d618ce4e3..dbcaa85bf 100644
--- a/extra/dashboard/dashboard.py
+++ b/extra/dashboard/dashboard.py
@@ -112,7 +112,39 @@ async def get_best_benchmarks():
 
   return best_results
 
-async def send_discord_notification(benchmark_data):
+async def get_previous_benchmark(config_data, config_name, current_timestamp):
+  """Get the previous benchmark for a given configuration."""
+  benchmarks = config_data.get(config_name, [])
+  # Sort by timestamp and find the most recent benchmark before current_timestamp
+  previous = None
+  for b in sorted(benchmarks, key=lambda x: x['timestamp']):
+    if b['timestamp'] < current_timestamp:
+      previous = b
+    else:
+      break
+  return previous
+
+async def format_metric_comparison(current, previous, metric, format_str=".2f", lower_is_better=False):
+  """Format a metric with trend indicator."""
+  current_val = current.get(metric, 0)
+  if not previous:
+    return f"**{current_val:{format_str}}**"
+
+  prev_val = previous.get(metric, 0)
+  diff = current_val - prev_val
+
+  # Invert the comparison logic if lower values are better
+  if lower_is_better:
+    diff = -diff  # This makes negative diffs good and positive diffs bad
+
+  if diff > 0:
+    return f"**{current_val:{format_str}}** 🟢↑ ({'-' if lower_is_better else '+'}{abs(current_val - prev_val):{format_str}})"
+  elif diff < 0:
+    return f"**{current_val:{format_str}}** 🔴↓ ({'+' if lower_is_better else '-'}{abs(current_val - prev_val):{format_str}})"
+  else:
+    return f"**{current_val:{format_str}}** ⚪"
+
+async def send_discord_notification(benchmark_data, config_data):
   if not DISCORD_WEBHOOK_URL:
     print("Discord webhook URL not configured, skipping notification")
     return
@@ -120,6 +152,25 @@ async def send_discord_notification(benchmark_data):
   # Create a formatted message
   config_name = f"{benchmark_data['config']}/{benchmark_data['model']}"
 
+  # Use the passed config_data instead of fetching again
+  previous_benchmark = await get_previous_benchmark(
+    config_data,
+    f"{benchmark_data['config']}/{benchmark_data['model']}",
+    benchmark_data['timestamp']
+  )
+
+  # Format metrics with comparisons
+  gen_tps = await format_metric_comparison(benchmark_data, previous_benchmark, 'generation_tps')
+  prompt_tps = await format_metric_comparison(benchmark_data, previous_benchmark, 'prompt_tps')
+  ttft = await format_metric_comparison(
+    {'ttft': benchmark_data['ttft'] * 1000},
+    {'ttft': previous_benchmark['ttft'] * 1000} if previous_benchmark else None,
+    'ttft',
+    lower_is_better=True
+  )
+  prompt_len = await format_metric_comparison(benchmark_data, previous_benchmark, 'prompt_len', "d")
+  response_len = await format_metric_comparison(benchmark_data, previous_benchmark, 'response_len', "d")
+
   # Create a simple JSON string of the topology
   topology = benchmark_data.get('configuration', {})
   topology_str = "```json\n" + json.dumps(topology, indent=2) + "\n```"
@@ -127,11 +178,11 @@ async def send_discord_notification(benchmark_data):
   message = (
     f"🚀 New Benchmark Result for **{config_name}**\n\n"
     f"📊 Performance Metrics:\n"
-    f"• Generation TPS: **{benchmark_data['generation_tps']:.2f}**\n"
-    f"• Prompt TPS: **{benchmark_data['prompt_tps']:.2f}**\n"
-    f"• TTFT: **{benchmark_data['ttft'] * 1000:.2f}ms**\n"
-    f"• Prompt Length: {benchmark_data['prompt_len']}\n"
-    f"• Response Length: {benchmark_data['response_len']}\n\n"
+    f"• Generation TPS: {gen_tps}\n"
+    f"• Prompt TPS: {prompt_tps}\n"
+    f"• TTFT: {ttft}ms\n"
+    f"• Prompt Length: {prompt_len}\n"
+    f"• Response Length: {response_len}\n\n"
     f"🔍 Run Details:\n"
     f"• Commit: {benchmark_data['commit'][:7]}\n"
     f"• Branch: {benchmark_data['branch']}\n"
@@ -165,7 +216,7 @@ async def generate_best():
   print(f"Last processed timestamp: {last_processed}")
 
   async with session.client('s3') as s3:
-    # Load all benchmark data
+    # Load all benchmark data once
     config_data = await load_data_from_s3()
     best_benchmarks = await get_best_benchmarks()
 
@@ -185,7 +236,8 @@ async def generate_best():
             'config': config,
             'model': model,
           })
-          await send_discord_notification(benchmark_with_info)
+          # Pass the already loaded config_data to avoid refetching
+          await send_discord_notification(benchmark_with_info, config_data)
 
           # Update the latest timestamp if this is the newest we've seen
           if timestamp > new_latest:

From 4d9d4ad05ad4fe03c530114b6f0504879357fbf4 Mon Sep 17 00:00:00 2001
From: Gary <gary@Garys-MacBook-Pro.local>
Date: Sun, 15 Dec 2024 14:54:06 +0000
Subject: [PATCH 42/42] separate line by branch name

---
 extra/dashboard/dashboard.py | 72 ++++++++++++++++++++++--------------
 1 file changed, 45 insertions(+), 27 deletions(-)

diff --git a/extra/dashboard/dashboard.py b/extra/dashboard/dashboard.py
index dbcaa85bf..808f35d87 100644
--- a/extra/dashboard/dashboard.py
+++ b/extra/dashboard/dashboard.py
@@ -321,42 +321,60 @@ def update_graphs(n, previous_data):
     commits = [d['commit'] for d in data]
     run_ids = [d['run_id'] for d in data]
 
+    # Create a list of unique branches for this config
+    branches = list(set(d['branch'] for d in data))
+
     # Create subplot with 2 columns
     fig = make_subplots(rows=1, cols=2,
                        subplot_titles=('Performance Over Time', 'Generation TPS Distribution'),
                        column_widths=[0.7, 0.3])
 
-    # Time series plot (left)
-    fig.add_trace(go.Scatter(
-      x=timestamps,
-      y=generation_tps,
-      name='Generation TPS',
-      mode='lines+markers',
-      hovertemplate='Commit: %{text}<br>TPS: %{y}<extra></extra>',
-      text=commits,
-      customdata=run_ids,
-      line=dict(color='#2196F3', width=2),
-      marker=dict(color='#2196F3')
-    ), row=1, col=1)
-
-    # Calculate statistics
+    # Generate a color for each branch
+    colors = px.colors.qualitative.Set1[:len(branches)]
+    branch_colors = dict(zip(branches, colors))
+
+    # Time series plot (left) - separate line for each branch
+    for branch in branches:
+        branch_data = [d for d in data if d['branch'] == branch]
+        branch_timestamps = [d['timestamp'] for d in branch_data]
+        branch_generation_tps = [d['generation_tps'] for d in branch_data]
+        branch_commits = [d['commit'] for d in branch_data]
+        branch_run_ids = [d['run_id'] for d in branch_data]
+
+        fig.add_trace(go.Scatter(
+            x=branch_timestamps,
+            y=branch_generation_tps,
+            name=f'{branch}',
+            mode='lines+markers',
+            hovertemplate='Branch: %{text}<br>Commit: %{customdata}<br>TPS: %{y}<extra></extra>',
+            text=[branch] * len(branch_timestamps),
+            customdata=branch_commits,
+            line=dict(color=branch_colors[branch], width=2),
+            marker=dict(color=branch_colors[branch])
+        ), row=1, col=1)
+
+    # Histogram plot (right) - stacked histogram by branch
+    for branch in branches:
+        branch_data = [d for d in data if d['branch'] == branch]
+        branch_generation_tps = [d['generation_tps'] for d in branch_data]
+
+        fig.add_trace(go.Histogram(
+            x=branch_generation_tps,
+            name=f'{branch}',
+            nbinsx=10,
+            marker=dict(color=branch_colors[branch]),
+            opacity=0.75
+        ), row=1, col=2)
+
+    # Calculate statistics for all data
     gen_tps_array = np.array(generation_tps)
     stats = {
-      'Mean': np.mean(gen_tps_array),
-      'Std Dev': np.std(gen_tps_array),
-      'Min': np.min(gen_tps_array),
-      'Max': np.max(gen_tps_array)
+        'Mean': np.mean(gen_tps_array),
+        'Std Dev': np.std(gen_tps_array),
+        'Min': np.min(gen_tps_array),
+        'Max': np.max(gen_tps_array)
     }
 
-    # Histogram plot (right)
-    fig.add_trace(go.Histogram(
-      x=generation_tps,
-      name='Generation TPS Distribution',
-      nbinsx=10,
-      showlegend=False,
-      marker=dict(color='#2196F3')
-    ), row=1, col=2)
-
     # Add statistics as annotations
     stats_text = '<br>'.join([f'{k}: {v:.2f}' for k, v in stats.items()])
     fig.add_annotation(