Add pytest replay example (#29)

autoblocksai · Oct 30, 2023 · d980b3f · d980b3f · vercel · Oct 30, 2023
1 parent cb05a8a
commit d980b3f
Show file tree

Hide file tree

Showing 13 changed files with 1,217 additions and 85 deletions.
diff --git a/.github/workflows/autoblocks-replays.yml b/.github/workflows/autoblocks-replays.yml
@@ -13,7 +13,7 @@ on:
     - cron: '27 12 * * *'
 
 jobs:
-  autoblocks-replays:
+  autoblocks-replays-js:
     runs-on: ubuntu-latest
 
     defaults:
@@ -34,10 +34,42 @@ jobs:
       - name: Setup Node
         uses: actions/setup-node@v3
         with:
-          node-version: 20
+          node-version: '20'
 
       - name: Install dependencies
         run: npm ci
 
-      - name: Run script
-        run: npm run start
+      - name: Run tests
+        run: npm run test
+
+  autoblocks-replays-py:
+    runs-on: ubuntu-latest
+
+    defaults:
+      run:
+        shell: bash
+        working-directory: Python/pytest-replays
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Create .env file
+        run: |
+          touch .env
+          echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" >> .env
+          echo "AUTOBLOCKS_INGESTION_KEY=${{ secrets.AUTOBLOCKS_REPLAY_INGESTION_KEY }}" >> .env
+
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+
+      - name: Install poetry
+        run: curl -sSL https://install.python-poetry.org | python3 -
+
+      - name: Install dependencies
+        run: poetry install
+
+      - name: Run tests
+        run: poetry run pytest --autoblocks
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -30,7 +30,7 @@ jobs:
       - name: Setup Node
         uses: actions/setup-node@v3
         with:
-          node-version: 20
+          node-version: '20'
 
       - name: Make READMEs
         run: node tools/make-readmes.js
@@ -53,6 +53,7 @@ jobs:
           - Python/langchain
           - Python/openai-manual
           - Python/openai-automated
+          - Python/pytest-replays
 
     defaults:
       run:
@@ -72,7 +73,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v3
         with:
-          python-version: 3.11
+          python-version: '3.11'
 
       - name: Install poetry
         run: curl -sSL https://install.python-poetry.org | python3 -
@@ -117,7 +118,7 @@ jobs:
       - name: Setup Node
         uses: actions/setup-node@v3
         with:
-          node-version: 20
+          node-version: '20'
 
       - name: Install dependencies
         run: npm ci

diff --git a/JavaScript/jest-replays/README.md b/JavaScript/jest-replays/README.md
@@ -48,11 +48,11 @@ ingestion key: https://app.autoblocks.ai/settings/api-keys
 
 ### 2. Set an `AUTOBLOCKS_REPLAY_ID`
 
-This is already set up in this example via the `start` script in [`package.json`](./package.json):
+This is already set up in this example via the `test` script in [`package.json`](./package.json):
 
 ```json
   "scripts": {
-    "start": "AUTOBLOCKS_REPLAY_ID=$(date +%Y%m%d-%H%M%S) dotenv -e .env -- jest"
+    "test": "AUTOBLOCKS_REPLAY_ID=$(date +%Y%m%d-%H%M%S) dotenv -e .env -- jest"
   },
 ```
 
@@ -64,10 +64,10 @@ First install the dependencies:
 npm install
 ```
 
-Then run `npm start` (which runs the Jest test suite):
+Then run the tests:
 
 ```
-npm start
+npm test
 ```
 
 Within the test suite, you should see a link printed to the console that will take you to the replay in the Autoblocks UI:

diff --git a/JavaScript/jest-replays/package.json b/JavaScript/jest-replays/package.json
@@ -1,10 +1,10 @@
 {
   "name": "jest-replays",
   "version": "0.0.0",
-  "description": "Run Autoblocks replays from within a jest test suite",
-  "type": "module",
+  "description": "Run Autoblocks Replays from within a jest test suite",
   "scripts": {
-    "start": "AUTOBLOCKS_REPLAY_ID=$(date +%Y%m%d-%H%M%S) dotenv -e .env -- jest"
+    "start": "dotenv -e .env -- node ./src/index.js",
+    "test": "AUTOBLOCKS_REPLAY_ID=$(date +%Y%m%d-%H%M%S) dotenv -e .env -- jest"
   },
   "license": "MIT",
   "dependencies": {

diff --git a/JavaScript/jest-replays/src/index.js b/JavaScript/jest-replays/src/index.js
@@ -1,66 +1,10 @@
-const crypto = require('crypto');
-const OpenAI = require('openai');
-const { AutoblocksTracer } = require('@autoblocks/client');
+const { run } = require('./run');
 
-const openai = new OpenAI({
-  apiKey: process.env.OPENAI_API_KEY,
-});
+// Simulate running the `run` function in a production setting,
+// i.e. without passing in a traceId from a test case (one will
+// be auto-generated).
+async function main() {
+  await run({ input: 'How do I sign up?' });
+}
 
-const tracer = new AutoblocksTracer(process.env.AUTOBLOCKS_INGESTION_KEY, {
-  properties: {
-    provider: 'openai',
-  },
-});
-
-const run = async ({ input, traceId }) => {
-  // Set the traceId to the one given, or fall back to a random UUID.
-  // When we call this function from the test suite we will pass in a
-  // traceId so that it is stable across replay runs, but in production
-  // we'll only pass in an input, like run({ input }), so that we generate
-  // a random traceId while in production.
-  tracer.setTraceId(traceId || crypto.randomUUID());
-
-  const request = {
-    model: 'gpt-3.5-turbo',
-    messages: [
-      {
-        role: 'system',
-        content:
-          'You are a helpful assistant. ' +
-          'You answer questions about a software product named Acme. ' +
-          'Your answers should be in a friendly tone and include a bulleted or numbered list where appopriate. ' +
-          'You should also include a link to the relevant page in the Acme documentation.',
-      },
-      {
-        role: 'user',
-        content: input,
-      },
-    ],
-    temperature: 0.3,
-  };
-
-  await tracer.sendEvent('ai.request', {
-    properties: request,
-  });
-
-  try {
-    const now = Date.now();
-    const response = await openai.chat.completions.create(request);
-    await tracer.sendEvent('ai.response', {
-      properties: {
-        response,
-        latencyMs: Date.now() - now,
-      },
-    });
-    return response.choices[0].message.content;
-  } catch (error) {
-    await tracer.sendEvent('ai.error', {
-      properties: {
-        error,
-      },
-    });
-    throw error;
-  }
-};
-
-module.exports = { run };
+main();
diff --git a/JavaScript/jest-replays/src/run.js b/JavaScript/jest-replays/src/run.js
@@ -0,0 +1,66 @@
+const crypto = require('crypto');
+const OpenAI = require('openai');
+const { AutoblocksTracer } = require('@autoblocks/client');
+
+const openai = new OpenAI({
+  apiKey: process.env.OPENAI_API_KEY,
+});
+
+const tracer = new AutoblocksTracer(process.env.AUTOBLOCKS_INGESTION_KEY, {
+  properties: {
+    provider: 'openai',
+  },
+});
+
+const run = async ({ input, traceId }) => {
+  // Set the traceId to the one given, or fall back to a random UUID.
+  // When we call this function from the test suite we will pass in a
+  // traceId so that it is stable across replay runs, but in production
+  // we'll only pass in an input, like run({ input }), so that we generate
+  // a random traceId while in production.
+  tracer.setTraceId(traceId || crypto.randomUUID());
+
+  const request = {
+    model: 'gpt-3.5-turbo',
+    messages: [
+      {
+        role: 'system',
+        content:
+          'You are a helpful assistant. ' +
+          'You answer questions about a software product named Acme. ' +
+          'Your answers should be in a friendly tone and include a bulleted or numbered list where appopriate. ' +
+          'You should also include a link to the relevant page in the Acme documentation.',
+      },
+      {
+        role: 'user',
+        content: input,
+      },
+    ],
+    temperature: 0.3,
+  };
+
+  await tracer.sendEvent('ai.request', {
+    properties: request,
+  });
+
+  try {
+    const now = Date.now();
+    const response = await openai.chat.completions.create(request);
+    await tracer.sendEvent('ai.response', {
+      properties: {
+        response,
+        latencyMs: Date.now() - now,
+      },
+    });
+    return response.choices[0].message.content;
+  } catch (error) {
+    await tracer.sendEvent('ai.error', {
+      properties: {
+        error,
+      },
+    });
+    throw error;
+  }
+};
+
+module.exports = { run };
diff --git a/JavaScript/jest-replays/test/index.spec.js b/JavaScript/jest-replays/test/index.spec.js
@@ -1,4 +1,4 @@
-const { run } = require('../src/index');
+const { run } = require('../src/run');
 
 jest.setTimeout(60000);
 

diff --git a/Python/pytest-replays/README.md b/Python/pytest-replays/README.md
@@ -0,0 +1,108 @@
+<!-- banner start -->
+<p align="center">
+  <img src="https://app.autoblocks.ai/images/logo.png" width="300px">
+</p>
+
+<p align="center">
+  📚
+  <a href="https://docs.autoblocks.ai/">Documentation</a>
+  &nbsp;
+  •
+  &nbsp;
+  🖥️
+  <a href="https://app.autoblocks.ai/">Application</a>
+  &nbsp;
+  •
+  &nbsp;
+  🏠
+  <a href="https://www.autoblocks.ai/">Home</a>
+</p>
+<!-- banner end -->
+
+<!-- getting started start -->
+
+## Getting started
+
+- Sign up for an Autoblocks account at https://app.autoblocks.ai
+- Grab your Autoblocks ingestion key from https://app.autoblocks.ai/settings/api-keys
+- Grab your OpenAI API key from https://platform.openai.com/account/api-keys
+- Create a file named `.env` in this folder and include the following environment variables:
+
+```
+OPENAI_API_KEY=<your-api-key>
+AUTOBLOCKS_INGESTION_KEY=<your-ingestion-key>
+```
+
+<!-- getting started end -->
+
+## Replays
+
+This project shows how you can run Autoblocks Replays via your [pytest](https://docs.pytest.org/en/7.4.x/) test suite. Follow the steps below to get started.
+
+### 1. Use your replay key
+
+Replace the value for `AUTOBLOCKS_INGESTION_KEY` in the `.env` file with your replay key. Your replay key is in the same place as your
+ingestion key: https://app.autoblocks.ai/settings/api-keys
+
+> **_NOTE:_** This means you need to make very few code changes to your production code to get started with Autoblocks Replays. You simply need to swap out an environment variable.
+
+### 2. Run the tests
+
+First install the dependencies:
+
+```
+poetry install
+```
+
+Then run the test suite with Autoblocks Replays enabled:
+
+```
+poetry run pytest --autoblocks
+```
+
+Within the test suite, you should see a link printed to the console that will take you to the replay in the Autoblocks UI:
+
+```
+➜  poetry run pytest --autoblocks
+=========================================== test session starts ============================================
+platform darwin -- Python 3.11.4, pytest-7.4.3, pluggy-1.3.0
+rootdir: /Users/nicole/autoblocks/autoblocks-examples/Python/pytest-replays
+plugins: anyio-4.0.0, autoblocksai-0.0.11
+collected 3 items
+
+test_main.py ...                                                                                     [100%]
+
+======================================== Autoblocks Replay Results =========================================
+View your replay: https://app.autoblocks.ai/replays/local/run/nicole-pytest-20231030-122752
+============================================================================================================
+============================================ 3 passed in 16.65s ============================================
+```
+
+Run the tests a few times so that you generate multiple replays (your first replay won't have any baseline to compare against!).
+
+### 3. View the replays in the Autoblocks UI
+
+The link will take you to the replay UI where you can see at-a-glance differences between the replay runs over the three test cases. There are four main columns:
+
+- **Message**: The name of the Autoblocks event sent
+  - a gray icon indicates no changes
+  - a yellow icon indicates changes
+  - a red icon indicates the event was there before but not now
+  - a green icon indicates the event was not there before but is now
+- **Changes**: The number of word changes between the event properties of the replay run and the baseline run
+- **Difference Scores**: For properties that we've detected to be LLM outputs, this column will show you a difference score between the value from the baseline run and the current run
+- **Evals**: The results of your [Autoblocks Evaluators](https://docs.autoblocks.ai/features/evaluators)
+
+In one of my runs, I could see that the difference score was pretty high for the `"What is your refund policy?"` test case:
+
+![replay-summary](https://github.com/autoblocksai/autoblocks-examples/assets/7498009/cb99858a-8f94-4bd9-b8b4-893e32097642)
+
+Clicking into **View Differences**, I could see that the response now included an apology about not being able to answer questions about refunds, even though it did previously:
+
+![replay-differences](https://github.com/autoblocksai/autoblocks-examples/assets/7498009/53b33ed5-fe8e-44cf-ac07-c2f315ecb4b9)
+
+This kind of snapshot / stability testing is important to run over LLM outputs on every pull request so that you can catch regressions before they go to production.
+
+### 4. Run the replays in GitHub Actions
+
+See the [Autoblocks Replays GitHub Action](/.github/workflows/autoblocks-replays.yml) workflow; this workflow runs replays on every pull request and also on a schedule. The results of these replays will be under the GitHub tab on the [replays](https://app.autoblocks.ai/replays) page.