[Python] Release W/ LLM Expectations (#596)

- [x] change default dataset creation (should be unique per file) - [x] change env vars to use `LANGSMITH_` prefix Want feedback on: - [x] ergonomics: is this a general UX we want to support - [x] imports: should i re-implement string, embedding distance, etc.? - [x] Do we want default implementations? - [x] Any other we ought to include at the outset? - [x] I could also do a general `expect(value).is(...)` or something: want anything super generic like that? Example: ```python @Unit(inputs=x, outputs=y) def test_output_semantically_close(): response = oai_client.chat.completions.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Say hello!"}, ], ) # The embedding_distance call logs the embedding distance to LangSmith expect.embedding_distance( prediction=response.choices[0].message.content, reference="Hello!", # The following optional assertion logs a # pass/fail score to LangSmith # and raises an AssertionError if the assertion fails. ).to_be_less_than(0.5) # Compute damerau_levenshtein distance expect.string_distance( prediction=response.choices[0].message.content, reference="Hello!", # And then log a pass/fail score to LangSmith ).to_be_less_than(0.5) ``` The idea is it's still an easy onramp for developers to quickly write some scoring functions and get it running regularly in CI
langchain-ai · Apr 12, 2024 · 0308d11 · 0308d11
1 parent e1297d5
commit 0308d11
Show file tree

Hide file tree

Showing 22 changed files with 3,099 additions and 3,515 deletions.
diff --git a/.github/actions/python-integration-tests/action.yml b/.github/actions/python-integration-tests/action.yml
@@ -33,7 +33,7 @@ runs:
     - name: Install dependencies
       run: |
         poetry install --with dev
-        poetry run pip install -U langchain langchain_anthropic tiktoken rapidfuzz vcrpy
+        poetry run pip install -U langchain langchain_anthropic tiktoken rapidfuzz vcrpy numpy
       shell: bash
       working-directory: python
 
@@ -52,7 +52,19 @@ runs:
         LANGCHAIN_API_KEY: ${{ inputs.langchain-api-key }}
         OPENAI_API_KEY: ${{ inputs.openai-api-key }}
         ANTHROPIC_API_KEY: ${{ inputs.anthropic-api-key }}
-        LANGCHAIN_TEST_CACHE: "tests/cassettes"
       run: make doctest
       shell: bash
       working-directory: python
+
+
+    - name: Run Evaluation
+      env:
+          LANGCHAIN_TRACING_V2: "true"
+          LANGCHAIN_API_KEY: ${{ inputs.langchain-api-key }}
+          OPENAI_API_KEY: ${{ inputs.openai-api-key }}
+          ANTHROPIC_API_KEY: ${{ inputs.anthropic-api-key }}
+          LANGCHAIN_TEST_CACHE: "tests/cassettes"
+      run: make evals
+      shell: bash
+      working-directory: python
+
diff --git a/python/Makefile b/python/Makefile
@@ -1,4 +1,4 @@
-.PHONY: tests lint format
+.PHONY: tests lint format build publish doctest integration_tests integration_tests_fast evals
 
 tests:
 	poetry run pytest -n auto --durations=10 tests/unit_tests
@@ -15,6 +15,9 @@ integration_tests_fast:
 doctest:
 	poetry run pytest -n auto --durations=10 --doctest-modules langsmith
 
+evals:
+	poetry run pytest tests/evaluation
+
 lint:
 	poetry run ruff check .
 	poetry run mypy .

diff --git a/python/README.md b/python/README.md
@@ -8,8 +8,8 @@ To install:
 
 ```bash
 pip install -U langsmith
-export LANGCHAIN_TRACING_V2=true
-export LANGCHAIN_API_KEY=ls_...
+export LANGSMITH_TRACING=true
+export LANGSMITH_API_KEY=ls_...
 ```
 
 Then trace:
@@ -68,10 +68,10 @@ Tracing can be activated by setting the following environment variables or by ma
 
 ```python
 import os
-os.environ["LANGCHAIN_TRACING_V2"] = "true"
-os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
-os.environ["LANGCHAIN_API_KEY"] = "<YOUR-LANGSMITH-API-KEY>"
-# os.environ["LANGCHAIN_PROJECT"] = "My Project Name" # Optional: "default" is used if not set
+os.environ["LANGSMITH_TRACING_V2"] = "true"
+os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
+os.environ["LANGSMITH_API_KEY"] = "<YOUR-LANGSMITH-API-KEY>"
+# os.environ["LANGSMITH_PROJECT"] = "My Project Name" # Optional: "default" is used if not set
 ```
 
 > **Tip:** Projects are groups of traces. All runs are logged to a project. If not specified, the project is set to `default`.

diff --git a/python/langsmith/__init__.py b/python/langsmith/__init__.py
@@ -3,8 +3,10 @@
 from typing import TYPE_CHECKING, Any
 
 if TYPE_CHECKING:
+    from langsmith._expect import expect
     from langsmith._testing import unit
     from langsmith.client import Client
+    from langsmith.evaluation import aevaluate, evaluate
     from langsmith.evaluation.evaluator import EvaluationResult, RunEvaluator
     from langsmith.run_helpers import trace, traceable
     from langsmith.run_trees import RunTree
@@ -47,6 +49,19 @@ def __getattr__(name: str) -> Any:
 
         return unit
 
+    elif name == "expect":
+        from langsmith._expect import expect
+
+        return expect
+    elif name == "evaluate":
+        from langsmith.evaluation import evaluate
+
+        return evaluate
+    elif name == "aevaluate":
+        from langsmith.evaluation import aevaluate
+
+        return aevaluate
+
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
 
@@ -59,4 +74,7 @@ def __getattr__(name: str) -> Any:
     "traceable",
     "trace",
     "unit",
+    "expect",
+    "evaluate",
+    "aevaluate",
 ]