From 596f294b01447774986501d8937def4310e0e0b4 Mon Sep 17 00:00:00 2001
From: William FH <13333726+hinthornw@users.noreply.github.com>
Date: Wed, 13 Sep 2023 17:13:18 -0700
Subject: [PATCH] Update LangSmith Walkthrough (#10564)

---
 .../extras/guides/langsmith/walkthrough.ipynb | 310 ++++++++++++++----
 .../langchain/callbacks/tracers/evaluation.py |  10 +-
 .../smith/evaluation/runner_utils.py          |   9 +-
 3 files changed, 256 insertions(+), 73 deletions(-)

diff --git a/docs/extras/guides/langsmith/walkthrough.ipynb b/docs/extras/guides/langsmith/walkthrough.ipynb
index 9e1b8f3fcfc24..3615d8f187ae3 100644
--- a/docs/extras/guides/langsmith/walkthrough.ipynb
+++ b/docs/extras/guides/langsmith/walkthrough.ipynb
@@ -48,7 +48,7 @@
     "First, configure your environment variables to tell LangChain to log traces. This is done by setting the `LANGCHAIN_TRACING_V2` environment variable to true.\n",
     "You can tell LangChain which project to log to by setting the `LANGCHAIN_PROJECT` environment variable (if this isn't set, runs will be logged to the `default` project). This will automatically create the project for you if it doesn't exist. You must also set the `LANGCHAIN_ENDPOINT` and `LANGCHAIN_API_KEY` environment variables.\n",
     "\n",
-    "For more information on other ways to set up tracing, please reference the [LangSmith documentation](https://docs.smith.langchain.com/docs/)\n",
+    "For more information on other ways to set up tracing, please reference the [LangSmith documentation](https://docs.smith.langchain.com/docs/).\n",
     "\n",
     "**NOTE:** You must also set your `OPENAI_API_KEY` and `SERPAPI_API_KEY` environment variables in order to run the following tutorial.\n",
     "\n",
@@ -65,6 +65,17 @@
     "However, in this example, we will use environment variables."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "e4780363-f05a-4649-8b1a-9b449f960ce4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %pip install -U langchain langsmith --quiet\n",
+    "# %pip install google-search-results pandas --quiet"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,
@@ -81,7 +92,7 @@
     "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n",
     "os.environ[\"LANGCHAIN_PROJECT\"] = f\"Tracing Walkthrough - {unique_id}\"\n",
     "os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.smith.langchain.com\"\n",
-    "os.environ[\"LANGCHAIN_API_KEY\"] = \"\"  # Update to your API key\n",
+    "# os.environ[\"LANGCHAIN_API_KEY\"] = \"\"  # Update to your API key\n",
     "\n",
     "# Used by the agent in this tutorial\n",
     "# os.environ[\"OPENAI_API_KEY\"] = \"<YOUR-OPENAI-API-KEY>\"\n",
@@ -156,8 +167,6 @@
    },
    "outputs": [],
    "source": [
-    "import asyncio\n",
-    "\n",
     "inputs = [\n",
     "    \"How many people live in canada as of 2023?\",\n",
     "    \"who is dua lipa's boyfriend? what is his age raised to the .43 power?\",\n",
@@ -170,20 +179,8 @@
     "    \"who is kendall jenner's boyfriend? what is his height (in inches) raised to .13 power?\",\n",
     "    \"what is 1213 divided by 4345?\",\n",
     "]\n",
-    "results = []\n",
-    "\n",
     "\n",
-    "async def arun(agent, input_example):\n",
-    "    try:\n",
-    "        return await agent.arun(input_example)\n",
-    "    except Exception as e:\n",
-    "        # The agent sometimes makes mistakes! These will be captured by the tracing.\n",
-    "        return e\n",
-    "\n",
-    "\n",
-    "for input_example in inputs:\n",
-    "    results.append(arun(agent, input_example))\n",
-    "results = await asyncio.gather(*results)"
+    "results = agent.batch(inputs, return_exceptions=True)"
    ]
   },
   {
@@ -389,53 +386,30 @@
     "tags": []
    },
    "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "View the evaluation results for project '2023-07-17-11-25-20-AgentExecutor' at:\n",
-      "https://dev.smith.langchain.com/projects/p/1c9baec3-ae86-4fac-9e99-e1b9f8e7818c?eval=true\n",
-      "Processed examples: 1\r"
-     ]
-    },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Chain failed for example 5a2ac8da-8c2b-4d12-acb9-5c4b0f47fe8a. Error: LLMMathChain._evaluate(\"\n",
+      "Chain failed for example f8dfff24-d288-4d8e-ba94-c3cc33dd10d0 with inputs {'input': \"what is dua lipa's boyfriend age raised to the .43 power?\"}\n",
+      "Error Type: ValueError, Message: LLMMathChain._evaluate(\"\n",
       "age_of_Dua_Lipa_boyfriend ** 0.43\n",
-      "\") raised error: 'age_of_Dua_Lipa_boyfriend'. Please try again with a valid numerical expression\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Processed examples: 4\r"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example 91439261-1c86-4198-868b-a6c1cc8a051b. Error: Too many arguments to single-input tool Calculator. Args: ['height ^ 0.13', {'height': 68}]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Processed examples: 9\r"
+      "\") raised error: 'age_of_Dua_Lipa_boyfriend'. Please try again with a valid numerical expression\n",
+      "Chain failed for example 78c959a4-467d-4469-8bd7-c5f0b059bc4a with inputs {'input': \"who is dua lipa's boyfriend? what is his age raised to the .43 power?\"}\n",
+      "Error Type: ValueError, Message: LLMMathChain._evaluate(\"\n",
+      "age ** 0.43\n",
+      "\") raised error: 'age'. Please try again with a valid numerical expression\n",
+      "Chain failed for example 6de48a56-3f30-4aac-b6cf-eee4b05ad43f with inputs {'input': \"who is kendall jenner's boyfriend? what is his height (in inches) raised to .13 power?\"}\n",
+      "Error Type: ToolException, Message: Too many arguments to single-input tool Calculator. Args: ['height ^ 0.13', {'height': 72}]\n"
      ]
     }
    ],
    "source": [
     "from langchain.smith import (\n",
     "    arun_on_dataset,\n",
-    "    run_on_dataset,  # Available if your chain doesn't support async calls.\n",
+    "    run_on_dataset, \n",
     ")\n",
     "\n",
-    "chain_results = await arun_on_dataset(\n",
+    "chain_results = run_on_dataset(\n",
     "    client=client,\n",
     "    dataset_name=dataset_name,\n",
     "    llm_or_chain_factory=agent_factory,\n",
@@ -448,6 +422,218 @@
     "# These are logged as warnings here and captured as errors in the tracing UI."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "9da60638-5be8-4b5f-a721-2c6627aeaf0c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>input</th>\n",
+       "      <th>output</th>\n",
+       "      <th>reference</th>\n",
+       "      <th>embedding_cosine_distance</th>\n",
+       "      <th>correctness</th>\n",
+       "      <th>helpfulness</th>\n",
+       "      <th>fifth-grader-score</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>78c959a4-467d-4469-8bd7-c5f0b059bc4a</th>\n",
+       "      <td>{'input': 'who is dua lipa's boyfriend? what i...</td>\n",
+       "      <td>{'Error': 'ValueError('LLMMathChain._evaluate(...</td>\n",
+       "      <td>{'output': 'Romain Gavras' age raised to the 0...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>f8dfff24-d288-4d8e-ba94-c3cc33dd10d0</th>\n",
+       "      <td>{'input': 'what is dua lipa's boyfriend age ra...</td>\n",
+       "      <td>{'Error': 'ValueError('LLMMathChain._evaluate(...</td>\n",
+       "      <td>{'output': 'Approximately 4.9888126515157.'}</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>c78d5e84-3fbd-442f-affb-4b0e5806c439</th>\n",
+       "      <td>{'input': 'how far is it from paris to boston ...</td>\n",
+       "      <td>{'input': 'how far is it from paris to boston ...</td>\n",
+       "      <td>{'output': 'The distance from Paris to Boston ...</td>\n",
+       "      <td>0.007577</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>02cadef9-5794-49a9-8e43-acca977cab60</th>\n",
+       "      <td>{'input': 'How many people live in canada as o...</td>\n",
+       "      <td>{'input': 'How many people live in canada as o...</td>\n",
+       "      <td>{'output': 'The current population of Canada a...</td>\n",
+       "      <td>0.016324</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>e888a340-0486-4552-bb4b-911756e6bed7</th>\n",
+       "      <td>{'input': 'what was the total number of points...</td>\n",
+       "      <td>{'input': 'what was the total number of points...</td>\n",
+       "      <td>{'output': '3'}</td>\n",
+       "      <td>0.225076</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1b1f655b-754c-474d-8832-e6ec6bad3943</th>\n",
+       "      <td>{'input': 'what was the total number of points...</td>\n",
+       "      <td>{'input': 'what was the total number of points...</td>\n",
+       "      <td>{'output': 'The total number of points scored ...</td>\n",
+       "      <td>0.011580</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>51f1b1f1-3b51-400f-b871-65f8a3a3c2d4</th>\n",
+       "      <td>{'input': 'how many more points were scored in...</td>\n",
+       "      <td>{'input': 'how many more points were scored in...</td>\n",
+       "      <td>{'output': '15'}</td>\n",
+       "      <td>0.251002</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>83339364-0135-4efd-a24a-f3bd2a85e33a</th>\n",
+       "      <td>{'input': 'what is 153 raised to .1312 power?'}</td>\n",
+       "      <td>{'input': 'what is 153 raised to .1312 power?'...</td>\n",
+       "      <td>{'output': '1.9347796717823205'}</td>\n",
+       "      <td>0.127441</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6de48a56-3f30-4aac-b6cf-eee4b05ad43f</th>\n",
+       "      <td>{'input': 'who is kendall jenner's boyfriend? ...</td>\n",
+       "      <td>{'Error': 'ToolException(\"Too many arguments t...</td>\n",
+       "      <td>{'output': 'Bad Bunny's height raised to the p...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>0c41cc28-9c07-4550-8940-68b58cbc045e</th>\n",
+       "      <td>{'input': 'what is 1213 divided by 4345?'}</td>\n",
+       "      <td>{'input': 'what is 1213 divided by 4345?', 'ou...</td>\n",
+       "      <td>{'output': '0.2791714614499425'}</td>\n",
+       "      <td>0.144522</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                                                  input  \\\n",
+       "78c959a4-467d-4469-8bd7-c5f0b059bc4a  {'input': 'who is dua lipa's boyfriend? what i...   \n",
+       "f8dfff24-d288-4d8e-ba94-c3cc33dd10d0  {'input': 'what is dua lipa's boyfriend age ra...   \n",
+       "c78d5e84-3fbd-442f-affb-4b0e5806c439  {'input': 'how far is it from paris to boston ...   \n",
+       "02cadef9-5794-49a9-8e43-acca977cab60  {'input': 'How many people live in canada as o...   \n",
+       "e888a340-0486-4552-bb4b-911756e6bed7  {'input': 'what was the total number of points...   \n",
+       "1b1f655b-754c-474d-8832-e6ec6bad3943  {'input': 'what was the total number of points...   \n",
+       "51f1b1f1-3b51-400f-b871-65f8a3a3c2d4  {'input': 'how many more points were scored in...   \n",
+       "83339364-0135-4efd-a24a-f3bd2a85e33a    {'input': 'what is 153 raised to .1312 power?'}   \n",
+       "6de48a56-3f30-4aac-b6cf-eee4b05ad43f  {'input': 'who is kendall jenner's boyfriend? ...   \n",
+       "0c41cc28-9c07-4550-8940-68b58cbc045e         {'input': 'what is 1213 divided by 4345?'}   \n",
+       "\n",
+       "                                                                                 output  \\\n",
+       "78c959a4-467d-4469-8bd7-c5f0b059bc4a  {'Error': 'ValueError('LLMMathChain._evaluate(...   \n",
+       "f8dfff24-d288-4d8e-ba94-c3cc33dd10d0  {'Error': 'ValueError('LLMMathChain._evaluate(...   \n",
+       "c78d5e84-3fbd-442f-affb-4b0e5806c439  {'input': 'how far is it from paris to boston ...   \n",
+       "02cadef9-5794-49a9-8e43-acca977cab60  {'input': 'How many people live in canada as o...   \n",
+       "e888a340-0486-4552-bb4b-911756e6bed7  {'input': 'what was the total number of points...   \n",
+       "1b1f655b-754c-474d-8832-e6ec6bad3943  {'input': 'what was the total number of points...   \n",
+       "51f1b1f1-3b51-400f-b871-65f8a3a3c2d4  {'input': 'how many more points were scored in...   \n",
+       "83339364-0135-4efd-a24a-f3bd2a85e33a  {'input': 'what is 153 raised to .1312 power?'...   \n",
+       "6de48a56-3f30-4aac-b6cf-eee4b05ad43f  {'Error': 'ToolException(\"Too many arguments t...   \n",
+       "0c41cc28-9c07-4550-8940-68b58cbc045e  {'input': 'what is 1213 divided by 4345?', 'ou...   \n",
+       "\n",
+       "                                                                              reference  \\\n",
+       "78c959a4-467d-4469-8bd7-c5f0b059bc4a  {'output': 'Romain Gavras' age raised to the 0...   \n",
+       "f8dfff24-d288-4d8e-ba94-c3cc33dd10d0       {'output': 'Approximately 4.9888126515157.'}   \n",
+       "c78d5e84-3fbd-442f-affb-4b0e5806c439  {'output': 'The distance from Paris to Boston ...   \n",
+       "02cadef9-5794-49a9-8e43-acca977cab60  {'output': 'The current population of Canada a...   \n",
+       "e888a340-0486-4552-bb4b-911756e6bed7                                    {'output': '3'}   \n",
+       "1b1f655b-754c-474d-8832-e6ec6bad3943  {'output': 'The total number of points scored ...   \n",
+       "51f1b1f1-3b51-400f-b871-65f8a3a3c2d4                                   {'output': '15'}   \n",
+       "83339364-0135-4efd-a24a-f3bd2a85e33a                   {'output': '1.9347796717823205'}   \n",
+       "6de48a56-3f30-4aac-b6cf-eee4b05ad43f  {'output': 'Bad Bunny's height raised to the p...   \n",
+       "0c41cc28-9c07-4550-8940-68b58cbc045e                   {'output': '0.2791714614499425'}   \n",
+       "\n",
+       "                                      embedding_cosine_distance  correctness  \\\n",
+       "78c959a4-467d-4469-8bd7-c5f0b059bc4a                        NaN          NaN   \n",
+       "f8dfff24-d288-4d8e-ba94-c3cc33dd10d0                        NaN          NaN   \n",
+       "c78d5e84-3fbd-442f-affb-4b0e5806c439                   0.007577          1.0   \n",
+       "02cadef9-5794-49a9-8e43-acca977cab60                   0.016324          1.0   \n",
+       "e888a340-0486-4552-bb4b-911756e6bed7                   0.225076          0.0   \n",
+       "1b1f655b-754c-474d-8832-e6ec6bad3943                   0.011580          0.0   \n",
+       "51f1b1f1-3b51-400f-b871-65f8a3a3c2d4                   0.251002          1.0   \n",
+       "83339364-0135-4efd-a24a-f3bd2a85e33a                   0.127441          1.0   \n",
+       "6de48a56-3f30-4aac-b6cf-eee4b05ad43f                        NaN          NaN   \n",
+       "0c41cc28-9c07-4550-8940-68b58cbc045e                   0.144522          1.0   \n",
+       "\n",
+       "                                      helpfulness  fifth-grader-score  \n",
+       "78c959a4-467d-4469-8bd7-c5f0b059bc4a          NaN                 NaN  \n",
+       "f8dfff24-d288-4d8e-ba94-c3cc33dd10d0          NaN                 NaN  \n",
+       "c78d5e84-3fbd-442f-affb-4b0e5806c439          1.0                 1.0  \n",
+       "02cadef9-5794-49a9-8e43-acca977cab60          1.0                 1.0  \n",
+       "e888a340-0486-4552-bb4b-911756e6bed7          0.0                 0.0  \n",
+       "1b1f655b-754c-474d-8832-e6ec6bad3943          0.0                 0.0  \n",
+       "51f1b1f1-3b51-400f-b871-65f8a3a3c2d4          1.0                 1.0  \n",
+       "83339364-0135-4efd-a24a-f3bd2a85e33a          1.0                 1.0  \n",
+       "6de48a56-3f30-4aac-b6cf-eee4b05ad43f          NaN                 NaN  \n",
+       "0c41cc28-9c07-4550-8940-68b58cbc045e          1.0                 1.0  "
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "chain_results.to_dataframe()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "cdacd159-eb4d-49e9-bb2a-c55322c40ed4",
@@ -474,7 +660,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 18,
    "id": "33bfefde-d1bb-4f50-9f7a-fd572ee76820",
    "metadata": {
     "tags": []
@@ -483,22 +669,22 @@
     {
      "data": {
       "text/plain": [
-       "Run(id=UUID('e39f310b-c5a8-4192-8a59-6a9498e1cb85'), name='AgentExecutor', start_time=datetime.datetime(2023, 7, 17, 18, 25, 30, 653872), run_type=<RunTypeEnum.chain: 'chain'>, end_time=datetime.datetime(2023, 7, 17, 18, 25, 35, 359642), extra={'runtime': {'library': 'langchain', 'runtime': 'python', 'platform': 'macOS-13.4.1-arm64-arm-64bit', 'sdk_version': '0.0.8', 'library_version': '0.0.231', 'runtime_version': '3.11.2'}, 'total_tokens': 512, 'prompt_tokens': 451, 'completion_tokens': 61}, error=None, serialized=None, events=[{'name': 'start', 'time': '2023-07-17T18:25:30.653872'}, {'name': 'end', 'time': '2023-07-17T18:25:35.359642'}], inputs={'input': 'what is 1213 divided by 4345?'}, outputs={'output': '1213 divided by 4345 is approximately 0.2792.'}, reference_example_id=UUID('a75cf754-4f73-46fd-b126-9bcd0695e463'), parent_run_id=None, tags=['openai-functions', 'testing-notebook'], execution_order=1, session_id=UUID('1c9baec3-ae86-4fac-9e99-e1b9f8e7818c'), child_run_ids=[UUID('40d0fdca-0b2b-47f4-a9da-f2b229aa4ed5'), UUID('cfa5130f-264c-4126-8950-ec1c4c31b800'), UUID('ba638a2f-2a57-45db-91e8-9a7a66a42c5a'), UUID('fcc29b5a-cdb7-4bcc-8194-47729bbdf5fb'), UUID('a6f92bf5-cfba-4747-9336-370cb00c928a'), UUID('65312576-5a39-4250-b820-4dfae7d73945')], child_runs=None, feedback_stats={'correctness': {'n': 1, 'avg': 1.0, 'mode': 1}, 'helpfulness': {'n': 1, 'avg': 1.0, 'mode': 1}, 'fifth-grader-score': {'n': 1, 'avg': 1.0, 'mode': 1}, 'embedding_cosine_distance': {'n': 1, 'avg': 0.144522385071361, 'mode': 0.144522385071361}})"
+       "Run(id=UUID('a6893e95-a9cc-43e0-b9fa-f471b0cfee83'), name='AgentExecutor', start_time=datetime.datetime(2023, 9, 13, 22, 34, 32, 177406), run_type='chain', end_time=datetime.datetime(2023, 9, 13, 22, 34, 37, 77740), extra={'runtime': {'cpu': {'time': {'sys': 3.153218304, 'user': 5.045262336}, 'percent': 0.0, 'ctx_switches': {'voluntary': 42164.0, 'involuntary': 0.0}}, 'mem': {'rss': 184205312.0}, 'library': 'langchain', 'runtime': 'python', 'platform': 'macOS-13.4.1-arm64-arm-64bit', 'sdk_version': '0.0.26', 'thread_count': 58.0, 'library_version': '0.0.286', 'runtime_version': '3.11.2', 'langchain_version': '0.0.286', 'py_implementation': 'CPython'}}, error=None, serialized=None, events=[{'name': 'start', 'time': '2023-09-13T22:34:32.177406'}, {'name': 'end', 'time': '2023-09-13T22:34:37.077740'}], inputs={'input': 'what is 1213 divided by 4345?'}, outputs={'output': '1213 divided by 4345 is approximately 0.2792.'}, reference_example_id=UUID('0c41cc28-9c07-4550-8940-68b58cbc045e'), parent_run_id=None, tags=['openai-functions', 'testing-notebook'], execution_order=1, session_id=UUID('7865a050-467e-4c58-9322-58a26f182ecb'), child_run_ids=[UUID('37faef05-b6b3-4cb7-a6db-471425e69b46'), UUID('2d6a895f-de2c-4f7f-b5f1-ca876d38e530'), UUID('e7d145e3-74b0-4f32-9240-3e370becdf8f'), UUID('10db62c9-fe4f-4aba-959a-ad02cfadfa20'), UUID('8dc46a27-8ab9-4f33-9ec1-660ca73ebb4f'), UUID('eccd042e-dde0-4425-b62f-e855e25d6b64')], child_runs=None, feedback_stats={'correctness': {'n': 1, 'avg': 1.0, 'mode': 1, 'is_all_model': True}, 'helpfulness': {'n': 1, 'avg': 1.0, 'mode': 1, 'is_all_model': True}, 'fifth-grader-score': {'n': 1, 'avg': 1.0, 'mode': 1, 'is_all_model': True}, 'embedding_cosine_distance': {'n': 1, 'avg': 0.144522385071361, 'mode': 0.144522385071361, 'is_all_model': True}}, app_path='/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/7865a050-467e-4c58-9322-58a26f182ecb/r/a6893e95-a9cc-43e0-b9fa-f471b0cfee83', manifest_id=None, status='success', prompt_tokens=None, completion_tokens=None, total_tokens=None, first_token_time=None, parent_run_ids=None)"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "runs = list(client.list_runs(dataset_name=dataset_name))\n",
+    "runs = list(client.list_runs(project_name=chain_results[\"project_name\"], execution_order=1))\n",
     "runs[0]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 22,
    "id": "6595c888-1f5c-4ae3-9390-0a559f5575d1",
    "metadata": {
     "tags": []
@@ -507,21 +693,17 @@
     {
      "data": {
       "text/plain": [
-       "{'correctness': {'n': 7, 'avg': 0.5714285714285714, 'mode': 1},\n",
-       " 'helpfulness': {'n': 7, 'avg': 0.7142857142857143, 'mode': 1},\n",
-       " 'fifth-grader-score': {'n': 7, 'avg': 0.7142857142857143, 'mode': 1},\n",
-       " 'embedding_cosine_distance': {'n': 7,\n",
-       "  'avg': 0.11462010799473926,\n",
-       "  'mode': 0.0130477459560272}}"
+       "TracerSessionResult(id=UUID('7865a050-467e-4c58-9322-58a26f182ecb'), start_time=datetime.datetime(2023, 9, 13, 22, 34, 10, 611846), name='test-dependable-stop-67', extra=None, tenant_id=UUID('ebbaf2eb-769b-4505-aca2-d11de10372a4'), run_count=None, latency_p50=None, latency_p99=None, total_tokens=None, prompt_tokens=None, completion_tokens=None, last_run_start_time=None, feedback_stats=None, reference_dataset_ids=None, run_facets=None)"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "client.read_project(project_id=runs[0].session_id).feedback_stats"
+    "# After some time, these will be populated.\n",
+    "client.read_project(project_name=chain_results[\"project_name\"]).feedback_stats"
    ]
   },
   {
diff --git a/libs/langchain/langchain/callbacks/tracers/evaluation.py b/libs/langchain/langchain/callbacks/tracers/evaluation.py
index 1cf205e3d2843..0d333f9f0458a 100644
--- a/libs/langchain/langchain/callbacks/tracers/evaluation.py
+++ b/libs/langchain/langchain/callbacks/tracers/evaluation.py
@@ -7,7 +7,7 @@
 from uuid import UUID
 
 import langsmith
-from langsmith import schemas as langsmith_schemas
+from langsmith.evaluation.evaluator import EvaluationResult
 
 from langchain.callbacks import manager
 from langchain.callbacks.tracers import langchain as langchain_tracer
@@ -76,7 +76,7 @@ def __init__(
         self.futures: Set[Future] = set()
         self.skip_unfinished = skip_unfinished
         self.project_name = project_name
-        self.logged_feedback: Dict[str, List[langsmith_schemas.Feedback]] = {}
+        self.logged_eval_results: Dict[str, List[EvaluationResult]] = {}
 
     def _evaluate_in_project(self, run: Run, evaluator: langsmith.RunEvaluator) -> None:
         """Evaluate the run in the project.
@@ -91,11 +91,11 @@ def _evaluate_in_project(self, run: Run, evaluator: langsmith.RunEvaluator) -> N
         """
         try:
             if self.project_name is None:
-                feedback = self.client.evaluate_run(run, evaluator)
+                eval_result = self.client.evaluate_run(run, evaluator)
             with manager.tracing_v2_enabled(
                 project_name=self.project_name, tags=["eval"], client=self.client
             ):
-                feedback = self.client.evaluate_run(run, evaluator)
+                eval_result = self.client.evaluate_run(run, evaluator)
         except Exception as e:
             logger.error(
                 f"Error evaluating run {run.id} with "
@@ -104,7 +104,7 @@ def _evaluate_in_project(self, run: Run, evaluator: langsmith.RunEvaluator) -> N
             )
             raise e
         example_id = str(run.reference_example_id)
-        self.logged_feedback.setdefault(example_id, []).append(feedback)
+        self.logged_eval_results.setdefault(example_id, []).append(eval_result)
 
     def _persist_run(self, run: Run) -> None:
         """Run the evaluator on the run.
diff --git a/libs/langchain/langchain/smith/evaluation/runner_utils.py b/libs/langchain/langchain/smith/evaluation/runner_utils.py
index 0e2262875f637..8eea9ef26b199 100644
--- a/libs/langchain/langchain/smith/evaluation/runner_utils.py
+++ b/libs/langchain/langchain/smith/evaluation/runner_utils.py
@@ -866,7 +866,8 @@ def _prepare_eval_run(
             f"Project {project_name} already exists. Please use a different name."
         )
     print(
-        f"View the evaluation results for project '{project_name}' at:\n{project.url}"
+        f"View the evaluation results for project '{project_name}' at:\n{project.url}",
+        flush=True,
     )
     dataset = client.read_dataset(dataset_name=dataset_name)
     examples = list(client.list_examples(dataset_id=dataset.id))
@@ -927,14 +928,14 @@ def _collect_test_results(
     project_name: str,
 ) -> TestResult:
     wait_for_all_tracers()
-    all_feedback = {}
+    all_eval_results = {}
     for c in configs:
         for callback in cast(list, c["callbacks"]):
             if isinstance(callback, EvaluatorCallbackHandler):
-                all_feedback.update(callback.logged_feedback)
+                all_eval_results.update(callback.logged_eval_results)
     results = {}
     for example, output in zip(examples, batch_results):
-        feedback = all_feedback.get(str(example.id), [])
+        feedback = all_eval_results.get(str(example.id), [])
         results[str(example.id)] = {
             "output": output,
             "input": example.inputs,