From 596f294b01447774986501d8937def4310e0e0b4 Mon Sep 17 00:00:00 2001 From: William FH <13333726+hinthornw@users.noreply.github.com> Date: Wed, 13 Sep 2023 17:13:18 -0700 Subject: [PATCH] Update LangSmith Walkthrough (#10564) --- .../extras/guides/langsmith/walkthrough.ipynb | 310 ++++++++++++++---- .../langchain/callbacks/tracers/evaluation.py | 10 +- .../smith/evaluation/runner_utils.py | 9 +- 3 files changed, 256 insertions(+), 73 deletions(-) diff --git a/docs/extras/guides/langsmith/walkthrough.ipynb b/docs/extras/guides/langsmith/walkthrough.ipynb index 9e1b8f3fcfc24..3615d8f187ae3 100644 --- a/docs/extras/guides/langsmith/walkthrough.ipynb +++ b/docs/extras/guides/langsmith/walkthrough.ipynb @@ -48,7 +48,7 @@ "First, configure your environment variables to tell LangChain to log traces. This is done by setting the `LANGCHAIN_TRACING_V2` environment variable to true.\n", "You can tell LangChain which project to log to by setting the `LANGCHAIN_PROJECT` environment variable (if this isn't set, runs will be logged to the `default` project). This will automatically create the project for you if it doesn't exist. You must also set the `LANGCHAIN_ENDPOINT` and `LANGCHAIN_API_KEY` environment variables.\n", "\n", - "For more information on other ways to set up tracing, please reference the [LangSmith documentation](https://docs.smith.langchain.com/docs/)\n", + "For more information on other ways to set up tracing, please reference the [LangSmith documentation](https://docs.smith.langchain.com/docs/).\n", "\n", "**NOTE:** You must also set your `OPENAI_API_KEY` and `SERPAPI_API_KEY` environment variables in order to run the following tutorial.\n", "\n", @@ -65,6 +65,17 @@ "However, in this example, we will use environment variables." ] }, + { + "cell_type": "code", + "execution_count": 11, + "id": "e4780363-f05a-4649-8b1a-9b449f960ce4", + "metadata": {}, + "outputs": [], + "source": [ + "# %pip install -U langchain langsmith --quiet\n", + "# %pip install google-search-results pandas --quiet" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -81,7 +92,7 @@ "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n", "os.environ[\"LANGCHAIN_PROJECT\"] = f\"Tracing Walkthrough - {unique_id}\"\n", "os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.smith.langchain.com\"\n", - "os.environ[\"LANGCHAIN_API_KEY\"] = \"\" # Update to your API key\n", + "# os.environ[\"LANGCHAIN_API_KEY\"] = \"\" # Update to your API key\n", "\n", "# Used by the agent in this tutorial\n", "# os.environ[\"OPENAI_API_KEY\"] = \"\"\n", @@ -156,8 +167,6 @@ }, "outputs": [], "source": [ - "import asyncio\n", - "\n", "inputs = [\n", " \"How many people live in canada as of 2023?\",\n", " \"who is dua lipa's boyfriend? what is his age raised to the .43 power?\",\n", @@ -170,20 +179,8 @@ " \"who is kendall jenner's boyfriend? what is his height (in inches) raised to .13 power?\",\n", " \"what is 1213 divided by 4345?\",\n", "]\n", - "results = []\n", - "\n", "\n", - "async def arun(agent, input_example):\n", - " try:\n", - " return await agent.arun(input_example)\n", - " except Exception as e:\n", - " # The agent sometimes makes mistakes! These will be captured by the tracing.\n", - " return e\n", - "\n", - "\n", - "for input_example in inputs:\n", - " results.append(arun(agent, input_example))\n", - "results = await asyncio.gather(*results)" + "results = agent.batch(inputs, return_exceptions=True)" ] }, { @@ -389,53 +386,30 @@ "tags": [] }, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "View the evaluation results for project '2023-07-17-11-25-20-AgentExecutor' at:\n", - "https://dev.smith.langchain.com/projects/p/1c9baec3-ae86-4fac-9e99-e1b9f8e7818c?eval=true\n", - "Processed examples: 1\r" - ] - }, { "name": "stderr", "output_type": "stream", "text": [ - "Chain failed for example 5a2ac8da-8c2b-4d12-acb9-5c4b0f47fe8a. Error: LLMMathChain._evaluate(\"\n", + "Chain failed for example f8dfff24-d288-4d8e-ba94-c3cc33dd10d0 with inputs {'input': \"what is dua lipa's boyfriend age raised to the .43 power?\"}\n", + "Error Type: ValueError, Message: LLMMathChain._evaluate(\"\n", "age_of_Dua_Lipa_boyfriend ** 0.43\n", - "\") raised error: 'age_of_Dua_Lipa_boyfriend'. Please try again with a valid numerical expression\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processed examples: 4\r" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Chain failed for example 91439261-1c86-4198-868b-a6c1cc8a051b. Error: Too many arguments to single-input tool Calculator. Args: ['height ^ 0.13', {'height': 68}]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processed examples: 9\r" + "\") raised error: 'age_of_Dua_Lipa_boyfriend'. Please try again with a valid numerical expression\n", + "Chain failed for example 78c959a4-467d-4469-8bd7-c5f0b059bc4a with inputs {'input': \"who is dua lipa's boyfriend? what is his age raised to the .43 power?\"}\n", + "Error Type: ValueError, Message: LLMMathChain._evaluate(\"\n", + "age ** 0.43\n", + "\") raised error: 'age'. Please try again with a valid numerical expression\n", + "Chain failed for example 6de48a56-3f30-4aac-b6cf-eee4b05ad43f with inputs {'input': \"who is kendall jenner's boyfriend? what is his height (in inches) raised to .13 power?\"}\n", + "Error Type: ToolException, Message: Too many arguments to single-input tool Calculator. Args: ['height ^ 0.13', {'height': 72}]\n" ] } ], "source": [ "from langchain.smith import (\n", " arun_on_dataset,\n", - " run_on_dataset, # Available if your chain doesn't support async calls.\n", + " run_on_dataset, \n", ")\n", "\n", - "chain_results = await arun_on_dataset(\n", + "chain_results = run_on_dataset(\n", " client=client,\n", " dataset_name=dataset_name,\n", " llm_or_chain_factory=agent_factory,\n", @@ -448,6 +422,218 @@ "# These are logged as warnings here and captured as errors in the tracing UI." ] }, + { + "cell_type": "code", + "execution_count": 10, + "id": "9da60638-5be8-4b5f-a721-2c6627aeaf0c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
inputoutputreferenceembedding_cosine_distancecorrectnesshelpfulnessfifth-grader-score
78c959a4-467d-4469-8bd7-c5f0b059bc4a{'input': 'who is dua lipa's boyfriend? what i...{'Error': 'ValueError('LLMMathChain._evaluate(...{'output': 'Romain Gavras' age raised to the 0...NaNNaNNaNNaN
f8dfff24-d288-4d8e-ba94-c3cc33dd10d0{'input': 'what is dua lipa's boyfriend age ra...{'Error': 'ValueError('LLMMathChain._evaluate(...{'output': 'Approximately 4.9888126515157.'}NaNNaNNaNNaN
c78d5e84-3fbd-442f-affb-4b0e5806c439{'input': 'how far is it from paris to boston ...{'input': 'how far is it from paris to boston ...{'output': 'The distance from Paris to Boston ...0.0075771.01.01.0
02cadef9-5794-49a9-8e43-acca977cab60{'input': 'How many people live in canada as o...{'input': 'How many people live in canada as o...{'output': 'The current population of Canada a...0.0163241.01.01.0
e888a340-0486-4552-bb4b-911756e6bed7{'input': 'what was the total number of points...{'input': 'what was the total number of points...{'output': '3'}0.2250760.00.00.0
1b1f655b-754c-474d-8832-e6ec6bad3943{'input': 'what was the total number of points...{'input': 'what was the total number of points...{'output': 'The total number of points scored ...0.0115800.00.00.0
51f1b1f1-3b51-400f-b871-65f8a3a3c2d4{'input': 'how many more points were scored in...{'input': 'how many more points were scored in...{'output': '15'}0.2510021.01.01.0
83339364-0135-4efd-a24a-f3bd2a85e33a{'input': 'what is 153 raised to .1312 power?'}{'input': 'what is 153 raised to .1312 power?'...{'output': '1.9347796717823205'}0.1274411.01.01.0
6de48a56-3f30-4aac-b6cf-eee4b05ad43f{'input': 'who is kendall jenner's boyfriend? ...{'Error': 'ToolException(\"Too many arguments t...{'output': 'Bad Bunny's height raised to the p...NaNNaNNaNNaN
0c41cc28-9c07-4550-8940-68b58cbc045e{'input': 'what is 1213 divided by 4345?'}{'input': 'what is 1213 divided by 4345?', 'ou...{'output': '0.2791714614499425'}0.1445221.01.01.0
\n", + "
" + ], + "text/plain": [ + " input \\\n", + "78c959a4-467d-4469-8bd7-c5f0b059bc4a {'input': 'who is dua lipa's boyfriend? what i... \n", + "f8dfff24-d288-4d8e-ba94-c3cc33dd10d0 {'input': 'what is dua lipa's boyfriend age ra... \n", + "c78d5e84-3fbd-442f-affb-4b0e5806c439 {'input': 'how far is it from paris to boston ... \n", + "02cadef9-5794-49a9-8e43-acca977cab60 {'input': 'How many people live in canada as o... \n", + "e888a340-0486-4552-bb4b-911756e6bed7 {'input': 'what was the total number of points... \n", + "1b1f655b-754c-474d-8832-e6ec6bad3943 {'input': 'what was the total number of points... \n", + "51f1b1f1-3b51-400f-b871-65f8a3a3c2d4 {'input': 'how many more points were scored in... \n", + "83339364-0135-4efd-a24a-f3bd2a85e33a {'input': 'what is 153 raised to .1312 power?'} \n", + "6de48a56-3f30-4aac-b6cf-eee4b05ad43f {'input': 'who is kendall jenner's boyfriend? ... \n", + "0c41cc28-9c07-4550-8940-68b58cbc045e {'input': 'what is 1213 divided by 4345?'} \n", + "\n", + " output \\\n", + "78c959a4-467d-4469-8bd7-c5f0b059bc4a {'Error': 'ValueError('LLMMathChain._evaluate(... \n", + "f8dfff24-d288-4d8e-ba94-c3cc33dd10d0 {'Error': 'ValueError('LLMMathChain._evaluate(... \n", + "c78d5e84-3fbd-442f-affb-4b0e5806c439 {'input': 'how far is it from paris to boston ... \n", + "02cadef9-5794-49a9-8e43-acca977cab60 {'input': 'How many people live in canada as o... \n", + "e888a340-0486-4552-bb4b-911756e6bed7 {'input': 'what was the total number of points... \n", + "1b1f655b-754c-474d-8832-e6ec6bad3943 {'input': 'what was the total number of points... \n", + "51f1b1f1-3b51-400f-b871-65f8a3a3c2d4 {'input': 'how many more points were scored in... \n", + "83339364-0135-4efd-a24a-f3bd2a85e33a {'input': 'what is 153 raised to .1312 power?'... \n", + "6de48a56-3f30-4aac-b6cf-eee4b05ad43f {'Error': 'ToolException(\"Too many arguments t... \n", + "0c41cc28-9c07-4550-8940-68b58cbc045e {'input': 'what is 1213 divided by 4345?', 'ou... \n", + "\n", + " reference \\\n", + "78c959a4-467d-4469-8bd7-c5f0b059bc4a {'output': 'Romain Gavras' age raised to the 0... \n", + "f8dfff24-d288-4d8e-ba94-c3cc33dd10d0 {'output': 'Approximately 4.9888126515157.'} \n", + "c78d5e84-3fbd-442f-affb-4b0e5806c439 {'output': 'The distance from Paris to Boston ... \n", + "02cadef9-5794-49a9-8e43-acca977cab60 {'output': 'The current population of Canada a... \n", + "e888a340-0486-4552-bb4b-911756e6bed7 {'output': '3'} \n", + "1b1f655b-754c-474d-8832-e6ec6bad3943 {'output': 'The total number of points scored ... \n", + "51f1b1f1-3b51-400f-b871-65f8a3a3c2d4 {'output': '15'} \n", + "83339364-0135-4efd-a24a-f3bd2a85e33a {'output': '1.9347796717823205'} \n", + "6de48a56-3f30-4aac-b6cf-eee4b05ad43f {'output': 'Bad Bunny's height raised to the p... \n", + "0c41cc28-9c07-4550-8940-68b58cbc045e {'output': '0.2791714614499425'} \n", + "\n", + " embedding_cosine_distance correctness \\\n", + "78c959a4-467d-4469-8bd7-c5f0b059bc4a NaN NaN \n", + "f8dfff24-d288-4d8e-ba94-c3cc33dd10d0 NaN NaN \n", + "c78d5e84-3fbd-442f-affb-4b0e5806c439 0.007577 1.0 \n", + "02cadef9-5794-49a9-8e43-acca977cab60 0.016324 1.0 \n", + "e888a340-0486-4552-bb4b-911756e6bed7 0.225076 0.0 \n", + "1b1f655b-754c-474d-8832-e6ec6bad3943 0.011580 0.0 \n", + "51f1b1f1-3b51-400f-b871-65f8a3a3c2d4 0.251002 1.0 \n", + "83339364-0135-4efd-a24a-f3bd2a85e33a 0.127441 1.0 \n", + "6de48a56-3f30-4aac-b6cf-eee4b05ad43f NaN NaN \n", + "0c41cc28-9c07-4550-8940-68b58cbc045e 0.144522 1.0 \n", + "\n", + " helpfulness fifth-grader-score \n", + "78c959a4-467d-4469-8bd7-c5f0b059bc4a NaN NaN \n", + "f8dfff24-d288-4d8e-ba94-c3cc33dd10d0 NaN NaN \n", + "c78d5e84-3fbd-442f-affb-4b0e5806c439 1.0 1.0 \n", + "02cadef9-5794-49a9-8e43-acca977cab60 1.0 1.0 \n", + "e888a340-0486-4552-bb4b-911756e6bed7 0.0 0.0 \n", + "1b1f655b-754c-474d-8832-e6ec6bad3943 0.0 0.0 \n", + "51f1b1f1-3b51-400f-b871-65f8a3a3c2d4 1.0 1.0 \n", + "83339364-0135-4efd-a24a-f3bd2a85e33a 1.0 1.0 \n", + "6de48a56-3f30-4aac-b6cf-eee4b05ad43f NaN NaN \n", + "0c41cc28-9c07-4550-8940-68b58cbc045e 1.0 1.0 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain_results.to_dataframe()" + ] + }, { "cell_type": "markdown", "id": "cdacd159-eb4d-49e9-bb2a-c55322c40ed4", @@ -474,7 +660,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 18, "id": "33bfefde-d1bb-4f50-9f7a-fd572ee76820", "metadata": { "tags": [] @@ -483,22 +669,22 @@ { "data": { "text/plain": [ - "Run(id=UUID('e39f310b-c5a8-4192-8a59-6a9498e1cb85'), name='AgentExecutor', start_time=datetime.datetime(2023, 7, 17, 18, 25, 30, 653872), run_type=, end_time=datetime.datetime(2023, 7, 17, 18, 25, 35, 359642), extra={'runtime': {'library': 'langchain', 'runtime': 'python', 'platform': 'macOS-13.4.1-arm64-arm-64bit', 'sdk_version': '0.0.8', 'library_version': '0.0.231', 'runtime_version': '3.11.2'}, 'total_tokens': 512, 'prompt_tokens': 451, 'completion_tokens': 61}, error=None, serialized=None, events=[{'name': 'start', 'time': '2023-07-17T18:25:30.653872'}, {'name': 'end', 'time': '2023-07-17T18:25:35.359642'}], inputs={'input': 'what is 1213 divided by 4345?'}, outputs={'output': '1213 divided by 4345 is approximately 0.2792.'}, reference_example_id=UUID('a75cf754-4f73-46fd-b126-9bcd0695e463'), parent_run_id=None, tags=['openai-functions', 'testing-notebook'], execution_order=1, session_id=UUID('1c9baec3-ae86-4fac-9e99-e1b9f8e7818c'), child_run_ids=[UUID('40d0fdca-0b2b-47f4-a9da-f2b229aa4ed5'), UUID('cfa5130f-264c-4126-8950-ec1c4c31b800'), UUID('ba638a2f-2a57-45db-91e8-9a7a66a42c5a'), UUID('fcc29b5a-cdb7-4bcc-8194-47729bbdf5fb'), UUID('a6f92bf5-cfba-4747-9336-370cb00c928a'), UUID('65312576-5a39-4250-b820-4dfae7d73945')], child_runs=None, feedback_stats={'correctness': {'n': 1, 'avg': 1.0, 'mode': 1}, 'helpfulness': {'n': 1, 'avg': 1.0, 'mode': 1}, 'fifth-grader-score': {'n': 1, 'avg': 1.0, 'mode': 1}, 'embedding_cosine_distance': {'n': 1, 'avg': 0.144522385071361, 'mode': 0.144522385071361}})" + "Run(id=UUID('a6893e95-a9cc-43e0-b9fa-f471b0cfee83'), name='AgentExecutor', start_time=datetime.datetime(2023, 9, 13, 22, 34, 32, 177406), run_type='chain', end_time=datetime.datetime(2023, 9, 13, 22, 34, 37, 77740), extra={'runtime': {'cpu': {'time': {'sys': 3.153218304, 'user': 5.045262336}, 'percent': 0.0, 'ctx_switches': {'voluntary': 42164.0, 'involuntary': 0.0}}, 'mem': {'rss': 184205312.0}, 'library': 'langchain', 'runtime': 'python', 'platform': 'macOS-13.4.1-arm64-arm-64bit', 'sdk_version': '0.0.26', 'thread_count': 58.0, 'library_version': '0.0.286', 'runtime_version': '3.11.2', 'langchain_version': '0.0.286', 'py_implementation': 'CPython'}}, error=None, serialized=None, events=[{'name': 'start', 'time': '2023-09-13T22:34:32.177406'}, {'name': 'end', 'time': '2023-09-13T22:34:37.077740'}], inputs={'input': 'what is 1213 divided by 4345?'}, outputs={'output': '1213 divided by 4345 is approximately 0.2792.'}, reference_example_id=UUID('0c41cc28-9c07-4550-8940-68b58cbc045e'), parent_run_id=None, tags=['openai-functions', 'testing-notebook'], execution_order=1, session_id=UUID('7865a050-467e-4c58-9322-58a26f182ecb'), child_run_ids=[UUID('37faef05-b6b3-4cb7-a6db-471425e69b46'), UUID('2d6a895f-de2c-4f7f-b5f1-ca876d38e530'), UUID('e7d145e3-74b0-4f32-9240-3e370becdf8f'), UUID('10db62c9-fe4f-4aba-959a-ad02cfadfa20'), UUID('8dc46a27-8ab9-4f33-9ec1-660ca73ebb4f'), UUID('eccd042e-dde0-4425-b62f-e855e25d6b64')], child_runs=None, feedback_stats={'correctness': {'n': 1, 'avg': 1.0, 'mode': 1, 'is_all_model': True}, 'helpfulness': {'n': 1, 'avg': 1.0, 'mode': 1, 'is_all_model': True}, 'fifth-grader-score': {'n': 1, 'avg': 1.0, 'mode': 1, 'is_all_model': True}, 'embedding_cosine_distance': {'n': 1, 'avg': 0.144522385071361, 'mode': 0.144522385071361, 'is_all_model': True}}, app_path='/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/7865a050-467e-4c58-9322-58a26f182ecb/r/a6893e95-a9cc-43e0-b9fa-f471b0cfee83', manifest_id=None, status='success', prompt_tokens=None, completion_tokens=None, total_tokens=None, first_token_time=None, parent_run_ids=None)" ] }, - "execution_count": 10, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "runs = list(client.list_runs(dataset_name=dataset_name))\n", + "runs = list(client.list_runs(project_name=chain_results[\"project_name\"], execution_order=1))\n", "runs[0]" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 22, "id": "6595c888-1f5c-4ae3-9390-0a559f5575d1", "metadata": { "tags": [] @@ -507,21 +693,17 @@ { "data": { "text/plain": [ - "{'correctness': {'n': 7, 'avg': 0.5714285714285714, 'mode': 1},\n", - " 'helpfulness': {'n': 7, 'avg': 0.7142857142857143, 'mode': 1},\n", - " 'fifth-grader-score': {'n': 7, 'avg': 0.7142857142857143, 'mode': 1},\n", - " 'embedding_cosine_distance': {'n': 7,\n", - " 'avg': 0.11462010799473926,\n", - " 'mode': 0.0130477459560272}}" + "TracerSessionResult(id=UUID('7865a050-467e-4c58-9322-58a26f182ecb'), start_time=datetime.datetime(2023, 9, 13, 22, 34, 10, 611846), name='test-dependable-stop-67', extra=None, tenant_id=UUID('ebbaf2eb-769b-4505-aca2-d11de10372a4'), run_count=None, latency_p50=None, latency_p99=None, total_tokens=None, prompt_tokens=None, completion_tokens=None, last_run_start_time=None, feedback_stats=None, reference_dataset_ids=None, run_facets=None)" ] }, - "execution_count": 11, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "client.read_project(project_id=runs[0].session_id).feedback_stats" + "# After some time, these will be populated.\n", + "client.read_project(project_name=chain_results[\"project_name\"]).feedback_stats" ] }, { diff --git a/libs/langchain/langchain/callbacks/tracers/evaluation.py b/libs/langchain/langchain/callbacks/tracers/evaluation.py index 1cf205e3d2843..0d333f9f0458a 100644 --- a/libs/langchain/langchain/callbacks/tracers/evaluation.py +++ b/libs/langchain/langchain/callbacks/tracers/evaluation.py @@ -7,7 +7,7 @@ from uuid import UUID import langsmith -from langsmith import schemas as langsmith_schemas +from langsmith.evaluation.evaluator import EvaluationResult from langchain.callbacks import manager from langchain.callbacks.tracers import langchain as langchain_tracer @@ -76,7 +76,7 @@ def __init__( self.futures: Set[Future] = set() self.skip_unfinished = skip_unfinished self.project_name = project_name - self.logged_feedback: Dict[str, List[langsmith_schemas.Feedback]] = {} + self.logged_eval_results: Dict[str, List[EvaluationResult]] = {} def _evaluate_in_project(self, run: Run, evaluator: langsmith.RunEvaluator) -> None: """Evaluate the run in the project. @@ -91,11 +91,11 @@ def _evaluate_in_project(self, run: Run, evaluator: langsmith.RunEvaluator) -> N """ try: if self.project_name is None: - feedback = self.client.evaluate_run(run, evaluator) + eval_result = self.client.evaluate_run(run, evaluator) with manager.tracing_v2_enabled( project_name=self.project_name, tags=["eval"], client=self.client ): - feedback = self.client.evaluate_run(run, evaluator) + eval_result = self.client.evaluate_run(run, evaluator) except Exception as e: logger.error( f"Error evaluating run {run.id} with " @@ -104,7 +104,7 @@ def _evaluate_in_project(self, run: Run, evaluator: langsmith.RunEvaluator) -> N ) raise e example_id = str(run.reference_example_id) - self.logged_feedback.setdefault(example_id, []).append(feedback) + self.logged_eval_results.setdefault(example_id, []).append(eval_result) def _persist_run(self, run: Run) -> None: """Run the evaluator on the run. diff --git a/libs/langchain/langchain/smith/evaluation/runner_utils.py b/libs/langchain/langchain/smith/evaluation/runner_utils.py index 0e2262875f637..8eea9ef26b199 100644 --- a/libs/langchain/langchain/smith/evaluation/runner_utils.py +++ b/libs/langchain/langchain/smith/evaluation/runner_utils.py @@ -866,7 +866,8 @@ def _prepare_eval_run( f"Project {project_name} already exists. Please use a different name." ) print( - f"View the evaluation results for project '{project_name}' at:\n{project.url}" + f"View the evaluation results for project '{project_name}' at:\n{project.url}", + flush=True, ) dataset = client.read_dataset(dataset_name=dataset_name) examples = list(client.list_examples(dataset_id=dataset.id)) @@ -927,14 +928,14 @@ def _collect_test_results( project_name: str, ) -> TestResult: wait_for_all_tracers() - all_feedback = {} + all_eval_results = {} for c in configs: for callback in cast(list, c["callbacks"]): if isinstance(callback, EvaluatorCallbackHandler): - all_feedback.update(callback.logged_feedback) + all_eval_results.update(callback.logged_eval_results) results = {} for example, output in zip(examples, batch_results): - feedback = all_feedback.get(str(example.id), []) + feedback = all_eval_results.get(str(example.id), []) results[str(example.id)] = { "output": output, "input": example.inputs,