From ab025507bcbd4e51f494ce3e509cd77a943f4652 Mon Sep 17 00:00:00 2001 From: Christophe Bornet Date: Tue, 13 Feb 2024 04:19:50 +0100 Subject: [PATCH 01/25] community[patch]: Add async methods to VectorStoreQATool (#16949) --- .../tools/vectorstore/tool.py | 53 +++++++++++++++++-- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/libs/community/langchain_community/tools/vectorstore/tool.py b/libs/community/langchain_community/tools/vectorstore/tool.py index e51deaeaa1dec..c0cbdb670627f 100644 --- a/libs/community/langchain_community/tools/vectorstore/tool.py +++ b/libs/community/langchain_community/tools/vectorstore/tool.py @@ -3,7 +3,10 @@ import json from typing import Any, Dict, Optional -from langchain_core.callbacks import CallbackManagerForToolRun +from langchain_core.callbacks import ( + AsyncCallbackManagerForToolRun, + CallbackManagerForToolRun, +) from langchain_core.language_models import BaseLanguageModel from langchain_core.pydantic_v1 import BaseModel, Field from langchain_core.tools import BaseTool @@ -51,9 +54,30 @@ def _run( chain = RetrievalQA.from_chain_type( self.llm, retriever=self.vectorstore.as_retriever() ) - return chain.run( - query, callbacks=run_manager.get_child() if run_manager else None + return chain.invoke( + {chain.input_key: query}, + config={"callbacks": [run_manager.get_child() if run_manager else None]}, + )[chain.output_key] + + async def _arun( + self, + query: str, + run_manager: Optional[AsyncCallbackManagerForToolRun] = None, + ) -> str: + """Use the tool asynchronously.""" + from langchain.chains.retrieval_qa.base import RetrievalQA + + chain = RetrievalQA.from_chain_type( + self.llm, retriever=self.vectorstore.as_retriever() ) + return ( + await chain.ainvoke( + {chain.input_key: query}, + config={ + "callbacks": [run_manager.get_child() if run_manager else None] + }, + ) + )[chain.output_key] class VectorStoreQAWithSourcesTool(BaseVectorStoreTool, BaseTool): @@ -87,7 +111,28 @@ def _run( self.llm, retriever=self.vectorstore.as_retriever() ) return json.dumps( - chain( + chain.invoke( + {chain.question_key: query}, + return_only_outputs=True, + callbacks=run_manager.get_child() if run_manager else None, + ) + ) + + async def _arun( + self, + query: str, + run_manager: Optional[AsyncCallbackManagerForToolRun] = None, + ) -> str: + """Use the tool asynchronously.""" + from langchain.chains.qa_with_sources.retrieval import ( + RetrievalQAWithSourcesChain, + ) + + chain = RetrievalQAWithSourcesChain.from_chain_type( + self.llm, retriever=self.vectorstore.as_retriever() + ) + return json.dumps( + await chain.ainvoke( {chain.question_key: query}, return_only_outputs=True, callbacks=run_manager.get_child() if run_manager else None, From e135dc70c360c779a6d5d0ca7ae48e3dc5f51083 Mon Sep 17 00:00:00 2001 From: Robby <45851384+h0rv@users.noreply.github.com> Date: Mon, 12 Feb 2024 22:22:55 -0500 Subject: [PATCH 02/25] community[patch]: Invoke callback prior to yielding token (#17348) **Description:** Invoke callback prior to yielding token in stream method for Ollama. **Issue:** [Callback for on_llm_new_token should be invoked before the token is yielded by the model #16913](https://github.com/langchain-ai/langchain/issues/16913) Co-authored-by: Robby --- libs/community/langchain_community/chat_models/ollama.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/community/langchain_community/chat_models/ollama.py b/libs/community/langchain_community/chat_models/ollama.py index 92b5afb52ba3f..73a194a90f9b7 100644 --- a/libs/community/langchain_community/chat_models/ollama.py +++ b/libs/community/langchain_community/chat_models/ollama.py @@ -313,12 +313,12 @@ def _stream( for stream_resp in self._create_chat_stream(messages, stop, **kwargs): if stream_resp: chunk = _chat_stream_response_to_chat_generation_chunk(stream_resp) - yield chunk if run_manager: run_manager.on_llm_new_token( chunk.text, verbose=self.verbose, ) + yield chunk except OllamaEndpointNotFoundError: yield from self._legacy_stream(messages, stop, **kwargs) @@ -332,12 +332,12 @@ async def _astream( async for stream_resp in self._acreate_chat_stream(messages, stop, **kwargs): if stream_resp: chunk = _chat_stream_response_to_chat_generation_chunk(stream_resp) - yield chunk if run_manager: await run_manager.on_llm_new_token( chunk.text, verbose=self.verbose, ) + yield chunk @deprecated("0.0.3", alternative="_stream") def _legacy_stream( @@ -351,9 +351,9 @@ def _legacy_stream( for stream_resp in self._create_generate_stream(prompt, stop, **kwargs): if stream_resp: chunk = _stream_response_to_chat_generation_chunk(stream_resp) - yield chunk if run_manager: run_manager.on_llm_new_token( chunk.text, verbose=self.verbose, ) + yield chunk From 30af711c3467d33ae38918005211baa33d4cb0b5 Mon Sep 17 00:00:00 2001 From: Lingzhen Chen <38908263+lz-chen@users.noreply.github.com> Date: Tue, 13 Feb 2024 04:23:35 +0100 Subject: [PATCH 03/25] community[patch]: update AzureSearch class to work with azure-search-documents=11.4.0 (#15659) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - **Description:** Updates `libs/community/langchain_community/vectorstores/azuresearch.py` to support the stable version `azure-search-documents=11.4.0` - **Issue:** https://github.com/langchain-ai/langchain/issues/14534, https://github.com/langchain-ai/langchain/issues/15039, https://github.com/langchain-ai/langchain/issues/15355 - **Dependencies:** azure-search-documents>=11.4.0 --------- Co-authored-by: Clément Tamines Co-authored-by: Bagatur --- .../vectorstores/azuresearch.py | 228 +++++++----------- 1 file changed, 81 insertions(+), 147 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/azuresearch.py b/libs/community/langchain_community/vectorstores/azuresearch.py index 9006985fa2db2..6e4aed8d22c19 100644 --- a/libs/community/langchain_community/vectorstores/azuresearch.py +++ b/libs/community/langchain_community/vectorstores/azuresearch.py @@ -37,14 +37,10 @@ CorsOptions, ScoringProfile, SearchField, + SemanticConfiguration, VectorSearch, ) - try: - from azure.search.documents.indexes.models import SemanticSearch - except ImportError: - from azure.search.documents.indexes.models import SemanticSettings # <11.4.0 - # Allow overriding field names for Azure Search FIELDS_ID = get_from_env( key="AZURESEARCH_FIELDS_ID", env_key="AZURESEARCH_FIELDS_ID", default="id" @@ -73,7 +69,7 @@ def _get_search_client( semantic_configuration_name: Optional[str] = None, fields: Optional[List[SearchField]] = None, vector_search: Optional[VectorSearch] = None, - semantic_settings: Optional[Union[SemanticSearch, SemanticSettings]] = None, + semantic_configurations: Optional[SemanticConfiguration] = None, scoring_profiles: Optional[List[ScoringProfile]] = None, default_scoring_profile: Optional[str] = None, default_fields: Optional[List[SearchField]] = None, @@ -86,30 +82,20 @@ def _get_search_client( from azure.search.documents import SearchClient from azure.search.documents.indexes import SearchIndexClient from azure.search.documents.indexes.models import ( + ExhaustiveKnnAlgorithmConfiguration, + ExhaustiveKnnParameters, + HnswAlgorithmConfiguration, + HnswParameters, SearchIndex, SemanticConfiguration, SemanticField, - VectorSearch, + SemanticPrioritizedFields, + SemanticSearch, + VectorSearchAlgorithmKind, + VectorSearchAlgorithmMetric, + VectorSearchProfile, ) - # class names changed for versions >= 11.4.0 - try: - from azure.search.documents.indexes.models import ( - HnswAlgorithmConfiguration, # HnswVectorSearchAlgorithmConfiguration is old - SemanticPrioritizedFields, # PrioritizedFields outdated - SemanticSearch, # SemanticSettings outdated - ) - - NEW_VERSION = True - except ImportError: - from azure.search.documents.indexes.models import ( - HnswVectorSearchAlgorithmConfiguration, - PrioritizedFields, - SemanticSettings, - ) - - NEW_VERSION = False - default_fields = default_fields or [] if key is None: credential = DefaultAzureCredential() @@ -155,77 +141,55 @@ def fmt_err(x: str) -> str: fields = default_fields # Vector search configuration if vector_search is None: - if NEW_VERSION: - # >= 11.4.0: - # VectorSearch(algorithm_configuration) --> VectorSearch(algorithms) - # HnswVectorSearchAlgorithmConfiguration --> HnswAlgorithmConfiguration - vector_search = VectorSearch( - algorithms=[ - HnswAlgorithmConfiguration( - name="default", - kind="hnsw", - parameters={ # type: ignore - "m": 4, - "efConstruction": 400, - "efSearch": 500, - "metric": "cosine", - }, - ) - ] - ) - else: # < 11.4.0 - vector_search = VectorSearch( - algorithm_configurations=[ - HnswVectorSearchAlgorithmConfiguration( - name="default", - kind="hnsw", - parameters={ # type: ignore - "m": 4, - "efConstruction": 400, - "efSearch": 500, - "metric": "cosine", - }, - ) - ] - ) + vector_search = VectorSearch( + algorithms=[ + HnswAlgorithmConfiguration( + name="default", + kind=VectorSearchAlgorithmKind.HNSW, + parameters=HnswParameters( + m=4, + ef_construction=400, + ef_search=500, + metric=VectorSearchAlgorithmMetric.COSINE, + ), + ), + ExhaustiveKnnAlgorithmConfiguration( + name="default_exhaustive_knn", + kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN, + parameters=ExhaustiveKnnParameters( + metric=VectorSearchAlgorithmMetric.COSINE + ), + ), + ], + profiles=[ + VectorSearchProfile( + name="myHnswProfile", + algorithm_configuration_name="default", + ), + VectorSearchProfile( + name="myExhaustiveKnnProfile", + algorithm_configuration_name="default_exhaustive_knn", + ), + ], + ) # Create the semantic settings with the configuration - if semantic_settings is None and semantic_configuration_name is not None: - if NEW_VERSION: - # <=11.4.0: SemanticSettings --> SemanticSearch - # PrioritizedFields(prioritized_content_fields) - # --> SemanticPrioritizedFields(content_fields) - semantic_settings = SemanticSearch( - configurations=[ - SemanticConfiguration( - name=semantic_configuration_name, - prioritized_fields=SemanticPrioritizedFields( - content_fields=[ - SemanticField(field_name=FIELDS_CONTENT) - ], - ), - ) - ] - ) - else: # < 11.4.0 - semantic_settings = SemanticSettings( - configurations=[ - SemanticConfiguration( - name=semantic_configuration_name, - prioritized_fields=PrioritizedFields( - prioritized_content_fields=[ - SemanticField(field_name=FIELDS_CONTENT) - ], - ), - ) - ] - ) + semantic_search = None + if semantic_configurations is None and semantic_configuration_name is not None: + semantic_configuration = SemanticConfiguration( + name=semantic_configuration_name, + prioritized_fields=SemanticPrioritizedFields( + content_fields=[SemanticField(field_name=FIELDS_CONTENT)], + ), + ) + semantic_search = SemanticSearch(configurations=[semantic_configuration]) + # Create the search index with the semantic settings and vector search index = SearchIndex( name=index_name, fields=fields, vector_search=vector_search, - semantic_settings=semantic_settings, + semantic_search=semantic_search, scoring_profiles=scoring_profiles, default_scoring_profile=default_scoring_profile, cors_options=cors_options, @@ -251,10 +215,9 @@ def __init__( embedding_function: Union[Callable, Embeddings], search_type: str = "hybrid", semantic_configuration_name: Optional[str] = None, - semantic_query_language: str = "en-us", fields: Optional[List[SearchField]] = None, vector_search: Optional[VectorSearch] = None, - semantic_settings: Optional[Union[SemanticSearch, SemanticSettings]] = None, + semantic_configurations: Optional[SemanticConfiguration] = None, scoring_profiles: Optional[List[ScoringProfile]] = None, default_scoring_profile: Optional[str] = None, cors_options: Optional[CorsOptions] = None, @@ -309,7 +272,7 @@ def __init__( semantic_configuration_name=semantic_configuration_name, fields=fields, vector_search=vector_search, - semantic_settings=semantic_settings, + semantic_configurations=semantic_configurations, scoring_profiles=scoring_profiles, default_scoring_profile=default_scoring_profile, default_fields=default_fields, @@ -318,7 +281,6 @@ def __init__( ) self.search_type = search_type self.semantic_configuration_name = semantic_configuration_name - self.semantic_query_language = semantic_query_language self.fields = fields if fields else default_fields @property @@ -451,39 +413,30 @@ def vector_search_with_score( Returns: List of Documents most similar to the query and score for each """ - from azure.search.documents.models import Vector + + from azure.search.documents.models import VectorizedQuery results = self.client.search( search_text="", - vectors=[ - Vector( - value=np.array(self.embed_query(query), dtype=np.float32).tolist(), - k=k, + vector_queries=[ + VectorizedQuery( + vector=np.array(self.embed_query(query), dtype=np.float32).tolist(), + k_nearest_neighbors=k, fields=FIELDS_CONTENT_VECTOR, ) ], filter=filters, + top=k, ) # Convert results to Document objects docs = [ ( Document( page_content=result.pop(FIELDS_CONTENT), - metadata={ - **( - {FIELDS_ID: result.pop(FIELDS_ID)} - if FIELDS_ID in result - else {} - ), - **( - json.loads(result[FIELDS_METADATA]) - if FIELDS_METADATA in result - else { - k: v - for k, v in result.items() - if k != FIELDS_CONTENT_VECTOR - } - ), + metadata=json.loads(result[FIELDS_METADATA]) + if FIELDS_METADATA in result + else { + k: v for k, v in result.items() if k != FIELDS_CONTENT_VECTOR }, ), float(result["@search.score"]), @@ -520,14 +473,14 @@ def hybrid_search_with_score( Returns: List of Documents most similar to the query and score for each """ - from azure.search.documents.models import Vector + from azure.search.documents.models import VectorizedQuery results = self.client.search( search_text=query, - vectors=[ - Vector( - value=np.array(self.embed_query(query), dtype=np.float32).tolist(), - k=k, + vector_queries=[ + VectorizedQuery( + vector=np.array(self.embed_query(query), dtype=np.float32).tolist(), + k_nearest_neighbors=k, fields=FIELDS_CONTENT_VECTOR, ) ], @@ -539,21 +492,10 @@ def hybrid_search_with_score( ( Document( page_content=result.pop(FIELDS_CONTENT), - metadata={ - **( - {FIELDS_ID: result.pop(FIELDS_ID)} - if FIELDS_ID in result - else {} - ), - **( - json.loads(result[FIELDS_METADATA]) - if FIELDS_METADATA in result - else { - k: v - for k, v in result.items() - if k != FIELDS_CONTENT_VECTOR - } - ), + metadata=json.loads(result[FIELDS_METADATA]) + if FIELDS_METADATA in result + else { + k: v for k, v in result.items() if k != FIELDS_CONTENT_VECTOR }, ), float(result["@search.score"]), @@ -610,20 +552,19 @@ def semantic_hybrid_search_with_score_and_rerank( Returns: List of Documents most similar to the query and score for each """ - from azure.search.documents.models import Vector + from azure.search.documents.models import VectorizedQuery results = self.client.search( search_text=query, - vectors=[ - Vector( - value=np.array(self.embed_query(query), dtype=np.float32).tolist(), - k=50, + vector_queries=[ + VectorizedQuery( + vector=np.array(self.embed_query(query), dtype=np.float32).tolist(), + k_nearest_neighbors=k, fields=FIELDS_CONTENT_VECTOR, ) ], filter=filters, query_type="semantic", - query_language=self.semantic_query_language, semantic_configuration_name=self.semantic_configuration_name, query_caption="extractive", query_answer="extractive", @@ -643,11 +584,6 @@ def semantic_hybrid_search_with_score_and_rerank( Document( page_content=result.pop(FIELDS_CONTENT), metadata={ - **( - {FIELDS_ID: result.pop(FIELDS_ID)} - if FIELDS_ID in result - else {} - ), **( json.loads(result[FIELDS_METADATA]) if FIELDS_METADATA in result @@ -667,9 +603,7 @@ def semantic_hybrid_search_with_score_and_rerank( if result.get("@search.captions") else {}, "answers": semantic_answers_dict.get( - json.loads(result[FIELDS_METADATA]).get("key") - if FIELDS_METADATA in result - else "", + json.loads(result["metadata"]).get("key"), "", ), }, From 4986e7227e8831df8a90c49ed4c6a3d53f9041d0 Mon Sep 17 00:00:00 2001 From: Zeeland Date: Tue, 13 Feb 2024 11:25:54 +0800 Subject: [PATCH 04/25] docs: rm unnecessary imports (#16876) - **Description:** optimize the document of memory usage - **Issue:** it lose some install guide --- docs/docs/integrations/memory/aws_dynamodb.ipynb | 2 -- .../memory/mongodb_chat_message_history.ipynb | 2 -- docs/docs/integrations/memory/remembrall.md | 6 ++++++ .../memory/sql_chat_message_history.ipynb | 14 +++----------- docs/docs/integrations/memory/sqlite.ipynb | 2 -- 5 files changed, 9 insertions(+), 17 deletions(-) diff --git a/docs/docs/integrations/memory/aws_dynamodb.ipynb b/docs/docs/integrations/memory/aws_dynamodb.ipynb index 32001587bfbbc..9410bbe024c54 100644 --- a/docs/docs/integrations/memory/aws_dynamodb.ipynb +++ b/docs/docs/integrations/memory/aws_dynamodb.ipynb @@ -274,8 +274,6 @@ "metadata": {}, "outputs": [], "source": [ - "from typing import Optional\n", - "\n", "from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder\n", "from langchain_core.runnables.history import RunnableWithMessageHistory\n", "from langchain_openai import ChatOpenAI" diff --git a/docs/docs/integrations/memory/mongodb_chat_message_history.ipynb b/docs/docs/integrations/memory/mongodb_chat_message_history.ipynb index 8356154fcda4c..88771c7680fce 100644 --- a/docs/docs/integrations/memory/mongodb_chat_message_history.ipynb +++ b/docs/docs/integrations/memory/mongodb_chat_message_history.ipynb @@ -133,8 +133,6 @@ "metadata": {}, "outputs": [], "source": [ - "from typing import Optional\n", - "\n", "from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder\n", "from langchain_core.runnables.history import RunnableWithMessageHistory\n", "from langchain_openai import ChatOpenAI" diff --git a/docs/docs/integrations/memory/remembrall.md b/docs/docs/integrations/memory/remembrall.md index 9cc03fd75e788..8f8b818785947 100644 --- a/docs/docs/integrations/memory/remembrall.md +++ b/docs/docs/integrations/memory/remembrall.md @@ -16,6 +16,12 @@ To get started, [sign in with Github on the Remembrall platform](https://remembr Any request that you send with the modified `openai_api_base` (see below) and Remembrall API key will automatically be tracked in the Remembrall dashboard. You **never** have to share your OpenAI key with our platform and this information is **never** stored by the Remembrall systems. +To do this, we need to install the following dependencies: + +```bash +pip install -U langchain-openai +``` + ### Enable Long Term Memory In addition to setting the `openai_api_base` and Remembrall API key via `x-gp-api-key`, you should specify a UID to maintain memory for. This will usually be a unique user identifier (like email). diff --git a/docs/docs/integrations/memory/sql_chat_message_history.ipynb b/docs/docs/integrations/memory/sql_chat_message_history.ipynb index 859399156724e..e8cca612624a6 100644 --- a/docs/docs/integrations/memory/sql_chat_message_history.ipynb +++ b/docs/docs/integrations/memory/sql_chat_message_history.ipynb @@ -26,7 +26,7 @@ "The integration lives in the `langchain-community` package, so we need to install that. We also need to install the `SQLAlchemy` package.\n", "\n", "```bash\n", - "pip install -U langchain-community SQLAlchemy\n", + "pip install -U langchain-community SQLAlchemy langchain-openai\n", "```" ] }, @@ -71,10 +71,7 @@ "end_time": "2023-08-28T10:04:38.077748Z", "start_time": "2023-08-28T10:04:36.105894Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [], "source": [ @@ -97,10 +94,7 @@ "end_time": "2023-08-28T10:04:38.929396Z", "start_time": "2023-08-28T10:04:38.915727Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -137,8 +131,6 @@ "metadata": {}, "outputs": [], "source": [ - "from typing import Optional\n", - "\n", "from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder\n", "from langchain_core.runnables.history import RunnableWithMessageHistory\n", "from langchain_openai import ChatOpenAI" diff --git a/docs/docs/integrations/memory/sqlite.ipynb b/docs/docs/integrations/memory/sqlite.ipynb index 200335cc4df4e..21dde4e114bf4 100644 --- a/docs/docs/integrations/memory/sqlite.ipynb +++ b/docs/docs/integrations/memory/sqlite.ipynb @@ -119,8 +119,6 @@ "metadata": {}, "outputs": [], "source": [ - "from typing import Optional\n", - "\n", "from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder\n", "from langchain_core.runnables.history import RunnableWithMessageHistory\n", "from langchain_openai import ChatOpenAI" From 604e11741164bb6f6553387a6f14cf737aee607a Mon Sep 17 00:00:00 2001 From: Max Jakob Date: Tue, 13 Feb 2024 04:29:54 +0100 Subject: [PATCH 05/25] docs: another auth method for ElasticsearchStore (#16831) Users can also use their own Elasticsearch client object to configure the connection. --- .../vectorstores/elasticsearch.ipynb | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/docs/docs/integrations/vectorstores/elasticsearch.ipynb b/docs/docs/integrations/vectorstores/elasticsearch.ipynb index 9032e698c2a1c..c36c6f65ee632 100644 --- a/docs/docs/integrations/vectorstores/elasticsearch.ipynb +++ b/docs/docs/integrations/vectorstores/elasticsearch.ipynb @@ -92,6 +92,28 @@ " )\n", "```\n", "\n", + "You can also use an `Elasticsearch` client object that gives you more flexibility, for example to configure the maximum number of retries.\n", + "\n", + "Example:\n", + "```python\n", + " import elasticsearch\n", + " from langchain_community.vectorstores import ElasticsearchStore\n", + "\n", + " es_client= elasticsearch.Elasticsearch(\n", + " hosts=[\"http://localhost:9200\"],\n", + " es_user=\"elastic\",\n", + " es_password=\"changeme\"\n", + " max_retries=10,\n", + " )\n", + "\n", + " embedding = OpenAIEmbeddings()\n", + " elastic_vector_search = ElasticsearchStore(\n", + " index_name=\"test_index\",\n", + " es_connection=es_client,\n", + " embedding=embedding,\n", + " )\n", + "```\n", + "\n", "#### How to obtain a password for the default \"elastic\" user?\n", "\n", "To obtain your Elastic Cloud password for the default \"elastic\" user:\n", @@ -134,7 +156,7 @@ "id": "ea167a29", "metadata": {}, "source": [ - "We want to use `OpenAIEmbeddings` so we have to get the OpenAI API Key." + "To use the `OpenAIEmbeddings` we have to configure the OpenAI API Key in the environment." ] }, { From 64938ae6f213fa1dac7813968fe922bdcae795df Mon Sep 17 00:00:00 2001 From: James Braza Date: Mon, 12 Feb 2024 19:39:58 -0800 Subject: [PATCH 06/25] infra: unit testing `check_package_version` (#16825) Wrote a unit test for `check_package_version` in the core package. Note that this is a revival of https://github.com/langchain-ai/langchain/pull/16387 after GitHub incident (see https://github.com/langchain-ai/langchain/discussions/16796). --- .../core/tests/unit_tests/utils/test_utils.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 libs/core/tests/unit_tests/utils/test_utils.py diff --git a/libs/core/tests/unit_tests/utils/test_utils.py b/libs/core/tests/unit_tests/utils/test_utils.py new file mode 100644 index 0000000000000..e6dedb2a5a86b --- /dev/null +++ b/libs/core/tests/unit_tests/utils/test_utils.py @@ -0,0 +1,30 @@ +from typing import Dict, Optional, Tuple, Type +from unittest.mock import patch + +import pytest + +from langchain_core.utils import check_package_version + + +@pytest.mark.parametrize( + ("package", "check_kwargs", "actual_version", "expected"), + [ + ("stub", {"gt_version": "0.1"}, "0.1.2", None), + ("stub", {"gt_version": "0.1.2"}, "0.1.12", None), + ("stub", {"gt_version": "0.1.2"}, "0.1.2", (ValueError, "> 0.1.2")), + ("stub", {"gte_version": "0.1"}, "0.1.2", None), + ("stub", {"gte_version": "0.1.2"}, "0.1.2", None), + ], +) +def test_check_package_version( + package: str, + check_kwargs: Dict[str, Optional[str]], + actual_version: str, + expected: Optional[Tuple[Type[Exception], str]], +) -> None: + with patch("langchain_core.utils.utils.version", return_value=actual_version): + if expected is None: + check_package_version(package, **check_kwargs) + else: + with pytest.raises(expected[0], match=expected[1]): + check_package_version(package, **check_kwargs) From e3b775e0357d835475e932bdf83c4417c1e7ca98 Mon Sep 17 00:00:00 2001 From: Bhupesh Varshney Date: Tue, 13 Feb 2024 09:13:41 +0530 Subject: [PATCH 07/25] infra: make `.gitignore` consistent with standard python gitignore (#16828) - The new .gitignore version is inherited from the one maintained by the github community over at https://github.com/github/gitignore/blob/main/Python.gitignore - This should cover all the cases of how a langchain app can be used. --- libs/cli/.gitignore | 161 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 159 insertions(+), 2 deletions(-) diff --git a/libs/cli/.gitignore b/libs/cli/.gitignore index 1694d2bc1e914..68bc17f9ff210 100644 --- a/libs/cli/.gitignore +++ b/libs/cli/.gitignore @@ -1,3 +1,160 @@ -dist -__pycache__ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env .venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ From c454dc36fce1e44a8c5ea7fcc17d7b030fd4a1de Mon Sep 17 00:00:00 2001 From: yin1991 <84140478+xiaokuili@users.noreply.github.com> Date: Tue, 13 Feb 2024 11:48:29 +0800 Subject: [PATCH 08/25] community[proxy]: Enhancement/add proxy support playwrighturlloader 16751 (#16822) - **Description:** Enhancement/add proxy support playwrighturlloader 16751 - **Issue:** [Enhancement: Add Proxy Support to PlaywrightURLLoader Class](https://github.com/langchain-ai/langchain/issues/16751) - **Dependencies:** - **Twitter handle:** @ootR77013489 --------- Co-authored-by: root Co-authored-by: Bagatur --- .../document_loaders/url_playwright.py | 24 ++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/url_playwright.py b/libs/community/langchain_community/document_loaders/url_playwright.py index 8071d3717f726..106f15cee6f61 100644 --- a/libs/community/langchain_community/document_loaders/url_playwright.py +++ b/libs/community/langchain_community/document_loaders/url_playwright.py @@ -2,7 +2,7 @@ """ import logging from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, List, Optional +from typing import TYPE_CHECKING, Dict, List, Optional from langchain_core.documents import Document @@ -111,6 +111,22 @@ class PlaywrightURLLoader(BaseLoader): urls (List[str]): List of URLs to load. continue_on_failure (bool): If True, continue loading other URLs on failure. headless (bool): If True, the browser will run in headless mode. + proxy (Optional[Dict[str, str]]): If set, the browser will access URLs + through the specified proxy. + + Example: + .. code-block:: python + + from langchain_community.document_loaders import PlaywrightURLLoader + + urls = ["https://api.ipify.org/?format=json",] + proxy={ + "server": "https://xx.xx.xx:15818", # https://: + "username": "username", + "password": "password" + } + loader = PlaywrightURLLoader(urls, proxy=proxy) + data = loader.load() """ def __init__( @@ -120,6 +136,7 @@ def __init__( headless: bool = True, remove_selectors: Optional[List[str]] = None, evaluator: Optional[PlaywrightEvaluator] = None, + proxy: Optional[Dict[str, str]] = None, ): """Load a list of URLs using Playwright.""" try: @@ -133,6 +150,7 @@ def __init__( self.urls = urls self.continue_on_failure = continue_on_failure self.headless = headless + self.proxy = proxy if remove_selectors and evaluator: raise ValueError( @@ -153,7 +171,7 @@ def load(self) -> List[Document]: docs: List[Document] = list() with sync_playwright() as p: - browser = p.chromium.launch(headless=self.headless) + browser = p.chromium.launch(headless=self.headless, proxy=self.proxy) for url in self.urls: try: page = browser.new_page() @@ -186,7 +204,7 @@ async def aload(self) -> List[Document]: docs: List[Document] = list() async with async_playwright() as p: - browser = await p.chromium.launch(headless=self.headless) + browser = await p.chromium.launch(headless=self.headless, proxy=self.proxy) for url in self.urls: try: page = await browser.new_page() From 722aae4fd1fb2a23647d71e6c43b84ec36d77683 Mon Sep 17 00:00:00 2001 From: morgana Date: Mon, 12 Feb 2024 19:50:20 -0800 Subject: [PATCH 09/25] community: add delete method to rocksetdb vectorstore to support recordmanager (#17030) - **Description:** This adds a delete method so that rocksetdb can be used with `RecordManager`. - **Issue:** N/A - **Dependencies:** N/A - **Twitter handle:** `@_morgan_adams_` --------- Co-authored-by: Rockset API Bot --- .../docs/modules/data_connection/indexing.ipynb | 2 +- .../vectorstores/rocksetdb.py | 17 +++++++++++++++++ .../vectorstores/test_rocksetdb.py | 16 ++++++++++++++++ .../vectorstores/test_indexing_docs.py | 1 + 4 files changed, 35 insertions(+), 1 deletion(-) diff --git a/docs/docs/modules/data_connection/indexing.ipynb b/docs/docs/modules/data_connection/indexing.ipynb index b888a77958b87..45a5d92a42bc9 100644 --- a/docs/docs/modules/data_connection/indexing.ipynb +++ b/docs/docs/modules/data_connection/indexing.ipynb @@ -60,7 +60,7 @@ " * document addition by id (`add_documents` method with `ids` argument)\n", " * delete by id (`delete` method with `ids` argument)\n", "\n", - "Compatible Vectorstores: `AnalyticDB`, `AstraDB`, `AwaDB`, `Bagel`, `Cassandra`, `Chroma`, `DashVector`, `DatabricksVectorSearch`, `DeepLake`, `Dingo`, `ElasticVectorSearch`, `ElasticsearchStore`, `FAISS`, `HanaDB`, `Milvus`, `MyScale`, `PGVector`, `Pinecone`, `Qdrant`, `Redis`, `ScaNN`, `SupabaseVectorStore`, `SurrealDBStore`, `TimescaleVector`, `Vald`, `Vearch`, `VespaStore`, `Weaviate`, `ZepVectorStore`.\n", + "Compatible Vectorstores: `AnalyticDB`, `AstraDB`, `AwaDB`, `Bagel`, `Cassandra`, `Chroma`, `DashVector`, `DatabricksVectorSearch`, `DeepLake`, `Dingo`, `ElasticVectorSearch`, `ElasticsearchStore`, `FAISS`, `HanaDB`, `Milvus`, `MyScale`, `PGVector`, `Pinecone`, `Qdrant`, `Redis`, `Rockset`, `ScaNN`, `SupabaseVectorStore`, `SurrealDBStore`, `TimescaleVector`, `Vald`, `Vearch`, `VespaStore`, `Weaviate`, `ZepVectorStore`.\n", " \n", "## Caution\n", "\n", diff --git a/libs/community/langchain_community/vectorstores/rocksetdb.py b/libs/community/langchain_community/vectorstores/rocksetdb.py index 992f53db081dd..0e94089a0cddf 100644 --- a/libs/community/langchain_community/vectorstores/rocksetdb.py +++ b/libs/community/langchain_community/vectorstores/rocksetdb.py @@ -6,6 +6,7 @@ from langchain_core.documents import Document from langchain_core.embeddings import Embeddings +from langchain_core.runnables import run_in_executor from langchain_core.vectorstores import VectorStore logger = logging.getLogger(__name__) @@ -332,3 +333,19 @@ def delete_texts(self, ids: List[str]) -> None: data=[DeleteDocumentsRequestData(id=i) for i in ids], workspace=self._workspace, ) + + def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]: + try: + if ids is None: + ids = [] + self.delete_texts(ids) + except Exception as e: + logger.error("Exception when deleting docs from Rockset: %s\n", e) + return False + + return True + + async def adelete( + self, ids: Optional[List[str]] = None, **kwargs: Any + ) -> Optional[bool]: + return await run_in_executor(None, self.delete, ids, **kwargs) diff --git a/libs/community/tests/integration_tests/vectorstores/test_rocksetdb.py b/libs/community/tests/integration_tests/vectorstores/test_rocksetdb.py index 25f56d522464b..b4c79b610718f 100644 --- a/libs/community/tests/integration_tests/vectorstores/test_rocksetdb.py +++ b/libs/community/tests/integration_tests/vectorstores/test_rocksetdb.py @@ -155,3 +155,19 @@ def test_build_query_sql_with_where(self) -> None: LIMIT 4 """ assert q_str == expected + + def test_add_documents_and_delete(self) -> None: + """ "add_documents" and "delete" are requirements to support use + with RecordManager""" + + texts = ["foo", "bar", "baz"] + metadatas = [{"metadata_index": i} for i in range(len(texts))] + + _docs = zip(texts, metadatas) + docs = [Document(page_content=pc, metadata=i) for pc, i in _docs] + + ids = self.rockset_vectorstore.add_documents(docs) + assert len(ids) == len(texts) + + deleted = self.rockset_vectorstore.delete(ids) + assert deleted diff --git a/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py b/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py index 1232d6bb9a681..85c5312d1f924 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py +++ b/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py @@ -68,6 +68,7 @@ def check_compatibility(vector_store: VectorStore) -> bool: "Pinecone", "Qdrant", "Redis", + "Rockset", "ScaNN", "SemaDB", "SupabaseVectorStore", From f9f5626ca48d56d875444504e4745d2b059965fc Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 13 Feb 2024 16:50:59 +1300 Subject: [PATCH 10/25] community[patch]: Fix github search issues and PRs PaginatedList has no len() error (#16806) **Description:** Bugfix: Langchain_community's GitHub Api wrapper throws a TypeError when searching for issues and/or PRs (the `search_issues_and_prs` method). This is because PyGithub's PageinatedList type does not support the len() method. See https://github.com/PyGithub/PyGithub/issues/1476 ![image](https://github.com/langchain-ai/langchain/assets/8849021/57390b11-ed41-4f48-ba50-f3028610789c) **Dependencies:** None **Twitter handle**: @ChrisKeoghNZ I haven't registered an issue as it would take me longer to fill the template out than to make the fix, but I'm happy to if that's deemed essential. I've added a simple integration test to cover this as there were no existing unit tests and it was going to be tricky to set them up. Co-authored-by: Chris Keogh --- libs/community/langchain_community/utilities/github.py | 2 +- .../tests/integration_tests/utilities/test_github.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/libs/community/langchain_community/utilities/github.py b/libs/community/langchain_community/utilities/github.py index 6c2950dcc99b7..410fdb494b7c3 100644 --- a/libs/community/langchain_community/utilities/github.py +++ b/libs/community/langchain_community/utilities/github.py @@ -735,7 +735,7 @@ def search_issues_and_prs(self, query: str) -> str: str: A string containing the first 5 issues and pull requests """ search_result = self.github.search_issues(query, repo=self.github_repository) - max_items = min(5, len(search_result)) + max_items = min(5, search_result.totalCount) results = [f"Top {max_items} results:"] for issue in search_result[:max_items]: results.append( diff --git a/libs/community/tests/integration_tests/utilities/test_github.py b/libs/community/tests/integration_tests/utilities/test_github.py index 6e21a8a9c1469..77b87ec64f786 100644 --- a/libs/community/tests/integration_tests/utilities/test_github.py +++ b/libs/community/tests/integration_tests/utilities/test_github.py @@ -19,3 +19,9 @@ def test_get_open_issues(api_client: GitHubAPIWrapper) -> None: """Basic test to fetch issues""" issues = api_client.get_issues() assert len(issues) != 0 + + +def test_search_issues_and_prs(api_client: GitHubAPIWrapper) -> None: + """Basic test to search issues and PRs""" + results = api_client.search_issues_and_prs("is:pr is:merged") + assert len(results) != 0 From cd00a87db732687d0d6a9ffdc1eb9f6f5ebc49b6 Mon Sep 17 00:00:00 2001 From: Kapil Sachdeva Date: Mon, 12 Feb 2024 21:51:55 -0600 Subject: [PATCH 11/25] community[patch] - in FAISS vector store, support passing custom DocStore implementation when using from_xxx methods (#16801) - **Description:** The from__xx methods of FAISS class have hardcoded InMemoryStore implementation and thereby not let users pass a custom DocStore implementation, - **Issue:** no referenced issue, - **Dependencies:** none, - **Twitter handle:** ksachdeva --- libs/community/langchain_community/vectorstores/faiss.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/faiss.py b/libs/community/langchain_community/vectorstores/faiss.py index 1617a8c36bb4f..4a7d65f9120a9 100644 --- a/libs/community/langchain_community/vectorstores/faiss.py +++ b/libs/community/langchain_community/vectorstores/faiss.py @@ -921,11 +921,13 @@ def __from( else: # Default to L2, currently other metric types not initialized. index = faiss.IndexFlatL2(len(embeddings[0])) + docstore = kwargs.pop("docstore", InMemoryDocstore()) + index_to_docstore_id = kwargs.pop("index_to_docstore_id", {}) vecstore = cls( embedding, index, - InMemoryDocstore(), - {}, + docstore, + index_to_docstore_id, normalize_L2=normalize_L2, distance_strategy=distance_strategy, **kwargs, From 1987f905ed72088e77884d954993d609aaf62823 Mon Sep 17 00:00:00 2001 From: Theo / Taeyoon Kang <52947381+TykanN@users.noreply.github.com> Date: Tue, 13 Feb 2024 12:57:20 +0900 Subject: [PATCH 12/25] core[patch]: Support .yml extension for YAML (#16783) - **Description:** [AS-IS] When dealing with a yaml file, the extension must be .yaml. [TO-BE] In the absence of extension length constraints in the OS, the extension of the YAML file is yaml, but control over the yml extension must still be made. It's as if it's an error because it's a .jpg extension in jpeg support. - **Issue:** - - **Dependencies:** no dependencies required for this change, --- libs/community/langchain_community/llms/loading.py | 2 +- libs/core/langchain_core/language_models/llms.py | 2 +- libs/core/langchain_core/prompts/base.py | 2 +- libs/core/langchain_core/prompts/loading.py | 2 +- .../langchain_experimental/data_anonymizer/presidio.py | 4 ++-- libs/experimental/langchain_experimental/prompts/load.py | 2 +- libs/langchain/langchain/agents/agent.py | 4 ++-- libs/langchain/langchain/chains/base.py | 2 +- libs/langchain/langchain/chains/loading.py | 2 +- 9 files changed, 11 insertions(+), 11 deletions(-) diff --git a/libs/community/langchain_community/llms/loading.py b/libs/community/langchain_community/llms/loading.py index c9621b784df60..f410f7b7c6c2a 100644 --- a/libs/community/langchain_community/llms/loading.py +++ b/libs/community/langchain_community/llms/loading.py @@ -35,7 +35,7 @@ def load_llm(file: Union[str, Path]) -> BaseLLM: if file_path.suffix == ".json": with open(file_path) as f: config = json.load(f) - elif file_path.suffix == ".yaml": + elif file_path.suffix.endswith((".yaml", ".yml")): with open(file_path, "r") as f: config = yaml.safe_load(f) else: diff --git a/libs/core/langchain_core/language_models/llms.py b/libs/core/langchain_core/language_models/llms.py index ec2af1b91e088..36d9435325146 100644 --- a/libs/core/langchain_core/language_models/llms.py +++ b/libs/core/langchain_core/language_models/llms.py @@ -1120,7 +1120,7 @@ def save(self, file_path: Union[Path, str]) -> None: if save_path.suffix == ".json": with open(file_path, "w") as f: json.dump(prompt_dict, f, indent=4) - elif save_path.suffix == ".yaml": + elif save_path.suffix.endswith((".yaml", ".yml")): with open(file_path, "w") as f: yaml.dump(prompt_dict, f, default_flow_style=False) else: diff --git a/libs/core/langchain_core/prompts/base.py b/libs/core/langchain_core/prompts/base.py index 51eddf0f7718c..96cfbf63740e1 100644 --- a/libs/core/langchain_core/prompts/base.py +++ b/libs/core/langchain_core/prompts/base.py @@ -221,7 +221,7 @@ def save(self, file_path: Union[Path, str]) -> None: if save_path.suffix == ".json": with open(file_path, "w") as f: json.dump(prompt_dict, f, indent=4) - elif save_path.suffix == ".yaml": + elif save_path.suffix.endswith((".yaml", ".yml")): with open(file_path, "w") as f: yaml.dump(prompt_dict, f, default_flow_style=False) else: diff --git a/libs/core/langchain_core/prompts/loading.py b/libs/core/langchain_core/prompts/loading.py index dd6e0c35478d3..baa56a5666d72 100644 --- a/libs/core/langchain_core/prompts/loading.py +++ b/libs/core/langchain_core/prompts/loading.py @@ -146,7 +146,7 @@ def _load_prompt_from_file(file: Union[str, Path]) -> BasePromptTemplate: if file_path.suffix == ".json": with open(file_path) as f: config = json.load(f) - elif file_path.suffix == ".yaml": + elif file_path.suffix.endswith((".yaml", ".yml")): with open(file_path, "r") as f: config = yaml.safe_load(f) else: diff --git a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py index f9b35788b08be..fbfad3703e731 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py @@ -428,7 +428,7 @@ def save_deanonymizer_mapping(self, file_path: Union[Path, str]) -> None: if save_path.suffix == ".json": with open(save_path, "w") as f: json.dump(self.deanonymizer_mapping, f, indent=2) - elif save_path.suffix == ".yaml": + elif save_path.suffix.endswith((".yaml", ".yml")): with open(save_path, "w") as f: yaml.dump(self.deanonymizer_mapping, f, default_flow_style=False) @@ -452,7 +452,7 @@ def load_deanonymizer_mapping(self, file_path: Union[Path, str]) -> None: if load_path.suffix == ".json": with open(load_path, "r") as f: loaded_mapping = json.load(f) - elif load_path.suffix == ".yaml": + elif load_path.suffix.endswith((".yaml", ".yml")): with open(load_path, "r") as f: loaded_mapping = yaml.load(f, Loader=yaml.FullLoader) diff --git a/libs/experimental/langchain_experimental/prompts/load.py b/libs/experimental/langchain_experimental/prompts/load.py index a7322565778b2..51efdc98db5c5 100644 --- a/libs/experimental/langchain_experimental/prompts/load.py +++ b/libs/experimental/langchain_experimental/prompts/load.py @@ -30,7 +30,7 @@ def _load_prompt_from_file(file: Union[str, Path]) -> BasePromptTemplate: if file_path.suffix == ".json": with open(file_path) as f: config = json.load(f) - elif file_path.suffix == ".yaml": + elif file_path.suffix.endswith((".yaml", ".yml")): with open(file_path, "r") as f: config = yaml.safe_load(f) elif file_path.suffix == ".py": diff --git a/libs/langchain/langchain/agents/agent.py b/libs/langchain/langchain/agents/agent.py index 8bd7b6d478ab5..e6f9df4939853 100644 --- a/libs/langchain/langchain/agents/agent.py +++ b/libs/langchain/langchain/agents/agent.py @@ -185,7 +185,7 @@ def save(self, file_path: Union[Path, str]) -> None: if save_path.suffix == ".json": with open(file_path, "w") as f: json.dump(agent_dict, f, indent=4) - elif save_path.suffix == ".yaml": + elif save_path.suffix.endswith((".yaml", ".yml")): with open(file_path, "w") as f: yaml.dump(agent_dict, f, default_flow_style=False) else: @@ -310,7 +310,7 @@ def save(self, file_path: Union[Path, str]) -> None: if save_path.suffix == ".json": with open(file_path, "w") as f: json.dump(agent_dict, f, indent=4) - elif save_path.suffix == ".yaml": + elif save_path.suffix.endswith((".yaml", ".yml")): with open(file_path, "w") as f: yaml.dump(agent_dict, f, default_flow_style=False) else: diff --git a/libs/langchain/langchain/chains/base.py b/libs/langchain/langchain/chains/base.py index dc665ae9978c4..1bb6d1636b242 100644 --- a/libs/langchain/langchain/chains/base.py +++ b/libs/langchain/langchain/chains/base.py @@ -687,7 +687,7 @@ def save(self, file_path: Union[Path, str]) -> None: if save_path.suffix == ".json": with open(file_path, "w") as f: json.dump(chain_dict, f, indent=4) - elif save_path.suffix == ".yaml": + elif save_path.suffix.endswith((".yaml", ".yml")): with open(file_path, "w") as f: yaml.dump(chain_dict, f, default_flow_style=False) else: diff --git a/libs/langchain/langchain/chains/loading.py b/libs/langchain/langchain/chains/loading.py index 0675b87141ac7..0fc3da3f499ae 100644 --- a/libs/langchain/langchain/chains/loading.py +++ b/libs/langchain/langchain/chains/loading.py @@ -607,7 +607,7 @@ def _load_chain_from_file(file: Union[str, Path], **kwargs: Any) -> Chain: if file_path.suffix == ".json": with open(file_path) as f: config = json.load(f) - elif file_path.suffix == ".yaml": + elif file_path.suffix.endswith((".yaml", ".yml")): with open(file_path, "r") as f: config = yaml.safe_load(f) else: From 1fdd9bd980e09d9f80a0ded6aacfbd7329f66ce0 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Tue, 13 Feb 2024 04:58:34 +0100 Subject: [PATCH 13/25] community/SQLDatabase: Generalize and trim software tests (#16659) - **Description:** Improve test cases for `SQLDatabase` adapter component, see [suggestion](https://github.com/langchain-ai/langchain/pull/16655#pullrequestreview-1846749474). - **Depends on:** GH-16655 - **Addressed to:** @baskaryan, @cbornet, @eyurtsev _Remark: This PR is stacked upon GH-16655, so that one will need to go in first._ Edit: Thank you for bringing in GH-17191, @eyurtsev. This is a little aftermath, improving/streamlining the corresponding test cases. --- .../tests/unit_tests/test_sql_database.py | 95 ++++++++++--------- 1 file changed, 49 insertions(+), 46 deletions(-) diff --git a/libs/community/tests/unit_tests/test_sql_database.py b/libs/community/tests/unit_tests/test_sql_database.py index 42aaef7134f53..4293265a1defd 100644 --- a/libs/community/tests/unit_tests/test_sql_database.py +++ b/libs/community/tests/unit_tests/test_sql_database.py @@ -10,7 +10,6 @@ String, Table, Text, - create_engine, insert, select, ) @@ -35,11 +34,19 @@ ) -def test_table_info() -> None: - """Test that table info is constructed properly.""" - engine = create_engine("sqlite:///:memory:") +@pytest.fixture +def engine() -> sa.Engine: + return sa.create_engine("sqlite:///:memory:") + + +@pytest.fixture +def db(engine: sa.Engine) -> SQLDatabase: metadata_obj.create_all(engine) - db = SQLDatabase(engine) + return SQLDatabase(engine) + + +def test_table_info(db: SQLDatabase) -> None: + """Test that table info is constructed properly.""" output = db.table_info expected_output = """ CREATE TABLE user ( @@ -68,20 +75,19 @@ def test_table_info() -> None: assert sorted(" ".join(output.split())) == sorted(" ".join(expected_output.split())) -def test_table_info_w_sample_rows() -> None: +def test_table_info_w_sample_rows(db: SQLDatabase) -> None: """Test that table info is constructed properly.""" - engine = create_engine("sqlite:///:memory:") - metadata_obj.create_all(engine) + + # Provision. values = [ {"user_id": 13, "user_name": "Harrison", "user_bio": "bio"}, {"user_id": 14, "user_name": "Chase", "user_bio": "bio"}, ] stmt = insert(user).values(values) - with engine.begin() as conn: - conn.execute(stmt) - - db = SQLDatabase(engine, sample_rows_in_table_info=2) + db._execute(stmt) + # Query and verify. + db = SQLDatabase(db._engine, sample_rows_in_table_info=2) output = db.table_info expected_output = """ @@ -112,16 +118,16 @@ def test_table_info_w_sample_rows() -> None: assert sorted(output.split()) == sorted(expected_output.split()) -def test_sql_database_run_fetch_all() -> None: +def test_sql_database_run_fetch_all(db: SQLDatabase) -> None: """Verify running SQL expressions returning results as strings.""" - engine = create_engine("sqlite:///:memory:") - metadata_obj.create_all(engine) + + # Provision. stmt = insert(user).values( user_id=13, user_name="Harrison", user_bio="That is my Bio " * 24 ) - with engine.begin() as conn: - conn.execute(stmt) - db = SQLDatabase(engine) + db._execute(stmt) + + # Query and verify. command = "select user_id, user_name, user_bio from user where user_id = 13" partial_output = db.run(command) user_bio = "That is my Bio " * 19 + "That is my..." @@ -135,60 +141,57 @@ def test_sql_database_run_fetch_all() -> None: assert full_output == expected_full_output -def test_sql_database_run_fetch_result() -> None: +def test_sql_database_run_fetch_result(db: SQLDatabase) -> None: """Verify running SQL expressions returning results as SQLAlchemy `Result` instances.""" - engine = create_engine("sqlite:///:memory:") - metadata_obj.create_all(engine) + + # Provision. stmt = insert(user).values(user_id=17, user_name="hwchase") - with engine.begin() as conn: - conn.execute(stmt) - db = SQLDatabase(engine) - command = "select user_id, user_name, user_bio from user where user_id = 17" + db._execute(stmt) + # Query and verify. + command = "select user_id, user_name, user_bio from user where user_id = 17" result = db.run(command, fetch="cursor", include_columns=True) expected = [{"user_id": 17, "user_name": "hwchase", "user_bio": None}] assert isinstance(result, Result) assert result.mappings().fetchall() == expected -def test_sql_database_run_with_parameters() -> None: +def test_sql_database_run_with_parameters(db: SQLDatabase) -> None: """Verify running SQL expressions with query parameters.""" - engine = create_engine("sqlite:///:memory:") - metadata_obj.create_all(engine) + + # Provision. stmt = insert(user).values(user_id=17, user_name="hwchase") - with engine.begin() as conn: - conn.execute(stmt) - db = SQLDatabase(engine) - command = "select user_id, user_name, user_bio from user where user_id = :user_id" + db._execute(stmt) + # Query and verify. + command = "select user_id, user_name, user_bio from user where user_id = :user_id" full_output = db.run(command, parameters={"user_id": 17}, include_columns=True) expected_full_output = "[{'user_id': 17, 'user_name': 'hwchase', 'user_bio': None}]" assert full_output == expected_full_output -def test_sql_database_run_sqlalchemy_selectable() -> None: +def test_sql_database_run_sqlalchemy_selectable(db: SQLDatabase) -> None: """Verify running SQL expressions using SQLAlchemy selectable.""" - engine = create_engine("sqlite:///:memory:") - metadata_obj.create_all(engine) + + # Provision. stmt = insert(user).values(user_id=17, user_name="hwchase") - with engine.begin() as conn: - conn.execute(stmt) - db = SQLDatabase(engine) - command = select(user).where(user.c.user_id == 17) + db._execute(stmt) + # Query and verify. + command = select(user).where(user.c.user_id == 17) full_output = db.run(command, include_columns=True) expected_full_output = "[{'user_id': 17, 'user_name': 'hwchase', 'user_bio': None}]" assert full_output == expected_full_output -def test_sql_database_run_update() -> None: +def test_sql_database_run_update(db: SQLDatabase) -> None: """Test commands which return no rows return an empty string.""" - engine = create_engine("sqlite:///:memory:") - metadata_obj.create_all(engine) + + # Provision. stmt = insert(user).values(user_id=13, user_name="Harrison") - with engine.begin() as conn: - conn.execute(stmt) - db = SQLDatabase(engine) + db._execute(stmt) + + # Query and verify. command = "update user set user_name='Updated' where user_id = 13" output = db.run(command) expected_output = "" @@ -198,7 +201,7 @@ def test_sql_database_run_update() -> None: def test_sql_database_schema_translate_map() -> None: """Verify using statement-specific execution options.""" - engine = create_engine("sqlite:///:memory:") + engine = sa.create_engine("sqlite:///:memory:") db = SQLDatabase(engine) # Define query using SQLAlchemy selectable. From df7cbd6fbbb1f7dcd73705d4773425a692c43efd Mon Sep 17 00:00:00 2001 From: Massimiliano Pronesti Date: Tue, 13 Feb 2024 05:00:52 +0100 Subject: [PATCH 14/25] community[minor]: add FlashRank ranker (#16785) **Description:** This PR adds support for [flashrank](https://github.com/PrithivirajDamodaran/FlashRank) for reranking as alternative to Cohere. I'm not sure `libs/langchain` is the right place for this change. At first, I wanted to put it under `libs/community`. All the compressors were under `libs/langchain/retrievers/document_compressors` though. Hope this makes sense! --- .../retrievers/flashrank-reranker.ipynb | 475 ++++++++++++++++++ .../document_compressors/__init__.py | 2 + .../document_compressors/flashrank_rerank.py | 74 +++ 3 files changed, 551 insertions(+) create mode 100644 docs/docs/integrations/retrievers/flashrank-reranker.ipynb create mode 100644 libs/langchain/langchain/retrievers/document_compressors/flashrank_rerank.py diff --git a/docs/docs/integrations/retrievers/flashrank-reranker.ipynb b/docs/docs/integrations/retrievers/flashrank-reranker.ipynb new file mode 100644 index 0000000000000..7f53ed00a687f --- /dev/null +++ b/docs/docs/integrations/retrievers/flashrank-reranker.ipynb @@ -0,0 +1,475 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "# Flashrank Reranker\n", + "\n", + "This notebook shows how to use [flashrank](https://github.com/PrithivirajDamodaran/FlashRank) for document compression and retrieval." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "pycharm": { + "is_executing": true + } + }, + "outputs": [], + "source": [ + "% pip install --upgrade --quiet flashrank\n", + "% pip install --upgrade --quiet faiss\n", + "\n", + "# OR (depending on Python version)\n", + "\n", + "% pip install --upgrade --quiet faiss_cpu" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Helper function for printing docs\n", + "\n", + "\n", + "def pretty_print_docs(docs):\n", + " print(\n", + " f\"\\n{'-' * 100}\\n\".join(\n", + " [f\"Document {i+1}:\\n\\n\" + d.page_content for i, d in enumerate(docs)]\n", + " )\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "## Set up the base vector store retriever\n", + "Let's start by initializing a simple vector store retriever and storing the 2023 State of the Union speech (in chunks). We can set up the retriever to retrieve a high number (20) of docs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import getpass\n", + "import os\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Document 1:\n", + "\n", + "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 2:\n", + "\n", + "As I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential. \n", + "\n", + "While it often appears that we never agree, that isn’t true. I signed 80 bipartisan bills into law last year. From preventing government shutdowns to protecting Asian-Americans from still-too-common hate crimes to reforming military justice.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 3:\n", + "\n", + "A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \n", + "\n", + "And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 4:\n", + "\n", + "He met the Ukrainian people. \n", + "\n", + "From President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world. \n", + "\n", + "Groups of citizens blocking tanks with their bodies. Everyone from students to retirees teachers turned soldiers defending their homeland. \n", + "\n", + "In this struggle as President Zelenskyy said in his speech to the European Parliament “Light will win over darkness.” The Ukrainian Ambassador to the United States is here tonight.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 5:\n", + "\n", + "But that trickle-down theory led to weaker economic growth, lower wages, bigger deficits, and the widest gap between those at the top and everyone else in nearly a century. \n", + "\n", + "Vice President Harris and I ran for office with a new economic vision for America. \n", + "\n", + "Invest in America. Educate Americans. Grow the workforce. Build the economy from the bottom up \n", + "and the middle out, not from the top down.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 6:\n", + "\n", + "And tonight, I’m announcing that the Justice Department will name a chief prosecutor for pandemic fraud. \n", + "\n", + "By the end of this year, the deficit will be down to less than half what it was before I took office. \n", + "\n", + "The only president ever to cut the deficit by more than one trillion dollars in a single year. \n", + "\n", + "Lowering your costs also means demanding more competition. \n", + "\n", + "I’m a capitalist, but capitalism without competition isn’t capitalism. \n", + "\n", + "It’s exploitation—and it drives up prices.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 7:\n", + "\n", + "I spoke with their families and told them that we are forever in debt for their sacrifice, and we will carry on their mission to restore the trust and safety every community deserves. \n", + "\n", + "I’ve worked on these issues a long time. \n", + "\n", + "I know what works: Investing in crime prevention and community police officers who’ll walk the beat, who’ll know the neighborhood, and who can restore trust and safety. \n", + "\n", + "So let’s not abandon our streets. Or choose between safety and equal justice.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 8:\n", + "\n", + "As I’ve told Xi Jinping, it is never a good bet to bet against the American people. \n", + "\n", + "We’ll create good jobs for millions of Americans, modernizing roads, airports, ports, and waterways all across America. \n", + "\n", + "And we’ll do it all to withstand the devastating effects of the climate crisis and promote environmental justice.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 9:\n", + "\n", + "Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \n", + "\n", + "Last year COVID-19 kept us apart. This year we are finally together again. \n", + "\n", + "Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n", + "\n", + "With a duty to one another to the American people to the Constitution. \n", + "\n", + "And with an unwavering resolve that freedom will always triumph over tyranny.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 10:\n", + "\n", + "As Ohio Senator Sherrod Brown says, “It’s time to bury the label “Rust Belt.” \n", + "\n", + "It’s time. \n", + "\n", + "But with all the bright spots in our economy, record job growth and higher wages, too many families are struggling to keep up with the bills. \n", + "\n", + "Inflation is robbing them of the gains they might otherwise feel. \n", + "\n", + "I get it. That’s why my top priority is getting prices under control.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 11:\n", + "\n", + "I’m also calling on Congress: pass a law to make sure veterans devastated by toxic exposures in Iraq and Afghanistan finally get the benefits and comprehensive health care they deserve. \n", + "\n", + "And fourth, let’s end cancer as we know it. \n", + "\n", + "This is personal to me and Jill, to Kamala, and to so many of you. \n", + "\n", + "Cancer is the #2 cause of death in America–second only to heart disease.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 12:\n", + "\n", + "Headaches. Numbness. Dizziness. \n", + "\n", + "A cancer that would put them in a flag-draped coffin. \n", + "\n", + "I know. \n", + "\n", + "One of those soldiers was my son Major Beau Biden. \n", + "\n", + "We don’t know for sure if a burn pit was the cause of his brain cancer, or the diseases of so many of our troops. \n", + "\n", + "But I’m committed to finding out everything we can. \n", + "\n", + "Committed to military families like Danielle Robinson from Ohio. \n", + "\n", + "The widow of Sergeant First Class Heath Robinson.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 13:\n", + "\n", + "He will never extinguish their love of freedom. He will never weaken the resolve of the free world. \n", + "\n", + "We meet tonight in an America that has lived through two of the hardest years this nation has ever faced. \n", + "\n", + "The pandemic has been punishing. \n", + "\n", + "And so many families are living paycheck to paycheck, struggling to keep up with the rising cost of food, gas, housing, and so much more. \n", + "\n", + "I understand.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 14:\n", + "\n", + "When we invest in our workers, when we build the economy from the bottom up and the middle out together, we can do something we haven’t done in a long time: build a better America. \n", + "\n", + "For more than two years, COVID-19 has impacted every decision in our lives and the life of the nation. \n", + "\n", + "And I know you’re tired, frustrated, and exhausted. \n", + "\n", + "But I also know this.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 15:\n", + "\n", + "My plan to fight inflation will lower your costs and lower the deficit. \n", + "\n", + "17 Nobel laureates in economics say my plan will ease long-term inflationary pressures. Top business leaders and most Americans support my plan. And here’s the plan: \n", + "\n", + "First – cut the cost of prescription drugs. Just look at insulin. One in ten Americans has diabetes. In Virginia, I met a 13-year-old boy named Joshua Davis.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 16:\n", + "\n", + "And soon, we’ll strengthen the Violence Against Women Act that I first wrote three decades ago. It is important for us to show the nation that we can come together and do big things. \n", + "\n", + "So tonight I’m offering a Unity Agenda for the Nation. Four big things we can do together. \n", + "\n", + "First, beat the opioid epidemic. \n", + "\n", + "There is so much we can do. Increase funding for prevention, treatment, harm reduction, and recovery.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 17:\n", + "\n", + "So let’s not abandon our streets. Or choose between safety and equal justice. \n", + "\n", + "Let’s come together to protect our communities, restore trust, and hold law enforcement accountable. \n", + "\n", + "That’s why the Justice Department required body cameras, banned chokeholds, and restricted no-knock warrants for its officers.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 18:\n", + "\n", + "My plan will not only lower costs to give families a fair shot, it will lower the deficit. \n", + "\n", + "The previous Administration not only ballooned the deficit with tax cuts for the very wealthy and corporations, it undermined the watchdogs whose job was to keep pandemic relief funds from being wasted. \n", + "\n", + "But in my administration, the watchdogs have been welcomed back. \n", + "\n", + "We’re going after the criminals who stole billions in relief money meant for small businesses and millions of Americans.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 19:\n", + "\n", + "I understand. \n", + "\n", + "I remember when my Dad had to leave our home in Scranton, Pennsylvania to find work. I grew up in a family where if the price of food went up, you felt it. \n", + "\n", + "That’s why one of the first things I did as President was fight to pass the American Rescue Plan. \n", + "\n", + "Because people were hurting. We needed to act, and we did. \n", + "\n", + "Few pieces of legislation have done more in a critical moment in our history to lift us out of crisis.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 20:\n", + "\n", + "And we will, as one people. \n", + "\n", + "One America. \n", + "\n", + "The United States of America. \n", + "\n", + "May God bless you all. May God protect our troops.\n" + ] + } + ], + "source": [ + "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "from langchain_community.document_loaders import TextLoader\n", + "from langchain_community.vectorstores import FAISS\n", + "from langchain_openai import OpenAIEmbeddings\n", + "\n", + "documents = TextLoader(\n", + " \"../../modules/state_of_the_union.txt\",\n", + ").load()\n", + "text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)\n", + "texts = text_splitter.split_documents(documents)\n", + "\n", + "embedding = OpenAIEmbeddings(model=\"text-embedding-ada-002\")\n", + "retriever = FAISS.from_documents(texts, embedding).as_retriever(search_kwargs={\"k\": 20})\n", + "\n", + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = retriever.get_relevant_documents(query)\n", + "pretty_print_docs(docs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "## Doing reranking with FlashRank\n", + "Now let's wrap our base retriever with a `ContextualCompressionRetriever`, using `FlashrankRerank` as a compressor." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0, 3, 7]\n" + ] + } + ], + "source": [ + "from langchain.retrievers import ContextualCompressionRetriever\n", + "from langchain.retrievers.document_compressors import FlashrankRerank\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "llm = ChatOpenAI(temperature=0)\n", + "\n", + "compressor = FlashrankRerank()\n", + "compression_retriever = ContextualCompressionRetriever(\n", + " base_compressor=compressor, base_retriever=retriever\n", + ")\n", + "\n", + "compressed_docs = compression_retriever.get_relevant_documents(\n", + " \"What did the president say about Ketanji Jackson Brown\"\n", + ")\n", + "print([doc.metadata[\"id\"] for doc in compressed_docs])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "After reranking, the top 3 documents are different from the top 3 documents retrieved by the base retriever." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Document 1:\n", + "\n", + "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 2:\n", + "\n", + "And tonight, I’m announcing that the Justice Department will name a chief prosecutor for pandemic fraud. \n", + "\n", + "By the end of this year, the deficit will be down to less than half what it was before I took office. \n", + "\n", + "The only president ever to cut the deficit by more than one trillion dollars in a single year. \n", + "\n", + "Lowering your costs also means demanding more competition. \n", + "\n", + "I’m a capitalist, but capitalism without competition isn’t capitalism. \n", + "\n", + "It’s exploitation—and it drives up prices.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 3:\n", + "\n", + "As Ohio Senator Sherrod Brown says, “It’s time to bury the label “Rust Belt.” \n", + "\n", + "It’s time. \n", + "\n", + "But with all the bright spots in our economy, record job growth and higher wages, too many families are struggling to keep up with the bills. \n", + "\n", + "Inflation is robbing them of the gains they might otherwise feel. \n", + "\n", + "I get it. That’s why my top priority is getting prices under control.\n" + ] + } + ], + "source": [ + "pretty_print_docs(compressed_docs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "## QA reranking with FlashRank" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from langchain.chains import RetrievalQA\n", + "\n", + "chain = RetrievalQA.from_chain_type(llm=llm, retriever=compression_retriever)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": "{'query': 'What did the president say about Ketanji Brown Jackson',\n 'result': \"The President said that Ketanji Brown Jackson is one of our nation's top legal minds and will continue Justice Breyer's legacy of excellence.\"}" + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke(query)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/libs/langchain/langchain/retrievers/document_compressors/__init__.py b/libs/langchain/langchain/retrievers/document_compressors/__init__.py index 410ad540d19e8..a4c1456cf24d0 100644 --- a/libs/langchain/langchain/retrievers/document_compressors/__init__.py +++ b/libs/langchain/langchain/retrievers/document_compressors/__init__.py @@ -9,6 +9,7 @@ from langchain.retrievers.document_compressors.embeddings_filter import ( EmbeddingsFilter, ) +from langchain.retrievers.document_compressors.flashrank_rerank import FlashrankRerank __all__ = [ "DocumentCompressorPipeline", @@ -16,4 +17,5 @@ "LLMChainExtractor", "LLMChainFilter", "CohereRerank", + "FlashrankRerank", ] diff --git a/libs/langchain/langchain/retrievers/document_compressors/flashrank_rerank.py b/libs/langchain/langchain/retrievers/document_compressors/flashrank_rerank.py new file mode 100644 index 0000000000000..273aff3d44ca3 --- /dev/null +++ b/libs/langchain/langchain/retrievers/document_compressors/flashrank_rerank.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Dict, Optional, Sequence + +from langchain_core.documents import Document +from langchain_core.pydantic_v1 import Extra, root_validator + +from langchain.callbacks.manager import Callbacks +from langchain.retrievers.document_compressors.base import BaseDocumentCompressor + +if TYPE_CHECKING: + from flashrank import Ranker, RerankRequest +else: + # Avoid pydantic annotation issues when actually instantiating + # while keeping this import optional + try: + from flashrank import Ranker, RerankRequest + except ImportError: + pass + +DEFAULT_MODEL_NAME = "ms-marco-MultiBERT-L-12" + + +class FlashrankRerank(BaseDocumentCompressor): + """Document compressor using Flashrank interface.""" + + client: Ranker + """Flashrank client to use for compressing documents""" + top_n: int = 3 + """Number of documents to return.""" + model: Optional[str] = None + """Model to use for reranking.""" + + class Config: + """Configuration for this pydantic object.""" + + extra = Extra.forbid + arbitrary_types_allowed = True + + @root_validator(pre=True) + def validate_environment(cls, values: Dict) -> Dict: + """Validate that api key and python package exists in environment.""" + try: + from flashrank import Ranker + except ImportError: + raise ImportError( + "Could not import flashrank python package. " + "Please install it with `pip install flashrank`." + ) + + values["model"] = values.get("model", DEFAULT_MODEL_NAME) + values["client"] = Ranker(model_name=values["model"]) + return values + + def compress_documents( + self, + documents: Sequence[Document], + query: str, + callbacks: Optional[Callbacks] = None, + ) -> Sequence[Document]: + passages = [ + {"id": i, "text": doc.page_content} for i, doc in enumerate(documents) + ] + + rerank_request = RerankRequest(query=query, passages=passages) + rerank_response = self.client.rerank(rerank_request)[: self.top_n] + final_results = [] + for r in rerank_response: + doc = Document( + page_content=r["text"], + metadata={"id": r["id"], "relevance_score": r["score"]}, + ) + final_results.append(doc) + return final_results From e1cfd0f3e73ee9126385073fa7d7c323b195d11a Mon Sep 17 00:00:00 2001 From: Michael Feil <63565275+michaelfeil@users.noreply.github.com> Date: Mon, 12 Feb 2024 20:05:08 -0800 Subject: [PATCH 15/25] community[patch]: infinity embeddings update incorrect default url (#16759) The default url has always been incorrect (7797 instead 7997). Here is a update to the correct url. --- libs/community/langchain_community/embeddings/infinity.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/community/langchain_community/embeddings/infinity.py b/libs/community/langchain_community/embeddings/infinity.py index 4cc5688e0e1e0..f0a01fc75e512 100644 --- a/libs/community/langchain_community/embeddings/infinity.py +++ b/libs/community/langchain_community/embeddings/infinity.py @@ -28,14 +28,14 @@ class InfinityEmbeddings(BaseModel, Embeddings): from langchain_community.embeddings import InfinityEmbeddings InfinityEmbeddings( model="BAAI/bge-small", - infinity_api_url="http://localhost:7797/v1", + infinity_api_url="http://localhost:7997", ) """ model: str "Underlying Infinity model id." - infinity_api_url: str = "http://localhost:7797/v1" + infinity_api_url: str = "http://localhost:7997" """Endpoint URL to use.""" client: Any = None #: :meta private: From 1bbb64d956210fe57d234be98af30436d518d4d9 Mon Sep 17 00:00:00 2001 From: mhavey Date: Tue, 13 Feb 2024 00:30:20 -0500 Subject: [PATCH 16/25] community[minor], langchian[minor]: Add Neptune Rdf graph and chain (#16650) **Description**: This PR adds a chain for Amazon Neptune graph database RDF format. It complements the existing Neptune Cypher chain. The PR also includes a Neptune RDF graph class to connect to, introspect, and query a Neptune RDF graph database from the chain. A sample notebook is provided under docs that demonstrates the overall effect: invoking the chain to make natural language queries against Neptune using an LLM. **Issue**: This is a new feature **Dependencies**: The RDF graph class depends on the AWS boto3 library if using IAM authentication to connect to the Neptune database. --------- Co-authored-by: Piyush Jain Co-authored-by: Bagatur --- .../use_cases/graph/neptune_sparql_qa.ipynb | 337 ++++++++++++++++++ .../langchain_community/graphs/__init__.py | 2 + .../graphs/neptune_rdf_graph.py | 256 +++++++++++++ .../tests/unit_tests/graphs/test_imports.py | 1 + .../unit_tests/graphs/test_neptune_graph.py | 5 +- libs/langchain/langchain/chains/__init__.py | 2 + .../chains/graph_qa/neptune_sparql.py | 196 ++++++++++ .../tests/unit_tests/chains/test_imports.py | 1 + 8 files changed, 799 insertions(+), 1 deletion(-) create mode 100644 docs/docs/use_cases/graph/neptune_sparql_qa.ipynb create mode 100644 libs/community/langchain_community/graphs/neptune_rdf_graph.py create mode 100644 libs/langchain/langchain/chains/graph_qa/neptune_sparql.py diff --git a/docs/docs/use_cases/graph/neptune_sparql_qa.ipynb b/docs/docs/use_cases/graph/neptune_sparql_qa.ipynb new file mode 100644 index 0000000000000..4c464e840d522 --- /dev/null +++ b/docs/docs/use_cases/graph/neptune_sparql_qa.ipynb @@ -0,0 +1,337 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Neptune SPARQL QA Chain\n", + "\n", + "This notebook shows use of LLM to query RDF graph in Amazon Neptune. This code uses a `NeptuneRdfGraph` class that connects with the Neptune database and loads it's schema. The `NeptuneSparqlQAChain` is used to connect the graph and LLM to ask natural language questions.\n", + "\n", + "Requirements for running this notebook:\n", + "- Neptune 1.2.x cluster accessible from this notebook\n", + "- Kernel with Python 3.9 or higher\n", + "- For Bedrock access, ensure IAM role has this policy\n", + "\n", + "```json\n", + "{\n", + " \"Action\": [\n", + " \"bedrock:ListFoundationModels\",\n", + " \"bedrock:InvokeModel\"\n", + " ],\n", + " \"Resource\": \"*\",\n", + " \"Effect\": \"Allow\"\n", + "}\n", + "```\n", + "\n", + "- S3 bucket for staging sample data, bucket should be in same account/region as Neptune." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Seed W3C organizational data\n", + "W3C org ontology plus some instances. \n", + "\n", + "You will need an S3 bucket in the same region and account. Set STAGE_BUCKET to name of that bucket." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "STAGE_BUCKET = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash -s \"$STAGE_BUCKET\"\n", + "\n", + "rm -rf data\n", + "mkdir -p data\n", + "cd data\n", + "echo getting org ontology and sample org instances\n", + "wget http://www.w3.org/ns/org.ttl \n", + "wget https://raw.githubusercontent.com/aws-samples/amazon-neptune-ontology-example-blog/main/data/example_org.ttl \n", + "\n", + "echo Copying org ttl to S3\n", + "aws s3 cp org.ttl s3://$1/org.ttl\n", + "aws s3 cp example_org.ttl s3://$1/example_org.ttl\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Bulk-load the org ttl - both ontology and instances" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load -s s3://{STAGE_BUCKET} -f turtle --store-to loadres --run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_status {loadres['payload']['loadId']} --errors --details" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup Chain" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "EXAMPLES = \"\"\"\n", + "\n", + "\n", + "Find organizations.\n", + "\n", + "\n", + "\n", + "PREFIX rdf: \n", + "PREFIX rdfs: \n", + "PREFIX org: \n", + "\n", + "select ?org ?orgName where {{\n", + " ?org rdfs:label ?orgName .\n", + "}} \n", + "\n", + "\n", + "\n", + "Find sites of an organization\n", + "\n", + "\n", + "\n", + "PREFIX rdf: \n", + "PREFIX rdfs: \n", + "PREFIX org: \n", + "\n", + "select ?org ?orgName ?siteName where {{\n", + " ?org rdfs:label ?orgName .\n", + " ?org org:hasSite/rdfs:label ?siteName . \n", + "}} \n", + "\n", + "\n", + "\n", + "Find suborganizations of an organization\n", + "\n", + "\n", + "\n", + "PREFIX rdf: \n", + "PREFIX rdfs: \n", + "PREFIX org: \n", + "\n", + "select ?org ?orgName ?subName where {{\n", + " ?org rdfs:label ?orgName .\n", + " ?org org:hasSubOrganization/rdfs:label ?subName .\n", + "}} \n", + "\n", + "\n", + "\n", + "Find organizational units of an organization\n", + "\n", + "\n", + "\n", + "PREFIX rdf: \n", + "PREFIX rdfs: \n", + "PREFIX org: \n", + "\n", + "select ?org ?orgName ?unitName where {{\n", + " ?org rdfs:label ?orgName .\n", + " ?org org:hasUnit/rdfs:label ?unitName . \n", + "}} \n", + "\n", + "\n", + "\n", + "Find members of an organization. Also find their manager, or the member they report to.\n", + "\n", + "\n", + "\n", + "PREFIX org: \n", + "PREFIX foaf: \n", + "\n", + "select * where {{\n", + " ?person rdf:type foaf:Person .\n", + " ?person org:memberOf ?org .\n", + " OPTIONAL {{ ?person foaf:firstName ?firstName . }}\n", + " OPTIONAL {{ ?person foaf:family_name ?lastName . }}\n", + " OPTIONAL {{ ?person org:reportsTo ??manager }} .\n", + "}}\n", + "\n", + "\n", + "\n", + "\n", + "Find change events, such as mergers and acquisitions, of an organization\n", + "\n", + "\n", + "\n", + "PREFIX org: \n", + "\n", + "select ?event ?prop ?obj where {{\n", + " ?org rdfs:label ?orgName .\n", + " ?event rdf:type org:ChangeEvent .\n", + " ?event org:originalOrganization ?origOrg .\n", + " ?event org:resultingOrganization ?resultingOrg .\n", + "}}\n", + "\n", + "\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import boto3\n", + "from langchain.chains.graph_qa.neptune_sparql import NeptuneSparqlQAChain\n", + "from langchain.chat_models import BedrockChat\n", + "from langchain_community.graphs import NeptuneRdfGraph\n", + "\n", + "host = \"\"\n", + "port = \"\"\n", + "region = \"us-east-1\" # specify region\n", + "\n", + "graph = NeptuneRdfGraph(\n", + " host=host, port=port, use_iam_auth=True, region_name=region, hide_comments=True\n", + ")\n", + "\n", + "schema_elements = graph.get_schema_elements\n", + "# Optionally, you can update the schema_elements, and\n", + "# load the schema from the pruned elements.\n", + "graph.load_from_schema_elements(schema_elements)\n", + "\n", + "bedrock_client = boto3.client(\"bedrock-runtime\")\n", + "llm = BedrockChat(model_id=\"anthropic.claude-v2\", client=bedrock_client)\n", + "\n", + "chain = NeptuneSparqlQAChain.from_llm(\n", + " llm=llm,\n", + " graph=graph,\n", + " examples=EXAMPLES,\n", + " verbose=True,\n", + " top_K=10,\n", + " return_intermediate_steps=True,\n", + " return_direct=False,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Ask questions\n", + "Depends on the data we ingested above" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "chain.invoke(\"\"\"How many organizations are in the graph\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "chain.invoke(\"\"\"Are there any mergers or acquisitions\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "chain.invoke(\"\"\"Find organizations\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "chain.invoke(\"\"\"Find sites of MegaSystems or MegaFinancial\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "chain.invoke(\"\"\"Find a member who is manager of one or more members.\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "chain.invoke(\"\"\"Find five members and who their manager is.\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "chain.invoke(\n", + " \"\"\"Find org units or suborganizations of The Mega Group. What are the sites of those units?\"\"\"\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/libs/community/langchain_community/graphs/__init__.py b/libs/community/langchain_community/graphs/__init__.py index bd15f6465d1b4..c1fc640c4b1e7 100644 --- a/libs/community/langchain_community/graphs/__init__.py +++ b/libs/community/langchain_community/graphs/__init__.py @@ -8,6 +8,7 @@ from langchain_community.graphs.nebula_graph import NebulaGraph from langchain_community.graphs.neo4j_graph import Neo4jGraph from langchain_community.graphs.neptune_graph import NeptuneGraph +from langchain_community.graphs.neptune_rdf_graph import NeptuneRdfGraph from langchain_community.graphs.networkx_graph import NetworkxEntityGraph from langchain_community.graphs.ontotext_graphdb_graph import OntotextGraphDBGraph from langchain_community.graphs.rdf_graph import RdfGraph @@ -19,6 +20,7 @@ "Neo4jGraph", "NebulaGraph", "NeptuneGraph", + "NeptuneRdfGraph", "KuzuGraph", "HugeGraph", "RdfGraph", diff --git a/libs/community/langchain_community/graphs/neptune_rdf_graph.py b/libs/community/langchain_community/graphs/neptune_rdf_graph.py new file mode 100644 index 0000000000000..b9e0074a366ee --- /dev/null +++ b/libs/community/langchain_community/graphs/neptune_rdf_graph.py @@ -0,0 +1,256 @@ +import json +from types import SimpleNamespace +from typing import Any, Dict, Optional, Sequence + +import requests + +CLASS_QUERY = """ +SELECT DISTINCT ?elem ?com +WHERE { + ?instance a ?elem . + OPTIONAL { ?instance rdf:type/rdfs:subClassOf* ?elem } . + #FILTER (isIRI(?elem)) . + OPTIONAL { ?elem rdfs:comment ?com filter (lang(?com) = "en")} +} +""" + +REL_QUERY = """ +SELECT DISTINCT ?elem ?com +WHERE { + ?subj ?elem ?obj . + OPTIONAL { + ?elem rdf:type/rdfs:subPropertyOf* ?proptype . + VALUES ?proptype { rdf:Property owl:DatatypeProperty owl:ObjectProperty } . + } . + OPTIONAL { ?elem rdfs:comment ?com filter (lang(?com) = "en")} +} +""" + +DTPROP_QUERY = """ +SELECT DISTINCT ?elem ?com +WHERE { + ?subj ?elem ?obj . + OPTIONAL { + ?elem rdf:type/rdfs:subPropertyOf* ?proptype . + ?proptype a owl:DatatypeProperty . + } . + OPTIONAL { ?elem rdfs:comment ?com filter (lang(?com) = "en")} +} +""" + +OPROP_QUERY = """ +SELECT DISTINCT ?elem ?com +WHERE { + ?subj ?elem ?obj . + OPTIONAL { + ?elem rdf:type/rdfs:subPropertyOf* ?proptype . + ?proptype a owl:ObjectProperty . + } . + OPTIONAL { ?elem rdfs:comment ?com filter (lang(?com) = "en")} +} +""" + +ELEM_TYPES = { + "classes": CLASS_QUERY, + "rels": REL_QUERY, + "dtprops": DTPROP_QUERY, + "oprops": OPROP_QUERY, +} + + +class NeptuneRdfGraph: + """Neptune wrapper for RDF graph operations. + + Args: + host: SPARQL endpoint host for Neptune + port: SPARQL endpoint port for Neptune. Defaults 8182. + use_iam_auth: boolean indicating IAM auth is enabled in Neptune cluster + region_name: AWS region required if use_iam_auth is True, e.g., us-west-2 + hide_comments: whether to include ontology comments in schema for prompt + + Example: + .. code-block:: python + + graph = NeptuneRdfGraph( + host=', + port=, + use_iam_auth=False + ) + schema = graph.get_schema() + + OR + graph = NeptuneRdfGraph( + host=', + port=, + use_iam_auth=False + ) + schema_elem = graph.get_schema_elements() + ... change schema_elements ... + graph.load_schema(schema_elem) + schema = graph.get_schema() + + *Security note*: Make sure that the database connection uses credentials + that are narrowly-scoped to only include necessary permissions. + Failure to do so may result in data corruption or loss, since the calling + code may attempt commands that would result in deletion, mutation + of data if appropriately prompted or reading sensitive data if such + data is present in the database. + The best way to guard against such negative outcomes is to (as appropriate) + limit the permissions granted to the credentials used with this tool. + + See https://python.langchain.com/docs/security for more information. + """ + + def __init__( + self, + host: str, + port: int = 8182, + use_iam_auth: bool = False, + region_name: Optional[str] = None, + hide_comments: bool = False, + ) -> None: + self.use_iam_auth = use_iam_auth + self.region_name = region_name + self.hide_comments = hide_comments + self.query_endpoint = f"https://{host}:{port}/sparql" + + if self.use_iam_auth: + try: + import boto3 + + self.session = boto3.Session() + except ImportError: + raise ImportError( + "Could not import boto3 python package. " + "Please install it with `pip install boto3`." + ) + else: + self.session = None + + # Set schema + self.schema = "" + self.schema_elements: Dict[str, Any] = {} + self._refresh_schema() + + @property + def get_schema(self) -> str: + """ + Returns the schema of the graph database. + """ + return self.schema + + @property + def get_schema_elements(self) -> Dict[str, Any]: + return self.schema_elements + + def query( + self, + query: str, + ) -> Dict[str, Any]: + """ + Run Neptune query. + """ + request_data = {"query": query} + data = request_data + request_hdr = None + + if self.use_iam_auth: + credentials = self.session.get_credentials() + credentials = credentials.get_frozen_credentials() + access_key = credentials.access_key + secret_key = credentials.secret_key + service = "neptune-db" + session_token = credentials.token + params = None + creds = SimpleNamespace( + access_key=access_key, + secret_key=secret_key, + token=session_token, + region=self.region_name, + ) + from botocore.awsrequest import AWSRequest + + request = AWSRequest( + method="POST", url=self.query_endpoint, data=data, params=params + ) + from botocore.auth import SigV4Auth + + SigV4Auth(creds, service, self.region_name).add_auth(request) + request.headers["Content-Type"] = "application/x-www-form-urlencoded" + request_hdr = request.headers + else: + request_hdr = {} + request_hdr["Content-Type"] = "application/x-www-form-urlencoded" + + queryres = requests.request( + method="POST", url=self.query_endpoint, headers=request_hdr, data=data + ) + json_resp = json.loads(queryres.text) + return json_resp + + def load_schema(self, schema_elements: Dict[str, Any]) -> None: + """ + Generates and sets schema from schema_elements. Helpful in + cases where introspected schema needs pruning. + """ + + elem_str = {} + for elem in ELEM_TYPES: + res_list = [] + for elem_rec in self.schema_elements[elem]: + uri = elem_rec["uri"] + local = elem_rec["local"] + res_str = f"<{uri}> ({local})" + if self.hide_comments is False: + res_str = res_str + f", {elem_rec['comment']}" + res_list.append(res_str) + elem_str[elem] = ", ".join(res_list) + + self.schema = ( + "In the following, each IRI is followed by the local name and " + "optionally its description in parentheses. \n" + "The graph supports the following node types:\n" + f"{elem_str['classes']}" + "The graph supports the following relationships:\n" + f"{elem_str['rels']}" + "The graph supports the following OWL object properties, " + f"{elem_str['dtprops']}" + "The graph supports the following OWL data properties, " + f"{elem_str['oprops']}" + ) + + def _get_local_name(self, iri: str) -> Sequence[str]: + """ + Split IRI into prefix and local + """ + if "#" in iri: + tokens = iri.split("#") + return [f"{tokens[0]}#", tokens[-1]] + elif "/" in iri: + tokens = iri.split("/") + return [f"{'/'.join(tokens[0:len(tokens)-1])}/", tokens[-1]] + else: + raise ValueError(f"Unexpected IRI '{iri}', contains neither '#' nor '/'.") + + def _refresh_schema(self) -> None: + """ + Query Neptune to introspect schema. + """ + self.schema_elements["distinct_prefixes"] = {} + + for elem in ELEM_TYPES: + items = self.query(ELEM_TYPES[elem]) + reslist = [] + for r in items["results"]["bindings"]: + uri = r["elem"]["value"] + tokens = self._get_local_name(uri) + elem_record = {"uri": uri, "local": tokens[1]} + if not self.hide_comments: + elem_record["comment"] = r["com"]["value"] if "com" in r else "" + reslist.append(elem_record) + if tokens[0] not in self.schema_elements["distinct_prefixes"]: + self.schema_elements["distinct_prefixes"][tokens[0]] = "y" + + self.schema_elements[elem] = reslist + + self.load_schema(self.schema_elements) diff --git a/libs/community/tests/unit_tests/graphs/test_imports.py b/libs/community/tests/unit_tests/graphs/test_imports.py index 653d7d540ba5f..202ecefa24997 100644 --- a/libs/community/tests/unit_tests/graphs/test_imports.py +++ b/libs/community/tests/unit_tests/graphs/test_imports.py @@ -6,6 +6,7 @@ "Neo4jGraph", "NebulaGraph", "NeptuneGraph", + "NeptuneRdfGraph", "KuzuGraph", "HugeGraph", "RdfGraph", diff --git a/libs/community/tests/unit_tests/graphs/test_neptune_graph.py b/libs/community/tests/unit_tests/graphs/test_neptune_graph.py index e3d986f2eb04b..6e714a41665a7 100644 --- a/libs/community/tests/unit_tests/graphs/test_neptune_graph.py +++ b/libs/community/tests/unit_tests/graphs/test_neptune_graph.py @@ -1,2 +1,5 @@ def test_import() -> None: - from langchain_community.graphs import NeptuneGraph # noqa: F401 + from langchain_community.graphs import ( + NeptuneGraph, # noqa: F401 + NeptuneRdfGraph, # noqa: F401 + ) diff --git a/libs/langchain/langchain/chains/__init__.py b/libs/langchain/langchain/chains/__init__.py index 2b7ba6ac256bb..b20d3fecb444d 100644 --- a/libs/langchain/langchain/chains/__init__.py +++ b/libs/langchain/langchain/chains/__init__.py @@ -41,6 +41,7 @@ from langchain.chains.graph_qa.kuzu import KuzuQAChain from langchain.chains.graph_qa.nebulagraph import NebulaGraphQAChain from langchain.chains.graph_qa.neptune_cypher import NeptuneOpenCypherQAChain +from langchain.chains.graph_qa.neptune_sparql import NeptuneSparqlQAChain from langchain.chains.graph_qa.ontotext_graphdb import OntotextGraphDBQAChain from langchain.chains.graph_qa.sparql import GraphSparqlQAChain from langchain.chains.history_aware_retriever import create_history_aware_retriever @@ -116,6 +117,7 @@ "NatBotChain", "NebulaGraphQAChain", "NeptuneOpenCypherQAChain", + "NeptuneSparqlQAChain", "OpenAIModerationChain", "OpenAPIEndpointChain", "QAGenerationChain", diff --git a/libs/langchain/langchain/chains/graph_qa/neptune_sparql.py b/libs/langchain/langchain/chains/graph_qa/neptune_sparql.py new file mode 100644 index 0000000000000..08a1cc249beed --- /dev/null +++ b/libs/langchain/langchain/chains/graph_qa/neptune_sparql.py @@ -0,0 +1,196 @@ +""" +Question answering over an RDF or OWL graph using SPARQL. +""" +from __future__ import annotations + +from typing import Any, Dict, List, Optional + +from langchain_community.graphs import NeptuneRdfGraph +from langchain_core.language_models import BaseLanguageModel +from langchain_core.prompts.base import BasePromptTemplate +from langchain_core.prompts.prompt import PromptTemplate +from langchain_core.pydantic_v1 import Field + +from langchain.callbacks.manager import CallbackManagerForChainRun +from langchain.chains.base import Chain +from langchain.chains.graph_qa.prompts import SPARQL_QA_PROMPT +from langchain.chains.llm import LLMChain + +INTERMEDIATE_STEPS_KEY = "intermediate_steps" + +SPARQL_GENERATION_TEMPLATE = """ +Task: Generate a SPARQL SELECT statement for querying a graph database. +For instance, to find all email addresses of John Doe, the following +query in backticks would be suitable: +``` +PREFIX foaf: +SELECT ?email +WHERE {{ + ?person foaf:name "John Doe" . + ?person foaf:mbox ?email . +}} +``` +Instructions: +Use only the node types and properties provided in the schema. +Do not use any node types and properties that are not explicitly provided. +Include all necessary prefixes. + +Examples: + +Schema: +{schema} +Note: Be as concise as possible. +Do not include any explanations or apologies in your responses. +Do not respond to any questions that ask for anything else than +for you to construct a SPARQL query. +Do not include any text except the SPARQL query generated. + +The question is: +{prompt}""" + +SPARQL_GENERATION_PROMPT = PromptTemplate( + input_variables=["schema", "prompt"], template=SPARQL_GENERATION_TEMPLATE +) + + +def extract_sparql(query: str) -> str: + query = query.strip() + querytoks = query.split("```") + if len(querytoks) == 3: + query = querytoks[1] + + if query.startswith("sparql"): + query = query[6:] + elif query.startswith("") and query.endswith(""): + query = query[8:-9] + return query + + +class NeptuneSparqlQAChain(Chain): + """Chain for question-answering against a Neptune graph + by generating SPARQL statements. + + *Security note*: Make sure that the database connection uses credentials + that are narrowly-scoped to only include necessary permissions. + Failure to do so may result in data corruption or loss, since the calling + code may attempt commands that would result in deletion, mutation + of data if appropriately prompted or reading sensitive data if such + data is present in the database. + The best way to guard against such negative outcomes is to (as appropriate) + limit the permissions granted to the credentials used with this tool. + + See https://python.langchain.com/docs/security for more information. + + Example: + .. code-block:: python + + chain = NeptuneSparqlQAChain.from_llm( + llm=llm, + graph=graph + ) + response = chain.invoke(query) + """ + + graph: NeptuneRdfGraph = Field(exclude=True) + sparql_generation_chain: LLMChain + qa_chain: LLMChain + input_key: str = "query" #: :meta private: + output_key: str = "result" #: :meta private: + top_k: int = 10 + return_intermediate_steps: bool = False + """Whether or not to return the intermediate steps along with the final answer.""" + return_direct: bool = False + """Whether or not to return the result of querying the graph directly.""" + extra_instructions: Optional[str] = None + """Extra instructions by the appended to the query generation prompt.""" + + @property + def input_keys(self) -> List[str]: + return [self.input_key] + + @property + def output_keys(self) -> List[str]: + _output_keys = [self.output_key] + return _output_keys + + @classmethod + def from_llm( + cls, + llm: BaseLanguageModel, + *, + qa_prompt: BasePromptTemplate = SPARQL_QA_PROMPT, + sparql_prompt: BasePromptTemplate = SPARQL_GENERATION_PROMPT, + examples: Optional[str] = None, + **kwargs: Any, + ) -> NeptuneSparqlQAChain: + """Initialize from LLM.""" + qa_chain = LLMChain(llm=llm, prompt=qa_prompt) + template_to_use = SPARQL_GENERATION_TEMPLATE + if examples: + template_to_use = template_to_use.replace( + "Examples:", "Examples: " + examples + ) + sparql_prompt = PromptTemplate( + input_variables=["schema", "prompt"], template=template_to_use + ) + sparql_generation_chain = LLMChain(llm=llm, prompt=sparql_prompt) + + return cls( + qa_chain=qa_chain, + sparql_generation_chain=sparql_generation_chain, + examples=examples, + **kwargs, + ) + + def _call( + self, + inputs: Dict[str, Any], + run_manager: Optional[CallbackManagerForChainRun] = None, + ) -> Dict[str, str]: + """ + Generate SPARQL query, use it to retrieve a response from the gdb and answer + the question. + """ + _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager() + callbacks = _run_manager.get_child() + prompt = inputs[self.input_key] + + intermediate_steps: List = [] + + generated_sparql = self.sparql_generation_chain.run( + {"prompt": prompt, "schema": self.graph.get_schema}, callbacks=callbacks + ) + + # Extract SPARQL + generated_sparql = extract_sparql(generated_sparql) + + _run_manager.on_text("Generated SPARQL:", end="\n", verbose=self.verbose) + _run_manager.on_text( + generated_sparql, color="green", end="\n", verbose=self.verbose + ) + + intermediate_steps.append({"query": generated_sparql}) + + context = self.graph.query(generated_sparql) + + if self.return_direct: + final_result = context + else: + _run_manager.on_text("Full Context:", end="\n", verbose=self.verbose) + _run_manager.on_text( + str(context), color="green", end="\n", verbose=self.verbose + ) + + intermediate_steps.append({"context": context}) + + result = self.qa_chain( + {"prompt": prompt, "context": context}, + callbacks=callbacks, + ) + final_result = result[self.qa_chain.output_key] + + chain_result: Dict[str, Any] = {self.output_key: final_result} + if self.return_intermediate_steps: + chain_result[INTERMEDIATE_STEPS_KEY] = intermediate_steps + + return chain_result diff --git a/libs/langchain/tests/unit_tests/chains/test_imports.py b/libs/langchain/tests/unit_tests/chains/test_imports.py index 8317dd62ea983..cf76a851b8c79 100644 --- a/libs/langchain/tests/unit_tests/chains/test_imports.py +++ b/libs/langchain/tests/unit_tests/chains/test_imports.py @@ -33,6 +33,7 @@ "NatBotChain", "NebulaGraphQAChain", "NeptuneOpenCypherQAChain", + "NeptuneSparqlQAChain", "OpenAIModerationChain", "OpenAPIEndpointChain", "QAGenerationChain", From d70a5bbf15b07973ed6906f93f3c4aaf4fc26906 Mon Sep 17 00:00:00 2001 From: Sheil Naik Date: Tue, 13 Feb 2024 00:39:56 -0500 Subject: [PATCH 17/25] docs: Fix broken link in LLMs index.mdx (#16557) - **Description:** The [LLMs](https://python.langchain.com/docs/modules/model_io/llms/) page has a broken link. This fixes the link. - **Issue:** N/A - **Dependencies:** N/A - **Twitter handle:** @sheilnaik --- docs/docs/modules/model_io/llms/index.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docs/modules/model_io/llms/index.mdx b/docs/docs/modules/model_io/llms/index.mdx index 8e7a1e95dabb4..ead4a3e1b9f03 100644 --- a/docs/docs/modules/model_io/llms/index.mdx +++ b/docs/docs/modules/model_io/llms/index.mdx @@ -26,4 +26,4 @@ This includes: - [How to write a custom LLM class](./custom_llm) - [How to cache LLM responses](./llm_caching) - [How to stream responses from an LLM](./streaming_llm) -- [How to track token usage in an LLM call)(./token_usage_tracking) +- [How to track token usage in an LLM call](./token_usage_tracking) From 0834457f28bb684099aa07324fd94db2896632ce Mon Sep 17 00:00:00 2001 From: Preetam D'Souza Date: Tue, 13 Feb 2024 14:40:57 +0900 Subject: [PATCH 18/25] docs: Fix broken link in summarization use-case (#16554) - **Description:** Fix broken link to `StuffDocumentsChain` - **Issue:** N/A - **Dependencies:** None - **Twitter handle:** [@preetamdsouza](https://twitter.com/preetamdsouza) --- docs/docs/use_cases/summarization.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docs/use_cases/summarization.ipynb b/docs/docs/use_cases/summarization.ipynb index 21dc63dde8ef4..35917b747213d 100644 --- a/docs/docs/use_cases/summarization.ipynb +++ b/docs/docs/use_cases/summarization.ipynb @@ -224,7 +224,7 @@ "source": [ "## Option 1. Stuff\n", "\n", - "When we use `load_summarize_chain` with `chain_type=\"stuff\"`, we will use the [StuffDocumentsChain](/docs/modules/chains/document/stuff).\n", + "When we use `load_summarize_chain` with `chain_type=\"stuff\"`, we will use the [StuffDocumentsChain](https://api.python.langchain.com/en/latest/chains/langchain.chains.combine_documents.stuff.StuffDocumentsChain.html#langchain.chains.combine_documents.stuff.StuffDocumentsChain).\n", "\n", "The chain will take a list of documents, inserts them all into a prompt, and passes that prompt to an LLM:" ] From 9f1cbbc6ed3060219153403064fd8b0a5a8f3e81 Mon Sep 17 00:00:00 2001 From: Sridhar Ramaswamy Date: Mon, 12 Feb 2024 21:56:12 -0800 Subject: [PATCH 19/25] community[minor]: Add pebblo safe document loader (#16862) - **Description:** Pebblo opensource project enables developers to safely load data to their Gen AI apps. It identifies semantic topics and entities found in the loaded data and summarizes them in a developer-friendly report. - **Dependencies:** none - **Twitter handle:** srics @hwchase17 --- .../document_loaders/pebblo.ipynb | 88 ++++++ .../document_loaders/__init__.py | 2 + .../document_loaders/pebblo.py | 291 ++++++++++++++++++ .../langchain_community/utilities/pebblo.py | 249 +++++++++++++++ libs/community/tests/examples/test_empty.csv | 0 .../community/tests/examples/test_nominal.csv | 3 + .../document_loaders/test_imports.py | 1 + .../document_loaders/test_pebblo.py | 114 +++++++ 8 files changed, 748 insertions(+) create mode 100644 docs/docs/integrations/document_loaders/pebblo.ipynb create mode 100644 libs/community/langchain_community/document_loaders/pebblo.py create mode 100644 libs/community/langchain_community/utilities/pebblo.py create mode 100644 libs/community/tests/examples/test_empty.csv create mode 100644 libs/community/tests/examples/test_nominal.csv create mode 100644 libs/community/tests/unit_tests/document_loaders/test_pebblo.py diff --git a/docs/docs/integrations/document_loaders/pebblo.ipynb b/docs/docs/integrations/document_loaders/pebblo.ipynb new file mode 100644 index 0000000000000..40aa7ee6b0c93 --- /dev/null +++ b/docs/docs/integrations/document_loaders/pebblo.ipynb @@ -0,0 +1,88 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Pebblo Safe DocumentLoader\n", + "\n", + "> [Pebblo](https://github.com/daxa-ai/pebblo) enables developers to safely load data and promote their Gen AI app to deployment without worrying about the organization’s compliance and security requirements. The project identifies semantic topics and entities found in the loaded data and summarizes them on the UI or a PDF report.\n", + "\n", + "Pebblo has two components.\n", + "\n", + "1. Pebblo Safe DocumentLoader for Langchain\n", + "1. Pebblo Daemon\n", + "\n", + "This document describes how to augment your existing Langchain DocumentLoader with Pebblo Safe DocumentLoader to get deep data visibility on the types of Topics and Entities ingested into the Gen-AI Langchain application. For details on `Pebblo Daemon` see this [pebblo daemon](https://daxa-ai.github.io/pebblo-docs/daemon.html) document.\n", + "\n", + "Pebblo Safeloader enables safe data ingestion for Langchain `DocumentLoader`. This is done by wrapping the document loader call with `Pebblo Safe DocumentLoader`.\n", + "\n", + "#### How to Pebblo enable Document Loading?\n", + "\n", + "Assume a Langchain RAG application snippet using `CSVLoader` to read a CSV document for inference.\n", + "\n", + "Here is the snippet of Document loading using `CSVLoader`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders.csv_loader import CSVLoader\n", + "\n", + "loader = CSVLoader(\"data/corp_sens_data.csv\")\n", + "documents = loader.load()\n", + "print(documents)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The Pebblo SafeLoader can be enabled with few lines of code change to the above snippet." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders.csv_loader import CSVLoader\n", + "from langchain_community.document_loaders import PebbloSafeLoader\n", + "\n", + "loader = PebbloSafeLoader(\n", + " CSVLoader(\"data/corp_sens_data.csv\"),\n", + " name=\"acme-corp-rag-1\", # App name (Mandatory)\n", + " owner=\"Joe Smith\", # Owner (Optional)\n", + " description=\"Support productivity RAG application\", # Description (Optional)\n", + ")\n", + "documents = loader.load()\n", + "print(documents)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/libs/community/langchain_community/document_loaders/__init__.py b/libs/community/langchain_community/document_loaders/__init__.py index ac1a197b3d259..8f2d6baf9b535 100644 --- a/libs/community/langchain_community/document_loaders/__init__.py +++ b/libs/community/langchain_community/document_loaders/__init__.py @@ -160,6 +160,7 @@ PyPDFLoader, UnstructuredPDFLoader, ) +from langchain_community.document_loaders.pebblo import PebbloSafeLoader from langchain_community.document_loaders.polars_dataframe import PolarsDataFrameLoader from langchain_community.document_loaders.powerpoint import UnstructuredPowerPointLoader from langchain_community.document_loaders.psychic import PsychicLoader @@ -284,6 +285,7 @@ "CubeSemanticLoader", "DataFrameLoader", "DatadogLogsLoader", + "PebbloSafeLoader", "DiffbotLoader", "DirectoryLoader", "DiscordChatLoader", diff --git a/libs/community/langchain_community/document_loaders/pebblo.py b/libs/community/langchain_community/document_loaders/pebblo.py new file mode 100644 index 0000000000000..e66e5ddc30af3 --- /dev/null +++ b/libs/community/langchain_community/document_loaders/pebblo.py @@ -0,0 +1,291 @@ +"""Pebblo's safe dataloader is a wrapper for document loaders""" + +import logging +import os +import pwd +import uuid +from http import HTTPStatus +from typing import Any, Dict, Iterator, List + +import requests +from langchain_core.documents import Document + +from langchain_community.document_loaders.base import BaseLoader +from langchain_community.utilities.pebblo import ( + CLASSIFIER_URL, + PLUGIN_VERSION, + App, + Doc, + get_full_path, + get_loader_full_path, + get_loader_type, + get_runtime, +) + +logger = logging.getLogger(__name__) + + +class PebbloSafeLoader(BaseLoader): + """Pebblo Safe Loader class is a wrapper around document loaders enabling the data + to be scrutinized. + """ + + _discover_sent: bool = False + _loader_sent: bool = False + + def __init__( + self, + langchain_loader: BaseLoader, + name: str, + owner: str = "", + description: str = "", + ): + if not name or not isinstance(name, str): + raise NameError("Must specify a valid name.") + self.app_name = name + self.load_id = str(uuid.uuid4()) + self.loader = langchain_loader + self.owner = owner + self.description = description + self.source_path = get_loader_full_path(self.loader) + self.source_owner = PebbloSafeLoader.get_file_owner_from_path(self.source_path) + self.docs: List[Document] = [] + loader_name = str(type(self.loader)).split(".")[-1].split("'")[0] + self.source_type = get_loader_type(loader_name) + self.source_path_size = self.get_source_size(self.source_path) + self.source_aggr_size = 0 + self.loader_details = { + "loader": loader_name, + "source_path": self.source_path, + "source_type": self.source_type, + **( + {"source_path_size": str(self.source_path_size)} + if self.source_path_size > 0 + else {} + ), + } + # generate app + self.app = self._get_app_details() + self._send_discover() + + def load(self) -> List[Document]: + """Load Documents. + + Returns: + list: Documents fetched from load method of the wrapped `loader`. + """ + self.docs = self.loader.load() + self._send_loader_doc(loading_end=True) + return self.docs + + def lazy_load(self) -> Iterator[Document]: + """Load documents in lazy fashion. + + Raises: + NotImplementedError: raised when lazy_load id not implemented + within wrapped loader. + + Yields: + list: Documents from loader's lazy loading. + """ + try: + doc_iterator = self.loader.lazy_load() + except NotImplementedError as exc: + err_str = f"{self.loader.__class__.__name__} does not implement lazy_load()" + logger.error(err_str) + raise NotImplementedError(err_str) from exc + while True: + try: + doc = next(doc_iterator) + except StopIteration: + self.docs = [] + self._send_loader_doc(loading_end=True) + break + self.docs = [ + doc, + ] + self._send_loader_doc() + yield doc + + @classmethod + def set_discover_sent(cls) -> None: + cls._discover_sent = True + + @classmethod + def set_loader_sent(cls) -> None: + cls._loader_sent = True + + def _send_loader_doc(self, loading_end: bool = False) -> None: + """Send documents fetched from loader to pebblo-server. Internal method. + + Args: + loading_end (bool, optional): Flag indicating the halt of data + loading by loader. Defaults to False. + """ + headers = {"Accept": "application/json", "Content-Type": "application/json"} + doc_content = [doc.dict() for doc in self.docs] + docs = [] + for doc in doc_content: + doc_source_path = get_full_path(doc.get("metadata", {}).get("source")) + doc_source_owner = PebbloSafeLoader.get_file_owner_from_path( + doc_source_path + ) + doc_source_size = self.get_source_size(doc_source_path) + page_content = str(doc.get("page_content")) + page_content_size = self.calculate_content_size(page_content) + self.source_aggr_size += page_content_size + docs.append( + { + "doc": page_content, + "source_path": doc_source_path, + "last_modified": doc.get("metadata", {}).get("last_modified"), + "file_owner": doc_source_owner, + **( + {"source_path_size": doc_source_size} + if doc_source_size is not None + else {} + ), + } + ) + payload: Dict[str, Any] = { + "name": self.app_name, + "owner": self.owner, + "docs": docs, + "plugin_version": PLUGIN_VERSION, + "load_id": self.load_id, + "loader_details": self.loader_details, + "loading_end": "false", + "source_owner": self.source_owner, + } + if loading_end is True: + payload["loading_end"] = "true" + if "loader_details" in payload: + payload["loader_details"]["source_aggr_size"] = self.source_aggr_size + payload = Doc(**payload).dict(exclude_unset=True) + load_doc_url = f"{CLASSIFIER_URL}/v1/loader/doc" + try: + resp = requests.post( + load_doc_url, headers=headers, json=payload, timeout=20 + ) + if resp.status_code not in [HTTPStatus.OK, HTTPStatus.BAD_GATEWAY]: + logger.warning( + f"Received unexpected HTTP response code: {resp.status_code}" + ) + logger.debug( + f"send_loader_doc: request \ + url {resp.request.url}, \ + body {str(resp.request.body)[:999]} \ + len {len(resp.request.body if resp.request.body else [])} \ + response status{resp.status_code} body {resp.json()}" + ) + except requests.exceptions.RequestException: + logger.warning("Unable to reach pebblo server.") + except Exception: + logger.warning("An Exception caught in _send_loader_doc.") + if loading_end is True: + PebbloSafeLoader.set_loader_sent() + + @staticmethod + def calculate_content_size(page_content: str) -> int: + """Calculate the content size in bytes: + - Encode the string to bytes using a specific encoding (e.g., UTF-8) + - Get the length of the encoded bytes. + + Args: + page_content (str): Data string. + + Returns: + int: Size of string in bytes. + """ + + # Encode the content to bytes using UTF-8 + encoded_content = page_content.encode("utf-8") + size = len(encoded_content) + return size + + def _send_discover(self) -> None: + """Send app discovery payload to pebblo-server. Internal method.""" + headers = {"Accept": "application/json", "Content-Type": "application/json"} + payload = self.app.dict(exclude_unset=True) + app_discover_url = f"{CLASSIFIER_URL}/v1/app/discover" + try: + resp = requests.post( + app_discover_url, headers=headers, json=payload, timeout=20 + ) + logger.debug( + f"send_discover: request \ + url {resp.request.url}, \ + headers {resp.request.headers}, \ + body {str(resp.request.body)[:999]} \ + len {len(resp.request.body if resp.request.body else [])} \ + response status{resp.status_code} body {resp.json()}" + ) + if resp.status_code in [HTTPStatus.OK, HTTPStatus.BAD_GATEWAY]: + PebbloSafeLoader.set_discover_sent() + else: + logger.warning( + f"Received unexpected HTTP response code: {resp.status_code}" + ) + except requests.exceptions.RequestException: + logger.warning("Unable to reach pebblo server.") + except Exception: + logger.warning("An Exception caught in _send_discover.") + + def _get_app_details(self) -> App: + """Fetch app details. Internal method. + + Returns: + App: App details. + """ + framework, runtime = get_runtime() + app = App( + name=self.app_name, + owner=self.owner, + description=self.description, + load_id=self.load_id, + runtime=runtime, + framework=framework, + plugin_version=PLUGIN_VERSION, + ) + return app + + @staticmethod + def get_file_owner_from_path(file_path: str) -> str: + """Fetch owner of local file path. + + Args: + file_path (str): Local file path. + + Returns: + str: Name of owner. + """ + try: + file_owner_uid = os.stat(file_path).st_uid + file_owner_name = pwd.getpwuid(file_owner_uid).pw_name + except Exception: + file_owner_name = "unknown" + return file_owner_name + + def get_source_size(self, source_path: str) -> int: + """Fetch size of source path. Source can be a directory or a file. + + Args: + source_path (str): Local path of data source. + + Returns: + int: Source size in bytes. + """ + if not source_path: + return 0 + size = 0 + if os.path.isfile(source_path): + size = os.path.getsize(source_path) + elif os.path.isdir(source_path): + total_size = 0 + for dirpath, _, filenames in os.walk(source_path): + for f in filenames: + fp = os.path.join(dirpath, f) + if not os.path.islink(fp): + total_size += os.path.getsize(fp) + size = total_size + return size diff --git a/libs/community/langchain_community/utilities/pebblo.py b/libs/community/langchain_community/utilities/pebblo.py new file mode 100644 index 0000000000000..a52d3c45b4613 --- /dev/null +++ b/libs/community/langchain_community/utilities/pebblo.py @@ -0,0 +1,249 @@ +from __future__ import annotations + +import logging +import os +import pathlib +import platform +from typing import Optional, Tuple + +from langchain_core.env import get_runtime_environment +from langchain_core.pydantic_v1 import BaseModel + +from langchain_community.document_loaders.base import BaseLoader + +logger = logging.getLogger(__name__) + +PLUGIN_VERSION = "0.1.0" +CLASSIFIER_URL = os.getenv("PEBBLO_CLASSIFIER_URL", "http://localhost:8000") + +# Supported loaders for Pebblo safe data loading +file_loader = [ + "JSONLoader", + "S3FileLoader", + "UnstructuredMarkdownLoader", + "UnstructuredPDFLoader", + "UnstructuredFileLoader", + "UnstructuredJsonLoader", + "PyPDFLoader", + "GCSFileLoader", + "AmazonTextractPDFLoader", + "CSVLoader", + "UnstructuredExcelLoader", +] +dir_loader = ["DirectoryLoader", "S3DirLoader", "PyPDFDirectoryLoader"] +in_memory = ["DataFrameLoader"] + +LOADER_TYPE_MAPPING = {"file": file_loader, "dir": dir_loader, "in-memory": in_memory} + +SUPPORTED_LOADERS = (*file_loader, *dir_loader, *in_memory) + +logger = logging.getLogger(__name__) + + +class Runtime(BaseModel): + """This class represents a Runtime. + + Args: + type (Optional[str]): Runtime type. Defaults to "" + host (str): Hostname of runtime. + path (str): Current working directory path. + ip (Optional[str]): Ip of current runtime. Defaults to "" + platform (str): Platform details of current runtime. + os (str): OS name. + os_version (str): OS version. + language (str): Runtime kernel. + language_version (str): version of current runtime kernel. + runtime (Optional[str]) More runtime details. Defaults to "" + """ + + type: str = "local" + host: str + path: str + ip: Optional[str] = "" + platform: str + os: str + os_version: str + language: str + language_version: str + runtime: str = "local" + + +class Framework(BaseModel): + """This class represents a Framework instance. + + Args: + name (str): Name of the Framework. + version (str): Version of the Framework. + """ + + name: str + version: str + + +class App(BaseModel): + """This class represents an AI application. + + Args: + name (str): Name of the app. + owner (str): Owner of the app. + description (Optional[str]): Description of the app. + load_id (str): Unique load_id of the app instance. + runtime (Runtime): Runtime details of app. + framework (Framework): Framework details of the app + plugin_version (str): Plugin version used for the app. + """ + + name: str + owner: str + description: Optional[str] + load_id: str + runtime: Runtime + framework: Framework + plugin_version: str + + +class Doc(BaseModel): + """This class represents a pebblo document. + + Args: + name (str): Name of app originating this document. + owner (str): Owner of app. + docs (list): List of documents with its metadata. + plugin_version (str): Pebblo plugin Version + load_id (str): Unique load_id of the app instance. + loader_details (dict): Loader details with its metadata. + loading_end (bool): Boolean, specifying end of loading of source. + source_owner (str): Owner of the source of the loader. + """ + + name: str + owner: str + docs: list + plugin_version: str + load_id: str + loader_details: dict + loading_end: bool + source_owner: str + + +def get_full_path(path: str) -> str: + """Return absolute local path for a local file/directory, + for network related path, return as is. + + Args: + path (str): Relative path to be resolved. + + Returns: + str: Resolved absolute path. + """ + if ( + not path + or ("://" in path) + or ("/" == path[0]) + or (path in ["unknown", "-", "in-memory"]) + ): + return path + full_path = pathlib.Path(path).resolve() + return str(full_path) + + +def get_loader_type(loader: str) -> str: + """Return loader type among, file, dir or in-memory. + + Args: + loader (str): Name of the loader, whose type is to be resolved. + + Returns: + str: One of the loader type among, file/dir/in-memory. + """ + for loader_type, loaders in LOADER_TYPE_MAPPING.items(): + if loader in loaders: + return loader_type + return "unknown" + + +def get_loader_full_path(loader: BaseLoader) -> str: + """Return absolute source path of source of loader based on the + keys present in Document object from loader. + + Args: + loader (BaseLoader): Langchain document loader, derived from Baseloader. + """ + from langchain_community.document_loaders import ( + DataFrameLoader, + GCSFileLoader, + S3FileLoader, + ) + + location = "-" + if not isinstance(loader, BaseLoader): + logger.error( + "loader is not derived from BaseLoader, source location will be unknown!" + ) + return location + loader_dict = loader.__dict__ + try: + if "bucket" in loader_dict: + if isinstance(loader, GCSFileLoader): + location = f"gc://{loader.bucket}/{loader.blob}" + elif isinstance(loader, S3FileLoader): + location = f"s3://{loader.bucket}/{loader.key}" + elif "path" in loader_dict: + location = loader_dict["path"] + elif "file_path" in loader_dict: + location = loader_dict["file_path"] + elif "web_paths" in loader_dict: + location = loader_dict["web_paths"][0] + # For in-memory types: + elif isinstance(loader, DataFrameLoader): + location = "in-memory" + except Exception: + pass + return get_full_path(str(location)) + + +def get_runtime() -> Tuple[Framework, Runtime]: + """Fetch the current Framework and Runtime details. + + Returns: + Tuple[Framework, Runtime]: Framework and Runtime for the current app instance. + """ + runtime_env = get_runtime_environment() + framework = Framework( + name="langchain", version=runtime_env.get("library_version", None) + ) + uname = platform.uname() + runtime = Runtime( + host=uname.node, + path=os.environ["PWD"], + platform=runtime_env.get("platform", "unknown"), + os=uname.system, + os_version=uname.version, + ip=get_ip(), + language=runtime_env.get("runtime", "unknown"), + language_version=runtime_env.get("runtime_version", "unknown"), + ) + + if "Darwin" in runtime.os: + runtime.type = "desktop" + runtime.runtime = "Mac OSX" + + logger.debug(f"framework {framework}") + logger.debug(f"runtime {runtime}") + return framework, runtime + + +def get_ip() -> str: + """Fetch local runtime ip address + + Returns: + str: IP address + """ + import socket # lazy imports + + host = socket.gethostname() + try: + public_ip = socket.gethostbyname(host) + except Exception: + public_ip = socket.gethostbyname("localhost") + return public_ip diff --git a/libs/community/tests/examples/test_empty.csv b/libs/community/tests/examples/test_empty.csv new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/libs/community/tests/examples/test_nominal.csv b/libs/community/tests/examples/test_nominal.csv new file mode 100644 index 0000000000000..65debb11207c2 --- /dev/null +++ b/libs/community/tests/examples/test_nominal.csv @@ -0,0 +1,3 @@ +column1,column2,column3 +value1,value2,value3 +value4,value5,value6 \ No newline at end of file diff --git a/libs/community/tests/unit_tests/document_loaders/test_imports.py b/libs/community/tests/unit_tests/document_loaders/test_imports.py index 98865797e3b78..e4b406add5773 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_imports.py +++ b/libs/community/tests/unit_tests/document_loaders/test_imports.py @@ -49,6 +49,7 @@ "CubeSemanticLoader", "DataFrameLoader", "DatadogLogsLoader", + "PebbloSafeLoader", "DiffbotLoader", "DirectoryLoader", "DiscordChatLoader", diff --git a/libs/community/tests/unit_tests/document_loaders/test_pebblo.py b/libs/community/tests/unit_tests/document_loaders/test_pebblo.py new file mode 100644 index 0000000000000..9ab487c8e7804 --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/test_pebblo.py @@ -0,0 +1,114 @@ +import os +from pathlib import Path +from typing import Dict + +import pytest +from langchain_core.documents import Document +from pytest_mock import MockerFixture + +from langchain_community.document_loaders import CSVLoader, PyPDFLoader + +EXAMPLE_DOCS_DIRECTORY = str(Path(__file__).parent.parent.parent / "examples/") + + +class MockResponse: + def __init__(self, json_data: Dict, status_code: int): + self.json_data = json_data + self.status_code = status_code + + def json(self) -> Dict: + return self.json_data + + +def test_pebblo_import() -> None: + """Test that the Pebblo safe loader can be imported.""" + from langchain_community.document_loaders import PebbloSafeLoader # noqa: F401 + + +def test_empty_filebased_loader(mocker: MockerFixture) -> None: + """Test basic file based csv loader.""" + # Setup + from langchain_community.document_loaders import PebbloSafeLoader + + mocker.patch.multiple( + "requests", + get=MockResponse(json_data={"data": ""}, status_code=200), + post=MockResponse(json_data={"data": ""}, status_code=200), + ) + + file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "test_empty.csv") + expected_docs: list = [] + + # Exercise + loader = PebbloSafeLoader( + CSVLoader(file_path=file_path), + "dummy_app_name", + "dummy_owner", + "dummy_description", + ) + result = loader.load() + + # Assert + assert result == expected_docs + + +def test_csv_loader_load_valid_data(mocker: MockerFixture) -> None: + # Setup + from langchain_community.document_loaders import PebbloSafeLoader + + mocker.patch.multiple( + "requests", + get=MockResponse(json_data={"data": ""}, status_code=200), + post=MockResponse(json_data={"data": ""}, status_code=200), + ) + file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "test_nominal.csv") + expected_docs = [ + Document( + page_content="column1: value1\ncolumn2: value2\ncolumn3: value3", + metadata={"source": file_path, "row": 0}, + ), + Document( + page_content="column1: value4\ncolumn2: value5\ncolumn3: value6", + metadata={"source": file_path, "row": 1}, + ), + ] + + # Exercise + loader = PebbloSafeLoader( + CSVLoader(file_path=file_path), + "dummy_app_name", + "dummy_owner", + "dummy_description", + ) + result = loader.load() + + # Assert + assert result == expected_docs + + +@pytest.mark.requires("pypdf") +def test_pdf_lazy_load(mocker: MockerFixture) -> None: + # Setup + from langchain_community.document_loaders import PebbloSafeLoader + + mocker.patch.multiple( + "requests", + get=MockResponse(json_data={"data": ""}, status_code=200), + post=MockResponse(json_data={"data": ""}, status_code=200), + ) + file_path = os.path.join( + EXAMPLE_DOCS_DIRECTORY, "multi-page-forms-sample-2-page.pdf" + ) + + # Exercise + loader = PebbloSafeLoader( + PyPDFLoader(file_path=file_path), + "dummy_app_name", + "dummy_owner", + "dummy_description", + ) + + result = list(loader.lazy_load()) + + # Assert + assert len(result) == 2 From 37e1275f9ec6675912984add3a166d27abc8632f Mon Sep 17 00:00:00 2001 From: Abhishek Jain Date: Tue, 13 Feb 2024 11:27:27 +0530 Subject: [PATCH 20/25] community[patch]: Fixed the 'aembed' method of 'CohereEmbeddings'. (#16497) **Description:** - The existing code was trying to find a `.embeddings` property on the `Coroutine` returned by calling `cohere.async_client.embed`. - Instead, the `.embeddings` property is present on the value returned by the `Coroutine`. - Also, it seems that the original cohere client expects a value of `max_retries` to not be `None`. Hence, setting the default value of `max_retries` to `3`. --------- Co-authored-by: Bagatur --- .../langchain_community/embeddings/cohere.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/libs/community/langchain_community/embeddings/cohere.py b/libs/community/langchain_community/embeddings/cohere.py index fd95b58ea6ce8..2d4676d1254d8 100644 --- a/libs/community/langchain_community/embeddings/cohere.py +++ b/libs/community/langchain_community/embeddings/cohere.py @@ -34,7 +34,7 @@ class CohereEmbeddings(BaseModel, Embeddings): cohere_api_key: Optional[str] = None - max_retries: Optional[int] = None + max_retries: Optional[int] = 3 """Maximum number of retries to make when generating.""" request_timeout: Optional[float] = None """Timeout in seconds for the Cohere API request.""" @@ -92,11 +92,13 @@ def embed( async def aembed( self, texts: List[str], *, input_type: Optional[str] = None ) -> List[List[float]]: - embeddings = await self.async_client.embed( - model=self.model, - texts=texts, - input_type=input_type, - truncate=self.truncate, + embeddings = ( + await self.async_client.embed( + model=self.model, + texts=texts, + input_type=input_type, + truncate=self.truncate, + ) ).embeddings return [list(map(float, e)) for e in embeddings] From c0ce93236a15d77d18164482106516c6b523bd12 Mon Sep 17 00:00:00 2001 From: Bagatur <22008038+baskaryan@users.noreply.github.com> Date: Mon, 12 Feb 2024 21:58:35 -0800 Subject: [PATCH 21/25] experimental[patch]: fix zero-shot pandas agent (#17442) --- .../agents/agent_toolkits/pandas/base.py | 25 ++-- libs/experimental/poetry.lock | 119 ++++++++++++++++-- libs/experimental/pyproject.toml | 4 + .../tests/unit_tests/agents/__init__.py | 0 .../agents/agent_toolkits/__init__.py | 0 .../agents/agent_toolkits/pandas/__init__.py | 0 .../agents/agent_toolkits/pandas/test_base.py | 15 +++ 7 files changed, 135 insertions(+), 28 deletions(-) create mode 100644 libs/experimental/tests/unit_tests/agents/__init__.py create mode 100644 libs/experimental/tests/unit_tests/agents/agent_toolkits/__init__.py create mode 100644 libs/experimental/tests/unit_tests/agents/agent_toolkits/pandas/__init__.py create mode 100644 libs/experimental/tests/unit_tests/agents/agent_toolkits/pandas/test_base.py diff --git a/libs/experimental/langchain_experimental/agents/agent_toolkits/pandas/base.py b/libs/experimental/langchain_experimental/agents/agent_toolkits/pandas/base.py index 0970c39595b70..6fa67dc60f8ca 100644 --- a/libs/experimental/langchain_experimental/agents/agent_toolkits/pandas/base.py +++ b/libs/experimental/langchain_experimental/agents/agent_toolkits/pandas/base.py @@ -10,7 +10,7 @@ RunnableAgent, RunnableMultiActionAgent, ) -from langchain.agents.mrkl.base import ZeroShotAgent +from langchain.agents.mrkl.prompt import FORMAT_INSTRUCTIONS from langchain.agents.openai_functions_agent.base import ( OpenAIFunctionsAgent, create_openai_functions_agent, @@ -18,7 +18,11 @@ from langchain_core.callbacks import BaseCallbackManager from langchain_core.language_models import LanguageModelLike from langchain_core.messages import SystemMessage -from langchain_core.prompts import BasePromptTemplate, ChatPromptTemplate +from langchain_core.prompts import ( + BasePromptTemplate, + ChatPromptTemplate, + PromptTemplate, +) from langchain_core.tools import BaseTool from langchain_core.utils.interactive_env import is_interactive_env @@ -43,7 +47,6 @@ def _get_multi_prompt( suffix: Optional[str] = None, include_df_in_prompt: Optional[bool] = True, number_of_head_rows: int = 5, - tools: Sequence[BaseTool] = (), ) -> BasePromptTemplate: if suffix is not None: suffix_to_use = suffix @@ -53,11 +56,8 @@ def _get_multi_prompt( suffix_to_use = SUFFIX_NO_DF prefix = prefix if prefix is not None else MULTI_DF_PREFIX - prompt = ZeroShotAgent.create_prompt( - tools, - prefix=prefix, - suffix=suffix_to_use, - ) + template = "\n\n".join([prefix, "{tools}", FORMAT_INSTRUCTIONS, suffix_to_use]) + prompt = PromptTemplate.from_template(template) partial_prompt = prompt.partial() if "dfs_head" in partial_prompt.input_variables: dfs_head = "\n\n".join([d.head(number_of_head_rows).to_markdown() for d in dfs]) @@ -74,7 +74,6 @@ def _get_single_prompt( suffix: Optional[str] = None, include_df_in_prompt: Optional[bool] = True, number_of_head_rows: int = 5, - tools: Sequence[BaseTool] = (), ) -> BasePromptTemplate: if suffix is not None: suffix_to_use = suffix @@ -84,11 +83,8 @@ def _get_single_prompt( suffix_to_use = SUFFIX_NO_DF prefix = prefix if prefix is not None else PREFIX - prompt = ZeroShotAgent.create_prompt( - tools, - prefix=prefix, - suffix=suffix_to_use, - ) + template = "\n\n".join([prefix, "{tools}", FORMAT_INSTRUCTIONS, suffix_to_use]) + prompt = PromptTemplate.from_template(template) partial_prompt = prompt.partial() if "df_head" in partial_prompt.input_variables: @@ -257,7 +253,6 @@ def create_pandas_dataframe_agent( suffix=suffix, include_df_in_prompt=include_df_in_prompt, number_of_head_rows=number_of_head_rows, - tools=tools, ) agent: Union[BaseSingleActionAgent, BaseMultiActionAgent] = RunnableAgent( runnable=create_react_agent(llm, tools, prompt), # type: ignore diff --git a/libs/experimental/poetry.lock b/libs/experimental/poetry.lock index 4e75e3d6904b1..65cd4f58b0aa3 100644 --- a/libs/experimental/poetry.lock +++ b/libs/experimental/poetry.lock @@ -1642,7 +1642,7 @@ files = [ [[package]] name = "langchain" -version = "0.1.5" +version = "0.1.6" description = "Building applications with LLMs through composability" optional = false python-versions = ">=3.8.1,<4.0" @@ -1654,8 +1654,8 @@ aiohttp = "^3.8.3" async-timeout = {version = "^4.0.0", markers = "python_version < \"3.11\""} dataclasses-json = ">= 0.5.7, < 0.7" jsonpatch = "^1.33" -langchain-community = ">=0.0.17,<0.1" -langchain-core = ">=0.1.16,<0.2" +langchain-community = ">=0.0.18,<0.1" +langchain-core = ">=0.1.22,<0.2" langsmith = ">=0.0.83,<0.1" numpy = "^1" pydantic = ">=1,<3" @@ -1685,7 +1685,7 @@ url = "../langchain" [[package]] name = "langchain-community" -version = "0.0.17" +version = "0.0.19" description = "Community contributed LangChain integrations." optional = false python-versions = ">=3.8.1,<4.0" @@ -1695,7 +1695,7 @@ develop = true [package.dependencies] aiohttp = "^3.8.3" dataclasses-json = ">= 0.5.7, < 0.7" -langchain-core = ">=0.1.16,<0.2" +langchain-core = ">=0.1.21,<0.2" langsmith = ">=0.0.83,<0.1" numpy = "^1" PyYAML = ">=5.3" @@ -1705,7 +1705,7 @@ tenacity = "^8.1.0" [package.extras] cli = ["typer (>=0.9.0,<0.10.0)"] -extended-testing = ["aiosqlite (>=0.19.0,<0.20.0)", "aleph-alpha-client (>=2.15.0,<3.0.0)", "anthropic (>=0.3.11,<0.4.0)", "arxiv (>=1.4,<2.0)", "assemblyai (>=0.17.0,<0.18.0)", "atlassian-python-api (>=3.36.0,<4.0.0)", "azure-ai-documentintelligence (>=1.0.0b1,<2.0.0)", "beautifulsoup4 (>=4,<5)", "bibtexparser (>=1.4.0,<2.0.0)", "cassio (>=0.1.0,<0.2.0)", "chardet (>=5.1.0,<6.0.0)", "cohere (>=4,<5)", "dashvector (>=1.0.1,<2.0.0)", "databricks-vectorsearch (>=0.21,<0.22)", "datasets (>=2.15.0,<3.0.0)", "dgml-utils (>=0.3.0,<0.4.0)", "elasticsearch (>=8.12.0,<9.0.0)", "esprima (>=4.0.1,<5.0.0)", "faiss-cpu (>=1,<2)", "feedparser (>=6.0.10,<7.0.0)", "fireworks-ai (>=0.9.0,<0.10.0)", "geopandas (>=0.13.1,<0.14.0)", "gitpython (>=3.1.32,<4.0.0)", "google-cloud-documentai (>=2.20.1,<3.0.0)", "gql (>=3.4.1,<4.0.0)", "gradientai (>=1.4.0,<2.0.0)", "hdbcli (>=2.19.21,<3.0.0)", "hologres-vector (>=0.0.6,<0.0.7)", "html2text (>=2020.1.16,<2021.0.0)", "httpx (>=0.24.1,<0.25.0)", "javelin-sdk (>=0.1.8,<0.2.0)", "jinja2 (>=3,<4)", "jq (>=1.4.1,<2.0.0)", "jsonschema (>1)", "lxml (>=4.9.2,<5.0.0)", "markdownify (>=0.11.6,<0.12.0)", "motor (>=3.3.1,<4.0.0)", "msal (>=1.25.0,<2.0.0)", "mwparserfromhell (>=0.6.4,<0.7.0)", "mwxml (>=0.3.3,<0.4.0)", "newspaper3k (>=0.2.8,<0.3.0)", "numexpr (>=2.8.6,<3.0.0)", "oci (>=2.119.1,<3.0.0)", "openai (<2)", "openapi-pydantic (>=0.3.2,<0.4.0)", "oracle-ads (>=2.9.1,<3.0.0)", "pandas (>=2.0.1,<3.0.0)", "pdfminer-six (>=20221105,<20221106)", "pgvector (>=0.1.6,<0.2.0)", "praw (>=7.7.1,<8.0.0)", "psychicapi (>=0.8.0,<0.9.0)", "py-trello (>=0.19.0,<0.20.0)", "pymupdf (>=1.22.3,<2.0.0)", "pypdf (>=3.4.0,<4.0.0)", "pypdfium2 (>=4.10.0,<5.0.0)", "pyspark (>=3.4.0,<4.0.0)", "rank-bm25 (>=0.2.2,<0.3.0)", "rapidfuzz (>=3.1.1,<4.0.0)", "rapidocr-onnxruntime (>=1.3.2,<2.0.0)", "rdflib (==7.0.0)", "requests-toolbelt (>=1.0.0,<2.0.0)", "rspace_client (>=2.5.0,<3.0.0)", "scikit-learn (>=1.2.2,<2.0.0)", "sqlite-vss (>=0.1.2,<0.2.0)", "streamlit (>=1.18.0,<2.0.0)", "sympy (>=1.12,<2.0)", "telethon (>=1.28.5,<2.0.0)", "timescale-vector (>=0.0.1,<0.0.2)", "tqdm (>=4.48.0)", "upstash-redis (>=0.15.0,<0.16.0)", "xata (>=1.0.0a7,<2.0.0)", "xmltodict (>=0.13.0,<0.14.0)", "zhipuai (>=1.0.7,<2.0.0)"] +extended-testing = ["aiosqlite (>=0.19.0,<0.20.0)", "aleph-alpha-client (>=2.15.0,<3.0.0)", "anthropic (>=0.3.11,<0.4.0)", "arxiv (>=1.4,<2.0)", "assemblyai (>=0.17.0,<0.18.0)", "atlassian-python-api (>=3.36.0,<4.0.0)", "azure-ai-documentintelligence (>=1.0.0b1,<2.0.0)", "beautifulsoup4 (>=4,<5)", "bibtexparser (>=1.4.0,<2.0.0)", "cassio (>=0.1.0,<0.2.0)", "chardet (>=5.1.0,<6.0.0)", "cohere (>=4,<5)", "databricks-vectorsearch (>=0.21,<0.22)", "datasets (>=2.15.0,<3.0.0)", "dgml-utils (>=0.3.0,<0.4.0)", "elasticsearch (>=8.12.0,<9.0.0)", "esprima (>=4.0.1,<5.0.0)", "faiss-cpu (>=1,<2)", "feedparser (>=6.0.10,<7.0.0)", "fireworks-ai (>=0.9.0,<0.10.0)", "geopandas (>=0.13.1,<0.14.0)", "gitpython (>=3.1.32,<4.0.0)", "google-cloud-documentai (>=2.20.1,<3.0.0)", "gql (>=3.4.1,<4.0.0)", "gradientai (>=1.4.0,<2.0.0)", "hdbcli (>=2.19.21,<3.0.0)", "hologres-vector (>=0.0.6,<0.0.7)", "html2text (>=2020.1.16,<2021.0.0)", "httpx (>=0.24.1,<0.25.0)", "javelin-sdk (>=0.1.8,<0.2.0)", "jinja2 (>=3,<4)", "jq (>=1.4.1,<2.0.0)", "jsonschema (>1)", "lxml (>=4.9.2,<5.0.0)", "markdownify (>=0.11.6,<0.12.0)", "motor (>=3.3.1,<4.0.0)", "msal (>=1.25.0,<2.0.0)", "mwparserfromhell (>=0.6.4,<0.7.0)", "mwxml (>=0.3.3,<0.4.0)", "newspaper3k (>=0.2.8,<0.3.0)", "numexpr (>=2.8.6,<3.0.0)", "nvidia-riva-client (>=2.14.0,<3.0.0)", "oci (>=2.119.1,<3.0.0)", "openai (<2)", "openapi-pydantic (>=0.3.2,<0.4.0)", "oracle-ads (>=2.9.1,<3.0.0)", "pandas (>=2.0.1,<3.0.0)", "pdfminer-six (>=20221105,<20221106)", "pgvector (>=0.1.6,<0.2.0)", "praw (>=7.7.1,<8.0.0)", "psychicapi (>=0.8.0,<0.9.0)", "py-trello (>=0.19.0,<0.20.0)", "pymupdf (>=1.22.3,<2.0.0)", "pypdf (>=3.4.0,<4.0.0)", "pypdfium2 (>=4.10.0,<5.0.0)", "pyspark (>=3.4.0,<4.0.0)", "rank-bm25 (>=0.2.2,<0.3.0)", "rapidfuzz (>=3.1.1,<4.0.0)", "rapidocr-onnxruntime (>=1.3.2,<2.0.0)", "rdflib (==7.0.0)", "requests-toolbelt (>=1.0.0,<2.0.0)", "rspace_client (>=2.5.0,<3.0.0)", "scikit-learn (>=1.2.2,<2.0.0)", "sqlite-vss (>=0.1.2,<0.2.0)", "streamlit (>=1.18.0,<2.0.0)", "sympy (>=1.12,<2.0)", "telethon (>=1.28.5,<2.0.0)", "timescale-vector (>=0.0.1,<0.0.2)", "tqdm (>=4.48.0)", "upstash-redis (>=0.15.0,<0.16.0)", "xata (>=1.0.0a7,<2.0.0)", "xmltodict (>=0.13.0,<0.14.0)", "zhipuai (>=1.0.7,<2.0.0)"] [package.source] type = "directory" @@ -1713,7 +1713,7 @@ url = "../community" [[package]] name = "langchain-core" -version = "0.1.18" +version = "0.1.22" description = "Building applications with LLMs through composability" optional = false python-versions = ">=3.8.1,<4.0" @@ -1723,7 +1723,7 @@ develop = true [package.dependencies] anyio = ">=3,<5" jsonpatch = "^1.33" -langsmith = ">=0.0.83,<0.1" +langsmith = "^0.0.87" packaging = "^23.2" pydantic = ">=1,<3" PyYAML = ">=5.3" @@ -1753,13 +1753,13 @@ data = ["language-data (>=1.1,<2.0)"] [[package]] name = "langsmith" -version = "0.0.83" +version = "0.0.87" description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform." optional = false python-versions = ">=3.8.1,<4.0" files = [ - {file = "langsmith-0.0.83-py3-none-any.whl", hash = "sha256:a5bb7ac58c19a415a9d5f51db56dd32ee2cd7343a00825bbc2018312eb3d122a"}, - {file = "langsmith-0.0.83.tar.gz", hash = "sha256:94427846b334ad9bdbec3266fee12903fe9f5448f628667689d0412012aaf392"}, + {file = "langsmith-0.0.87-py3-none-any.whl", hash = "sha256:8903d3811b9fc89eb18f5961c8e6935fbd2d0f119884fbf30dc70b8f8f4121fc"}, + {file = "langsmith-0.0.87.tar.gz", hash = "sha256:36c4cc47e5b54be57d038036a30fb19ce6e4c73048cd7a464b8f25b459694d34"}, ] [package.dependencies] @@ -2317,6 +2317,73 @@ files = [ {file = "packaging-23.2.tar.gz", hash = "sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5"}, ] +[[package]] +name = "pandas" +version = "2.0.3" +description = "Powerful data structures for data analysis, time series, and statistics" +optional = true +python-versions = ">=3.8" +files = [ + {file = "pandas-2.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e4c7c9f27a4185304c7caf96dc7d91bc60bc162221152de697c98eb0b2648dd8"}, + {file = "pandas-2.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f167beed68918d62bffb6ec64f2e1d8a7d297a038f86d4aed056b9493fca407f"}, + {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce0c6f76a0f1ba361551f3e6dceaff06bde7514a374aa43e33b588ec10420183"}, + {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba619e410a21d8c387a1ea6e8a0e49bb42216474436245718d7f2e88a2f8d7c0"}, + {file = "pandas-2.0.3-cp310-cp310-win32.whl", hash = "sha256:3ef285093b4fe5058eefd756100a367f27029913760773c8bf1d2d8bebe5d210"}, + {file = "pandas-2.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:9ee1a69328d5c36c98d8e74db06f4ad518a1840e8ccb94a4ba86920986bb617e"}, + {file = "pandas-2.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b084b91d8d66ab19f5bb3256cbd5ea661848338301940e17f4492b2ce0801fe8"}, + {file = "pandas-2.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:37673e3bdf1551b95bf5d4ce372b37770f9529743d2498032439371fc7b7eb26"}, + {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9cb1e14fdb546396b7e1b923ffaeeac24e4cedd14266c3497216dd4448e4f2d"}, + {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9cd88488cceb7635aebb84809d087468eb33551097d600c6dad13602029c2df"}, + {file = "pandas-2.0.3-cp311-cp311-win32.whl", hash = "sha256:694888a81198786f0e164ee3a581df7d505024fbb1f15202fc7db88a71d84ebd"}, + {file = "pandas-2.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:6a21ab5c89dcbd57f78d0ae16630b090eec626360085a4148693def5452d8a6b"}, + {file = "pandas-2.0.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4da0d45e7f34c069fe4d522359df7d23badf83abc1d1cef398895822d11061"}, + {file = "pandas-2.0.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32fca2ee1b0d93dd71d979726b12b61faa06aeb93cf77468776287f41ff8fdc5"}, + {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:258d3624b3ae734490e4d63c430256e716f488c4fcb7c8e9bde2d3aa46c29089"}, + {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9eae3dc34fa1aa7772dd3fc60270d13ced7346fcbcfee017d3132ec625e23bb0"}, + {file = "pandas-2.0.3-cp38-cp38-win32.whl", hash = "sha256:f3421a7afb1a43f7e38e82e844e2bca9a6d793d66c1a7f9f0ff39a795bbc5e02"}, + {file = "pandas-2.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:69d7f3884c95da3a31ef82b7618af5710dba95bb885ffab339aad925c3e8ce78"}, + {file = "pandas-2.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5247fb1ba347c1261cbbf0fcfba4a3121fbb4029d95d9ef4dc45406620b25c8b"}, + {file = "pandas-2.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:81af086f4543c9d8bb128328b5d32e9986e0c84d3ee673a2ac6fb57fd14f755e"}, + {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1994c789bf12a7c5098277fb43836ce090f1073858c10f9220998ac74f37c69b"}, + {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ec591c48e29226bcbb316e0c1e9423622bc7a4eaf1ef7c3c9fa1a3981f89641"}, + {file = "pandas-2.0.3-cp39-cp39-win32.whl", hash = "sha256:04dbdbaf2e4d46ca8da896e1805bc04eb85caa9a82e259e8eed00254d5e0c682"}, + {file = "pandas-2.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:1168574b036cd8b93abc746171c9b4f1b83467438a5e45909fed645cf8692dbc"}, + {file = "pandas-2.0.3.tar.gz", hash = "sha256:c02f372a88e0d17f36d3093a644c73cfc1788e876a7c4bcb4020a77512e2043c"}, +] + +[package.dependencies] +numpy = [ + {version = ">=1.20.3", markers = "python_version < \"3.10\""}, + {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, + {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""}, +] +python-dateutil = ">=2.8.2" +pytz = ">=2020.1" +tzdata = ">=2022.1" + +[package.extras] +all = ["PyQt5 (>=5.15.1)", "SQLAlchemy (>=1.4.16)", "beautifulsoup4 (>=4.9.3)", "bottleneck (>=1.3.2)", "brotlipy (>=0.7.0)", "fastparquet (>=0.6.3)", "fsspec (>=2021.07.0)", "gcsfs (>=2021.07.0)", "html5lib (>=1.1)", "hypothesis (>=6.34.2)", "jinja2 (>=3.0.0)", "lxml (>=4.6.3)", "matplotlib (>=3.6.1)", "numba (>=0.53.1)", "numexpr (>=2.7.3)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pandas-gbq (>=0.15.0)", "psycopg2 (>=2.8.6)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)", "python-snappy (>=0.6.0)", "pyxlsb (>=1.0.8)", "qtpy (>=2.2.0)", "s3fs (>=2021.08.0)", "scipy (>=1.7.1)", "tables (>=3.6.1)", "tabulate (>=0.8.9)", "xarray (>=0.21.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)", "zstandard (>=0.15.2)"] +aws = ["s3fs (>=2021.08.0)"] +clipboard = ["PyQt5 (>=5.15.1)", "qtpy (>=2.2.0)"] +compression = ["brotlipy (>=0.7.0)", "python-snappy (>=0.6.0)", "zstandard (>=0.15.2)"] +computation = ["scipy (>=1.7.1)", "xarray (>=0.21.0)"] +excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pyxlsb (>=1.0.8)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)"] +feather = ["pyarrow (>=7.0.0)"] +fss = ["fsspec (>=2021.07.0)"] +gcp = ["gcsfs (>=2021.07.0)", "pandas-gbq (>=0.15.0)"] +hdf5 = ["tables (>=3.6.1)"] +html = ["beautifulsoup4 (>=4.9.3)", "html5lib (>=1.1)", "lxml (>=4.6.3)"] +mysql = ["SQLAlchemy (>=1.4.16)", "pymysql (>=1.0.2)"] +output-formatting = ["jinja2 (>=3.0.0)", "tabulate (>=0.8.9)"] +parquet = ["pyarrow (>=7.0.0)"] +performance = ["bottleneck (>=1.3.2)", "numba (>=0.53.1)", "numexpr (>=2.7.1)"] +plot = ["matplotlib (>=3.6.1)"] +postgresql = ["SQLAlchemy (>=1.4.16)", "psycopg2 (>=2.8.6)"] +spss = ["pyreadstat (>=1.1.2)"] +sql-other = ["SQLAlchemy (>=1.4.16)"] +test = ["hypothesis (>=6.34.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"] +xml = ["lxml (>=4.6.3)"] + [[package]] name = "pandocfilters" version = "1.5.0" @@ -2991,6 +3058,7 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -4124,6 +4192,20 @@ files = [ [package.dependencies] mpmath = ">=0.19" +[[package]] +name = "tabulate" +version = "0.9.0" +description = "Pretty-print tabular data" +optional = true +python-versions = ">=3.7" +files = [ + {file = "tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"}, + {file = "tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"}, +] + +[package.extras] +widechars = ["wcwidth"] + [[package]] name = "tenacity" version = "8.2.3" @@ -4701,6 +4783,17 @@ files = [ mypy-extensions = ">=0.3.0" typing-extensions = ">=3.7.4" +[[package]] +name = "tzdata" +version = "2024.1" +description = "Provider of IANA time zone data" +optional = true +python-versions = ">=2" +files = [ + {file = "tzdata-2024.1-py2.py3-none-any.whl", hash = "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252"}, + {file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"}, +] + [[package]] name = "uri-template" version = "1.3.0" @@ -4962,9 +5055,9 @@ docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.link testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy (>=0.9.1)", "pytest-ruff"] [extras] -extended-testing = ["faker", "jinja2", "presidio-analyzer", "presidio-anonymizer", "sentence-transformers", "vowpal-wabbit-next"] +extended-testing = ["faker", "jinja2", "pandas", "presidio-analyzer", "presidio-anonymizer", "sentence-transformers", "tabulate", "vowpal-wabbit-next"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "11321a2551a49e9cb432348f03e991d1f13368624b1e4df5a721b794114b4f02" +content-hash = "0e4b297b0a8c595fbfe1e8a00d5a13057b1bdd4a0ce08d415ca4c4a7712cee88" diff --git a/libs/experimental/pyproject.toml b/libs/experimental/pyproject.toml index b3386cc600c35..70a61a2821c0b 100644 --- a/libs/experimental/pyproject.toml +++ b/libs/experimental/pyproject.toml @@ -18,6 +18,8 @@ faker = {version = "^19.3.1", optional = true} vowpal-wabbit-next = {version = "0.6.0", optional = true} sentence-transformers = {version = "^2", optional = true} jinja2 = {version = "^3", optional = true} +pandas = { version = "^2.0.1", optional = true } +tabulate = {version = "^0.9.0", optional = true} [tool.poetry.group.lint] optional = true @@ -79,6 +81,8 @@ extended_testing = [ "vowpal-wabbit-next", "sentence-transformers", "jinja2", + "pandas", + "tabulate", ] [tool.ruff.lint] diff --git a/libs/experimental/tests/unit_tests/agents/__init__.py b/libs/experimental/tests/unit_tests/agents/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/libs/experimental/tests/unit_tests/agents/agent_toolkits/__init__.py b/libs/experimental/tests/unit_tests/agents/agent_toolkits/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/libs/experimental/tests/unit_tests/agents/agent_toolkits/pandas/__init__.py b/libs/experimental/tests/unit_tests/agents/agent_toolkits/pandas/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/libs/experimental/tests/unit_tests/agents/agent_toolkits/pandas/test_base.py b/libs/experimental/tests/unit_tests/agents/agent_toolkits/pandas/test_base.py new file mode 100644 index 0000000000000..8f3a063f6881e --- /dev/null +++ b/libs/experimental/tests/unit_tests/agents/agent_toolkits/pandas/test_base.py @@ -0,0 +1,15 @@ +import sys + +import pytest + +from langchain_experimental.agents import create_pandas_dataframe_agent +from tests.unit_tests.fake_llm import FakeLLM + + +@pytest.mark.requires("pandas", "tabulate") +@pytest.mark.skipif(sys.version_info < (3, 9), reason="requires python3.9 or higher") +def test_create_pandas_dataframe_agent() -> None: + import pandas as pd + + create_pandas_dataframe_agent(FakeLLM(), pd.DataFrame()) + create_pandas_dataframe_agent(FakeLLM(), [pd.DataFrame(), pd.DataFrame()]) From 3925071dd61363db0d483412c9e64bc5a7c60307 Mon Sep 17 00:00:00 2001 From: Bagatur <22008038+baskaryan@users.noreply.github.com> Date: Mon, 12 Feb 2024 22:52:07 -0800 Subject: [PATCH 22/25] =?UTF-8?q?langchain[patch],=20templates[patch]:=20f?= =?UTF-8?q?ix=20multi=20query=20retriever,=20web=20re=E2=80=A6=20(#17434)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …search retriever Fixes #17352 --- .../langchain/retrievers/multi_query.py | 33 +++++++------------ .../langchain/retrievers/web_research.py | 19 +++-------- .../retrievers/test_web_research.py | 2 +- .../rag_ollama_multi_query/chain.py | 30 ++--------------- 4 files changed, 20 insertions(+), 64 deletions(-) diff --git a/libs/langchain/langchain/retrievers/multi_query.py b/libs/langchain/langchain/retrievers/multi_query.py index 7d60b2215140b..ca7e731c51a67 100644 --- a/libs/langchain/langchain/retrievers/multi_query.py +++ b/libs/langchain/langchain/retrievers/multi_query.py @@ -1,39 +1,28 @@ import asyncio import logging -from typing import List, Sequence +from typing import List, Optional, Sequence from langchain_core.callbacks import ( AsyncCallbackManagerForRetrieverRun, CallbackManagerForRetrieverRun, ) from langchain_core.documents import Document -from langchain_core.language_models import BaseLLM +from langchain_core.language_models import BaseLanguageModel +from langchain_core.output_parsers import BaseOutputParser from langchain_core.prompts.prompt import PromptTemplate -from langchain_core.pydantic_v1 import BaseModel, Field from langchain_core.retrievers import BaseRetriever from langchain.chains.llm import LLMChain -from langchain.output_parsers.pydantic import PydanticOutputParser logger = logging.getLogger(__name__) -class LineList(BaseModel): - """List of lines.""" - - lines: List[str] = Field(description="Lines of text") - """List of lines.""" - - -class LineListOutputParser(PydanticOutputParser): +class LineListOutputParser(BaseOutputParser[List[str]]): """Output parser for a list of lines.""" - def __init__(self) -> None: - super().__init__(pydantic_object=LineList) - - def parse(self, text: str) -> LineList: + def parse(self, text: str) -> List[str]: lines = text.strip().split("\n") - return LineList(lines=lines) + return lines # Default prompt @@ -63,6 +52,7 @@ class MultiQueryRetriever(BaseRetriever): llm_chain: LLMChain verbose: bool = True parser_key: str = "lines" + """DEPRECATED. parser_key is no longer used and should not be specified.""" include_original: bool = False """Whether to include the original query in the list of generated queries.""" @@ -70,9 +60,9 @@ class MultiQueryRetriever(BaseRetriever): def from_llm( cls, retriever: BaseRetriever, - llm: BaseLLM, + llm: BaseLanguageModel, prompt: PromptTemplate = DEFAULT_QUERY_PROMPT, - parser_key: str = "lines", + parser_key: Optional[str] = None, include_original: bool = False, ) -> "MultiQueryRetriever": """Initialize from llm using default template. @@ -91,7 +81,6 @@ def from_llm( return cls( retriever=retriever, llm_chain=llm_chain, - parser_key=parser_key, include_original=include_original, ) @@ -129,7 +118,7 @@ async def agenerate_queries( response = await self.llm_chain.acall( inputs={"question": question}, callbacks=run_manager.get_child() ) - lines = getattr(response["text"], self.parser_key, []) + lines = response["text"] if self.verbose: logger.info(f"Generated queries: {lines}") return lines @@ -189,7 +178,7 @@ def generate_queries( response = self.llm_chain( {"question": question}, callbacks=run_manager.get_child() ) - lines = getattr(response["text"], self.parser_key, []) + lines = response["text"] if self.verbose: logger.info(f"Generated queries: {lines}") return lines diff --git a/libs/langchain/langchain/retrievers/web_research.py b/libs/langchain/langchain/retrievers/web_research.py index 149ef247af730..b992490f61723 100644 --- a/libs/langchain/langchain/retrievers/web_research.py +++ b/libs/langchain/langchain/retrievers/web_research.py @@ -12,6 +12,7 @@ ) from langchain_core.documents import Document from langchain_core.language_models import BaseLLM +from langchain_core.output_parsers import BaseOutputParser from langchain_core.prompts import BasePromptTemplate, PromptTemplate from langchain_core.pydantic_v1 import BaseModel, Field from langchain_core.retrievers import BaseRetriever @@ -19,7 +20,6 @@ from langchain.chains import LLMChain from langchain.chains.prompt_selector import ConditionalPromptSelector -from langchain.output_parsers.pydantic import PydanticOutputParser from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter logger = logging.getLogger(__name__) @@ -50,21 +50,12 @@ class SearchQueries(BaseModel): ) -class LineList(BaseModel): - """List of questions.""" - - lines: List[str] = Field(description="Questions") - - -class QuestionListOutputParser(PydanticOutputParser): +class QuestionListOutputParser(BaseOutputParser[List[str]]): """Output parser for a list of numbered questions.""" - def __init__(self) -> None: - super().__init__(pydantic_object=LineList) - - def parse(self, text: str) -> LineList: + def parse(self, text: str) -> List[str]: lines = re.findall(r"\d+\..*?(?:\n|$)", text) - return LineList(lines=lines) + return lines class WebResearchRetriever(BaseRetriever): @@ -176,7 +167,7 @@ def _get_relevant_documents( logger.info("Generating questions for Google Search ...") result = self.llm_chain({"question": query}) logger.info(f"Questions for Google Search (raw): {result}") - questions = getattr(result["text"], "lines", []) + questions = result["text"] logger.info(f"Questions for Google Search: {questions}") # Get urls diff --git a/libs/langchain/tests/unit_tests/retrievers/test_web_research.py b/libs/langchain/tests/unit_tests/retrievers/test_web_research.py index a052e59b72264..29878dd3b4b9d 100644 --- a/libs/langchain/tests/unit_tests/retrievers/test_web_research.py +++ b/libs/langchain/tests/unit_tests/retrievers/test_web_research.py @@ -33,4 +33,4 @@ def test_list_output_parser(text: str, expected: List[str]) -> None: parser = QuestionListOutputParser() result = parser.parse(text) - assert result.lines == expected + assert result == expected diff --git a/templates/rag-ollama-multi-query/rag_ollama_multi_query/chain.py b/templates/rag-ollama-multi-query/rag_ollama_multi_query/chain.py index dea1ea5fe53db..1bc81584532b0 100644 --- a/templates/rag-ollama-multi-query/rag_ollama_multi_query/chain.py +++ b/templates/rag-ollama-multi-query/rag_ollama_multi_query/chain.py @@ -1,7 +1,3 @@ -from typing import List - -from langchain.chains import LLMChain -from langchain.output_parsers import PydanticOutputParser from langchain.retrievers.multi_query import MultiQueryRetriever from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.chat_models import ChatOllama, ChatOpenAI @@ -10,7 +6,7 @@ from langchain_community.vectorstores import Chroma from langchain_core.output_parsers import StrOutputParser from langchain_core.prompts import ChatPromptTemplate, PromptTemplate -from langchain_core.pydantic_v1 import BaseModel, Field +from langchain_core.pydantic_v1 import BaseModel from langchain_core.runnables import RunnableParallel, RunnablePassthrough # Load @@ -29,23 +25,6 @@ ) -# Output parser will split the LLM result into a list of queries -class LineList(BaseModel): - # "lines" is the key (attribute name) of the parsed output - lines: List[str] = Field(description="Lines of text") - - -class LineListOutputParser(PydanticOutputParser): - def __init__(self) -> None: - super().__init__(pydantic_object=LineList) - - def parse(self, text: str) -> LineList: - lines = text.strip().split("\n") - return LineList(lines=lines) - - -output_parser = LineListOutputParser() - QUERY_PROMPT = PromptTemplate( input_variables=["question"], template="""You are an AI language model assistant. Your task is to generate five @@ -60,12 +39,9 @@ def parse(self, text: str) -> LineList: ollama_llm = "zephyr" llm = ChatOllama(model=ollama_llm) -# Chain -llm_chain = LLMChain(llm=llm, prompt=QUERY_PROMPT, output_parser=output_parser) - # Run -retriever = MultiQueryRetriever( - retriever=vectorstore.as_retriever(), llm_chain=llm_chain, parser_key="lines" +retriever = MultiQueryRetriever.from_llm( + vectorstore.as_retriever(), llm, prompt=QUERY_PROMPT ) # "lines" is the key (attribute name) of the parsed output # RAG prompt From 729c6d6827bb6e6bbee235028236a604e0998cbd Mon Sep 17 00:00:00 2001 From: merlin-quix <116729413+merlin-quix@users.noreply.github.com> Date: Tue, 13 Feb 2024 23:09:15 +0700 Subject: [PATCH 23/25] docs: add use case for managing chat messages via Apache Kafka (#16771) Adding a new notebook that demonstrates how to use LangChain's standard chat features while passing the chat messages back and forth via Apache Kafka. This goal is to simulate an architecture where the chat front end and the LLM are running as separate services that need to communicate with one another over an internal nework. It's an alternative to typical pattern of requesting a reponse from the model via a REST API (there's more info on why you would want to do this at the end of the notebook). NOTE: Assuming "uses cases" is the right place for this but feel free to propose another location. --------- Co-authored-by: Bagatur Co-authored-by: Harrison Chase --- cookbook/apache_kafka_message_handling.ipynb | 922 +++++++++++++++++++ 1 file changed, 922 insertions(+) create mode 100644 cookbook/apache_kafka_message_handling.ipynb diff --git a/cookbook/apache_kafka_message_handling.ipynb b/cookbook/apache_kafka_message_handling.ipynb new file mode 100644 index 0000000000000..616c12ac68039 --- /dev/null +++ b/cookbook/apache_kafka_message_handling.ipynb @@ -0,0 +1,922 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "rT1cmV4qCa2X" + }, + "source": [ + "# Using Apache Kafka to route messages\n", + "\n", + "---\n", + "\n", + "\n", + "\n", + "This notebook shows you how to use LangChain's standard chat features while passing the chat messages back and forth via Apache Kafka.\n", + "\n", + "This goal is to simulate an architecture where the chat front end and the LLM are running as separate services that need to communicate with one another over an internal nework.\n", + "\n", + "It's an alternative to typical pattern of requesting a reponse from the model via a REST API (there's more info on why you would want to do this at the end of the notebook)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UPYtfAR_9YxZ" + }, + "source": [ + "### 1. Install the main dependencies\n", + "\n", + "Dependencies include:\n", + "\n", + "- The Quix Streams library for managing interactions with Apache Kafka (or Kafka-like tools such as Redpanda) in a \"Pandas-like\" way.\n", + "- The LangChain library for managing interactions with Llama-2 and storing conversation state." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZX5tfKiy9cN-" + }, + "outputs": [], + "source": [ + "!pip install quixstreams==2.1.2a langchain==0.0.340 huggingface_hub==0.19.4 langchain-experimental==0.0.42 python-dotenv" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "losTSdTB9d9O" + }, + "source": [ + "### 2. Build and install the llama-cpp-python library (with CUDA enabled so that we can advantage of Google Colab GPU\n", + "\n", + "The `llama-cpp-python` library is a Python wrapper around the `llama-cpp` library which enables you to efficiently leverage just a CPU to run quantized LLMs.\n", + "\n", + "When you use the standard `pip install llama-cpp-python` command, you do not get GPU support by default. Generation can be very slow if you rely on just the CPU in Google Colab, so the following command adds an extra option to build and install\n", + "`llama-cpp-python` with GPU support (make sure you have a GPU-enabled runtime selected in Google Colab)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-JCQdl1G9tbl" + }, + "outputs": [], + "source": [ + "!CMAKE_ARGS=\"-DLLAMA_CUBLAS=on\" FORCE_CMAKE=1 pip install llama-cpp-python" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5_vjVIAh9rLl" + }, + "source": [ + "### 3. Download and setup Kafka and Zookeeper instances\n", + "\n", + "Download the Kafka binaries from the Apache website and start the servers as daemons. We'll use the default configurations (provided by Apache Kafka) for spinning up the instances." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "zFz7czGRW5Wr" + }, + "outputs": [], + "source": [ + "!curl -sSOL https://dlcdn.apache.org/kafka/3.6.1/kafka_2.13-3.6.1.tgz\n", + "!tar -xzf kafka_2.13-3.6.1.tgz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Uf7NR_UZ9wye" + }, + "outputs": [], + "source": [ + "!./kafka_2.13-3.6.1/bin/zookeeper-server-start.sh -daemon ./kafka_2.13-3.6.1/config/zookeeper.properties\n", + "!./kafka_2.13-3.6.1/bin/kafka-server-start.sh -daemon ./kafka_2.13-3.6.1/config/server.properties\n", + "!echo \"Waiting for 10 secs until kafka and zookeeper services are up and running\"\n", + "!sleep 10" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "H3SafFuS94p1" + }, + "source": [ + "### 4. Check that the Kafka Daemons are running\n", + "\n", + "Show the running processes and filter it for Java processes (you should see two—one for each server)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "CZDC2lQP99yp" + }, + "outputs": [], + "source": [ + "!ps aux | grep -E '[j]ava'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Snoxmjb5-V37" + }, + "source": [ + "### 5. Import the required dependencies and initialize required variables\n", + "\n", + "Import the Quix Streams library for interacting with Kafka, and the necessary LangChain components for running a `ConversationChain`." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "plR9e_MF-XL5" + }, + "outputs": [], + "source": [ + "# Import utility libraries\n", + "import json\n", + "import random\n", + "import re\n", + "import time\n", + "import uuid\n", + "from os import environ\n", + "from pathlib import Path\n", + "from random import choice, randint, random\n", + "\n", + "from dotenv import load_dotenv\n", + "\n", + "# Import a Hugging Face utility to download models directly from Hugging Face hub:\n", + "from huggingface_hub import hf_hub_download\n", + "from langchain.chains import ConversationChain\n", + "\n", + "# Import Langchain modules for managing prompts and conversation chains:\n", + "from langchain.llms import LlamaCpp\n", + "from langchain.memory import ConversationTokenBufferMemory\n", + "from langchain.prompts import PromptTemplate, load_prompt\n", + "from langchain.schema import SystemMessage\n", + "from langchain_experimental.chat_models import Llama2Chat\n", + "from quixstreams import Application, State, message_key\n", + "\n", + "# Import Quix dependencies\n", + "from quixstreams.kafka import Producer\n", + "\n", + "# Initialize global variables.\n", + "AGENT_ROLE = \"AI\"\n", + "chat_id = \"\"\n", + "\n", + "# Set the current role to the role constant and initialize variables for supplementary customer metadata:\n", + "role = AGENT_ROLE" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HgJjJ9aZ-liy" + }, + "source": [ + "### 6. Download the \"llama-2-7b-chat.Q4_K_M.gguf\" model\n", + "\n", + "Download the quantized LLama-2 7B model from Hugging Face which we will use as a local LLM (rather than relying on REST API calls to an external service)." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 67, + "referenced_widgets": [ + "969343cdbe604a26926679bbf8bd2dda", + "d8b8370c9b514715be7618bfe6832844", + "0def954cca89466b8408fadaf3b82e64", + "462482accc664729980562e208ceb179", + "80d842f73c564dc7b7cc316c763e2633", + "fa055d9f2a9d4a789e9cf3c89e0214e5", + "30ecca964a394109ac2ad757e3aec6c0", + "fb6478ce2dac489bb633b23ba0953c5c", + "734b0f5da9fc4307a95bab48cdbb5d89", + "b32f3a86a74741348511f4e136744ac8", + "e409071bff5a4e2d9bf0e9f5cc42231b" + ] + }, + "id": "Qwu4YoSA-503", + "outputId": "f956976c-7485-415b-ac93-4336ade31964" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The model path does not exist in state. Downloading model...\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "969343cdbe604a26926679bbf8bd2dda", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "llama-2-7b-chat.Q4_K_M.gguf: 0%| | 0.00/4.08G [00:00 Date: Tue, 13 Feb 2024 11:45:49 -0500 Subject: [PATCH 24/25] Framework for supporting more languages in LanguageParser (#13318) ## Description I am submitting this for a school project as part of a team of 5. Other team members are @LeilaChr, @maazh10, @Megabear137, @jelalalamy. This PR also has contributions from community members @Harrolee and @Mario928. Initial context is in the issue we opened (#11229). This pull request adds: - Generic framework for expanding the languages that `LanguageParser` can handle, using the [tree-sitter](https://github.com/tree-sitter/py-tree-sitter#py-tree-sitter) parsing library and existing language-specific parsers written for it - Support for the following additional languages in `LanguageParser`: - C - C++ - C# - Go - Java (contributed by @Mario928 https://github.com/ThatsJustCheesy/langchain/pull/2) - Kotlin - Lua - Perl - Ruby - Rust - Scala - TypeScript (contributed by @Harrolee https://github.com/ThatsJustCheesy/langchain/pull/1) Here is the [design document](https://docs.google.com/document/d/17dB14cKCWAaiTeSeBtxHpoVPGKrsPye8W0o_WClz2kk) if curious, but no need to read it. ## Issues - Closes #11229 - Closes #10996 - Closes #8405 ## Dependencies `tree_sitter` and `tree_sitter_languages` on PyPI. We have tried to add these as optional dependencies. ## Documentation We have updated the list of supported languages, and also added a section to `source_code.ipynb` detailing how to add support for additional languages using our framework. ## Maintainer - @hwchase17 (previously reviewed https://github.com/langchain-ai/langchain/pull/6486) Thanks!! ## Git commits We will gladly squash any/all of our commits (esp merge commits) if necessary. Let us know if this is desirable, or if you will be squash-merging anyway. --------- Co-authored-by: Maaz Hashmi Co-authored-by: LeilaChr <87657694+LeilaChr@users.noreply.github.com> Co-authored-by: Jeremy La Co-authored-by: Megabear137 Co-authored-by: Lee Harrold Co-authored-by: Mario928 <88029051+Mario928@users.noreply.github.com> Co-authored-by: Bagatur Co-authored-by: Harrison Chase --- .../document_loaders/source_code.ipynb | 61 ++++++- .../document_loaders/parsers/language/c.py | 36 ++++ .../document_loaders/parsers/language/cpp.py | 36 ++++ .../parsers/language/csharp.py | 36 ++++ .../document_loaders/parsers/language/go.py | 31 ++++ .../document_loaders/parsers/language/java.py | 32 ++++ .../parsers/language/kotlin.py | 31 ++++ .../parsers/language/language_parser.py | 67 ++++++- .../document_loaders/parsers/language/lua.py | 33 ++++ .../document_loaders/parsers/language/perl.py | 30 ++++ .../document_loaders/parsers/language/ruby.py | 32 ++++ .../document_loaders/parsers/language/rust.py | 34 ++++ .../parsers/language/scala.py | 33 ++++ .../parsers/language/tree_sitter_segmenter.py | 108 +++++++++++ .../parsers/language/typescript.py | 33 ++++ libs/community/poetry.lock | 170 +++++++++++++++++- libs/community/pyproject.toml | 5 +- .../parsers/language/test_c.py | 53 ++++++ .../parsers/language/test_cpp.py | 63 +++++++ .../parsers/language/test_csharp.py | 78 ++++++++ .../parsers/language/test_go.py | 50 ++++++ .../parsers/language/test_java.py | 57 ++++++ .../parsers/language/test_kotlin.py | 60 +++++++ .../parsers/language/test_lua.py | 40 +++++ .../parsers/language/test_perl.py | 44 +++++ .../parsers/language/test_ruby.py | 51 ++++++ .../parsers/language/test_rust.py | 50 ++++++ .../parsers/language/test_scala.py | 56 ++++++ .../parsers/language/test_typescript.py | 67 +++++++ 29 files changed, 1464 insertions(+), 13 deletions(-) create mode 100644 libs/community/langchain_community/document_loaders/parsers/language/c.py create mode 100644 libs/community/langchain_community/document_loaders/parsers/language/cpp.py create mode 100644 libs/community/langchain_community/document_loaders/parsers/language/csharp.py create mode 100644 libs/community/langchain_community/document_loaders/parsers/language/go.py create mode 100644 libs/community/langchain_community/document_loaders/parsers/language/java.py create mode 100644 libs/community/langchain_community/document_loaders/parsers/language/kotlin.py create mode 100644 libs/community/langchain_community/document_loaders/parsers/language/lua.py create mode 100644 libs/community/langchain_community/document_loaders/parsers/language/perl.py create mode 100644 libs/community/langchain_community/document_loaders/parsers/language/ruby.py create mode 100644 libs/community/langchain_community/document_loaders/parsers/language/rust.py create mode 100644 libs/community/langchain_community/document_loaders/parsers/language/scala.py create mode 100644 libs/community/langchain_community/document_loaders/parsers/language/tree_sitter_segmenter.py create mode 100644 libs/community/langchain_community/document_loaders/parsers/language/typescript.py create mode 100644 libs/community/tests/unit_tests/document_loaders/parsers/language/test_c.py create mode 100644 libs/community/tests/unit_tests/document_loaders/parsers/language/test_cpp.py create mode 100644 libs/community/tests/unit_tests/document_loaders/parsers/language/test_csharp.py create mode 100644 libs/community/tests/unit_tests/document_loaders/parsers/language/test_go.py create mode 100644 libs/community/tests/unit_tests/document_loaders/parsers/language/test_java.py create mode 100644 libs/community/tests/unit_tests/document_loaders/parsers/language/test_kotlin.py create mode 100644 libs/community/tests/unit_tests/document_loaders/parsers/language/test_lua.py create mode 100644 libs/community/tests/unit_tests/document_loaders/parsers/language/test_perl.py create mode 100644 libs/community/tests/unit_tests/document_loaders/parsers/language/test_ruby.py create mode 100644 libs/community/tests/unit_tests/document_loaders/parsers/language/test_rust.py create mode 100644 libs/community/tests/unit_tests/document_loaders/parsers/language/test_scala.py create mode 100644 libs/community/tests/unit_tests/document_loaders/parsers/language/test_typescript.py diff --git a/docs/docs/integrations/document_loaders/source_code.ipynb b/docs/docs/integrations/document_loaders/source_code.ipynb index cbf9679ec707e..c6a75afde81bc 100644 --- a/docs/docs/integrations/document_loaders/source_code.ipynb +++ b/docs/docs/integrations/document_loaders/source_code.ipynb @@ -9,7 +9,35 @@ "\n", "This notebook covers how to load source code files using a special approach with language parsing: each top-level function and class in the code is loaded into separate documents. Any remaining code top-level code outside the already loaded functions and classes will be loaded into a separate document.\n", "\n", - "This approach can potentially improve the accuracy of QA models over source code. Currently, the supported languages for code parsing are Python and JavaScript. The language used for parsing can be configured, along with the minimum number of lines required to activate the splitting based on syntax." + "This approach can potentially improve the accuracy of QA models over source code.\n", + "\n", + "The supported languages for code parsing are:\n", + "\n", + "- C (*)\n", + "- C++ (*)\n", + "- C# (*)\n", + "- COBOL\n", + "- Go (*)\n", + "- Java (*)\n", + "- JavaScript (requires package `esprima`)\n", + "- Kotlin (*)\n", + "- Lua (*)\n", + "- Perl (*)\n", + "- Python\n", + "- Ruby (*)\n", + "- Rust (*)\n", + "- Scala (*)\n", + "- TypeScript (*)\n", + "\n", + "Items marked with (*) require the packages `tree_sitter` and `tree_sitter_languages`.\n", + "It is straightforward to add support for additional languages using `tree_sitter`,\n", + "although this currently requires modifying LangChain.\n", + "\n", + "The language used for parsing can be configured, along with the minimum number of\n", + "lines required to activate the splitting based on syntax.\n", + "\n", + "If a language is not explicitly specified, `LanguageParser` will infer one from\n", + "filename extensions, if present." ] }, { @@ -19,7 +47,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install --upgrade --quiet esprima" + "%pip install -qU esprima esprima tree_sitter tree_sitter_languages" ] }, { @@ -395,6 +423,33 @@ "source": [ "print(\"\\n\\n--8<--\\n\\n\".join([document.page_content for document in result]))" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Adding Languages using Tree-sitter Template\n", + "\n", + "Expanding language support using the Tree-Sitter template involves a few essential steps:\n", + "\n", + "1. **Creating a New Language File**:\n", + " - Begin by creating a new file in the designated directory (langchain/libs/community/langchain_community/document_loaders/parsers/language).\n", + " - Model this file based on the structure and parsing logic of existing language files like **`cpp.py`**.\n", + " - You will also need to create a file in the langchain directory (langchain/libs/langchain/langchain/document_loaders/parsers/language).\n", + "2. **Parsing Language Specifics**:\n", + " - Mimic the structure used in the **`cpp.py`** file, adapting it to suit the language you are incorporating.\n", + " - The primary alteration involves adjusting the chunk query array to suit the syntax and structure of the language you are parsing.\n", + "3. **Testing the Language Parser**:\n", + " - For thorough validation, generate a test file specific to the new language. Create **`test_language.py`** in the designated directory(langchain/libs/community/tests/unit_tests/document_loaders/parsers/language).\n", + " - Follow the example set by **`test_cpp.py`** to establish fundamental tests for the parsed elements in the new language.\n", + "4. **Integration into the Parser and Text Splitter**:\n", + " - Incorporate your new language within the **`language_parser.py`** file. Ensure to update LANGUAGE_EXTENSIONS and LANGUAGE_SEGMENTERS along with the docstring for LanguageParser to recognize and handle the added language.\n", + " - Also, confirm that your language is included in **`text_splitter.py`** in class Language for proper parsing.\n", + "\n", + "By following these steps and ensuring comprehensive testing and integration, you'll successfully extend language support using the Tree-Sitter template.\n", + "\n", + "Best of luck!" + ] } ], "metadata": { @@ -413,7 +468,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.11.5" } }, "nbformat": 4, diff --git a/libs/community/langchain_community/document_loaders/parsers/language/c.py b/libs/community/langchain_community/document_loaders/parsers/language/c.py new file mode 100644 index 0000000000000..2db1ec99fca4a --- /dev/null +++ b/libs/community/langchain_community/document_loaders/parsers/language/c.py @@ -0,0 +1,36 @@ +from typing import TYPE_CHECKING + +from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter import ( # noqa: E501 + TreeSitterSegmenter, +) + +if TYPE_CHECKING: + from tree_sitter import Language + + +CHUNK_QUERY = """ + [ + (struct_specifier + body: (field_declaration_list)) @struct + (enum_specifier + body: (enumerator_list)) @enum + (union_specifier + body: (field_declaration_list)) @union + (function_definition) @function + ] +""".strip() + + +class CSegmenter(TreeSitterSegmenter): + """Code segmenter for C.""" + + def get_language(self) -> "Language": + from tree_sitter_languages import get_language + + return get_language("c") + + def get_chunk_query(self) -> str: + return CHUNK_QUERY + + def make_line_comment(self, text: str) -> str: + return f"// {text}" diff --git a/libs/community/langchain_community/document_loaders/parsers/language/cpp.py b/libs/community/langchain_community/document_loaders/parsers/language/cpp.py new file mode 100644 index 0000000000000..9d09164a846e7 --- /dev/null +++ b/libs/community/langchain_community/document_loaders/parsers/language/cpp.py @@ -0,0 +1,36 @@ +from typing import TYPE_CHECKING + +from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter import ( # noqa: E501 + TreeSitterSegmenter, +) + +if TYPE_CHECKING: + from tree_sitter import Language + + +CHUNK_QUERY = """ + [ + (class_specifier + body: (field_declaration_list)) @class + (struct_specifier + body: (field_declaration_list)) @struct + (union_specifier + body: (field_declaration_list)) @union + (function_definition) @function + ] +""".strip() + + +class CPPSegmenter(TreeSitterSegmenter): + """Code segmenter for C++.""" + + def get_language(self) -> "Language": + from tree_sitter_languages import get_language + + return get_language("cpp") + + def get_chunk_query(self) -> str: + return CHUNK_QUERY + + def make_line_comment(self, text: str) -> str: + return f"// {text}" diff --git a/libs/community/langchain_community/document_loaders/parsers/language/csharp.py b/libs/community/langchain_community/document_loaders/parsers/language/csharp.py new file mode 100644 index 0000000000000..a9f809fa00a84 --- /dev/null +++ b/libs/community/langchain_community/document_loaders/parsers/language/csharp.py @@ -0,0 +1,36 @@ +from typing import TYPE_CHECKING + +from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter import ( # noqa: E501 + TreeSitterSegmenter, +) + +if TYPE_CHECKING: + from tree_sitter import Language + + +CHUNK_QUERY = """ + [ + (namespace_declaration) @namespace + (class_declaration) @class + (method_declaration) @method + (interface_declaration) @interface + (enum_declaration) @enum + (struct_declaration) @struct + (record_declaration) @record + ] +""".strip() + + +class CSharpSegmenter(TreeSitterSegmenter): + """Code segmenter for C#.""" + + def get_language(self) -> "Language": + from tree_sitter_languages import get_language + + return get_language("c_sharp") + + def get_chunk_query(self) -> str: + return CHUNK_QUERY + + def make_line_comment(self, text: str) -> str: + return f"// {text}" diff --git a/libs/community/langchain_community/document_loaders/parsers/language/go.py b/libs/community/langchain_community/document_loaders/parsers/language/go.py new file mode 100644 index 0000000000000..f836ab3ad710c --- /dev/null +++ b/libs/community/langchain_community/document_loaders/parsers/language/go.py @@ -0,0 +1,31 @@ +from typing import TYPE_CHECKING + +from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter import ( # noqa: E501 + TreeSitterSegmenter, +) + +if TYPE_CHECKING: + from tree_sitter import Language + + +CHUNK_QUERY = """ + [ + (function_declaration) @function + (type_declaration) @type + ] +""".strip() + + +class GoSegmenter(TreeSitterSegmenter): + """Code segmenter for Go.""" + + def get_language(self) -> "Language": + from tree_sitter_languages import get_language + + return get_language("go") + + def get_chunk_query(self) -> str: + return CHUNK_QUERY + + def make_line_comment(self, text: str) -> str: + return f"// {text}" diff --git a/libs/community/langchain_community/document_loaders/parsers/language/java.py b/libs/community/langchain_community/document_loaders/parsers/language/java.py new file mode 100644 index 0000000000000..c7293e1ed7f78 --- /dev/null +++ b/libs/community/langchain_community/document_loaders/parsers/language/java.py @@ -0,0 +1,32 @@ +from typing import TYPE_CHECKING + +from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter import ( # noqa: E501 + TreeSitterSegmenter, +) + +if TYPE_CHECKING: + from tree_sitter import Language + + +CHUNK_QUERY = """ + [ + (class_declaration) @class + (interface_declaration) @interface + (enum_declaration) @enum + ] +""".strip() + + +class JavaSegmenter(TreeSitterSegmenter): + """Code segmenter for Java.""" + + def get_language(self) -> "Language": + from tree_sitter_languages import get_language + + return get_language("java") + + def get_chunk_query(self) -> str: + return CHUNK_QUERY + + def make_line_comment(self, text: str) -> str: + return f"// {text}" diff --git a/libs/community/langchain_community/document_loaders/parsers/language/kotlin.py b/libs/community/langchain_community/document_loaders/parsers/language/kotlin.py new file mode 100644 index 0000000000000..6f946f7b4a622 --- /dev/null +++ b/libs/community/langchain_community/document_loaders/parsers/language/kotlin.py @@ -0,0 +1,31 @@ +from typing import TYPE_CHECKING + +from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter import ( # noqa: E501 + TreeSitterSegmenter, +) + +if TYPE_CHECKING: + from tree_sitter import Language + + +CHUNK_QUERY = """ + [ + (function_declaration) @function + (class_declaration) @class + ] +""".strip() + + +class KotlinSegmenter(TreeSitterSegmenter): + """Code segmenter for Kotlin.""" + + def get_language(self) -> "Language": + from tree_sitter_languages import get_language + + return get_language("kotlin") + + def get_chunk_query(self) -> str: + return CHUNK_QUERY + + def make_line_comment(self, text: str) -> str: + return f"// {text}" diff --git a/libs/community/langchain_community/document_loaders/parsers/language/language_parser.py b/libs/community/langchain_community/document_loaders/parsers/language/language_parser.py index b3f6534327f55..97444d294609a 100644 --- a/libs/community/langchain_community/document_loaders/parsers/language/language_parser.py +++ b/libs/community/langchain_community/document_loaders/parsers/language/language_parser.py @@ -6,28 +6,66 @@ from langchain_community.document_loaders.base import BaseBlobParser from langchain_community.document_loaders.blob_loaders import Blob +from langchain_community.document_loaders.parsers.language.c import CSegmenter from langchain_community.document_loaders.parsers.language.cobol import CobolSegmenter +from langchain_community.document_loaders.parsers.language.cpp import CPPSegmenter +from langchain_community.document_loaders.parsers.language.csharp import CSharpSegmenter +from langchain_community.document_loaders.parsers.language.go import GoSegmenter +from langchain_community.document_loaders.parsers.language.java import JavaSegmenter from langchain_community.document_loaders.parsers.language.javascript import ( JavaScriptSegmenter, ) +from langchain_community.document_loaders.parsers.language.kotlin import KotlinSegmenter +from langchain_community.document_loaders.parsers.language.lua import LuaSegmenter +from langchain_community.document_loaders.parsers.language.perl import PerlSegmenter from langchain_community.document_loaders.parsers.language.python import PythonSegmenter +from langchain_community.document_loaders.parsers.language.ruby import RubySegmenter +from langchain_community.document_loaders.parsers.language.rust import RustSegmenter +from langchain_community.document_loaders.parsers.language.scala import ScalaSegmenter +from langchain_community.document_loaders.parsers.language.typescript import ( + TypeScriptSegmenter, +) if TYPE_CHECKING: - from langchain.text_splitter import Language + from langchain.langchain.text_splitter import Language try: - from langchain.text_splitter import Language + from langchain.langchain.text_splitter import Language LANGUAGE_EXTENSIONS: Dict[str, str] = { "py": Language.PYTHON, "js": Language.JS, "cobol": Language.COBOL, + "c": Language.C, + "cpp": Language.CPP, + "cs": Language.CSHARP, + "rb": Language.RUBY, + "scala": Language.SCALA, + "rs": Language.RUST, + "go": Language.GO, + "kt": Language.KOTLIN, + "lua": Language.LUA, + "pl": Language.PERL, + "ts": Language.TS, + "java": Language.JAVA, } LANGUAGE_SEGMENTERS: Dict[str, Any] = { Language.PYTHON: PythonSegmenter, Language.JS: JavaScriptSegmenter, Language.COBOL: CobolSegmenter, + Language.C: CSegmenter, + Language.CPP: CPPSegmenter, + Language.CSHARP: CSharpSegmenter, + Language.RUBY: RubySegmenter, + Language.RUST: RustSegmenter, + Language.SCALA: ScalaSegmenter, + Language.GO: GoSegmenter, + Language.KOTLIN: KotlinSegmenter, + Language.LUA: LuaSegmenter, + Language.PERL: PerlSegmenter, + Language.TS: TypeScriptSegmenter, + Language.JAVA: JavaSegmenter, } except ImportError: LANGUAGE_EXTENSIONS = {} @@ -43,11 +81,34 @@ class LanguageParser(BaseBlobParser): This approach can potentially improve the accuracy of QA models over source code. - Currently, the supported languages for code parsing are Python and JavaScript. + The supported languages for code parsing are: + + - C (*) + - C++ (*) + - C# (*) + - COBOL + - Go (*) + - Java (*) + - JavaScript (requires package `esprima`) + - Kotlin (*) + - Lua (*) + - Perl (*) + - Python + - Ruby (*) + - Rust (*) + - Scala (*) + - TypeScript (*) + + Items marked with (*) require the packages `tree_sitter` and + `tree_sitter_languages`. It is straightforward to add support for additional + languages using `tree_sitter`, although this currently requires modifying LangChain. The language used for parsing can be configured, along with the minimum number of lines required to activate the splitting based on syntax. + If a language is not explicitly specified, `LanguageParser` will infer one from + filename extensions, if present. + Examples: .. code-block:: python diff --git a/libs/community/langchain_community/document_loaders/parsers/language/lua.py b/libs/community/langchain_community/document_loaders/parsers/language/lua.py new file mode 100644 index 0000000000000..3e0a762ba4b5f --- /dev/null +++ b/libs/community/langchain_community/document_loaders/parsers/language/lua.py @@ -0,0 +1,33 @@ +from typing import TYPE_CHECKING + +from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter import ( # noqa: E501 + TreeSitterSegmenter, +) + +if TYPE_CHECKING: + from tree_sitter import Language + + +CHUNK_QUERY = """ + [ + (function_definition_statement + name: (identifier)) @function + (local_function_definition_statement + name: (identifier)) @function + ] +""".strip() + + +class LuaSegmenter(TreeSitterSegmenter): + """Code segmenter for Lua.""" + + def get_language(self) -> "Language": + from tree_sitter_languages import get_language + + return get_language("lua") + + def get_chunk_query(self) -> str: + return CHUNK_QUERY + + def make_line_comment(self, text: str) -> str: + return f"-- {text}" diff --git a/libs/community/langchain_community/document_loaders/parsers/language/perl.py b/libs/community/langchain_community/document_loaders/parsers/language/perl.py new file mode 100644 index 0000000000000..b68d52cef2b04 --- /dev/null +++ b/libs/community/langchain_community/document_loaders/parsers/language/perl.py @@ -0,0 +1,30 @@ +from typing import TYPE_CHECKING + +from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter import ( # noqa: E501 + TreeSitterSegmenter, +) + +if TYPE_CHECKING: + from tree_sitter import Language + + +CHUNK_QUERY = """ + [ + (function_definition) @subroutine + ] +""".strip() + + +class PerlSegmenter(TreeSitterSegmenter): + """Code segmenter for Perl.""" + + def get_language(self) -> "Language": + from tree_sitter_languages import get_language + + return get_language("perl") + + def get_chunk_query(self) -> str: + return CHUNK_QUERY + + def make_line_comment(self, text: str) -> str: + return f"# {text}" diff --git a/libs/community/langchain_community/document_loaders/parsers/language/ruby.py b/libs/community/langchain_community/document_loaders/parsers/language/ruby.py new file mode 100644 index 0000000000000..767a1f94a4d37 --- /dev/null +++ b/libs/community/langchain_community/document_loaders/parsers/language/ruby.py @@ -0,0 +1,32 @@ +from typing import TYPE_CHECKING + +from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter import ( # noqa: E501 + TreeSitterSegmenter, +) + +if TYPE_CHECKING: + from tree_sitter import Language + + +CHUNK_QUERY = """ + [ + (method) @method + (module) @module + (class) @class + ] +""".strip() + + +class RubySegmenter(TreeSitterSegmenter): + """Code segmenter for Ruby.""" + + def get_language(self) -> "Language": + from tree_sitter_languages import get_language + + return get_language("ruby") + + def get_chunk_query(self) -> str: + return CHUNK_QUERY + + def make_line_comment(self, text: str) -> str: + return f"# {text}" diff --git a/libs/community/langchain_community/document_loaders/parsers/language/rust.py b/libs/community/langchain_community/document_loaders/parsers/language/rust.py new file mode 100644 index 0000000000000..bb73f96bf6d7c --- /dev/null +++ b/libs/community/langchain_community/document_loaders/parsers/language/rust.py @@ -0,0 +1,34 @@ +from typing import TYPE_CHECKING + +from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter import ( # noqa: E501 + TreeSitterSegmenter, +) + +if TYPE_CHECKING: + from tree_sitter import Language + + +CHUNK_QUERY = """ + [ + (function_item + name: (identifier) + body: (block)) @function + (struct_item) @struct + (trait_item) @trait + ] +""".strip() + + +class RustSegmenter(TreeSitterSegmenter): + """Code segmenter for Rust.""" + + def get_language(self) -> "Language": + from tree_sitter_languages import get_language + + return get_language("rust") + + def get_chunk_query(self) -> str: + return CHUNK_QUERY + + def make_line_comment(self, text: str) -> str: + return f"// {text}" diff --git a/libs/community/langchain_community/document_loaders/parsers/language/scala.py b/libs/community/langchain_community/document_loaders/parsers/language/scala.py new file mode 100644 index 0000000000000..af62a4e748fed --- /dev/null +++ b/libs/community/langchain_community/document_loaders/parsers/language/scala.py @@ -0,0 +1,33 @@ +from typing import TYPE_CHECKING + +from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter import ( # noqa: E501 + TreeSitterSegmenter, +) + +if TYPE_CHECKING: + from tree_sitter import Language + + +CHUNK_QUERY = """ + [ + (class_definition) @class + (function_definition) @function + (object_definition) @object + (trait_definition) @trait + ] +""".strip() + + +class ScalaSegmenter(TreeSitterSegmenter): + """Code segmenter for Scala.""" + + def get_language(self) -> "Language": + from tree_sitter_languages import get_language + + return get_language("scala") + + def get_chunk_query(self) -> str: + return CHUNK_QUERY + + def make_line_comment(self, text: str) -> str: + return f"// {text}" diff --git a/libs/community/langchain_community/document_loaders/parsers/language/tree_sitter_segmenter.py b/libs/community/langchain_community/document_loaders/parsers/language/tree_sitter_segmenter.py new file mode 100644 index 0000000000000..7187cd6b5f528 --- /dev/null +++ b/libs/community/langchain_community/document_loaders/parsers/language/tree_sitter_segmenter.py @@ -0,0 +1,108 @@ +from abc import abstractmethod +from typing import TYPE_CHECKING, List + +from langchain_community.document_loaders.parsers.language.code_segmenter import ( + CodeSegmenter, +) + +if TYPE_CHECKING: + from tree_sitter import Language, Parser + + +class TreeSitterSegmenter(CodeSegmenter): + """Abstract class for `CodeSegmenter`s that use the tree-sitter library.""" + + def __init__(self, code: str): + super().__init__(code) + self.source_lines = self.code.splitlines() + + try: + import tree_sitter # noqa: F401 + import tree_sitter_languages # noqa: F401 + except ImportError: + raise ImportError( + "Could not import tree_sitter/tree_sitter_languages Python packages. " + "Please install them with " + "`pip install tree-sitter tree-sitter-languages`." + ) + + def is_valid(self) -> bool: + language = self.get_language() + error_query = language.query("(ERROR) @error") + + parser = self.get_parser() + tree = parser.parse(bytes(self.code, encoding="UTF-8")) + + return len(error_query.captures(tree.root_node)) == 0 + + def extract_functions_classes(self) -> List[str]: + language = self.get_language() + query = language.query(self.get_chunk_query()) + + parser = self.get_parser() + tree = parser.parse(bytes(self.code, encoding="UTF-8")) + captures = query.captures(tree.root_node) + + processed_lines = set() + chunks = [] + + for node, name in captures: + start_line = node.start_point[0] + end_line = node.end_point[0] + lines = list(range(start_line, end_line + 1)) + + if any(line in processed_lines for line in lines): + continue + + processed_lines.update(lines) + chunk_text = node.text.decode("UTF-8") + chunks.append(chunk_text) + + return chunks + + def simplify_code(self) -> str: + language = self.get_language() + query = language.query(self.get_chunk_query()) + + parser = self.get_parser() + tree = parser.parse(bytes(self.code, encoding="UTF-8")) + processed_lines = set() + + simplified_lines = self.source_lines[:] + for node, name in query.captures(tree.root_node): + start_line = node.start_point[0] + end_line = node.end_point[0] + + lines = list(range(start_line, end_line + 1)) + if any(line in processed_lines for line in lines): + continue + + simplified_lines[start_line] = self.make_line_comment( + f"Code for: {self.source_lines[start_line]}" + ) + + for line_num in range(start_line + 1, end_line + 1): + simplified_lines[line_num] = None # type: ignore + + processed_lines.update(lines) + + return "\n".join(line for line in simplified_lines if line is not None) + + def get_parser(self) -> "Parser": + from tree_sitter import Parser + + parser = Parser() + parser.set_language(self.get_language()) + return parser + + @abstractmethod + def get_language(self) -> "Language": + raise NotImplementedError() # pragma: no cover + + @abstractmethod + def get_chunk_query(self) -> str: + raise NotImplementedError() # pragma: no cover + + @abstractmethod + def make_line_comment(self, text: str) -> str: + raise NotImplementedError() # pragma: no cover diff --git a/libs/community/langchain_community/document_loaders/parsers/language/typescript.py b/libs/community/langchain_community/document_loaders/parsers/language/typescript.py new file mode 100644 index 0000000000000..ab7158e2e8210 --- /dev/null +++ b/libs/community/langchain_community/document_loaders/parsers/language/typescript.py @@ -0,0 +1,33 @@ +from typing import TYPE_CHECKING + +from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter import ( # noqa: E501 + TreeSitterSegmenter, +) + +if TYPE_CHECKING: + from tree_sitter import Language + + +CHUNK_QUERY = """ + [ + (function_declaration) @function + (class_declaration) @class + (interface_declaration) @interface + (enum_declaration) @enum + ] +""".strip() + + +class TypeScriptSegmenter(TreeSitterSegmenter): + """Code segmenter for TypeScript.""" + + def get_language(self) -> "Language": + from tree_sitter_languages import get_language + + return get_language("typescript") + + def get_chunk_query(self) -> str: + return CHUNK_QUERY + + def make_line_comment(self, text: str) -> str: + return f"// {text}" diff --git a/libs/community/poetry.lock b/libs/community/poetry.lock index c5ea8eae37ae4..b4b4c21e8fd6d 100644 --- a/libs/community/poetry.lock +++ b/libs/community/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. [[package]] name = "aenum" @@ -3140,6 +3140,7 @@ files = [ {file = "jq-1.6.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:227b178b22a7f91ae88525810441791b1ca1fc71c86f03190911793be15cec3d"}, {file = "jq-1.6.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:780eb6383fbae12afa819ef676fc93e1548ae4b076c004a393af26a04b460742"}, {file = "jq-1.6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:08ded6467f4ef89fec35b2bf310f210f8cd13fbd9d80e521500889edf8d22441"}, + {file = "jq-1.6.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:49e44ed677713f4115bd5bf2dbae23baa4cd503be350e12a1c1f506b0687848f"}, {file = "jq-1.6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:984f33862af285ad3e41e23179ac4795f1701822473e1a26bf87ff023e5a89ea"}, {file = "jq-1.6.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f42264fafc6166efb5611b5d4cb01058887d050a6c19334f6a3f8a13bb369df5"}, {file = "jq-1.6.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a67154f150aaf76cc1294032ed588436eb002097dd4fd1e283824bf753a05080"}, @@ -3650,7 +3651,7 @@ files = [ [[package]] name = "langchain-core" -version = "0.1.21" +version = "0.1.22" description = "Building applications with LLMs through composability" optional = false python-versions = ">=3.8.1,<4.0" @@ -5968,7 +5969,6 @@ files = [ {file = "pymongo-4.6.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b8729dbf25eb32ad0dc0b9bd5e6a0d0b7e5c2dc8ec06ad171088e1896b522a74"}, {file = "pymongo-4.6.1-cp312-cp312-win32.whl", hash = "sha256:3177f783ae7e08aaf7b2802e0df4e4b13903520e8380915e6337cdc7a6ff01d8"}, {file = "pymongo-4.6.1-cp312-cp312-win_amd64.whl", hash = "sha256:00c199e1c593e2c8b033136d7a08f0c376452bac8a896c923fcd6f419e07bdd2"}, - {file = "pymongo-4.6.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:6dcc95f4bb9ed793714b43f4f23a7b0c57e4ef47414162297d6f650213512c19"}, {file = "pymongo-4.6.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:13552ca505366df74e3e2f0a4f27c363928f3dff0eef9f281eb81af7f29bc3c5"}, {file = "pymongo-4.6.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:77e0df59b1a4994ad30c6d746992ae887f9756a43fc25dec2db515d94cf0222d"}, {file = "pymongo-4.6.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:3a7f02a58a0c2912734105e05dedbee4f7507e6f1bd132ebad520be0b11d46fd"}, @@ -6508,7 +6508,6 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -8123,6 +8122,165 @@ files = [ docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"] test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0,<7.5)", "pytest-mock", "pytest-mypy-testing"] +[[package]] +name = "tree-sitter" +version = "0.20.4" +description = "Python bindings for the Tree-Sitter parsing library" +optional = true +python-versions = ">=3.3" +files = [ + {file = "tree_sitter-0.20.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:c259b9bcb596e54f54713eb3951226fc834d65289940f4bfdcdf519f08e8e876"}, + {file = "tree_sitter-0.20.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:88da7e2e4c69881cd63916cc24ae0b809f96aae331da45b418ae6b2d1ed2ca19"}, + {file = "tree_sitter-0.20.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:66a68b156ba131e9d8dff4a1f72037f4b368cc50c58f18905a91743ae1d1c795"}, + {file = "tree_sitter-0.20.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae28e25d551f406807011487bdfb9728041e656b30b554fa7f3391ab64ed69f9"}, + {file = "tree_sitter-0.20.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36b10c9c69e825ba65cf9b0f77668bf33e70d2a5764b64ad6f133f8cc9220f09"}, + {file = "tree_sitter-0.20.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:7c18c64ddd44b75b7e1660b9793753eda427e4b145b6216d4b2d2e9b200c74f2"}, + {file = "tree_sitter-0.20.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e9e9e594bbefb76ad9ea256f5c87eba7591b4758854d3df83ce4df415933a006"}, + {file = "tree_sitter-0.20.4-cp310-cp310-win32.whl", hash = "sha256:b4755229dc18644fe48bcab974bde09b171fcb6ef625d3cb5ece5c6198f4223e"}, + {file = "tree_sitter-0.20.4-cp310-cp310-win_amd64.whl", hash = "sha256:f792684cee8a46d9194d9f4223810e54ccc704470c5777538d59fbde0a4c91bf"}, + {file = "tree_sitter-0.20.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9d22ee75f45836554ee6a11e50dd8f9827941e67c49fce9a0790245b899811a9"}, + {file = "tree_sitter-0.20.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2a0ffd76dd991ba745bb5d0ba1d583bec85726d3ddef8c9685dc8636a619adde"}, + {file = "tree_sitter-0.20.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:060d4e5b803be0975f1ac46e54a292eab0701296ccd912f6cdac3f7331e29143"}, + {file = "tree_sitter-0.20.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:822e02366dbf223697b2b56b8f91aa5b60571f9fe7c998988a381db1c69604e9"}, + {file = "tree_sitter-0.20.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:527ca72c6a8f60fa719af37fa86f58b7ad0e07b8f74d1c1c7e926c5c888a7e6b"}, + {file = "tree_sitter-0.20.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a418ca71309ea7052e076f08d623f33f58eae01a8e8cdc1e6d3a01b5b8ddebfe"}, + {file = "tree_sitter-0.20.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:08c3ba2561b61a83c28ca06a0bce2a5ffcfb6b39f9d27a45e5ebd9cad2bedb7f"}, + {file = "tree_sitter-0.20.4-cp311-cp311-win32.whl", hash = "sha256:8d04c75a389b2de94952d602264852acff8cd3ed1ccf8a2492a080973d5ddd58"}, + {file = "tree_sitter-0.20.4-cp311-cp311-win_amd64.whl", hash = "sha256:ba9215c0e7529d9eb370528e5d99b7389d14a7eae94f07d14fa9dab18f267c62"}, + {file = "tree_sitter-0.20.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:c4c1af5ed4306071d30970c83ec882520a7bf5d8053996dbc4aa5c59238d4990"}, + {file = "tree_sitter-0.20.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9d70bfa550cf22c9cea9b3c0d18b889fc4f2a7e9dcf1d6cc93f49fa9d4a94954"}, + {file = "tree_sitter-0.20.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6de537bca0641775d8d175d37303d54998980fc0d997dd9aa89e16b415bf0cc3"}, + {file = "tree_sitter-0.20.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9b1c0f8c0e3e50267566f5116cdceedf4e23e8c08b55ef3becbe954a11b16e84"}, + {file = "tree_sitter-0.20.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20ef2ee6d9bb8e21713949e5ff769ed670fe1217f95b7eeb6c675788438c1e6e"}, + {file = "tree_sitter-0.20.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:b6fd1c881ab0de5faa67168db2d001eee32be5482cb4e0b21b217689a05b6fe4"}, + {file = "tree_sitter-0.20.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:bf47047420021d50aec529cb66387c90562350b499ddf56ecef1fc8255439e30"}, + {file = "tree_sitter-0.20.4-cp312-cp312-win32.whl", hash = "sha256:c16b48378041fc9702b6aa3480f2ffa49ca8ea58141a862acd569e5a0679655f"}, + {file = "tree_sitter-0.20.4-cp312-cp312-win_amd64.whl", hash = "sha256:973e871167079a1b1d7304d361449253efbe2a6974728ad563cf407bd02ddccb"}, + {file = "tree_sitter-0.20.4-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:9d33a55598dd18a4d8b869a3417de82a4812c3a7dc7e61cb025ece3e9c3e4e96"}, + {file = "tree_sitter-0.20.4-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7cee6955c2c97fc5927a41c7a8b06647c4b4d9b99b8a1581bf1183435c8cec3e"}, + {file = "tree_sitter-0.20.4-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5022bea67e479ad212be7c05b983a72e297a013efb4e8ea5b5b4d7da79a9fdef"}, + {file = "tree_sitter-0.20.4-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:640f60a5b966f0990338f1bf559455c3dcb822bc4329d82b3d42f32a48374dfe"}, + {file = "tree_sitter-0.20.4-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:0e83f641fe6f27d91bd4d259fff5d35de1567d3f581b9efe9bbd5be50fe4ddc7"}, + {file = "tree_sitter-0.20.4-cp36-cp36m-win32.whl", hash = "sha256:ce6a85027c66fa3f09d482cc6d41927ea40955f7f33b86aedd26dd932709a2c9"}, + {file = "tree_sitter-0.20.4-cp36-cp36m-win_amd64.whl", hash = "sha256:fe10779347a6c067af29cb37fd4b75fa96c5cb68f587cc9530b70fe3f2a51a55"}, + {file = "tree_sitter-0.20.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:28d5f84e34e276887e3a240b60906ca7e2b51e975f3145c3149ceed977a69508"}, + {file = "tree_sitter-0.20.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c913b65cbe10996116988ac436748f24883b5097e58274223e89bb2c5d1bb1a"}, + {file = "tree_sitter-0.20.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ecaed46241e071752195a628bb97d2b740f2fde9e34f8a74456a4ea8bb26df88"}, + {file = "tree_sitter-0.20.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:b641e88a97eab002a1736d93ef5a4beac90ea4fd6e25affd1831319b99f456c9"}, + {file = "tree_sitter-0.20.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:327c40f439c6155e4eee54c4657e4701a04f5f4816d9defdcb836bf65bf83d21"}, + {file = "tree_sitter-0.20.4-cp37-cp37m-win32.whl", hash = "sha256:1b7c1d95f006b3de42fbf4045bd00c273d113e372fcb6a5378e74ed120c12032"}, + {file = "tree_sitter-0.20.4-cp37-cp37m-win_amd64.whl", hash = "sha256:6140d037239a41046f5d34fba5e0374ee697adb4b48b90579c618b5402781c11"}, + {file = "tree_sitter-0.20.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:f42fd1104efaad8151370f1936e2a488b7337a5d24544a9ab59ba4c4010b1272"}, + {file = "tree_sitter-0.20.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7859717c5d62ee386b3d036cab8ed0f88f8c027b6b4ae476a55a8c5fb8aab713"}, + {file = "tree_sitter-0.20.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:fdd361fe1cc68db68b4d85165641275e34b86cc26b2bab932790204fa14824dc"}, + {file = "tree_sitter-0.20.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9b8d7539075606027b67764543463ff2bc4e52f4158ef6dc419c9f5625aa5383"}, + {file = "tree_sitter-0.20.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78e76307f05aca6cde72f3307b4d53701f34ae45f2248ceb83d1626051e201fd"}, + {file = "tree_sitter-0.20.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:dd8c352f4577f61098d06cf3feb7fd214259f41b5036b81003860ed54d16b448"}, + {file = "tree_sitter-0.20.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:281f3e5382d1bd7fccc88d1afe68c915565bc24f8b8dd4844079d46c7815b8a7"}, + {file = "tree_sitter-0.20.4-cp38-cp38-win32.whl", hash = "sha256:6a77ac3cdcddd80cdd1fd394318bff99f94f37e08d235aaefccb87e1224946e5"}, + {file = "tree_sitter-0.20.4-cp38-cp38-win_amd64.whl", hash = "sha256:8eee8adf54033dc48eab84b040f4d7b32355a964c4ae0aae5dfbdc4dbc3364ca"}, + {file = "tree_sitter-0.20.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e89f6508e30fce05e2c724725d022db30d877817b9d64f933506ffb3a3f4a2c2"}, + {file = "tree_sitter-0.20.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7fb6286bb1fae663c45ff0700ec88fb9b50a81eed2bae8a291f95fcf8cc19547"}, + {file = "tree_sitter-0.20.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:11e93f8b4bbae04070416a82257a7ab2eb0afb76e093ae3ea73bd63b792f6846"}, + {file = "tree_sitter-0.20.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8250725c5f78929aeb2c71db5dca76f1ef448389ca16f9439161f90978bb8478"}, + {file = "tree_sitter-0.20.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d404a8ca9de9b0843844f0cd4d423f46bc46375ab8afb63b1d8ec01201457ac8"}, + {file = "tree_sitter-0.20.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0f2422c9ee70ba972dfc3943746e6cf7fc03725a866908950245bda9ccfc7301"}, + {file = "tree_sitter-0.20.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:21a937942e4729abbe778a609d2c218574436cb351c36fba89ef3c8c6066ec78"}, + {file = "tree_sitter-0.20.4-cp39-cp39-win32.whl", hash = "sha256:427a9a39360cc1816e28f8182550e478e4ba983595a2565ab9dfe32ea1b03fd7"}, + {file = "tree_sitter-0.20.4-cp39-cp39-win_amd64.whl", hash = "sha256:7095bb9aff297fa9c6026bf8914fd295997d714d1a6ee9a1edf7282c772f9f64"}, + {file = "tree_sitter-0.20.4-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:859260b90f0e3867ae840e39f54e830f607b3bc531bc21deeeeaa8a30cbb89ad"}, + {file = "tree_sitter-0.20.4-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0dfc14be73cf46126660a3aecdd0396e69562ad1a902245225ca7bd29649594e"}, + {file = "tree_sitter-0.20.4-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ec46355bf3ff23f54d5e365871ffd3e05cfbc65d1b36a8be7c0bcbda30a1d43"}, + {file = "tree_sitter-0.20.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d933a942fde39876b99c36f12aa3764e4a555ae9366c10ce6cca8c16341c1bbf"}, + {file = "tree_sitter-0.20.4-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:a7eec3b55135fe851a38fa248c9fd75fc3d58ceb6e1865b795e416e4d598c2a1"}, + {file = "tree_sitter-0.20.4-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dfc76225529ee14a53e84413480ce81ec3c44eaa0455c140e961c90ac3118ead"}, + {file = "tree_sitter-0.20.4-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ccf0396e47efffc0b528959a8f2e2346a98297579f867e9e1834c2aad4be829c"}, + {file = "tree_sitter-0.20.4-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:a15fbabd3bc8e29c48289c156d743e69f5ec72bb125cf44f7adbdaa1937c3da6"}, + {file = "tree_sitter-0.20.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:36f8adf2126f496cf376b6e4b707cba061c25beb17841727eef6f0e083e53e1f"}, + {file = "tree_sitter-0.20.4-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:841efb40c116ab0a066924925409a8a4dcffeb39a151c0b2a1c2abe56ad4fb42"}, + {file = "tree_sitter-0.20.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2051e8a70fd8426f27a43dad71d11929a62ce30a9b1eb65bba0ed79e82481592"}, + {file = "tree_sitter-0.20.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:99a3c2824d4cfcffd9f961176891426bde2cb36ece5280c61480be93319c23c4"}, + {file = "tree_sitter-0.20.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:72830dc85a10430eca3d56739b7efcd7a05459c8d425f08c1aee6179ab7f13a9"}, + {file = "tree_sitter-0.20.4-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4992dd226055b6cd0a4f5661c66b799a73d3eff716302e0f7ab06594ee12d49f"}, + {file = "tree_sitter-0.20.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a66d95bbf92175cdc295d6d77f330942811f02e3aaf3fc64431cb749683b2f7d"}, + {file = "tree_sitter-0.20.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a25b1087e4f7825b2458dacf5f4b0be2938f78e850e822edca1ff4994b56081a"}, + {file = "tree_sitter-0.20.4.tar.gz", hash = "sha256:6adb123e2f3e56399bbf2359924633c882cc40ee8344885200bca0922f713be5"}, +] + +[package.dependencies] +setuptools = {version = ">=60.0.0", markers = "python_version >= \"3.12\""} + +[[package]] +name = "tree-sitter-languages" +version = "1.10.2" +description = "Binary Python wheels for all tree sitter languages." +optional = true +python-versions = "*" +files = [ + {file = "tree_sitter_languages-1.10.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5580348f0b20233b1d5431fa178ccd3d07423ca4a3275df02a44608fd72344b9"}, + {file = "tree_sitter_languages-1.10.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:103c7466644486b1e9e03850df46fc6aa12f13ca636c74f173270276220ac80b"}, + {file = "tree_sitter_languages-1.10.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d13db84511c6f1a7dc40383b66deafa74dabd8b877e3d65ab253f3719eccafd6"}, + {file = "tree_sitter_languages-1.10.2-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57adfa32be7e465b54aa72f915f6c78a2b66b227df4f656b5d4fbd1ca7a92b3f"}, + {file = "tree_sitter_languages-1.10.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c6385e033e460ceb8f33f3f940335f422ef2b763700a04f0089391a68b56153"}, + {file = "tree_sitter_languages-1.10.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:dfa3f38cc5381c5aba01dd7494f59b8a9050e82ff6e06e1233e3a0cbae297e3c"}, + {file = "tree_sitter_languages-1.10.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:9f195155acf47f8bc5de7cee46ecd07b2f5697f007ba89435b51ef4c0b953ea5"}, + {file = "tree_sitter_languages-1.10.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2de330e2ac6d7426ca025a3ec0f10d5640c3682c1d0c7702e812dcfb44b58120"}, + {file = "tree_sitter_languages-1.10.2-cp310-cp310-win32.whl", hash = "sha256:c9731cf745f135d9770eeba9bb4e2ff4dabc107b5ae9b8211e919f6b9100ea6d"}, + {file = "tree_sitter_languages-1.10.2-cp310-cp310-win_amd64.whl", hash = "sha256:6dd75851c41d0c3c4987a9b7692d90fa8848706c23115669d8224ffd6571e357"}, + {file = "tree_sitter_languages-1.10.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7eb7d7542b2091c875fe52719209631fca36f8c10fa66970d2c576ae6a1b8289"}, + {file = "tree_sitter_languages-1.10.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6b41bcb00974b1c8a1800c7f1bb476a1d15a0463e760ee24872f2d53b08ee424"}, + {file = "tree_sitter_languages-1.10.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6f370cd7845c6c81df05680d5bd96db8a99d32b56f4728c5d05978911130a853"}, + {file = "tree_sitter_languages-1.10.2-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a1dc195c88ef4c72607e112a809a69190e096a2e5ebc6201548b3e05fdd169ad"}, + {file = "tree_sitter_languages-1.10.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ae34ac314a7170be24998a0f994c1ac80761d8d4bd126af27ee53a023d3b849"}, + {file = "tree_sitter_languages-1.10.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:01b5742d5f5bd675489486b582bd482215880b26dde042c067f8265a6e925d9c"}, + {file = "tree_sitter_languages-1.10.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:ab1cbc46244d34fd16f21edaa20231b2a57f09f092a06ee3d469f3117e6eb954"}, + {file = "tree_sitter_languages-1.10.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0b1149e7467a4e92b8a70e6005fe762f880f493cf811fc003554b29f04f5e7c8"}, + {file = "tree_sitter_languages-1.10.2-cp311-cp311-win32.whl", hash = "sha256:049276343962f4696390ee555acc2c1a65873270c66a6cbe5cb0bca83bcdf3c6"}, + {file = "tree_sitter_languages-1.10.2-cp311-cp311-win_amd64.whl", hash = "sha256:7f3fdd468a577f04db3b63454d939e26e360229b53c80361920aa1ebf2cd7491"}, + {file = "tree_sitter_languages-1.10.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c0f4c8b2734c45859edc7fcaaeaab97a074114111b5ba51ab4ec7ed52104763c"}, + {file = "tree_sitter_languages-1.10.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:eecd3c1244ac3425b7a82ba9125b4ddb45d953bbe61de114c0334fd89b7fe782"}, + {file = "tree_sitter_languages-1.10.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15db3c8510bc39a80147ee7421bf4782c15c09581c1dc2237ea89cefbd95b846"}, + {file = "tree_sitter_languages-1.10.2-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:92c6487a6feea683154d3e06e6db68c30e0ae749a7ce4ce90b9e4e46b78c85c7"}, + {file = "tree_sitter_languages-1.10.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d2f1cd1d1bdd65332f9c2b67d49dcf148cf1ded752851d159ac3e5ee4f4d260"}, + {file = "tree_sitter_languages-1.10.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:976c8039165b8e12f17a01ddee9f4e23ec6e352b165ad29b44d2bf04e2fbe77e"}, + {file = "tree_sitter_languages-1.10.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:dafbbdf16bf668a580902e1620f4baa1913e79438abcce721a50647564c687b9"}, + {file = "tree_sitter_languages-1.10.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1aeabd3d60d6d276b73cd8f3739d595b1299d123cc079a317f1a5b3c5461e2ca"}, + {file = "tree_sitter_languages-1.10.2-cp312-cp312-win32.whl", hash = "sha256:fab8ee641914098e8933b87ea3d657bea4dd00723c1ee7038b847b12eeeef4f5"}, + {file = "tree_sitter_languages-1.10.2-cp312-cp312-win_amd64.whl", hash = "sha256:5e606430d736367e5787fa5a7a0c5a1ec9b85eded0b3596bbc0d83532a40810b"}, + {file = "tree_sitter_languages-1.10.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:838d5b48a7ed7a17658721952c77fda4570d2a069f933502653b17e15a9c39c9"}, + {file = "tree_sitter_languages-1.10.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:987b3c71b1d278c2889e018ee77b8ee05c384e2e3334dec798f8b611c4ab2d1e"}, + {file = "tree_sitter_languages-1.10.2-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:faa00abcb2c819027df58472da055d22fa7dfcb77c77413d8500c32ebe24d38b"}, + {file = "tree_sitter_languages-1.10.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e102fbbf02322d9201a86a814e79a9734ac80679fdb9682144479044f401a73"}, + {file = "tree_sitter_languages-1.10.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:8f0b87cf1a7b03174ba18dfd81582be82bfed26803aebfe222bd20e444aba003"}, + {file = "tree_sitter_languages-1.10.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c0f1b9af9cb67f0b942b020da9fdd000aad5e92f2383ae0ba7a330b318d31912"}, + {file = "tree_sitter_languages-1.10.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:5a4076c921f7a4d31e643843de7dfe040b65b63a238a5aa8d31d93aabe6572aa"}, + {file = "tree_sitter_languages-1.10.2-cp37-cp37m-win32.whl", hash = "sha256:fa6391a3a5d83d32db80815161237b67d70576f090ce5f38339206e917a6f8bd"}, + {file = "tree_sitter_languages-1.10.2-cp37-cp37m-win_amd64.whl", hash = "sha256:55649d3f254585a064121513627cf9788c1cfdadbc5f097f33d5ba750685a4c0"}, + {file = "tree_sitter_languages-1.10.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6f85d1edaa2d22d80d4ea5b6d12b95cf3644017b6c227d0d42854439e02e8893"}, + {file = "tree_sitter_languages-1.10.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:d78feed4a764ef3141cb54bf00fe94d514d8b6e26e09423e23b4c616fcb7938c"}, + {file = "tree_sitter_languages-1.10.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da1aca27531f9dd5308637d76643372856f0f65d0d28677d1bcf4211e8ed1ad0"}, + {file = "tree_sitter_languages-1.10.2-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1031ea440dafb72237437d754eff8940153a3b051e3d18932ac25e75ce060a15"}, + {file = "tree_sitter_languages-1.10.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99d3249beaef2c9fe558ecc9a97853c260433a849dcc68266d9770d196c2e102"}, + {file = "tree_sitter_languages-1.10.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:59a4450f262a55148fb7e68681522f0c2a2f6b7d89666312a2b32708d8f416e1"}, + {file = "tree_sitter_languages-1.10.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ce74eab0e430370d5e15a96b6c6205f93405c177a8b2e71e1526643b2fb9bab1"}, + {file = "tree_sitter_languages-1.10.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:9b4dd2b6b3d24c85dffe33d6c343448869eaf4f41c19ddba662eb5d65d8808f4"}, + {file = "tree_sitter_languages-1.10.2-cp38-cp38-win32.whl", hash = "sha256:92d734fb968fe3927a7596d9f0459f81a8fa7b07e16569476b28e27d0d753348"}, + {file = "tree_sitter_languages-1.10.2-cp38-cp38-win_amd64.whl", hash = "sha256:46a13f7d38f2eeb75f7cf127d1201346093748c270d686131f0cbc50e42870a1"}, + {file = "tree_sitter_languages-1.10.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f8c6a936ae99fdd8857e91f86c11c2f5e507ff30631d141d98132bb7ab2c8638"}, + {file = "tree_sitter_languages-1.10.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c283a61423f49cdfa7b5a5dfbb39221e3bd126fca33479cd80749d4d7a6b7349"}, + {file = "tree_sitter_languages-1.10.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76e60be6bdcff923386a54a5edcb6ff33fc38ab0118636a762024fa2bc98de55"}, + {file = "tree_sitter_languages-1.10.2-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c00069f9575bd831eabcce2cdfab158dde1ed151e7e5614c2d985ff7d78a7de1"}, + {file = "tree_sitter_languages-1.10.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:475ff53203d8a43ccb19bb322fa2fb200d764001cc037793f1fadd714bb343da"}, + {file = "tree_sitter_languages-1.10.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:26fe7c9c412e4141dea87ea4b3592fd12e385465b5bdab106b0d5125754d4f60"}, + {file = "tree_sitter_languages-1.10.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:8fed27319957458340f24fe14daad467cd45021da034eef583519f83113a8c5e"}, + {file = "tree_sitter_languages-1.10.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:3657a491a7f96cc75a3568ddd062d25f3be82b6a942c68801a7b226ff7130181"}, + {file = "tree_sitter_languages-1.10.2-cp39-cp39-win32.whl", hash = "sha256:33f7d584d01a7a3c893072f34cfc64ec031f3cfe57eebc32da2f8ac046e101a7"}, + {file = "tree_sitter_languages-1.10.2-cp39-cp39-win_amd64.whl", hash = "sha256:1b944af3ee729fa70fc8ae82224a9ff597cdb63addea084e0ea2fa2b0ec39bb7"}, +] + +[package.dependencies] +tree-sitter = "*" + [[package]] name = "typer" version = "0.9.0" @@ -8998,9 +9156,9 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [extras] cli = ["typer"] -extended-testing = ["aiosqlite", "aleph-alpha-client", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "azure-ai-documentintelligence", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "cohere", "databricks-vectorsearch", "datasets", "dgml-utils", "elasticsearch", "esprima", "faiss-cpu", "feedparser", "fireworks-ai", "geopandas", "gitpython", "google-cloud-documentai", "gql", "gradientai", "hdbcli", "hologres-vector", "html2text", "httpx", "javelin-sdk", "jinja2", "jq", "jsonschema", "lxml", "markdownify", "motor", "msal", "mwparserfromhell", "mwxml", "newspaper3k", "numexpr", "nvidia-riva-client", "oci", "openai", "openapi-pydantic", "oracle-ads", "pandas", "pdfminer-six", "pgvector", "praw", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "rapidocr-onnxruntime", "rdflib", "requests-toolbelt", "rspace_client", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "timescale-vector", "tqdm", "upstash-redis", "xata", "xmltodict", "zhipuai"] +extended-testing = ["aiosqlite", "aleph-alpha-client", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "azure-ai-documentintelligence", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "cohere", "databricks-vectorsearch", "datasets", "dgml-utils", "elasticsearch", "esprima", "faiss-cpu", "feedparser", "fireworks-ai", "geopandas", "gitpython", "google-cloud-documentai", "gql", "gradientai", "hdbcli", "hologres-vector", "html2text", "httpx", "javelin-sdk", "jinja2", "jq", "jsonschema", "lxml", "markdownify", "motor", "msal", "mwparserfromhell", "mwxml", "newspaper3k", "numexpr", "nvidia-riva-client", "oci", "openai", "openapi-pydantic", "oracle-ads", "pandas", "pdfminer-six", "pgvector", "praw", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "rapidocr-onnxruntime", "rdflib", "requests-toolbelt", "rspace_client", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "timescale-vector", "tqdm", "tree-sitter", "tree-sitter-languages", "upstash-redis", "xata", "xmltodict", "zhipuai"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "fe633ab7d246239420a26bebf8bcf857edcf0778f75b3eb2a4b1314cb13645c8" +content-hash = "e98000541a4991b1d41c9e995a4153ca24745e880afe75af6516574e3fb8b4a2" diff --git a/libs/community/pyproject.toml b/libs/community/pyproject.toml index ac6b9a3a9940d..6a8d8a0454abd 100644 --- a/libs/community/pyproject.toml +++ b/libs/community/pyproject.toml @@ -83,6 +83,8 @@ msal = {version = "^1.25.0", optional = true} databricks-vectorsearch = {version = "^0.21", optional = true} dgml-utils = {version = "^0.3.0", optional = true} datasets = {version = "^2.15.0", optional = true} +tree-sitter = {version = "^0.20.2", optional = true} +tree-sitter-languages = {version = "^1.8.0", optional = true} azure-ai-documentintelligence = {version = "^1.0.0b1", optional = true} oracle-ads = {version = "^2.9.1", optional = true} zhipuai = {version = "^1.0.7", optional = true} @@ -177,7 +179,6 @@ setuptools = "^67.6.1" langchain-core = {path = "../core", develop = true} [tool.poetry.extras] - cli = ["typer"] # An extra used to be able to add extended testing. @@ -249,6 +250,8 @@ extended_testing = [ "databricks-vectorsearch", "dgml-utils", "cohere", + "tree-sitter", + "tree-sitter-languages", "azure-ai-documentintelligence", "oracle-ads", "zhipuai", diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_c.py b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_c.py new file mode 100644 index 0000000000000..aa6cb58688941 --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_c.py @@ -0,0 +1,53 @@ +import unittest + +import pytest + +from langchain_community.document_loaders.parsers.language.c import CSegmenter + + +@pytest.mark.requires("tree_sitter", "tree_sitter_languages") +class TestCSegmenter(unittest.TestCase): + def setUp(self) -> None: + self.example_code = """int main() { + return 0; +} + +struct S { +}; + +union U { +}; + +enum Evens { + Two = 2, + Four = 4 +};""" + + self.expected_simplified_code = """// Code for: int main() { + +// Code for: struct S { + +// Code for: union U { + +// Code for: enum Evens {""" + + self.expected_extracted_code = [ + "int main() {\n return 0;\n}", + "struct S {\n}", + "union U {\n}", + "enum Evens {\n Two = 2,\n Four = 4\n}", + ] + + def test_is_valid(self) -> None: + self.assertTrue(CSegmenter("int a;").is_valid()) + self.assertFalse(CSegmenter("a b c 1 2 3").is_valid()) + + def test_extract_functions_classes(self) -> None: + segmenter = CSegmenter(self.example_code) + extracted_code = segmenter.extract_functions_classes() + self.assertEqual(extracted_code, self.expected_extracted_code) + + def test_simplify_code(self) -> None: + segmenter = CSegmenter(self.example_code) + simplified_code = segmenter.simplify_code() + self.assertEqual(simplified_code, self.expected_simplified_code) diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_cpp.py b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_cpp.py new file mode 100644 index 0000000000000..a06b7fa7aff9d --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_cpp.py @@ -0,0 +1,63 @@ +import unittest + +import pytest + +from langchain_community.document_loaders.parsers.language.cpp import CPPSegmenter + + +@pytest.mark.requires("tree_sitter", "tree_sitter_languages") +class TestCPPSegmenter(unittest.TestCase): + def setUp(self) -> None: + self.example_code = """int foo() { + return 1; +} + +class T { + auto bar() const -> int; + template + void baz(U) { + } +}; + +struct S { +}; + +union U { +}; + +auto T::bar() const -> int { + return 1; +}""" + + self.expected_simplified_code = """// Code for: int foo() { + +// Code for: class T { + +// Code for: struct S { + +// Code for: union U { + +// Code for: auto T::bar() const -> int {""" + + self.expected_extracted_code = [ + "int foo() {\n return 1;\n}", + "class T {\n auto bar() const -> int;\n " + "template\n void baz(U) {\n }\n}", + "struct S {\n}", + "union U {\n}", + "auto T::bar() const -> int {\n return 1;\n}", + ] + + def test_is_valid(self) -> None: + self.assertTrue(CPPSegmenter("int a;").is_valid()) + self.assertFalse(CPPSegmenter("a b c 1 2 3").is_valid()) + + def test_extract_functions_classes(self) -> None: + segmenter = CPPSegmenter(self.example_code) + extracted_code = segmenter.extract_functions_classes() + self.assertEqual(extracted_code, self.expected_extracted_code) + + def test_simplify_code(self) -> None: + segmenter = CPPSegmenter(self.example_code) + simplified_code = segmenter.simplify_code() + self.assertEqual(simplified_code, self.expected_simplified_code) diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_csharp.py b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_csharp.py new file mode 100644 index 0000000000000..3f04713f4ce45 --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_csharp.py @@ -0,0 +1,78 @@ +import unittest + +import pytest + +from langchain_community.document_loaders.parsers.language.csharp import CSharpSegmenter + + +@pytest.mark.requires("tree_sitter", "tree_sitter_languages") +class TestCSharpSegmenter(unittest.TestCase): + def setUp(self) -> None: + self.example_code = """namespace World +{ +} + +class Hello +{ + static void Main(string []args) + { + System.Console.WriteLine("Hello, world."); + } +} + +interface Human +{ + void breathe(); +} + +enum Tens +{ + Ten = 10, + Twenty = 20 +} + +struct T +{ +} + +record Person(string FirstName, string LastName, string Id) +{ + internal string Id { get; init; } = Id; +}""" + + self.expected_simplified_code = """// Code for: namespace World + +// Code for: class Hello + +// Code for: interface Human + +// Code for: enum Tens + +// Code for: struct T + +// Code for: record Person(string FirstName, string LastName, string Id)""" + + self.expected_extracted_code = [ + "namespace World\n{\n}", + "class Hello\n{\n static void Main(string []args)\n {\n " + 'System.Console.WriteLine("Hello, world.");\n }\n}', + "interface Human\n{\n void breathe();\n}", + "enum Tens\n{\n Ten = 10,\n Twenty = 20\n}", + "struct T\n{\n}", + "record Person(string FirstName, string LastName, string Id)\n{\n " + "internal string Id { get; init; } = Id;\n}", + ] + + def test_is_valid(self) -> None: + self.assertTrue(CSharpSegmenter("int a;").is_valid()) + self.assertFalse(CSharpSegmenter("a b c 1 2 3").is_valid()) + + def test_extract_functions_classes(self) -> None: + segmenter = CSharpSegmenter(self.example_code) + extracted_code = segmenter.extract_functions_classes() + self.assertEqual(extracted_code, self.expected_extracted_code) + + def test_simplify_code(self) -> None: + segmenter = CSharpSegmenter(self.example_code) + simplified_code = segmenter.simplify_code() + self.assertEqual(simplified_code, self.expected_simplified_code) diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_go.py b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_go.py new file mode 100644 index 0000000000000..e1360e35d9793 --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_go.py @@ -0,0 +1,50 @@ +import unittest + +import pytest + +from langchain_community.document_loaders.parsers.language.go import GoSegmenter + + +@pytest.mark.requires("tree_sitter", "tree_sitter_languages") +class TestGoSegmenter(unittest.TestCase): + def setUp(self) -> None: + self.example_code = """func foo(a int) int { + return a; +} + +type T struct { + a int + b bool + c string +} + +type S interface { + bar() float64 +} +""" + + self.expected_simplified_code = """// Code for: func foo(a int) int { + +// Code for: type T struct { + +// Code for: type S interface {""" + + self.expected_extracted_code = [ + "func foo(a int) int {\n return a;\n}", + "type T struct {\n a int\n b bool\n c string\n}", + "type S interface {\n bar() float64\n}", + ] + + def test_is_valid(self) -> None: + self.assertTrue(GoSegmenter("var a int;").is_valid()) + self.assertFalse(GoSegmenter("a b c 1 2 3").is_valid()) + + def test_extract_functions_classes(self) -> None: + segmenter = GoSegmenter(self.example_code) + extracted_code = segmenter.extract_functions_classes() + self.assertEqual(extracted_code, self.expected_extracted_code) + + def test_simplify_code(self) -> None: + segmenter = GoSegmenter(self.example_code) + simplified_code = segmenter.simplify_code() + self.assertEqual(simplified_code, self.expected_simplified_code) diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_java.py b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_java.py new file mode 100644 index 0000000000000..1129ae1a889fe --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_java.py @@ -0,0 +1,57 @@ +import unittest + +import pytest + +from langchain_community.document_loaders.parsers.language.java import JavaSegmenter + + +@pytest.mark.requires("tree_sitter", "tree_sitter_languages") +class TestJavaSegmenter(unittest.TestCase): + def setUp(self) -> None: + self.example_code = """class Hello +{ + public static void main(String[] args) + { + System.out.println("Hello, world."); + } +} + +interface Human +{ + void breathe(); +} + +enum Tens +{ + TEN, + TWENTY +} +""" + + self.expected_simplified_code = """// Code for: class Hello + +// Code for: interface Human + +// Code for: enum Tens""" + + self.expected_extracted_code = [ + "class Hello\n{\n " + "public static void main(String[] args)\n {\n " + 'System.out.println("Hello, world.");\n }\n}', + "interface Human\n{\n void breathe();\n}", + "enum Tens\n{\n TEN,\n TWENTY\n}", + ] + + def test_is_valid(self) -> None: + self.assertTrue(JavaSegmenter("int a;").is_valid()) + self.assertFalse(JavaSegmenter("a b c 1 2 3").is_valid()) + + def test_extract_functions_classes(self) -> None: + segmenter = JavaSegmenter(self.example_code) + extracted_code = segmenter.extract_functions_classes() + self.assertEqual(extracted_code, self.expected_extracted_code) + + def test_simplify_code(self) -> None: + segmenter = JavaSegmenter(self.example_code) + simplified_code = segmenter.simplify_code() + self.assertEqual(simplified_code, self.expected_simplified_code) diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_kotlin.py b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_kotlin.py new file mode 100644 index 0000000000000..35bde9feb26e7 --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_kotlin.py @@ -0,0 +1,60 @@ +import unittest + +import pytest + +from langchain_community.document_loaders.parsers.language.kotlin import KotlinSegmenter + + +@pytest.mark.requires("tree_sitter", "tree_sitter_languages") +class TestKotlinSegmenter(unittest.TestCase): + def setUp(self) -> None: + self.example_code = """fun foo(a: Int): Int { + return a +} + +class T { + var a: Int = 0 + var b: Boolean = false + var c: String = "" +} + +interface S { + fun bar(): Double +} + +enum class P { + A, + B, + C +} +""" + + self.expected_simplified_code = """// Code for: fun foo(a: Int): Int { + +// Code for: class T { + +// Code for: interface S { + +// Code for: enum class P {""" + + self.expected_extracted_code = [ + "fun foo(a: Int): Int {\n return a\n}", + "class T {\n var a: Int = 0\n var b: Boolean = false\n " + 'var c: String = ""\n}', + "interface S {\n fun bar(): Double\n}", + "enum class P {\n A,\n B,\n C\n}", + ] + + def test_is_valid(self) -> None: + self.assertTrue(KotlinSegmenter("val a: Int = 5").is_valid()) + self.assertFalse(KotlinSegmenter("a b c 1 2 3").is_valid()) + + def test_extract_functions_classes(self) -> None: + segmenter = KotlinSegmenter(self.example_code) + extracted_code = segmenter.extract_functions_classes() + self.assertEqual(extracted_code, self.expected_extracted_code) + + def test_simplify_code(self) -> None: + segmenter = KotlinSegmenter(self.example_code) + simplified_code = segmenter.simplify_code() + self.assertEqual(simplified_code, self.expected_simplified_code) diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_lua.py b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_lua.py new file mode 100644 index 0000000000000..56df8f3310a66 --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_lua.py @@ -0,0 +1,40 @@ +import unittest + +import pytest + +from langchain_community.document_loaders.parsers.language.lua import LuaSegmenter + + +@pytest.mark.requires("tree_sitter", "tree_sitter_languages") +class TestLuaSegmenter(unittest.TestCase): + def setUp(self) -> None: + self.example_code = """function F() + print("Hello") +end + +local function G() + print("Goodbye") +end""" + + self.expected_simplified_code = """-- Code for: function F() + +-- Code for: local function G()""" + + self.expected_extracted_code = [ + 'function F()\n print("Hello")\nend', + 'local function G()\n print("Goodbye")\nend', + ] + + def test_is_valid(self) -> None: + self.assertTrue(LuaSegmenter("local a").is_valid()) + self.assertFalse(LuaSegmenter("a b c 1 2 3").is_valid()) + + def test_extract_functions_classes(self) -> None: + segmenter = LuaSegmenter(self.example_code) + extracted_code = segmenter.extract_functions_classes() + self.assertEqual(extracted_code, self.expected_extracted_code) + + def test_simplify_code(self) -> None: + segmenter = LuaSegmenter(self.example_code) + simplified_code = segmenter.simplify_code() + self.assertEqual(simplified_code, self.expected_simplified_code) diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_perl.py b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_perl.py new file mode 100644 index 0000000000000..78e3fa25a0003 --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_perl.py @@ -0,0 +1,44 @@ +import unittest + +import pytest + +from langchain_community.document_loaders.parsers.language.perl import PerlSegmenter + + +@pytest.mark.requires("tree_sitter", "tree_sitter_languages") +class TestPerlSegmenter(unittest.TestCase): + def setUp(self) -> None: + self.example_code = """sub Hello { + print "Hello, World!"; +} + +sub new { + my $class = shift; + my $self = {}; + bless $self, $class; + return $self; +}""" + + self.expected_simplified_code = """# Code for: sub Hello { + +# Code for: sub new {""" + + self.expected_extracted_code = [ + 'sub Hello {\n print "Hello, World!";\n}', + "sub new {\n my $class = shift;\n my $self = {};\n " + "bless $self, $class;\n return $self;\n}", + ] + + def test_is_valid(self) -> None: + self.assertTrue(PerlSegmenter("$age = 25;").is_valid()) + self.assertFalse(PerlSegmenter("a b c 1 2 3").is_valid()) + + def test_extract_functions_classes(self) -> None: + segmenter = PerlSegmenter(self.example_code) + extracted_code = segmenter.extract_functions_classes() + self.assertEqual(extracted_code, self.expected_extracted_code) + + def test_simplify_code(self) -> None: + segmenter = PerlSegmenter(self.example_code) + simplified_code = segmenter.simplify_code() + self.assertEqual(simplified_code, self.expected_simplified_code) diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_ruby.py b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_ruby.py new file mode 100644 index 0000000000000..99dd7db1fa813 --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_ruby.py @@ -0,0 +1,51 @@ +import unittest + +import pytest + +from langchain_community.document_loaders.parsers.language.ruby import RubySegmenter + + +@pytest.mark.requires("tree_sitter", "tree_sitter_languages") +class TestRubySegmenter(unittest.TestCase): + def setUp(self) -> None: + self.example_code = """def foo + i = 0 +end + +module M + def hi + i = 2 + end +end + +class T + def bar + j = 1 + end +end""" + + self.expected_simplified_code = """# Code for: def foo + +# Code for: module M + +# Code for: class T""" + + self.expected_extracted_code = [ + "def foo\n i = 0\nend", + "module M\n def hi\n i = 2\n end\nend", + "class T\n def bar\n j = 1\n end\nend", + ] + + def test_is_valid(self) -> None: + self.assertTrue(RubySegmenter("def a; end").is_valid()) + self.assertFalse(RubySegmenter("a b c 1 2 3").is_valid()) + + def test_extract_functions_classes(self) -> None: + segmenter = RubySegmenter(self.example_code) + extracted_code = segmenter.extract_functions_classes() + self.assertEqual(extracted_code, self.expected_extracted_code) + + def test_simplify_code(self) -> None: + segmenter = RubySegmenter(self.example_code) + simplified_code = segmenter.simplify_code() + self.assertEqual(simplified_code, self.expected_simplified_code) diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_rust.py b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_rust.py new file mode 100644 index 0000000000000..6b35677c30505 --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_rust.py @@ -0,0 +1,50 @@ +import unittest + +import pytest + +from langchain_community.document_loaders.parsers.language.rust import RustSegmenter + + +@pytest.mark.requires("tree_sitter", "tree_sitter_languages") +class TestRustSegmenter(unittest.TestCase): + def setUp(self) -> None: + self.example_code = """fn foo() -> i32 { + return 1; +} + +struct T { + a: i32, + b: bool, + c: String +} + +trait S { + fn bar() -> Self +} +""" + + self.expected_simplified_code = """// Code for: fn foo() -> i32 { + +// Code for: struct T { + +// Code for: trait S {""" + + self.expected_extracted_code = [ + "fn foo() -> i32 {\n return 1;\n}", + "struct T {\n a: i32,\n b: bool,\n c: String\n}", + "trait S {\n fn bar() -> Self\n}", + ] + + def test_is_valid(self) -> None: + self.assertTrue(RustSegmenter("let a: i32;").is_valid()) + self.assertFalse(RustSegmenter("a b c 1 2 3").is_valid()) + + def test_extract_functions_classes(self) -> None: + segmenter = RustSegmenter(self.example_code) + extracted_code = segmenter.extract_functions_classes() + self.assertEqual(extracted_code, self.expected_extracted_code) + + def test_simplify_code(self) -> None: + segmenter = RustSegmenter(self.example_code) + simplified_code = segmenter.simplify_code() + self.assertEqual(simplified_code, self.expected_simplified_code) diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_scala.py b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_scala.py new file mode 100644 index 0000000000000..3fad1aeb806be --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_scala.py @@ -0,0 +1,56 @@ +import unittest + +import pytest + +from langchain_community.document_loaders.parsers.language.scala import ScalaSegmenter + + +@pytest.mark.requires("tree_sitter", "tree_sitter_languages") +class TestScalaSegmenter(unittest.TestCase): + def setUp(self) -> None: + self.example_code = """def foo() { + return 1 +} + +object T { + def baz() { + val x = 1 + } +} + +class S() { + +} + +trait T { + def P(x: Any): Boolean +}""" + + self.expected_simplified_code = """// Code for: def foo() { + +// Code for: object T { + +// Code for: class S() { + +// Code for: trait T {""" + + self.expected_extracted_code = [ + "def foo() {\n return 1\n}", + "object T {\n def baz() {\n val x = 1\n }\n}", + "class S() {\n\n}", + "trait T {\n def P(x: Any): Boolean\n}", + ] + + def test_is_valid(self) -> None: + self.assertFalse(ScalaSegmenter("val x").is_valid()) + self.assertFalse(ScalaSegmenter("a b c 1 2 3").is_valid()) + + def test_extract_functions_classes(self) -> None: + segmenter = ScalaSegmenter(self.example_code) + extracted_code = segmenter.extract_functions_classes() + self.assertEqual(extracted_code, self.expected_extracted_code) + + def test_simplify_code(self) -> None: + segmenter = ScalaSegmenter(self.example_code) + simplified_code = segmenter.simplify_code() + self.assertEqual(simplified_code, self.expected_simplified_code) diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_typescript.py b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_typescript.py new file mode 100644 index 0000000000000..caf4b6ad66ccf --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_typescript.py @@ -0,0 +1,67 @@ +import unittest + +import pytest + +from langchain_community.document_loaders.parsers.language.typescript import ( + TypeScriptSegmenter, +) + + +@pytest.mark.requires("tree_sitter", "tree_sitter_languages") +class TestTypeScriptSegmenter(unittest.TestCase): + def setUp(self) -> None: + self.example_code = """function foo(): number +{ + return 1; +} + +class Autumn +{ + leafCount = 45; + reduceTemperature(desiredTemperature: number): number { + return desiredTemperature * 0.6; + } +} + +interface Season +{ + change(): void; +} + +enum Colors +{ + Green = 'green', + Red = 'red', +} +""" + + self.expected_simplified_code = """// Code for: function foo(): number + +// Code for: class Autumn + +// Code for: interface Season + +// Code for: enum Colors""" + + self.expected_extracted_code = [ + "function foo(): number\n{\n return 1;\n}", + "class Autumn\n{\n leafCount = 45;\n " + "reduceTemperature(desiredTemperature: number): number {\n " + "return desiredTemperature * 0.6;\n }\n}", + "interface Season\n{\n change(): void;\n}", + "enum Colors\n{\n Green = 'green',\n Red = 'red',\n}", + ] + + def test_is_valid(self) -> None: + self.assertTrue(TypeScriptSegmenter("let a;").is_valid()) + self.assertFalse(TypeScriptSegmenter("a b c 1 2 3").is_valid()) + + def test_extract_functions_classes(self) -> None: + segmenter = TypeScriptSegmenter(self.example_code) + extracted_code = segmenter.extract_functions_classes() + self.assertEqual(extracted_code, self.expected_extracted_code) + + def test_simplify_code(self) -> None: + segmenter = TypeScriptSegmenter(self.example_code) + simplified_code = segmenter.simplify_code() + self.assertEqual(simplified_code, self.expected_simplified_code) From db6f266d979215db37a1998f4de89ce83ddd3469 Mon Sep 17 00:00:00 2001 From: Sergey Kozlov Date: Tue, 13 Feb 2024 22:48:02 +0600 Subject: [PATCH 25/25] core: improve None value processing in merge_dicts() (#17462) - **Description:** fix `None` and `0` merging in `merge_dicts()`, add tests. ```python from langchain_core.utils._merge import merge_dicts assert merge_dicts({"a": None}, {"a": 0}) == {"a": 0} ``` --------- Co-authored-by: Sergey Kozlov --- libs/core/langchain_core/utils/_merge.py | 6 +- .../core/tests/unit_tests/utils/test_utils.py | 76 ++++++++++++++++++- 2 files changed, 77 insertions(+), 5 deletions(-) diff --git a/libs/core/langchain_core/utils/_merge.py b/libs/core/langchain_core/utils/_merge.py index e21fdd96621fe..13b91270c70d8 100644 --- a/libs/core/langchain_core/utils/_merge.py +++ b/libs/core/langchain_core/utils/_merge.py @@ -19,11 +19,9 @@ def merge_dicts(left: Dict[str, Any], right: Dict[str, Any]) -> Dict[str, Any]: for k, v in right.items(): if k not in merged: merged[k] = v - elif merged[k] is None and v: + elif v is not None and merged[k] is None: merged[k] = v - elif v is None: - continue - elif merged[k] == v: + elif v is None or merged[k] == v: continue elif type(merged[k]) != type(v): raise TypeError( diff --git a/libs/core/tests/unit_tests/utils/test_utils.py b/libs/core/tests/unit_tests/utils/test_utils.py index e6dedb2a5a86b..a69ea2510b41f 100644 --- a/libs/core/tests/unit_tests/utils/test_utils.py +++ b/libs/core/tests/unit_tests/utils/test_utils.py @@ -1,9 +1,12 @@ -from typing import Dict, Optional, Tuple, Type +import re +from contextlib import AbstractContextManager, nullcontext +from typing import Dict, Optional, Tuple, Type, Union from unittest.mock import patch import pytest from langchain_core.utils import check_package_version +from langchain_core.utils._merge import merge_dicts @pytest.mark.parametrize( @@ -28,3 +31,74 @@ def test_check_package_version( else: with pytest.raises(expected[0], match=expected[1]): check_package_version(package, **check_kwargs) + + +@pytest.mark.parametrize( + ("left", "right", "expected"), + ( + # Merge `None` and `1`. + ({"a": None}, {"a": 1}, {"a": 1}), + # Merge `1` and `None`. + ({"a": 1}, {"a": None}, {"a": 1}), + # Merge `None` and a value. + ({"a": None}, {"a": 0}, {"a": 0}), + ({"a": None}, {"a": "txt"}, {"a": "txt"}), + # Merge equal values. + ({"a": 1}, {"a": 1}, {"a": 1}), + ({"a": 1.5}, {"a": 1.5}, {"a": 1.5}), + ({"a": True}, {"a": True}, {"a": True}), + ({"a": False}, {"a": False}, {"a": False}), + ({"a": "txt"}, {"a": "txt"}, {"a": "txt"}), + ({"a": [1, 2]}, {"a": [1, 2]}, {"a": [1, 2]}), + ({"a": {"b": "txt"}}, {"a": {"b": "txt"}}, {"a": {"b": "txt"}}), + # Merge strings. + ({"a": "one"}, {"a": "two"}, {"a": "onetwo"}), + # Merge dicts. + ({"a": {"b": 1}}, {"a": {"c": 2}}, {"a": {"b": 1, "c": 2}}), + ( + {"function_call": {"arguments": None}}, + {"function_call": {"arguments": "{\n"}}, + {"function_call": {"arguments": "{\n"}}, + ), + # Merge lists. + ({"a": [1, 2]}, {"a": [3]}, {"a": [1, 2, 3]}), + ({"a": 1, "b": 2}, {"a": 1}, {"a": 1, "b": 2}), + ({"a": 1, "b": 2}, {"c": None}, {"a": 1, "b": 2, "c": None}), + # + # Invalid inputs. + # + ( + {"a": 1}, + {"a": "1"}, + pytest.raises( + TypeError, + match=re.escape( + 'additional_kwargs["a"] already exists in this message, ' + "but with a different type." + ), + ), + ), + ( + {"a": (1, 2)}, + {"a": (3,)}, + pytest.raises( + TypeError, + match=( + "Additional kwargs key a already exists in left dict and value " + "has unsupported type .+tuple.+." + ), + ), + ), + ), +) +def test_merge_dicts( + left: dict, right: dict, expected: Union[dict, AbstractContextManager] +) -> None: + if isinstance(expected, AbstractContextManager): + err = expected + else: + err = nullcontext() + + with err: + actual = merge_dicts(left, right) + assert actual == expected