diff --git a/libs/langchain/langchain/retrievers/arxiv.py b/libs/langchain/langchain/retrievers/arxiv.py index f9a7910d8da7f..b14f0b61f8a4f 100644 --- a/libs/langchain/langchain/retrievers/arxiv.py +++ b/libs/langchain/langchain/retrievers/arxiv.py @@ -12,7 +12,12 @@ class ArxivRetriever(BaseRetriever, ArxivAPIWrapper): It uses all ArxivAPIWrapper arguments without any change. """ + get_full_documents: bool = False + def _get_relevant_documents( self, query: str, *, run_manager: CallbackManagerForRetrieverRun ) -> List[Document]: - return self.load(query=query) + if self.get_full_documents: + return self.load(query=query) + else: + return self.get_summaries_as_docs(query) diff --git a/libs/langchain/langchain/utilities/arxiv.py b/libs/langchain/langchain/utilities/arxiv.py index b5439318c0257..9eef84ecb173f 100644 --- a/libs/langchain/langchain/utilities/arxiv.py +++ b/libs/langchain/langchain/utilities/arxiv.py @@ -90,6 +90,43 @@ def validate_environment(cls, values: Dict) -> Dict: ) return values + def get_summaries_as_docs(self, query: str) -> List[Document]: + """ + Performs an arxiv search and returns list of + documents, with summaries as the content. + + If an error occurs or no documents found, error text + is returned instead. Wrapper for + https://lukasschwab.me/arxiv.py/index.html#Search + + Args: + query: a plaintext search query + """ # noqa: E501 + try: + if self.is_arxiv_identifier(query): + results = self.arxiv_search( + id_list=query.split(), + max_results=self.top_k_results, + ).results() + else: + results = self.arxiv_search( # type: ignore + query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.top_k_results + ).results() + except self.arxiv_exceptions as ex: + return [Document(page_content=f"Arxiv exception: {ex}")] + docs = [ + Document( + page_content=result.summary, + metadata={ + "Published": result.updated.date(), + "Title": result.title, + "Authors": ", ".join(a.name for a in result.authors), + }, + ) + for result in results + ] + return docs + def run(self, query: str) -> str: """ Performs an arxiv search and A single string