Merge branch 'master' into fix-github-list-files

langchain-ai · Nov 6, 2024 · 42bd890 · 42bd890
2 parents 5f47883 + 41b7a51
commit 42bd890
Show file tree

Hide file tree

Showing 43 changed files with 2,373 additions and 1,629 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -0,0 +1,2 @@
+/.github/   @efriis @baskaryan @ccurme
+/libs/packages.yml   @efriis
diff --git a/.github/scripts/get_min_versions.py b/.github/scripts/get_min_versions.py
@@ -7,12 +7,17 @@
     # for python 3.10 and below, which doesnt have stdlib tomllib
     import tomli as tomllib
 
-from packaging.version import parse as parse_version
 from packaging.specifiers import SpecifierSet
 from packaging.version import Version
 
+
+import requests
+from packaging.version import parse
+from typing import List
+
 import re
 
+
 MIN_VERSION_LIBS = [
     "langchain-core",
     "langchain-community",
@@ -31,29 +36,61 @@
 ]
 
 
-def get_min_version(version: str) -> str:
-    # base regex for x.x.x with cases for rc/post/etc
-    # valid strings: https://peps.python.org/pep-0440/#public-version-identifiers
-    vstring = r"\d+(?:\.\d+){0,2}(?:(?:a|b|rc|\.post|\.dev)\d+)?"
-    # case ^x.x.x
-    _match = re.match(f"^\\^({vstring})$", version)
-    if _match:
-        return _match.group(1)
+def get_pypi_versions(package_name: str) -> List[str]:
+    """
+    Fetch all available versions for a package from PyPI.
+
+    Args:
+        package_name (str): Name of the package
+
+    Returns:
+        List[str]: List of all available versions
 
-    # case >=x.x.x,<y.y.y
-    _match = re.match(f"^>=({vstring}),<({vstring})$", version)
-    if _match:
-        _min = _match.group(1)
-        _max = _match.group(2)
-        assert parse_version(_min) < parse_version(_max)
-        return _min
+    Raises:
+        requests.exceptions.RequestException: If PyPI API request fails
+        KeyError: If package not found or response format unexpected
+    """
+    pypi_url = f"https://pypi.org/pypi/{package_name}/json"
+    response = requests.get(pypi_url)
+    response.raise_for_status()
+    return list(response.json()["releases"].keys())
 
-    # case x.x.x
-    _match = re.match(f"^({vstring})$", version)
-    if _match:
-        return _match.group(1)
 
-    raise ValueError(f"Unrecognized version format: {version}")
+def get_minimum_version(package_name: str, spec_string: str) -> Optional[str]:
+    """
+    Find the minimum published version that satisfies the given constraints.
+
+    Args:
+        package_name (str): Name of the package
+        spec_string (str): Version specification string (e.g., ">=0.2.43,<0.4.0,!=0.3.0")
+
+    Returns:
+        Optional[str]: Minimum compatible version or None if no compatible version found
+    """
+    # rewrite occurrences of ^0.0.z to 0.0.z (can be anywhere in constraint string)
+    spec_string = re.sub(r"\^0\.0\.(\d+)", r"0.0.\1", spec_string)
+    # rewrite occurrences of ^0.y.z to >=0.y.z,<0.y+1 (can be anywhere in constraint string)
+    for y in range(1, 10):
+        spec_string = re.sub(rf"\^0\.{y}\.(\d+)", rf">=0.{y}.\1,<0.{y+1}", spec_string)
+    # rewrite occurrences of ^x.y.z to >=x.y.z,<x+1.0.0 (can be anywhere in constraint string)
+    for x in range(1, 10):
+        spec_string = re.sub(
+            rf"\^{x}\.(\d+)\.(\d+)", rf">={x}.\1.\2,<{x+1}", spec_string
+        )
+
+    spec_set = SpecifierSet(spec_string)
+    all_versions = get_pypi_versions(package_name)
+
+    valid_versions = []
+    for version_str in all_versions:
+        try:
+            version = parse(version_str)
+            if spec_set.contains(version):
+                valid_versions.append(version)
+        except ValueError:
+            continue
+
+    return str(min(valid_versions)) if valid_versions else None
 
 
 def get_min_version_from_toml(
@@ -96,7 +133,7 @@ def get_min_version_from_toml(
                 ][0]["version"]
 
             # Use parse_version to get the minimum supported version from version_string
-            min_version = get_min_version(version_string)
+            min_version = get_minimum_version(lib, version_string)
 
             # Store the minimum version in the min_versions dictionary
             min_versions[lib] = min_version
@@ -112,6 +149,20 @@ def check_python_version(version_string, constraint_string):
     :param constraint_string: A string representing the package's Python version constraints (e.g. ">=3.6, <4.0").
     :return: True if the version matches the constraints, False otherwise.
     """
+
+    # rewrite occurrences of ^0.0.z to 0.0.z (can be anywhere in constraint string)
+    constraint_string = re.sub(r"\^0\.0\.(\d+)", r"0.0.\1", constraint_string)
+    # rewrite occurrences of ^0.y.z to >=0.y.z,<0.y+1.0 (can be anywhere in constraint string)
+    for y in range(1, 10):
+        constraint_string = re.sub(
+            rf"\^0\.{y}\.(\d+)", rf">=0.{y}.\1,<0.{y+1}.0", constraint_string
+        )
+    # rewrite occurrences of ^x.y.z to >=x.y.z,<x+1.0.0 (can be anywhere in constraint string)
+    for x in range(1, 10):
+        constraint_string = re.sub(
+            rf"\^{x}\.0\.(\d+)", rf">={x}.0.\1,<{x+1}.0.0", constraint_string
+        )
+
     try:
         version = Version(version_string)
         constraints = SpecifierSet(constraint_string)

diff --git a/.github/workflows/_release.yml b/.github/workflows/_release.yml
@@ -95,9 +95,30 @@ jobs:
           PKG_NAME: ${{ needs.build.outputs.pkg-name }}
           VERSION: ${{ needs.build.outputs.version }}
         run: |
-          REGEX="^$PKG_NAME==\\d+\\.\\d+\\.\\d+\$"
-          echo $REGEX
-          PREV_TAG=$(git tag --sort=-creatordate | grep -P $REGEX || true | head -1)
+          PREV_TAG="$PKG_NAME==${VERSION%.*}.$(( ${VERSION##*.} - 1 ))"; [[ "${VERSION##*.}" -eq 0 ]] && PREV_TAG=""
+
+          # backup case if releasing e.g. 0.3.0, looks up last release
+          # note if last release (chronologically) was e.g. 0.1.47 it will get 
+          # that instead of the last 0.2 release
+          if [ -z "$PREV_TAG" ]; then
+            REGEX="^$PKG_NAME==\\d+\\.\\d+\\.\\d+\$"
+            echo $REGEX
+            PREV_TAG=$(git tag --sort=-creatordate | (grep -P $REGEX || true) | head -1)
+          fi
+
+          # if PREV_TAG is empty, let it be empty
+          if [ -z "$PREV_TAG" ]; then
+            echo "No previous tag found - first release"
+          else
+            # confirm prev-tag actually exists in git repo with git tag
+            GIT_TAG_RESULT=$(git tag -l "$PREV_TAG")
+            if [ -z "$GIT_TAG_RESULT" ]; then
+              echo "Previous tag $PREV_TAG not found in git repo"
+              exit 1
+            fi
+          fi
+
+
           TAG="${PKG_NAME}==${VERSION}"
           if [ "$TAG" == "$PREV_TAG" ]; then
             echo "No new version to release"
@@ -231,7 +252,7 @@ jobs:
         working-directory: ${{ inputs.working-directory }}
         id: min-version
         run: |
-          poetry run pip install packaging
+          poetry run pip install packaging requests
           python_version="$(poetry run python --version | awk '{print $2}')"
           min_versions="$(poetry run python $GITHUB_WORKSPACE/.github/scripts/get_min_versions.py pyproject.toml release $python_version)"
           echo "min-versions=$min_versions" >> "$GITHUB_OUTPUT"

diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml
@@ -47,7 +47,7 @@ jobs:
         id: min-version
         shell: bash
         run: |
-          poetry run pip install packaging tomli
+          poetry run pip install packaging tomli requests
           python_version="$(poetry run python --version | awk '{print $2}')"
           min_versions="$(poetry run python $GITHUB_WORKSPACE/.github/scripts/get_min_versions.py pyproject.toml pull_request $python_version)"
           echo "min-versions=$min_versions" >> "$GITHUB_OUTPUT"

diff --git a/.github/workflows/check_diffs.yml b/.github/workflows/check_diffs.yml
@@ -31,7 +31,7 @@ jobs:
         uses: Ana06/[email protected]
       - id: set-matrix
         run: |
-          python -m pip install packaging
+          python -m pip install packaging requests
           python .github/scripts/check_diff.py ${{ steps.files.outputs.all }} >> $GITHUB_OUTPUT
     outputs:
       lint: ${{ steps.set-matrix.outputs.lint }}

diff --git a/docs/docs/concepts/messages.mdx b/docs/docs/concepts/messages.mdx
@@ -12,7 +12,7 @@ Each message has a **role** (e.g., "user", "assistant"), **content** (e.g., text
 
 LangChain provides a unified message format that can be used across chat models, allowing users to work with different chat models without worrying about the specific details of the message format used by each model provider.
 
-## What inside a message?
+## What is inside a message?
 
 A message typically consists of the following pieces of information:
 

diff --git a/docs/docs/how_to/graph_constructing.ipynb b/docs/docs/how_to/graph_constructing.ipynb
@@ -44,6 +44,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
       "Note: you may need to restart the kernel to use updated packages.\n"
      ]
     }
@@ -105,7 +108,7 @@
     "os.environ[\"NEO4J_USERNAME\"] = \"neo4j\"\n",
     "os.environ[\"NEO4J_PASSWORD\"] = \"password\"\n",
     "\n",
-    "graph = Neo4jGraph()"
+    "graph = Neo4jGraph(refresh_schema=False)"
    ]
   },
   {
@@ -149,8 +152,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Nodes:[Node(id='Marie Curie', type='Person'), Node(id='Pierre Curie', type='Person'), Node(id='University Of Paris', type='Organization')]\n",
-      "Relationships:[Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='Pierre Curie', type='Person'), type='MARRIED'), Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='University Of Paris', type='Organization'), type='PROFESSOR')]\n"
+      "Nodes:[Node(id='Marie Curie', type='Person', properties={}), Node(id='Pierre Curie', type='Person', properties={}), Node(id='University Of Paris', type='Organization', properties={})]\n",
+      "Relationships:[Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='Pierre Curie', type='Person', properties={}), type='MARRIED', properties={}), Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='University Of Paris', type='Organization', properties={}), type='PROFESSOR', properties={})]\n"
      ]
     }
    ],
@@ -191,8 +194,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Nodes:[Node(id='Marie Curie', type='Person'), Node(id='Pierre Curie', type='Person'), Node(id='University Of Paris', type='Organization')]\n",
-      "Relationships:[Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='Pierre Curie', type='Person'), type='SPOUSE'), Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='University Of Paris', type='Organization'), type='WORKED_AT')]\n"
+      "Nodes:[Node(id='Marie Curie', type='Person', properties={}), Node(id='Pierre Curie', type='Person', properties={}), Node(id='University Of Paris', type='Organization', properties={})]\n",
+      "Relationships:[Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='Pierre Curie', type='Person', properties={}), type='SPOUSE', properties={}), Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='University Of Paris', type='Organization', properties={}), type='WORKED_AT', properties={})]\n"
      ]
     }
    ],
@@ -209,6 +212,44 @@
     "print(f\"Relationships:{graph_documents_filtered[0].relationships}\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To define the graph schema more precisely, consider using a three-tuple approach for relationships. In this approach, each tuple consists of three elements: the source node, the relationship type, and the target node."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Nodes:[Node(id='Marie Curie', type='Person', properties={}), Node(id='Pierre Curie', type='Person', properties={}), Node(id='University Of Paris', type='Organization', properties={})]\n",
+      "Relationships:[Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='Pierre Curie', type='Person', properties={}), type='SPOUSE', properties={}), Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='University Of Paris', type='Organization', properties={}), type='WORKED_AT', properties={})]\n"
+     ]
+    }
+   ],
+   "source": [
+    "allowed_relationships = [\n",
+    "    (\"Person\", \"SPOUSE\", \"Person\"),\n",
+    "    (\"Person\", \"NATIONALITY\", \"Country\"),\n",
+    "    (\"Person\", \"WORKED_AT\", \"Organization\"),\n",
+    "]\n",
+    "\n",
+    "llm_transformer_tuple = LLMGraphTransformer(\n",
+    "    llm=llm,\n",
+    "    allowed_nodes=[\"Person\", \"Country\", \"Organization\"],\n",
+    "    allowed_relationships=allowed_relationships,\n",
+    ")\n",
+    "llm_transformer_tuple = llm_transformer_filtered.convert_to_graph_documents(documents)\n",
+    "print(f\"Nodes:{graph_documents_filtered[0].nodes}\")\n",
+    "print(f\"Relationships:{graph_documents_filtered[0].relationships}\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -229,15 +270,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Nodes:[Node(id='Marie Curie', type='Person', properties={'born_year': '1867'}), Node(id='Pierre Curie', type='Person'), Node(id='University Of Paris', type='Organization')]\n",
-      "Relationships:[Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='Pierre Curie', type='Person'), type='SPOUSE'), Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='University Of Paris', type='Organization'), type='WORKED_AT')]\n"
+      "Nodes:[Node(id='Marie Curie', type='Person', properties={'born_year': '1867'}), Node(id='Pierre Curie', type='Person', properties={}), Node(id='University Of Paris', type='Organization', properties={}), Node(id='Poland', type='Country', properties={}), Node(id='France', type='Country', properties={})]\n",
+      "Relationships:[Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='Poland', type='Country', properties={}), type='NATIONALITY', properties={}), Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='France', type='Country', properties={}), type='NATIONALITY', properties={}), Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='Pierre Curie', type='Person', properties={}), type='SPOUSE', properties={}), Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='University Of Paris', type='Organization', properties={}), type='WORKED_AT', properties={})]\n"
      ]
     }
    ],
@@ -264,12 +305,71 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
     "graph.add_graph_documents(graph_documents_props)"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Most graph databases support indexes to optimize data import and retrieval. Since we might not know all the node labels in advance, we can handle this by adding a secondary base label to each node using the `baseEntityLabel` parameter."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "graph.add_graph_documents(graph_documents, baseEntityLabel=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Results will look like:\n",
+    "\n",
+    "![graph_construction3.png](../../static/img/graph_construction3.png)\n",
+    "\n",
+    "The final option is to also import the source documents for the extracted nodes and relationships. This approach lets us track which documents each entity appeared in."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "graph.add_graph_documents(graph_documents, include_source=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Graph will have the following structure:\n",
+    "\n",
+    "![graph_construction4.png](../../static/img/graph_construction4.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this visualization, the source document is highlighted in blue, with all entities extracted from it connected by `MENTIONS` relationships."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -288,7 +388,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.1"
+   "version": "3.11.5"
   }
  },
  "nbformat": 4,

diff --git a/docs/docs/how_to/multi_vector.ipynb b/docs/docs/how_to/multi_vector.ipynb
@@ -207,7 +207,7 @@
    "id": "cdef8339-f9fa-4b3b-955f-ad9dbdf2734f",
    "metadata": {},
    "source": [
-    "The default search type the retriever performs on the vector database is a similarity search. LangChain vector stores also support searching via [Max Marginal Relevance](https://python.langchain.com/api_reference/core/vectorstores/langchain_core.vectorstores.VectorStore.html#langchain_core.vectorstores.VectorStore.max_marginal_relevance_search). This can be controlled via the `search_type` parameter of the retriever:"
+    "The default search type the retriever performs on the vector database is a similarity search. LangChain vector stores also support searching via [Max Marginal Relevance](https://python.langchain.com/api_reference/core/vectorstores/langchain_core.vectorstores.base.VectorStore.html#langchain_core.vectorstores.base.VectorStore.max_marginal_relevance_search). This can be controlled via the `search_type` parameter of the retriever:"
    ]
   },
   {
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		/.github/ @efriis @baskaryan @ccurme
		/libs/packages.yml @efriis