Skip to content

Commit

Permalink
Merge branch 'master' into fix-github-list-files
Browse files Browse the repository at this point in the history
  • Loading branch information
lorenzb07 authored Nov 6, 2024
2 parents 5f47883 + 41b7a51 commit 42bd890
Show file tree
Hide file tree
Showing 43 changed files with 2,373 additions and 1,629 deletions.
2 changes: 2 additions & 0 deletions .github/CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
/.github/ @efriis @baskaryan @ccurme
/libs/packages.yml @efriis
95 changes: 73 additions & 22 deletions .github/scripts/get_min_versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,17 @@
# for python 3.10 and below, which doesnt have stdlib tomllib
import tomli as tomllib

from packaging.version import parse as parse_version
from packaging.specifiers import SpecifierSet
from packaging.version import Version


import requests
from packaging.version import parse
from typing import List

import re


MIN_VERSION_LIBS = [
"langchain-core",
"langchain-community",
Expand All @@ -31,29 +36,61 @@
]


def get_min_version(version: str) -> str:
# base regex for x.x.x with cases for rc/post/etc
# valid strings: https://peps.python.org/pep-0440/#public-version-identifiers
vstring = r"\d+(?:\.\d+){0,2}(?:(?:a|b|rc|\.post|\.dev)\d+)?"
# case ^x.x.x
_match = re.match(f"^\\^({vstring})$", version)
if _match:
return _match.group(1)
def get_pypi_versions(package_name: str) -> List[str]:
"""
Fetch all available versions for a package from PyPI.
Args:
package_name (str): Name of the package
Returns:
List[str]: List of all available versions
# case >=x.x.x,<y.y.y
_match = re.match(f"^>=({vstring}),<({vstring})$", version)
if _match:
_min = _match.group(1)
_max = _match.group(2)
assert parse_version(_min) < parse_version(_max)
return _min
Raises:
requests.exceptions.RequestException: If PyPI API request fails
KeyError: If package not found or response format unexpected
"""
pypi_url = f"https://pypi.org/pypi/{package_name}/json"
response = requests.get(pypi_url)
response.raise_for_status()
return list(response.json()["releases"].keys())

# case x.x.x
_match = re.match(f"^({vstring})$", version)
if _match:
return _match.group(1)

raise ValueError(f"Unrecognized version format: {version}")
def get_minimum_version(package_name: str, spec_string: str) -> Optional[str]:
"""
Find the minimum published version that satisfies the given constraints.
Args:
package_name (str): Name of the package
spec_string (str): Version specification string (e.g., ">=0.2.43,<0.4.0,!=0.3.0")
Returns:
Optional[str]: Minimum compatible version or None if no compatible version found
"""
# rewrite occurrences of ^0.0.z to 0.0.z (can be anywhere in constraint string)
spec_string = re.sub(r"\^0\.0\.(\d+)", r"0.0.\1", spec_string)
# rewrite occurrences of ^0.y.z to >=0.y.z,<0.y+1 (can be anywhere in constraint string)
for y in range(1, 10):
spec_string = re.sub(rf"\^0\.{y}\.(\d+)", rf">=0.{y}.\1,<0.{y+1}", spec_string)
# rewrite occurrences of ^x.y.z to >=x.y.z,<x+1.0.0 (can be anywhere in constraint string)
for x in range(1, 10):
spec_string = re.sub(
rf"\^{x}\.(\d+)\.(\d+)", rf">={x}.\1.\2,<{x+1}", spec_string
)

spec_set = SpecifierSet(spec_string)
all_versions = get_pypi_versions(package_name)

valid_versions = []
for version_str in all_versions:
try:
version = parse(version_str)
if spec_set.contains(version):
valid_versions.append(version)
except ValueError:
continue

return str(min(valid_versions)) if valid_versions else None


def get_min_version_from_toml(
Expand Down Expand Up @@ -96,7 +133,7 @@ def get_min_version_from_toml(
][0]["version"]

# Use parse_version to get the minimum supported version from version_string
min_version = get_min_version(version_string)
min_version = get_minimum_version(lib, version_string)

# Store the minimum version in the min_versions dictionary
min_versions[lib] = min_version
Expand All @@ -112,6 +149,20 @@ def check_python_version(version_string, constraint_string):
:param constraint_string: A string representing the package's Python version constraints (e.g. ">=3.6, <4.0").
:return: True if the version matches the constraints, False otherwise.
"""

# rewrite occurrences of ^0.0.z to 0.0.z (can be anywhere in constraint string)
constraint_string = re.sub(r"\^0\.0\.(\d+)", r"0.0.\1", constraint_string)
# rewrite occurrences of ^0.y.z to >=0.y.z,<0.y+1.0 (can be anywhere in constraint string)
for y in range(1, 10):
constraint_string = re.sub(
rf"\^0\.{y}\.(\d+)", rf">=0.{y}.\1,<0.{y+1}.0", constraint_string
)
# rewrite occurrences of ^x.y.z to >=x.y.z,<x+1.0.0 (can be anywhere in constraint string)
for x in range(1, 10):
constraint_string = re.sub(
rf"\^{x}\.0\.(\d+)", rf">={x}.0.\1,<{x+1}.0.0", constraint_string
)

try:
version = Version(version_string)
constraints = SpecifierSet(constraint_string)
Expand Down
29 changes: 25 additions & 4 deletions .github/workflows/_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -95,9 +95,30 @@ jobs:
PKG_NAME: ${{ needs.build.outputs.pkg-name }}
VERSION: ${{ needs.build.outputs.version }}
run: |
REGEX="^$PKG_NAME==\\d+\\.\\d+\\.\\d+\$"
echo $REGEX
PREV_TAG=$(git tag --sort=-creatordate | grep -P $REGEX || true | head -1)
PREV_TAG="$PKG_NAME==${VERSION%.*}.$(( ${VERSION##*.} - 1 ))"; [[ "${VERSION##*.}" -eq 0 ]] && PREV_TAG=""
# backup case if releasing e.g. 0.3.0, looks up last release
# note if last release (chronologically) was e.g. 0.1.47 it will get
# that instead of the last 0.2 release
if [ -z "$PREV_TAG" ]; then
REGEX="^$PKG_NAME==\\d+\\.\\d+\\.\\d+\$"
echo $REGEX
PREV_TAG=$(git tag --sort=-creatordate | (grep -P $REGEX || true) | head -1)
fi
# if PREV_TAG is empty, let it be empty
if [ -z "$PREV_TAG" ]; then
echo "No previous tag found - first release"
else
# confirm prev-tag actually exists in git repo with git tag
GIT_TAG_RESULT=$(git tag -l "$PREV_TAG")
if [ -z "$GIT_TAG_RESULT" ]; then
echo "Previous tag $PREV_TAG not found in git repo"
exit 1
fi
fi
TAG="${PKG_NAME}==${VERSION}"
if [ "$TAG" == "$PREV_TAG" ]; then
echo "No new version to release"
Expand Down Expand Up @@ -231,7 +252,7 @@ jobs:
working-directory: ${{ inputs.working-directory }}
id: min-version
run: |
poetry run pip install packaging
poetry run pip install packaging requests
python_version="$(poetry run python --version | awk '{print $2}')"
min_versions="$(poetry run python $GITHUB_WORKSPACE/.github/scripts/get_min_versions.py pyproject.toml release $python_version)"
echo "min-versions=$min_versions" >> "$GITHUB_OUTPUT"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ jobs:
id: min-version
shell: bash
run: |
poetry run pip install packaging tomli
poetry run pip install packaging tomli requests
python_version="$(poetry run python --version | awk '{print $2}')"
min_versions="$(poetry run python $GITHUB_WORKSPACE/.github/scripts/get_min_versions.py pyproject.toml pull_request $python_version)"
echo "min-versions=$min_versions" >> "$GITHUB_OUTPUT"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/check_diffs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
uses: Ana06/[email protected]
- id: set-matrix
run: |
python -m pip install packaging
python -m pip install packaging requests
python .github/scripts/check_diff.py ${{ steps.files.outputs.all }} >> $GITHUB_OUTPUT
outputs:
lint: ${{ steps.set-matrix.outputs.lint }}
Expand Down
2 changes: 1 addition & 1 deletion docs/docs/concepts/messages.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Each message has a **role** (e.g., "user", "assistant"), **content** (e.g., text

LangChain provides a unified message format that can be used across chat models, allowing users to work with different chat models without worrying about the specific details of the message format used by each model provider.

## What inside a message?
## What is inside a message?

A message typically consists of the following pieces of information:

Expand Down
120 changes: 110 additions & 10 deletions docs/docs/how_to/graph_constructing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
Expand Down Expand Up @@ -105,7 +108,7 @@
"os.environ[\"NEO4J_USERNAME\"] = \"neo4j\"\n",
"os.environ[\"NEO4J_PASSWORD\"] = \"password\"\n",
"\n",
"graph = Neo4jGraph()"
"graph = Neo4jGraph(refresh_schema=False)"
]
},
{
Expand Down Expand Up @@ -149,8 +152,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Nodes:[Node(id='Marie Curie', type='Person'), Node(id='Pierre Curie', type='Person'), Node(id='University Of Paris', type='Organization')]\n",
"Relationships:[Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='Pierre Curie', type='Person'), type='MARRIED'), Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='University Of Paris', type='Organization'), type='PROFESSOR')]\n"
"Nodes:[Node(id='Marie Curie', type='Person', properties={}), Node(id='Pierre Curie', type='Person', properties={}), Node(id='University Of Paris', type='Organization', properties={})]\n",
"Relationships:[Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='Pierre Curie', type='Person', properties={}), type='MARRIED', properties={}), Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='University Of Paris', type='Organization', properties={}), type='PROFESSOR', properties={})]\n"
]
}
],
Expand Down Expand Up @@ -191,8 +194,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Nodes:[Node(id='Marie Curie', type='Person'), Node(id='Pierre Curie', type='Person'), Node(id='University Of Paris', type='Organization')]\n",
"Relationships:[Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='Pierre Curie', type='Person'), type='SPOUSE'), Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='University Of Paris', type='Organization'), type='WORKED_AT')]\n"
"Nodes:[Node(id='Marie Curie', type='Person', properties={}), Node(id='Pierre Curie', type='Person', properties={}), Node(id='University Of Paris', type='Organization', properties={})]\n",
"Relationships:[Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='Pierre Curie', type='Person', properties={}), type='SPOUSE', properties={}), Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='University Of Paris', type='Organization', properties={}), type='WORKED_AT', properties={})]\n"
]
}
],
Expand All @@ -209,6 +212,44 @@
"print(f\"Relationships:{graph_documents_filtered[0].relationships}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To define the graph schema more precisely, consider using a three-tuple approach for relationships. In this approach, each tuple consists of three elements: the source node, the relationship type, and the target node."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Nodes:[Node(id='Marie Curie', type='Person', properties={}), Node(id='Pierre Curie', type='Person', properties={}), Node(id='University Of Paris', type='Organization', properties={})]\n",
"Relationships:[Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='Pierre Curie', type='Person', properties={}), type='SPOUSE', properties={}), Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='University Of Paris', type='Organization', properties={}), type='WORKED_AT', properties={})]\n"
]
}
],
"source": [
"allowed_relationships = [\n",
" (\"Person\", \"SPOUSE\", \"Person\"),\n",
" (\"Person\", \"NATIONALITY\", \"Country\"),\n",
" (\"Person\", \"WORKED_AT\", \"Organization\"),\n",
"]\n",
"\n",
"llm_transformer_tuple = LLMGraphTransformer(\n",
" llm=llm,\n",
" allowed_nodes=[\"Person\", \"Country\", \"Organization\"],\n",
" allowed_relationships=allowed_relationships,\n",
")\n",
"llm_transformer_tuple = llm_transformer_filtered.convert_to_graph_documents(documents)\n",
"print(f\"Nodes:{graph_documents_filtered[0].nodes}\")\n",
"print(f\"Relationships:{graph_documents_filtered[0].relationships}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -229,15 +270,15 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Nodes:[Node(id='Marie Curie', type='Person', properties={'born_year': '1867'}), Node(id='Pierre Curie', type='Person'), Node(id='University Of Paris', type='Organization')]\n",
"Relationships:[Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='Pierre Curie', type='Person'), type='SPOUSE'), Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='University Of Paris', type='Organization'), type='WORKED_AT')]\n"
"Nodes:[Node(id='Marie Curie', type='Person', properties={'born_year': '1867'}), Node(id='Pierre Curie', type='Person', properties={}), Node(id='University Of Paris', type='Organization', properties={}), Node(id='Poland', type='Country', properties={}), Node(id='France', type='Country', properties={})]\n",
"Relationships:[Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='Poland', type='Country', properties={}), type='NATIONALITY', properties={}), Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='France', type='Country', properties={}), type='NATIONALITY', properties={}), Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='Pierre Curie', type='Person', properties={}), type='SPOUSE', properties={}), Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='University Of Paris', type='Organization', properties={}), type='WORKED_AT', properties={})]\n"
]
}
],
Expand All @@ -264,12 +305,71 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"graph.add_graph_documents(graph_documents_props)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Most graph databases support indexes to optimize data import and retrieval. Since we might not know all the node labels in advance, we can handle this by adding a secondary base label to each node using the `baseEntityLabel` parameter."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"graph.add_graph_documents(graph_documents, baseEntityLabel=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Results will look like:\n",
"\n",
"![graph_construction3.png](../../static/img/graph_construction3.png)\n",
"\n",
"The final option is to also import the source documents for the extracted nodes and relationships. This approach lets us track which documents each entity appeared in."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"graph.add_graph_documents(graph_documents, include_source=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Graph will have the following structure:\n",
"\n",
"![graph_construction4.png](../../static/img/graph_construction4.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this visualization, the source document is highlighted in blue, with all entities extracted from it connected by `MENTIONS` relationships."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand All @@ -288,7 +388,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.1"
"version": "3.11.5"
}
},
"nbformat": 4,
Expand Down
2 changes: 1 addition & 1 deletion docs/docs/how_to/multi_vector.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@
"id": "cdef8339-f9fa-4b3b-955f-ad9dbdf2734f",
"metadata": {},
"source": [
"The default search type the retriever performs on the vector database is a similarity search. LangChain vector stores also support searching via [Max Marginal Relevance](https://python.langchain.com/api_reference/core/vectorstores/langchain_core.vectorstores.VectorStore.html#langchain_core.vectorstores.VectorStore.max_marginal_relevance_search). This can be controlled via the `search_type` parameter of the retriever:"
"The default search type the retriever performs on the vector database is a similarity search. LangChain vector stores also support searching via [Max Marginal Relevance](https://python.langchain.com/api_reference/core/vectorstores/langchain_core.vectorstores.base.VectorStore.html#langchain_core.vectorstores.base.VectorStore.max_marginal_relevance_search). This can be controlled via the `search_type` parameter of the retriever:"
]
},
{
Expand Down
Loading

0 comments on commit 42bd890

Please sign in to comment.