diff --git a/docs/docs/modules/data_connection/document_loaders/json.mdx b/docs/docs/modules/data_connection/document_loaders/json.mdx
index 93e85612a76d0..40275dc2c91a7 100644
--- a/docs/docs/modules/data_connection/document_loaders/json.mdx
+++ b/docs/docs/modules/data_connection/document_loaders/json.mdx
@@ -199,6 +199,58 @@ pprint(data)
+### JSON file with jq schema `content_key`
+
+To load documents from a JSON file using the content_key within the jq schema, set is_content_key_jq_parsable=True.
+Ensure that content_key is compatible and can be parsed using the jq schema.
+
+```python
+file_path = './sample.json'
+pprint(Path(file_path).read_text())
+```
+
+
+
+```json
+ {"data": [
+ {"attributes": {
+ "message": "message1",
+ "tags": [
+ "tag1"]},
+ "id": "1"},
+ {"attributes": {
+ "message": "message2",
+ "tags": [
+ "tag2"]},
+ "id": "2"}]}
+```
+
+
+
+
+```python
+loader = JSONLoader(
+ file_path=file_path,
+ jq_schema=".data[]",
+ content_key=".attributes.message",
+ is_content_key_jq_parsable=True,
+)
+
+data = loader.load()
+```
+
+```python
+pprint(data)
+```
+
+
+
+```
+ [Document(page_content='message1', metadata={'source': '/path/to/sample.json', 'seq_num': 1}),
+ Document(page_content='message2', metadata={'source': '/path/to/sample.json', 'seq_num': 2})]
+```
+
+
## Extracting metadata
diff --git a/libs/community/langchain_community/document_loaders/json_loader.py b/libs/community/langchain_community/document_loaders/json_loader.py
index ef7be7caaf81e..f828873073d71 100644
--- a/libs/community/langchain_community/document_loaders/json_loader.py
+++ b/libs/community/langchain_community/document_loaders/json_loader.py
@@ -21,6 +21,7 @@ def __init__(
file_path: Union[str, Path],
jq_schema: str,
content_key: Optional[str] = None,
+ is_content_key_jq_parsable: Optional[bool] = False,
metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None,
text_content: bool = True,
json_lines: bool = False,
@@ -31,8 +32,16 @@ def __init__(
file_path (Union[str, Path]): The path to the JSON or JSON Lines file.
jq_schema (str): The jq schema to use to extract the data or text from
the JSON.
- content_key (str): The key to use to extract the content from the JSON if
- the jq_schema results to a list of objects (dict).
+ content_key (str): The key to use to extract the content from
+ the JSON if the jq_schema results to a list of objects (dict).
+ If is_content_key_jq_parsable is True, this has to be a jq compatible
+ schema. If is_content_key_jq_parsable is False, this should be a simple
+ string key.
+ is_content_key_jq_parsable (bool): A flag to determine if
+ content_key is parsable by jq or not. If True, content_key is
+ treated as a jq schema and compiled accordingly. If False or if
+ content_key is None, content_key is used as a simple string.
+ Default is False.
metadata_func (Callable[Dict, Dict]): A function that takes in the JSON
object extracted by the jq_schema and the default metadata and returns
a dict of the updated metadata.
@@ -43,6 +52,8 @@ def __init__(
"""
try:
import jq # noqa:F401
+
+ self.jq = jq
except ImportError:
raise ImportError(
"jq package not found, please install it with `pip install jq`"
@@ -50,6 +61,7 @@ def __init__(
self.file_path = Path(file_path).resolve()
self._jq_schema = jq.compile(jq_schema)
+ self._is_content_key_jq_parsable = is_content_key_jq_parsable
self._content_key = content_key
self._metadata_func = metadata_func
self._text_content = text_content
@@ -90,7 +102,11 @@ def _parse(self, content: str, docs: List[Document]) -> None:
def _get_text(self, sample: Any) -> str:
"""Convert sample to string format"""
if self._content_key is not None:
- content = sample.get(self._content_key)
+ if self._is_content_key_jq_parsable:
+ compiled_content_key = self.jq.compile(self._content_key)
+ content = compiled_content_key.input(sample).first()
+ else:
+ content = sample[self._content_key]
else:
content = sample
@@ -125,6 +141,7 @@ def _get_metadata(
def _validate_content_key(self, data: Any) -> None:
"""Check if a content key is valid"""
+
sample = data.first()
if not isinstance(sample, dict):
raise ValueError(
@@ -132,11 +149,22 @@ def _validate_content_key(self, data: Any) -> None:
so sample must be a dict but got `{type(sample)}`"
)
- if sample.get(self._content_key) is None:
+ if (
+ not self._is_content_key_jq_parsable
+ and sample.get(self._content_key) is None
+ ):
raise ValueError(
f"Expected the jq schema to result in a list of objects (dict) \
with the key `{self._content_key}`"
)
+ if (
+ self._is_content_key_jq_parsable
+ and self.jq.compile(self._content_key).input(sample).text() is None
+ ):
+ raise ValueError(
+ f"Expected the jq schema to result in a list of objects (dict) \
+ with the key `{self._content_key}` which should be parsable by jq"
+ )
def _validate_metadata_func(self, data: Any) -> None:
"""Check if the metadata_func output is valid"""
diff --git a/libs/community/tests/unit_tests/document_loaders/test_json_loader.py b/libs/community/tests/unit_tests/document_loaders/test_json_loader.py
index 046fee4507c9f..c4b1df4f2a30f 100644
--- a/libs/community/tests/unit_tests/document_loaders/test_json_loader.py
+++ b/libs/community/tests/unit_tests/document_loaders/test_json_loader.py
@@ -319,3 +319,123 @@ def metadata_func(record: Dict, metadata: Dict) -> Dict:
result = loader.load()
assert result == expected_docs
+
+
+@pytest.mark.parametrize(
+ "params",
+ (
+ {"jq_schema": ".[].text"},
+ {"jq_schema": ".[]", "content_key": "text"},
+ {
+ "jq_schema": ".[]",
+ "content_key": ".text",
+ "is_content_key_jq_parsable": True,
+ },
+ ),
+)
+def test_load_json_with_jq_parsable_content_key(
+ params: Dict, mocker: MockerFixture
+) -> None:
+ file_path = "/workspaces/langchain/test.json"
+ expected_docs = [
+ Document(
+ page_content="value1",
+ metadata={"source": file_path, "seq_num": 1},
+ ),
+ Document(
+ page_content="value2",
+ metadata={"source": file_path, "seq_num": 2},
+ ),
+ ]
+
+ mocker.patch(
+ "pathlib.Path.open",
+ return_value=io.StringIO(
+ """
+ [{"text": "value1"}, {"text": "value2"}]
+ """
+ ),
+ )
+
+ loader = JSONLoader(file_path=file_path, json_lines=True, **params)
+ result = loader.load()
+
+ assert result == expected_docs
+
+
+def test_load_json_with_nested_jq_parsable_content_key(mocker: MockerFixture) -> None:
+ file_path = "/workspaces/langchain/test.json"
+ expected_docs = [
+ Document(
+ page_content="message1",
+ metadata={"source": file_path, "seq_num": 1},
+ ),
+ Document(
+ page_content="message2",
+ metadata={"source": file_path, "seq_num": 2},
+ ),
+ ]
+
+ mocker.patch(
+ "pathlib.Path.open",
+ return_value=io.StringIO(
+ """
+ {"data": [
+ {"attributes": {"message": "message1","tags": ["tag1"]},"id": "1"},
+ {"attributes": {"message": "message2","tags": ["tag2"]},"id": "2"}]}
+ """
+ ),
+ )
+
+ loader = JSONLoader(
+ file_path=file_path,
+ jq_schema=".data[]",
+ content_key=".attributes.message",
+ is_content_key_jq_parsable=True,
+ )
+ result = loader.load()
+
+ assert result == expected_docs
+
+
+def test_load_json_with_nested_jq_parsable_content_key_with_metadata(
+ mocker: MockerFixture,
+) -> None:
+ file_path = "/workspaces/langchain/test.json"
+ expected_docs = [
+ Document(
+ page_content="message1",
+ metadata={"source": file_path, "seq_num": 1, "id": "1", "tags": ["tag1"]},
+ ),
+ Document(
+ page_content="message2",
+ metadata={"source": file_path, "seq_num": 2, "id": "2", "tags": ["tag2"]},
+ ),
+ ]
+
+ mocker.patch(
+ "pathlib.Path.open",
+ return_value=io.StringIO(
+ """
+ {"data": [
+ {"attributes": {"message": "message1","tags": ["tag1"]},"id": "1"},
+ {"attributes": {"message": "message2","tags": ["tag2"]},"id": "2"}]}
+ """
+ ),
+ )
+
+ def _metadata_func(record: dict, metadata: dict) -> dict:
+ metadata["id"] = record.get("id")
+ metadata["tags"] = record["attributes"].get("tags")
+ return metadata
+
+ loader = JSONLoader(
+ file_path=file_path,
+ jq_schema=".data[]",
+ content_key=".attributes.message",
+ is_content_key_jq_parsable=True,
+ metadata_func=_metadata_func,
+ )
+ result = loader.load()
+
+ assert result == expected_docs