diff --git a/docs/docs/modules/data_connection/document_loaders/json.mdx b/docs/docs/modules/data_connection/document_loaders/json.mdx index 93e85612a76d0..40275dc2c91a7 100644 --- a/docs/docs/modules/data_connection/document_loaders/json.mdx +++ b/docs/docs/modules/data_connection/document_loaders/json.mdx @@ -199,6 +199,58 @@ pprint(data) +### JSON file with jq schema `content_key` + +To load documents from a JSON file using the content_key within the jq schema, set is_content_key_jq_parsable=True. +Ensure that content_key is compatible and can be parsed using the jq schema. + +```python +file_path = './sample.json' +pprint(Path(file_path).read_text()) +``` + + + +```json + {"data": [ + {"attributes": { + "message": "message1", + "tags": [ + "tag1"]}, + "id": "1"}, + {"attributes": { + "message": "message2", + "tags": [ + "tag2"]}, + "id": "2"}]} +``` + + + + +```python +loader = JSONLoader( + file_path=file_path, + jq_schema=".data[]", + content_key=".attributes.message", + is_content_key_jq_parsable=True, +) + +data = loader.load() +``` + +```python +pprint(data) +``` + + + +``` + [Document(page_content='message1', metadata={'source': '/path/to/sample.json', 'seq_num': 1}), + Document(page_content='message2', metadata={'source': '/path/to/sample.json', 'seq_num': 2})] +``` + + ## Extracting metadata diff --git a/libs/community/langchain_community/document_loaders/json_loader.py b/libs/community/langchain_community/document_loaders/json_loader.py index ef7be7caaf81e..f828873073d71 100644 --- a/libs/community/langchain_community/document_loaders/json_loader.py +++ b/libs/community/langchain_community/document_loaders/json_loader.py @@ -21,6 +21,7 @@ def __init__( file_path: Union[str, Path], jq_schema: str, content_key: Optional[str] = None, + is_content_key_jq_parsable: Optional[bool] = False, metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None, text_content: bool = True, json_lines: bool = False, @@ -31,8 +32,16 @@ def __init__( file_path (Union[str, Path]): The path to the JSON or JSON Lines file. jq_schema (str): The jq schema to use to extract the data or text from the JSON. - content_key (str): The key to use to extract the content from the JSON if - the jq_schema results to a list of objects (dict). + content_key (str): The key to use to extract the content from + the JSON if the jq_schema results to a list of objects (dict). + If is_content_key_jq_parsable is True, this has to be a jq compatible + schema. If is_content_key_jq_parsable is False, this should be a simple + string key. + is_content_key_jq_parsable (bool): A flag to determine if + content_key is parsable by jq or not. If True, content_key is + treated as a jq schema and compiled accordingly. If False or if + content_key is None, content_key is used as a simple string. + Default is False. metadata_func (Callable[Dict, Dict]): A function that takes in the JSON object extracted by the jq_schema and the default metadata and returns a dict of the updated metadata. @@ -43,6 +52,8 @@ def __init__( """ try: import jq # noqa:F401 + + self.jq = jq except ImportError: raise ImportError( "jq package not found, please install it with `pip install jq`" @@ -50,6 +61,7 @@ def __init__( self.file_path = Path(file_path).resolve() self._jq_schema = jq.compile(jq_schema) + self._is_content_key_jq_parsable = is_content_key_jq_parsable self._content_key = content_key self._metadata_func = metadata_func self._text_content = text_content @@ -90,7 +102,11 @@ def _parse(self, content: str, docs: List[Document]) -> None: def _get_text(self, sample: Any) -> str: """Convert sample to string format""" if self._content_key is not None: - content = sample.get(self._content_key) + if self._is_content_key_jq_parsable: + compiled_content_key = self.jq.compile(self._content_key) + content = compiled_content_key.input(sample).first() + else: + content = sample[self._content_key] else: content = sample @@ -125,6 +141,7 @@ def _get_metadata( def _validate_content_key(self, data: Any) -> None: """Check if a content key is valid""" + sample = data.first() if not isinstance(sample, dict): raise ValueError( @@ -132,11 +149,22 @@ def _validate_content_key(self, data: Any) -> None: so sample must be a dict but got `{type(sample)}`" ) - if sample.get(self._content_key) is None: + if ( + not self._is_content_key_jq_parsable + and sample.get(self._content_key) is None + ): raise ValueError( f"Expected the jq schema to result in a list of objects (dict) \ with the key `{self._content_key}`" ) + if ( + self._is_content_key_jq_parsable + and self.jq.compile(self._content_key).input(sample).text() is None + ): + raise ValueError( + f"Expected the jq schema to result in a list of objects (dict) \ + with the key `{self._content_key}` which should be parsable by jq" + ) def _validate_metadata_func(self, data: Any) -> None: """Check if the metadata_func output is valid""" diff --git a/libs/community/tests/unit_tests/document_loaders/test_json_loader.py b/libs/community/tests/unit_tests/document_loaders/test_json_loader.py index 046fee4507c9f..c4b1df4f2a30f 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_json_loader.py +++ b/libs/community/tests/unit_tests/document_loaders/test_json_loader.py @@ -319,3 +319,123 @@ def metadata_func(record: Dict, metadata: Dict) -> Dict: result = loader.load() assert result == expected_docs + + +@pytest.mark.parametrize( + "params", + ( + {"jq_schema": ".[].text"}, + {"jq_schema": ".[]", "content_key": "text"}, + { + "jq_schema": ".[]", + "content_key": ".text", + "is_content_key_jq_parsable": True, + }, + ), +) +def test_load_json_with_jq_parsable_content_key( + params: Dict, mocker: MockerFixture +) -> None: + file_path = "/workspaces/langchain/test.json" + expected_docs = [ + Document( + page_content="value1", + metadata={"source": file_path, "seq_num": 1}, + ), + Document( + page_content="value2", + metadata={"source": file_path, "seq_num": 2}, + ), + ] + + mocker.patch( + "pathlib.Path.open", + return_value=io.StringIO( + """ + [{"text": "value1"}, {"text": "value2"}] + """ + ), + ) + + loader = JSONLoader(file_path=file_path, json_lines=True, **params) + result = loader.load() + + assert result == expected_docs + + +def test_load_json_with_nested_jq_parsable_content_key(mocker: MockerFixture) -> None: + file_path = "/workspaces/langchain/test.json" + expected_docs = [ + Document( + page_content="message1", + metadata={"source": file_path, "seq_num": 1}, + ), + Document( + page_content="message2", + metadata={"source": file_path, "seq_num": 2}, + ), + ] + + mocker.patch( + "pathlib.Path.open", + return_value=io.StringIO( + """ + {"data": [ + {"attributes": {"message": "message1","tags": ["tag1"]},"id": "1"}, + {"attributes": {"message": "message2","tags": ["tag2"]},"id": "2"}]} + """ + ), + ) + + loader = JSONLoader( + file_path=file_path, + jq_schema=".data[]", + content_key=".attributes.message", + is_content_key_jq_parsable=True, + ) + result = loader.load() + + assert result == expected_docs + + +def test_load_json_with_nested_jq_parsable_content_key_with_metadata( + mocker: MockerFixture, +) -> None: + file_path = "/workspaces/langchain/test.json" + expected_docs = [ + Document( + page_content="message1", + metadata={"source": file_path, "seq_num": 1, "id": "1", "tags": ["tag1"]}, + ), + Document( + page_content="message2", + metadata={"source": file_path, "seq_num": 2, "id": "2", "tags": ["tag2"]}, + ), + ] + + mocker.patch( + "pathlib.Path.open", + return_value=io.StringIO( + """ + {"data": [ + {"attributes": {"message": "message1","tags": ["tag1"]},"id": "1"}, + {"attributes": {"message": "message2","tags": ["tag2"]},"id": "2"}]} + """ + ), + ) + + def _metadata_func(record: dict, metadata: dict) -> dict: + metadata["id"] = record.get("id") + metadata["tags"] = record["attributes"].get("tags") + return metadata + + loader = JSONLoader( + file_path=file_path, + jq_schema=".data[]", + content_key=".attributes.message", + is_content_key_jq_parsable=True, + metadata_func=_metadata_func, + ) + result = loader.load() + + assert result == expected_docs