Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

community: Added propagation of document metadata from O365BaseLoader #20663

Merged
merged 28 commits into from
May 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
df9ca00
Added propagation of document metadata from O365BaseLoader to FileSys…
MacanPN Apr 19, 2024
4f113b8
in sharepoint propagating all metadata, not just web_url
MacanPN Apr 19, 2024
19d654a
Merge branch 'master' into triska/O365_loader_update
baskaryan Apr 24, 2024
92e1f8c
Merge branch 'master' into triska/O365_loader_update
MacanPN Apr 29, 2024
b47576f
stricter typing on metadata_dict
MacanPN Apr 30, 2024
351d886
Merge branch 'triska/O365_loader_update' of https://github.com/MacanP…
MacanPN Apr 30, 2024
b3da8f0
importing `Any` from `typing` for backwards compatibility with python…
MacanPN Apr 30, 2024
edb6bc5
fixed initiation of metadata_dict
MacanPN Apr 30, 2024
ab0c142
Importing Dict from typing for compatibility with python 3.8
MacanPN Apr 30, 2024
7e83458
sorted imports
MacanPN Apr 30, 2024
3b3fffa
add typing for metadata_dict in base_o365.py
MacanPN Apr 30, 2024
d72d8b4
passing the `web_url` as `source` since that is preserved by parsers
MacanPN Apr 30, 2024
0ae2b14
Merge branch 'master' into triska/O365_loader_update
MacanPN Apr 30, 2024
4b1941d
Merge branch 'master' into triska/O365_loader_update
MacanPN May 1, 2024
d74eca2
Merge branch 'master' into triska/O365_loader_update
MacanPN May 2, 2024
8e080bc
Merge branch 'master' into triska/O365_loader_update
MacanPN May 6, 2024
cb73e8f
Merge branch 'master' into triska/O365_loader_update
MacanPN May 7, 2024
c270c8b
Merge branch 'master' into triska/O365_loader_update
MacanPN May 8, 2024
13dd518
Reverting changes to fily_system loader
MacanPN May 20, 2024
4e5aec2
Merge branch 'triska/O365_loader_update' of https://github.com/MacanP…
MacanPN May 20, 2024
022ca77
modified where/how metadata are preserved
MacanPN May 20, 2024
fc9269c
linting
MacanPN May 20, 2024
9d4c430
ensuring metadata is never "None"
MacanPN May 20, 2024
b2191eb
handling of a case when document.path is not pathlib object
MacanPN May 20, 2024
07aef47
formatting
MacanPN May 20, 2024
fc3884a
x
eyurtsev May 22, 2024
5bcbd74
x
eyurtsev May 22, 2024
a2be6a4
Merge branch 'master' into triska/O365_loader_update
MacanPN May 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 21 additions & 3 deletions libs/community/langchain_community/document_loaders/base_o365.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
"""Base class for all loaders that uses O365 Package"""

from __future__ import annotations

import logging
import os
import tempfile
from abc import abstractmethod
from enum import Enum
from pathlib import Path
from typing import TYPE_CHECKING, Dict, Iterable, List, Sequence, Union
from pathlib import Path, PurePath
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Sequence, Union

from langchain_core.pydantic_v1 import (
BaseModel,
Expand Down Expand Up @@ -108,14 +109,31 @@ def _load_from_folder(self, folder: Folder) -> Iterable[Blob]:
"""
file_mime_types = self._fetch_mime_types
items = folder.get_items()
metadata_dict: Dict[str, Dict[str, Any]] = {}
with tempfile.TemporaryDirectory() as temp_dir:
os.makedirs(os.path.dirname(temp_dir), exist_ok=True)
for file in items:
if file.is_file:
if file.mime_type in list(file_mime_types.values()):
file.download(to_path=temp_dir, chunk_size=self.chunk_size)
metadata_dict[file.name] = {
"source": file.web_url,
"mime_type": file.mime_type,
"created": file.created,
"modified": file.modified,
"created_by": str(file.created_by),
"modified_by": str(file.modified_by),
"description": file.description,
}

loader = FileSystemBlobLoader(path=temp_dir)
yield from loader.yield_blobs()
for blob in loader.yield_blobs():
if not isinstance(blob.path, PurePath):
raise NotImplementedError("Expected blob path to be a PurePath")
if blob.path:
file_metadata_ = metadata_dict.get(str(blob.path), {})
blob.metadata.update(file_metadata_)
yield blob
if self.recursive:
for subfolder in folder.get_child_folders():
yield from self._load_from_folder(subfolder)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Loader that loads data from Sharepoint Document Library"""

from __future__ import annotations

import json
Expand Down Expand Up @@ -82,7 +83,9 @@ def lazy_load(self) -> Iterator[Document]:
if not isinstance(target_folder, Folder):
raise ValueError("Unable to fetch root folder")
for blob in self._load_from_folder(target_folder):
yield from blob_parser.lazy_parse(blob)
for blob_part in blob_parser.lazy_parse(blob):
blob_part.metadata.update(blob.metadata)
yield blob_part

def authorized_identities(self) -> List:
data = self._fetch_access_token()
Expand Down
Loading