Skip to content

Commit

Permalink
community: add exclude parameter to DirectoryLoader
Browse files Browse the repository at this point in the history
  • Loading branch information
nejch committed Feb 13, 2024
1 parent 3925071 commit bd784e5
Show file tree
Hide file tree
Showing 5 changed files with 285 additions and 29 deletions.
40 changes: 38 additions & 2 deletions libs/community/langchain_community/document_loaders/directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import logging
import random
from pathlib import Path
from typing import Any, List, Optional, Type, Union
from typing import Any, List, Optional, Sequence, Type, Union

from langchain_core.documents import Document

Expand Down Expand Up @@ -41,6 +41,7 @@ def __init__(
use_multithreading: bool = False,
max_concurrency: int = 4,
*,
exclude: Union[Sequence[str], str] = (),
sample_size: int = 0,
randomize_sample: bool = False,
sample_seed: Union[int, None] = None,
Expand All @@ -51,6 +52,8 @@ def __init__(
path: Path to directory.
glob: Glob pattern to use to find files. Defaults to "**/[!.]*"
(all files except hidden).
exclude: A pattern or list of patterns to exclude from results.
Use glob syntax.
silent_errors: Whether to silently ignore errors. Defaults to False.
load_hidden: Whether to load hidden files. Defaults to False.
loader_cls: Loader class to use for loading files.
Expand All @@ -64,11 +67,38 @@ def __init__(
directory.
randomize_sample: Shuffle the files to get a random sample.
sample_seed: set the seed of the random shuffle for reproducibility.
Examples:
.. code-block:: python
from langchain_community.document_loaders import DirectoryLoader
# Load all non-hidden files in a directory.
loader = DirectoryLoader("/path/to/directory")
# Load all text files in a directory without recursion.
loader = DirectoryLoader("/path/to/directory", glob="*.txt")
# Recursively load all text files in a directory.
loader = DirectoryLoader(
"/path/to/directory", glob="*.txt", recursive=True
)
# Load all files in a directory, except for py files.
loader = DirectoryLoader("/path/to/directory", exclude="*.py")
# Load all files in a directory, except for py or pyc files.
loader = DirectoryLoader(
"/path/to/directory", exclude=["*.py", "*.pyc"]
)
"""
if loader_kwargs is None:
loader_kwargs = {}
if isinstance(exclude, str):
exclude = (exclude,)
self.path = path
self.glob = glob
self.exclude = exclude
self.load_hidden = load_hidden
self.loader_cls = loader_cls
self.loader_kwargs = loader_kwargs
Expand Down Expand Up @@ -118,7 +148,13 @@ def load(self) -> List[Document]:
raise ValueError(f"Expected directory, got file: '{self.path}'")

docs: List[Document] = []
items = list(p.rglob(self.glob) if self.recursive else p.glob(self.glob))

paths = p.rglob(self.glob) if self.recursive else p.glob(self.glob)
items = [
path
for path in paths
if not (self.exclude and any(path.match(glob) for glob in self.exclude))
]

if self.sample_size > 0:
if self.randomize_sample:
Expand Down
217 changes: 213 additions & 4 deletions libs/community/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion libs/community/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ hdbcli = {version = "^2.19.21", optional = true}
oci = {version = "^2.119.1", optional = true}
rdflib = {version = "7.0.0", optional = true}
nvidia-riva-client = {version = "^2.14.0", optional = true}
unstructured = {version = "^0.12.4", optional = true, python = ">=3.9.0,<3.12"}

[tool.poetry.group.test]
optional = true
Expand Down Expand Up @@ -256,7 +257,8 @@ extended_testing = [
"elasticsearch",
"hdbcli",
"oci",
"rdflib"
"rdflib",
"unstructured",
]

[tool.ruff]
Expand Down
Loading

0 comments on commit bd784e5

Please sign in to comment.