Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Core: Implement find api #48

Merged
merged 2 commits into from
Sep 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 104 additions & 8 deletions tosfs/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -713,6 +713,93 @@ def walk(
path, maxdepth=maxdepth, topdown=topdown, on_error=on_error, **kwargs
)

def find(
self,
path: str,
maxdepth: Optional[int] = None,
withdirs: bool = False,
detail: bool = False,
prefix: str = "",
**kwargs: Any,
) -> Union[List[str], dict]:
"""Find all files or dirs with conditions.

Like posix ``find`` command without conditions

Parameters
----------
path : str
The path to search.
maxdepth: int, optional
If not None, the maximum number of levels to descend
withdirs: bool
Whether to include directory paths in the output. This is True
when used by glob, but users usually only want files.
prefix: str
Only return files that match ``^{path}/{prefix}`` (if there is an
exact match ``filename == {path}/{prefix}``, it also will be included)
detail: bool
If True, return a dict with file information, else just the path
**kwargs: Any
Additional arguments.

"""
if path in ["", "*"] + ["{}://".format(p) for p in self.protocol]:
raise ValueError("Cannot access all of TOS via path {}.".format(path))

path = self._strip_protocol(path)
bucket, key, _ = self._split_path(path)
if not bucket:
raise ValueError("Cannot access all of TOS without specify a bucket.")

if maxdepth and prefix:
raise ValueError(
"Can not specify 'prefix' option alongside 'maxdepth' options."
)
if maxdepth:
return super().find(
bucket + "/" + key,
maxdepth=maxdepth,
withdirs=withdirs,
detail=detail,
**kwargs,
)

out = self._find_file_dir(key, path, prefix, withdirs, kwargs)

if detail:
return {o["name"]: o for o in out}
else:
return [o["name"] for o in out]

def _find_file_dir(
self, key: str, path: str, prefix: str, withdirs: bool, kwargs: Any
) -> List[dict]:
out = self._lsdir(
path, delimiter="", include_self=True, prefix=prefix, **kwargs
)
if not out and key:
try:
out = [self.info(path)]
except FileNotFoundError:
out = []
dirs = []
for o in out:
par = self._parent(o["name"])
if len(path) <= len(par):
d = {
"Key": self._split_path(par)[1],
"Size": 0,
"name": par,
"type": "directory",
}
dirs.append(d)
if withdirs:
out = sorted(out + dirs, key=lambda x: x["name"])
else:
out = [o for o in out if o["type"] == "file"]
return out

def _open_remote_file(
self,
bucket: str,
Expand Down Expand Up @@ -1059,6 +1146,7 @@ def _lsdir(
max_items: int = 1000,
delimiter: str = "/",
prefix: str = "",
include_self: bool = False,
versions: bool = False,
) -> List[dict]:
"""List objects in a directory.
Expand All @@ -1073,6 +1161,8 @@ def _lsdir(
The delimiter to use for grouping objects (default is '/').
prefix : str, optional
The prefix to use for filtering objects (default is '').
include_self : bool, optional
Whether to include the directory itself in the listing (default is False).
versions : bool, optional
Whether to list object versions (default is False).

Expand Down Expand Up @@ -1107,12 +1197,15 @@ def _lsdir(
max_items=max_items,
delimiter=delimiter,
prefix=prefix,
include_self=include_self,
versions=versions,
):
if isinstance(obj, CommonPrefixInfo):
dirs.append(self._fill_common_prefix_info(obj, bucket))
dirs.append(self._fill_dir_info(bucket, obj))
elif obj.key.endswith("/"):
dirs.append(self._fill_dir_info(bucket, None, obj.key))
else:
files.append(self._fill_object_info(obj, bucket, versions))
files.append(self._fill_file_info(obj, bucket, versions))
files += dirs

return files
Expand All @@ -1123,6 +1216,7 @@ def _listdir(
max_items: int = 1000,
delimiter: str = "/",
prefix: str = "",
include_self: bool = False,
versions: bool = False,
) -> List[Union[CommonPrefixInfo, ListedObject, ListedObjectVersion]]:
"""List objects in a bucket.
Expand All @@ -1137,6 +1231,8 @@ def _listdir(
The delimiter to use for grouping objects (default is '/').
prefix : str, optional
The prefix to use for filtering objects (default is '').
include_self : bool, optional
Whether to include the bucket itself in the listing (default is False).
versions : bool, optional
Whether to list object versions (default is False).

Expand Down Expand Up @@ -1194,7 +1290,7 @@ def _listdir(
resp = self.tos_client.list_objects_type2(
bucket,
prefix,
start_after=prefix,
start_after=prefix if not include_self else None,
delimiter=delimiter,
max_keys=max_items,
continuation_token=continuation_token,
Expand Down Expand Up @@ -1255,8 +1351,10 @@ def _split_path(self, path: str) -> Tuple[str, str, Optional[str]]:
)

@staticmethod
def _fill_common_prefix_info(common_prefix: CommonPrefixInfo, bucket: str) -> dict:
name = "/".join([bucket, common_prefix.prefix[:-1]])
def _fill_dir_info(
bucket: str, common_prefix: Optional[CommonPrefixInfo], key: str = ""
) -> dict:
name = "/".join([bucket, common_prefix.prefix[:-1] if common_prefix else key])
return {
"name": name,
"Key": name,
Expand All @@ -1265,9 +1363,7 @@ def _fill_common_prefix_info(common_prefix: CommonPrefixInfo, bucket: str) -> di
}

@staticmethod
def _fill_object_info(
obj: ListedObject, bucket: str, versions: bool = False
) -> dict:
def _fill_file_info(obj: ListedObject, bucket: str, versions: bool = False) -> dict:
result = {
"Key": f"{bucket}/{obj.key}",
"size": obj.size,
Expand Down
71 changes: 71 additions & 0 deletions tosfs/tests/test_tosfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,77 @@ def test_walk(tosfs: TosFileSystem, bucket: str, temporary_workspace: str) -> No
tosfs.rmdir(f"{bucket}/{temporary_workspace}")


def test_find(tosfs: TosFileSystem, bucket: str, temporary_workspace: str) -> None:
with pytest.raises(ValueError, match="Cannot access all of TOS via path ."):
tosfs.find("")

with pytest.raises(ValueError, match="Cannot access all of TOS via path *."):
tosfs.find("*")

with pytest.raises(ValueError, match="Cannot access all of TOS via path tos://."):
tosfs.find("tos://")

with pytest.raises(
ValueError, match="Cannot access all of TOS without specify a bucket."
):
tosfs.find("/")

assert len(tosfs.find(bucket, maxdepth=1)) > 0

with pytest.raises(
ValueError,
match="Can not specify 'prefix' option " "alongside 'maxdepth' options.",
):
tosfs.find(bucket, maxdepth=1, withdirs=True, prefix=temporary_workspace)

result = tosfs.find(bucket, prefix=temporary_workspace)
assert len(result) == 0

result = tosfs.find(bucket, prefix=random_str())
assert len(result) == 0

result = tosfs.find(
bucket, prefix=temporary_workspace + "/", withdirs=True, detail=True
)
assert len(result) == len([bucket, f"{bucket}/{temporary_workspace}/"])
assert (
result[f"{bucket}/{temporary_workspace}/"]["name"]
== f"{bucket}/{temporary_workspace}/"
)
assert result[f"{bucket}/{temporary_workspace}/"]["type"] == "directory"

result = tosfs.find(
f"{bucket}/{temporary_workspace}", withdirs=True, maxdepth=1, detail=True
)
assert len(result) == 1

dir_name = random_str()
sub_dir_name = random_str()
file_name = random_str()
sub_file_name = random_str()

tosfs.makedirs(f"{bucket}/{temporary_workspace}/{dir_name}/{sub_dir_name}")
result = tosfs.find(
f"{bucket}/{temporary_workspace}", prefix=dir_name, withdirs=False
)
assert len(result) == 0

tosfs.touch(f"{bucket}/{temporary_workspace}/{dir_name}/{file_name}")
assert tosfs.exists(f"{bucket}/{temporary_workspace}/{dir_name}/{file_name}")
result = tosfs.find(
f"{bucket}/{temporary_workspace}/{dir_name}", prefix=file_name, withdirs=False
)
assert len(result) == 1

tosfs.rm_file(
f"{bucket}/{temporary_workspace}/{dir_name}/{sub_dir_name}/{sub_file_name}"
)
tosfs.rmdir(f"{bucket}/{temporary_workspace}/{dir_name}/{sub_dir_name}")
tosfs.rm_file(f"{bucket}/{temporary_workspace}/{dir_name}/{file_name}")
tosfs.rmdir(f"{bucket}/{temporary_workspace}/{dir_name}")
tosfs.rmdir(f"{bucket}/{temporary_workspace}")


###########################################################
# File operation tests #
###########################################################
Expand Down