Skip to content

Commit

Permalink
uniref
Browse files Browse the repository at this point in the history
  • Loading branch information
ncfrey committed May 14, 2024
1 parent 21b5b4e commit 7f6a480
Show file tree
Hide file tree
Showing 14 changed files with 768 additions and 4 deletions.
4 changes: 4 additions & 0 deletions src/beignet/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from ._fasta_dataset import FASTADataset
from ._sequence_dataset import SequenceDataset
from ._sized_sequence_dataset import SizedSequenceDataset
from ._uni_ref_50_dataset import UniRef50Dataset
from ._uni_ref_90_dataset import UniRef90Dataset
from ._uni_ref_100_dataset import UniRef100Dataset
from ._uni_ref_dataset import UniRefDataset

__all__ = [
"FASTADataset",
Expand Down
3 changes: 2 additions & 1 deletion src/beignet/datasets/_fasta_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@

import numpy

from beignet.datasets import SizedSequenceDataset
from beignet.io import ThreadSafeFile

from ._sized_sequence_dataset import SizedSequenceDataset

T = TypeVar("T")


Expand Down
46 changes: 46 additions & 0 deletions src/beignet/datasets/_uni_ref_100_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from pathlib import Path
from typing import Callable, Optional, Union

from ._uni_ref_dataset import UniRefDataset


class UniRef100Dataset(UniRefDataset):
def __init__(
self,
root: Union[str, Path],
*,
index: bool = True,
download: bool = False,
transform_fn: Optional[Callable] = None,
target_transform_fn: Optional[Callable] = None,
) -> None:
"""
:param root: Root directory where the dataset subdirectory exists or,
if :attr:`download` is ``True``, the directory where the dataset
subdirectory will be created and the dataset downloaded.
:param index: If ``True``, caches the sequence
indicies to disk for faster re-initialization (default: ``True``).
:param download: If ``True``, download the dataset and to the
:attr:`root` directory (default: ``False``). If the dataset is
already downloaded, it is not redownloaded.
:param transform_fn: A ``Callable`` that maps a sequence to a
transformed sequence (default: ``None``).
:param target_transform_fn: ``Callable`` that maps a target (a cluster
identifier) to a transformed target (default: ``None``).
"""
super().__init__(
root,
"uniref100",
(
"",
"",
),
index=index,
download=download,
transform_fn=transform_fn,
target_transform_fn=target_transform_fn,
)
46 changes: 46 additions & 0 deletions src/beignet/datasets/_uni_ref_50_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from pathlib import Path
from typing import Callable, Optional, Union

from ._uni_ref_dataset import UniRefDataset


class UniRef50Dataset(UniRefDataset):
def __init__(
self,
root: Union[str, Path],
*,
index: bool = True,
download: bool = False,
transform_fn: Optional[Callable] = None,
target_transform_fn: Optional[Callable] = None,
) -> None:
"""
:param root: Root directory where the dataset subdirectory exists or,
if :attr:`download` is ``True``, the directory where the dataset
subdirectory will be created and the dataset downloaded.
:param index: If ``True``, caches the sequence
indicies to disk for faster re-initialization (default: ``True``).
:param download: If ``True``, download the dataset and to the
:attr:`root` directory (default: ``False``). If the dataset is
already downloaded, it is not redownloaded.
:param transform_fn: A ``Callable`` that maps a sequence to a
transformed sequence (default: ``None``).
:param target_transform_fn: ``Callable`` that maps a target (a cluster
identifier) to a transformed target (default: ``None``).
"""
super().__init__(
root,
"uniref50",
(
"2b05bd43c14ce0bc0591a017efa648e6", # uniref50.fasta
"cb28f2fc41694ccc009cbbab3e08db98", # uniref50.fasta.gz
),
index=index,
download=download,
transform_fn=transform_fn,
target_transform_fn=target_transform_fn,
)
46 changes: 46 additions & 0 deletions src/beignet/datasets/_uni_ref_90_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from pathlib import Path
from typing import Callable, Optional, Union

from ._uni_ref_dataset import UniRefDataset


class UniRef90Dataset(UniRefDataset):
def __init__(
self,
root: Union[str, Path],
*,
index: bool = True,
download: bool = False,
transform_fn: Optional[Callable] = None,
target_transform_fn: Optional[Callable] = None,
) -> None:
"""
:param root: Root directory where the dataset subdirectory exists or,
if :attr:`download` is ``True``, the directory where the dataset
subdirectory will be created and the dataset downloaded.
:param index: If ``True``, caches the sequence
indicies to disk for faster re-initialization (default: ``True``).
:param download: If ``True``, download the dataset and to the
:attr:`root` directory (default: ``False``). If the dataset is
already downloaded, it is not redownloaded.
:param transform_fn: A ``Callable`` that maps a sequence to a
transformed sequence (default: ``None``).
:param target_transform_fn: ``Callable`` that maps a target (a cluster
identifier) to a transformed target (default: ``None``).
"""
super().__init__(
root,
"uniref90",
(
"",
"",
),
index=index,
download=download,
transform_fn=transform_fn,
target_transform_fn=target_transform_fn,
)
82 changes: 82 additions & 0 deletions src/beignet/datasets/_uni_ref_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import os.path
import re
from pathlib import Path
from typing import Callable, Optional, Union

from beignet.io import download_and_extract_archive

from ._fasta_dataset import FASTADataset


class UniRefDataset(FASTADataset):
def __init__(
self,
root: Union[str, Path],
name: str,
md5: tuple[str, str],
*,
index: bool = True,
download: bool = False,
transform_fn: Optional[Callable] = None,
target_transform_fn: Optional[Callable] = None,
) -> None:
"""
:param root: Root directory where the dataset subdirectory exists or,
if :attr:`download` is ``True``, the directory where the dataset
subdirectory will be created and the dataset downloaded.
:param name:
:param md5:
:param index: If ``True``, caches the sequence
indicies to disk for faster re-initialization (default: ``True``).
:param download: If ``True``, download the dataset and to the
:attr:`root` directory (default: ``False``). If the dataset is
already downloaded, it is not redownloaded.
:param transform_fn: A ``Callable`` that maps a sequence to a
transformed sequence (default: ``None``).
:param target_transform_fn: ``Callable`` that maps a target (a cluster
identifier) to a transformed target (default: ``None``).
"""
root = Path(root)

directory = root / name

path = directory / f"{name}.fasta"

if download and not os.path.exists(path):
download_and_extract_archive(
f"http://ftp.uniprot.org/pub/databases/uniprot/uniref/{name}/{name}.fasta.gz",
str(directory),
str(directory),
f"{name}.fasta.gz",
md5[1],
)

self._pattern = re.compile(r"^UniRef.+_([A-Z0-9]+)\s.+$")

super().__init__(
path,
index=index,
)

self._transform_fn = transform_fn

self._target_transform_fn = target_transform_fn

def __getitem__(self, index: int) -> tuple[str, str]:
target, sequence = self.get(index)

(target,) = re.search(self._pattern, target).groups()

if self._transform_fn:
sequence = self._transform_fn(sequence)

if self._target_transform_fn:
target = self._target_transform_fn(target)

return sequence, target
5 changes: 2 additions & 3 deletions src/beignet/io/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from ._download import download_and_extract_archive
from ._thread_safe_file import ThreadSafeFile

__all__ = [
"ThreadSafeFile",
]
__all__ = ["ThreadSafeFile", "download_and_extract_archive"]
Loading

0 comments on commit 7f6a480

Please sign in to comment.