Skip to content

Commit

Permalink
[DOC] improve documentation of return_id, set return_id to True as de…
Browse files Browse the repository at this point in the history
…fault

addresses #140
  • Loading branch information
sophiamaedler committed Feb 15, 2025
1 parent 7bd6790 commit c4a19e8
Showing 1 changed file with 15 additions and 8 deletions.
23 changes: 15 additions & 8 deletions src/scportrait/tools/ml/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def __init__(
index_list: list[list[int]] | None = None,
select_channel: list[int] | int | None = None,
transform=None,
return_id: bool = False,
return_id: bool = True,
max_level: int = 5,
):
"""
Expand All @@ -34,7 +34,9 @@ def __init__(
index_list: List of cell indices to select from the dataset. If set to None all cells are taken. Default is None.
select_channel: Specify a specific channel or selection of channels to select from the data. Default is None, which returns all channels. Using this operation is more efficient than if this selection occurs via a passed transform.
transform: An optional user-defined function to apply transformations to the data. Default is None.
return_id: Whether to return the index of the cell with the data. Default is False.
return_id: Whether to return the unique cell-id of the cell along with the data. Default is `True`.
For training purposes this can be set to `False`, but for dataset inference it is generally recommended to set this to `True`,
otherwise you can no longer identify the source cell returning a specific result.
max_level: Maximum levels of directory to search for hdf5 files in the passed paths. Default is 5.
"""
self.dir_list = dir_list
Expand Down Expand Up @@ -383,9 +385,12 @@ class HDF5SingleCellDataset(_HDF5SingleCellDataset):
dir_labels: List of bulk labels applied to all cells within each dataset in `dir_list`.
index_list: List of indices to select from the dataset. If `None`, all cells are included. Default is `None`.
select_channel: Specific channel or list of channels to retrieve from the data. Default is `None`, which returns all channels.
This is more efficient than performing selection via a transform function.
This is more efficient than performing selection via a transform function as the data is never read in the first place.
transform: User-defined function to apply transformations to the data. Default is `None`.
return_id: Whether to return the unique cell-id of the cell along with the data. Default is `False`.
return_id: Whether to return the unique cell-id of the cell along with the data. Default is `True`.
For training purposes this can be set to `False`, but for dataset inference it is generally recommended to set this to `True`, otherwise
you can no longer identify the source cell returning a specific result.
max_level (int, optional):
Maximum number of directory levels to search for HDF5 files within the provided paths. Default is `5`.
Expand Down Expand Up @@ -417,7 +422,7 @@ def __init__(
index_list: list[list[int]] | None = None, # list of indices to select from the index
transform=None,
max_level: int = 5,
return_id: bool = False,
return_id: bool = True,
select_channel: int | list[int] | None = None,
):
super().__init__(
Expand Down Expand Up @@ -452,10 +457,12 @@ class LabelledHDF5SingleCellDataset(_HDF5SingleCellDataset):
label_column_transform: Optional function to apply a mathematical transformation to the read labels.
For example, if the labels are stored as seconds in the HDF5 dataset, set this value to `lambda x: x / 3600` to return labels in hours.
index_list: List of indices to select from the dataset. If `None`, all cells are included. Default is `None`.
select_channel: Specific channel or selection of channels to retrieve from the data. Default is `None`, which returns all channels.
This is more efficient than performing selection via a transform function.
select_channel: Specific channel or list of channels to retrieve from the data. Default is `None`, which returns all channels.
This is more efficient than performing selection via a transform function as the data is never read in the first place.
transform: Optional user-defined function to apply transformations to the data. Default is `None`.
return_id: Whether to return the unique cell-id of the cell along with the data. Default is `False`.
return_id: Whether to return the unique cell-id of the cell along with the data. Default is `True`.
For training purposes this can be set to `False`, but for dataset inference it is generally recommended to set this to `True`, otherwise
you can no longer identify the source cell returning a specific result.
max_level: Maximum number of directory levels to search for HDF5 files within the provided paths. Default is `5`.
Methods:
Expand Down Expand Up @@ -488,7 +495,7 @@ def __init__(
index_list: list[list[int]] | None = None, # list of indices to select from the index
transform: Callable | None = None,
max_level: int = 5,
return_id: bool = False,
return_id: bool = True,
select_channel: list[int] | None | int = None,
):
super().__init__(
Expand Down

0 comments on commit c4a19e8

Please sign in to comment.