Merge pull request #99 from mobiusklein/feature/polish_cli_and_docs

Polish CLI
bigbio · Feb 9, 2025 · 92d7b54 · 92d7b54
2 parents 653c385 + fa45437
commit 92d7b54
Show file tree

Hide file tree

Showing 9 changed files with 424 additions and 255 deletions.
diff --git a/ibaqpy/commands/correct_batches.py b/ibaqpy/commands/correct_batches.py
@@ -15,6 +15,10 @@
 from ibaqpy.ibaq.utils import apply_batch_correction
 
 
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+
+
 def is_valid_sample_id(
     samples: Union[str, list, pd.Series], sample_id_pattern: str = SAMPLE_ID_REGEX
 ) -> bool:
@@ -26,12 +30,17 @@ def is_valid_sample_id(
     If any sample ID does not match the pattern, it prints the invalid IDs and returns False.
     Otherwise, it returns True.
 
-    Parameters:
-        samples (Union[str, list, pd.Series]): The sample ID(s) to validate.
-        sample_id_pattern (str): The regex pattern to validate the sample IDs against. Defaults to SAMPLE_ID_REGEX.
-
-    Returns:
-        bool: True if all sample IDs are valid, False otherwise.
+    Parameters
+    ----------
+    samples : Union[str, list, pd.Series]
+        The sample ID(s) to validate.
+    sample_id_pattern : str, optional
+        The regex pattern to validate the sample IDs against. Defaults to 'SAMPLE_ID_REGEX'.
+
+    Returns
+    -------
+    bool
+        True if all sample IDs are valid, False otherwise.
     """
     sample_pattern = re.compile(sample_id_pattern)
 
@@ -54,21 +63,27 @@ def is_valid_sample_id(
 
 def get_batch_id_from_sample_names(samples: list) -> list:
     """
-    Extracts batch IDs from a list of sample names.
+    Extract batch IDs from a list of sample names.
 
     Each sample name is expected to have a batch ID as a prefix, separated by a hyphen.
     The function validates that the batch ID consists of alphanumeric characters only.
     Returns a list of unique batch IDs as integer factors.
 
-    Parameters:
-        samples (list): A list of sample names, each containing a batch ID prefix.
-
-    Returns:
-        list: A list of integer factors representing unique batch IDs.
-
-    Raises:
-        ValueError: If a sample name does not contain a valid batch ID prefix or if the
-                    batch ID contains non-alphanumeric characters.
+    Parameters
+    ----------
+    samples : list
+        A list of sample names, each containing a batch ID prefix.
+
+    Returns
+    -------
+    list
+        A list of integer factors representing unique batch IDs.
+
+    Raises
+    ------
+    ValueError
+        If a sample name does not contain a valid batch ID prefix or if the
+        batch ID contains non-alphanumeric characters.
     """
     batch_ids = []
     for sample in samples:
@@ -102,28 +117,44 @@ def run_batch_correction(
     This function combines multiple TSV files, reshapes the data, validates sample IDs,
     applies batch correction, and optionally exports the results to an AnnData object.
 
-    Parameters:
-        folder (str): Directory containing the TSV files.
-        pattern (str): Pattern to match files in the directory.
-        comment (str): Character indicating the start of a comment line in the TSV files.
-        sep (str): Delimiter for reading the TSV files.
-        output (str): File path to save the corrected iBAQ values.
-        sample_id_column (str): Column name for sample IDs. Defaults to SAMPLE_ID.
-        protein_id_column (str): Column name for protein IDs. Defaults to PROTEIN_NAME.
-        ibaq_raw_column (str): Column name for raw iBAQ values. Defaults to IBAQ.
-        ibaq_corrected_column (str): Column name for corrected iBAQ values. Defaults to IBAQ_BEC.
-        export_anndata (bool): Whether to export the data to an AnnData object. Defaults to False.
-
-    Returns:
-        pd.DataFrame: DataFrame containing the original and corrected iBAQ values.
-
-    Raises:
-        ValueError: If input files cannot be loaded, sample IDs are invalid, or output file cannot be saved.
-        FileNotFoundError: If the output file does not exist when exporting to AnnData.
+    Parameters
+    ----------
+    folder : str
+        Directory containing the TSV files.
+    pattern : str
+        Pattern to match files in the directory.
+    comment : str
+        Character indicating the start of a comment line in the TSV files.
+    sep : str
+        Delimiter for reading the TSV files.
+    output : str
+        File path to save the corrected iBAQ values.
+    sample_id_column : str, optional
+        Column name for sample IDs. Defaults to 'SAMPLE_ID'.
+    protein_id_column : str, optional
+        Column name for protein IDs. Defaults to 'PROTEIN_NAME'.
+    ibaq_raw_column : str, optional
+        Column name for raw iBAQ values. Defaults to 'IBAQ'.
+    ibaq_corrected_column : str, optional
+        Column name for corrected iBAQ values. Defaults to 'IBAQ_BEC'.
+    export_anndata : bool, optional
+        Whether to export the data to an AnnData object. Defaults to False.
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame containing the original and corrected iBAQ values.
+
+    Raises
+    ------
+    ValueError
+        If input files cannot be loaded, sample IDs are invalid, or output file cannot be saved.
+    FileNotFoundError
+        If the output file does not exist when exporting to AnnData.
     """
 
     # Load the data
-    logging.info(f"Loading iBAQ data from TSV files in folder '{folder}'")
+    logger.info(f"Loading iBAQ data from TSV files in folder '{folder}'")
 
     try:
         df_ibaq = combine_ibaq_tsv_files(folder, pattern=pattern, comment=comment, sep=sep)
@@ -147,7 +178,7 @@ def run_batch_correction(
     batch_ids = get_batch_id_from_sample_names(df_wide.columns)
 
     # Run batch correction
-    logging.info("Applying batch correction to iBAQ values")
+    logger.info("Applying batch correction to iBAQ values")
     df_corrected = apply_batch_correction(df_wide, list(batch_ids), kwargs={})
 
     # Convert the data back to long format
@@ -174,7 +205,7 @@ def run_batch_correction(
 
     # Export the raw and corrected iBAQ values to an AnnData object
     if export_anndata:
-        logging.info("Exporting raw and corrected iBAQ values to an AnnData object")
+        logger.info("Exporting raw and corrected iBAQ values to an AnnData object")
         output_path = Path(output)
         if not output_path.exists():
             raise FileNotFoundError(f"Output file {output} does not exist!")
@@ -191,7 +222,7 @@ def run_batch_correction(
         except Exception as e:
             raise ValueError(f"Failed to write AnnData object: {e}")
 
-    logging.info("Batch correction completed...")
+    logger.info("Batch correction completed...")
 
     return df_ibaq
 
@@ -267,24 +298,11 @@ def correct_batches(
     export_anndata: bool,
 ):
     """
-    Command-line interface for correcting batch effects in iBAQ data.
+    Correcting batch effects in iBAQ data.
 
     This command processes TSV files containing raw iBAQ values, applies batch correction,
     and outputs the corrected values. It supports various options for specifying file patterns,
-    column names, and output formats, including exporting to an AnnData object.
-
-    Parameters:
-        ctx: Click context object.
-        folder (str): Directory containing TSV files with raw iBAQ values.
-        pattern (str): Pattern to match TSV files.
-        comment (str): Comment character for TSV files; lines starting with this character are ignored.
-        sep (str): Separator for TSV files.
-        output (str): Output file name for corrected iBAQ values.
-        sample_id_column (str): Column name for sample IDs.
-        protein_id_column (str): Column name for protein IDs.
-        ibaq_raw_column (str): Column name for raw iBAQ values.
-        ibaq_corrected_column (str): Column name for corrected iBAQ values.
-        export_anndata (bool): Flag to export data to an AnnData object.
+    column names, and output formats, including exporting to an AnnData file.
     """
     run_batch_correction(
         folder=folder,

diff --git a/ibaqpy/commands/features2peptides.py b/ibaqpy/commands/features2peptides.py
@@ -88,24 +88,7 @@ def features2parquet(
     save_parquet: bool,
 ) -> None:
     """
-    Command-line interface for converting feature data to a parquet file with optional
-    normalization and filtering steps.
-
-    Parameters:
-        ctx: Click context object.
-        parquet (str): Path to the input parquet file generated by quantms.io.
-        sdrf (str): Path to the SDRF file generated by quantms.
-        min_aa (int): Minimum number of amino acids to filter peptides.
-        min_unique (int): Minimum number of unique peptides to filter proteins.
-        remove_ids (str): Path to a file with protein IDs to remove from analysis.
-        remove_decoy_contaminants (bool): Flag to remove decoy and contaminant proteins.
-        remove_low_frequency_peptides (bool): Flag to remove low-frequency peptides.
-        output (str): Path to the output file for peptide intensities.
-        skip_normalization (bool): Flag to skip the normalization step.
-        nmethod (str): Method for normalizing feature intensities.
-        pnmethod (str): Method for normalizing peptide intensities.
-        log2 (bool): Flag to apply log2 transformation to peptide intensities.
-        save_parquet (bool): Flag to save normalized peptides to a parquet file.
+    Convert feature data to a parquet file with optional normalization and filtering steps.
     """
 
     peptide_normalization(

diff --git a/ibaqpy/commands/peptides2protein.py b/ibaqpy/commands/peptides2protein.py
@@ -78,28 +78,12 @@ def peptides2protein(
     qc_report: str,
 ) -> None:
     """
-    CLI command to compute IBAQ values for proteins from peptide intensity data.
+    Compute IBAQ values for proteins from peptide intensity data.
 
     This command processes peptide identifications and computes IBAQ values,
     optionally normalizing the data and calculating protein metrics using a
     proteomic ruler approach. It supports generating a QC report with distribution
     plots if verbose mode is enabled.
-
-    Parameters:
-        fasta (str): Path to the protein database file.
-        peptides (str): Path to the peptide intensity file.
-        enzyme (str): Enzyme used for protein digestion (default: Trypsin).
-        normalize (bool): Flag to normalize IBAQ values.
-        min_aa (int): Minimum amino acids to consider a peptide (default: 7).
-        max_aa (int): Maximum amino acids to consider a peptide (default: 30).
-        tpa (bool): Flag to calculate TPA values.
-        ruler (bool): Flag to use the ProteomicRuler approach.
-        organism (str): Source organism of the data (default: human).
-        ploidy (int): Ploidy number (default: 2).
-        cpc (float): Cellular protein concentration in g/L (default: 200).
-        output (str): Path to the output file for IBAQ values.
-        verbose (bool): Flag to print additional information.
-        qc_report (str): Path to the PDF file for QC images (default: QCprofile.pdf).
     """
     peptides_to_protein(
         fasta=fasta,

diff --git a/ibaqpy/ibaq/file_utils.py b/ibaqpy/ibaq/file_utils.py
@@ -1,14 +1,19 @@
 import glob
 import logging
 import warnings
-from typing import List, Optional
+from typing import List, Optional, TYPE_CHECKING
 
-import anndata as an
 import pandas as pd
 
 from ibaqpy.ibaq.ibaqpy_postprocessing import pivot_wider
 
 
+if TYPE_CHECKING:
+    import anndata as an
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+
 def create_anndata(
     df: pd.DataFrame,
     obs_col: str,
@@ -17,7 +22,7 @@ def create_anndata(
     layer_cols: Optional[List[str]] = None,
     obs_metadata_cols: Optional[List[str]] = None,
     var_metadata_cols: Optional[List[str]] = None,
-) -> an.AnnData:
+) -> "an.AnnData":
     """
     Create an AnnData object from a long-format DataFrame.
 
@@ -33,6 +38,9 @@ def create_anndata(
     Returns:
         anndata.AnnData: The constructed AnnData object.
     """
+
+    import anndata as an
+
     if df.empty:
         raise ValueError("Cannot create AnnData object from empty DataFrame")
     # Validate that the required columns exist in the DataFrame.
@@ -101,7 +109,7 @@ def add_metadata(metadata_df: pd.DataFrame, key: str, cols: List[str]) -> pd.Dat
             )
             adata.layers[layer_col] = df_layer.to_numpy()
 
-    logging.info(f"Created AnnData object:\n {adata}")
+    logger.info(f"Created AnnData object:\n {adata}")
 
     return adata
 

diff --git a/ibaqpy/ibaq/ibaqpy_commons.py b/ibaqpy/ibaq/ibaqpy_commons.py
@@ -3,7 +3,9 @@
 import numpy as np
 import pandas as pd
 import seaborn as sns
+
 from matplotlib import pyplot as plt
+from matplotlib.figure import Figure
 
 
 PROTEIN_NAME = "ProteinName"
@@ -93,7 +95,7 @@ def plot_distributions(
     title: str = "",
     log2: bool = True,
     width: float = 10,
-) -> matplotlib.pyplot:
+) -> Figure:
     """
     Print the quantile plot for the dataset
     :param dataset: DataFrame
@@ -127,7 +129,7 @@ def plot_box_plot(
     rotation: int = 30,
     title: str = "",
     violin: bool = False,
-) -> matplotlib.pyplot:
+) -> Figure:
     """
     Plot a box plot of two values field and classes field
     :param violin: Also add violin on top of box plot
@@ -174,9 +176,16 @@ def plot_box_plot(
 # Functions needed by Combiner
 def load_sdrf(sdrf_path: str) -> pd.DataFrame:
     """
-    Load sdrf TSV as a dataframe.
-    :param sdrf_path: Path to SDRF TSV.
-    :return:
+    Load SDRF TSV as a dataframe.
+
+    Parameters
+    ----------
+    sdrf_path : str
+        Path to SDRF TSV.
+
+    Returns
+    -------
+    pd.DataFrame
     """
     if not os.path.exists(sdrf_path):
         raise FileNotFoundError(f"{sdrf_path} does not exist!")
@@ -188,16 +197,29 @@ def load_sdrf(sdrf_path: str) -> pd.DataFrame:
 def load_feature(feature_path: str) -> pd.DataFrame:
     """
     Load feature file as a dataframe.
-    :param feature_path: Path to feature file.
-    :return:
+
+    Parameters
+    ----------
+    feature_path : str
+        Path to feature file.
+
+    Returns
+    -------
+    pd.DataFrame
+
+    Raises
+    ------
+    ValueError
+        If the provided file's suffix is not supported, either "parquet" or "csv
+
     """
     suffix = os.path.splitext(feature_path)[1][1:]
     if suffix == "parquet":
         return pd.read_parquet(feature_path)
     elif suffix == "csv":
         return pd.read_csv(feature_path)
     else:
-        raise SystemExit(
+        raise ValueError(
             f"{suffix} is not allowed as input, please provide msstats_in or feature parquet."
         )
 
@@ -209,11 +231,15 @@ def is_parquet(path: str) -> bool:
     This function attempts to open the specified file and read its header
     to determine if it matches the Parquet file signature.
 
-    Parameters:
-        path (str): The file path to check.
+    Parameters
+    ----------
+    path : str
+        The file path to check.
 
-    Returns:
-        bool: True if the file is a Parquet file, False otherwise.
+    Returns
+    -------
+    bool
+        True if the file is a Parquet file, False otherwise.
     """
     try:
         with open(path, "rb") as fh: