Skip to content

Commit

Permalink
Merge pull request #99 from mobiusklein/feature/polish_cli_and_docs
Browse files Browse the repository at this point in the history
Polish CLI
  • Loading branch information
ypriverol authored Feb 9, 2025
2 parents 653c385 + fa45437 commit 92d7b54
Show file tree
Hide file tree
Showing 9 changed files with 424 additions and 255 deletions.
124 changes: 71 additions & 53 deletions ibaqpy/commands/correct_batches.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
from ibaqpy.ibaq.utils import apply_batch_correction


logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())


def is_valid_sample_id(
samples: Union[str, list, pd.Series], sample_id_pattern: str = SAMPLE_ID_REGEX
) -> bool:
Expand All @@ -26,12 +30,17 @@ def is_valid_sample_id(
If any sample ID does not match the pattern, it prints the invalid IDs and returns False.
Otherwise, it returns True.
Parameters:
samples (Union[str, list, pd.Series]): The sample ID(s) to validate.
sample_id_pattern (str): The regex pattern to validate the sample IDs against. Defaults to SAMPLE_ID_REGEX.
Returns:
bool: True if all sample IDs are valid, False otherwise.
Parameters
----------
samples : Union[str, list, pd.Series]
The sample ID(s) to validate.
sample_id_pattern : str, optional
The regex pattern to validate the sample IDs against. Defaults to 'SAMPLE_ID_REGEX'.
Returns
-------
bool
True if all sample IDs are valid, False otherwise.
"""
sample_pattern = re.compile(sample_id_pattern)

Expand All @@ -54,21 +63,27 @@ def is_valid_sample_id(

def get_batch_id_from_sample_names(samples: list) -> list:
"""
Extracts batch IDs from a list of sample names.
Extract batch IDs from a list of sample names.
Each sample name is expected to have a batch ID as a prefix, separated by a hyphen.
The function validates that the batch ID consists of alphanumeric characters only.
Returns a list of unique batch IDs as integer factors.
Parameters:
samples (list): A list of sample names, each containing a batch ID prefix.
Returns:
list: A list of integer factors representing unique batch IDs.
Raises:
ValueError: If a sample name does not contain a valid batch ID prefix or if the
batch ID contains non-alphanumeric characters.
Parameters
----------
samples : list
A list of sample names, each containing a batch ID prefix.
Returns
-------
list
A list of integer factors representing unique batch IDs.
Raises
------
ValueError
If a sample name does not contain a valid batch ID prefix or if the
batch ID contains non-alphanumeric characters.
"""
batch_ids = []
for sample in samples:
Expand Down Expand Up @@ -102,28 +117,44 @@ def run_batch_correction(
This function combines multiple TSV files, reshapes the data, validates sample IDs,
applies batch correction, and optionally exports the results to an AnnData object.
Parameters:
folder (str): Directory containing the TSV files.
pattern (str): Pattern to match files in the directory.
comment (str): Character indicating the start of a comment line in the TSV files.
sep (str): Delimiter for reading the TSV files.
output (str): File path to save the corrected iBAQ values.
sample_id_column (str): Column name for sample IDs. Defaults to SAMPLE_ID.
protein_id_column (str): Column name for protein IDs. Defaults to PROTEIN_NAME.
ibaq_raw_column (str): Column name for raw iBAQ values. Defaults to IBAQ.
ibaq_corrected_column (str): Column name for corrected iBAQ values. Defaults to IBAQ_BEC.
export_anndata (bool): Whether to export the data to an AnnData object. Defaults to False.
Returns:
pd.DataFrame: DataFrame containing the original and corrected iBAQ values.
Raises:
ValueError: If input files cannot be loaded, sample IDs are invalid, or output file cannot be saved.
FileNotFoundError: If the output file does not exist when exporting to AnnData.
Parameters
----------
folder : str
Directory containing the TSV files.
pattern : str
Pattern to match files in the directory.
comment : str
Character indicating the start of a comment line in the TSV files.
sep : str
Delimiter for reading the TSV files.
output : str
File path to save the corrected iBAQ values.
sample_id_column : str, optional
Column name for sample IDs. Defaults to 'SAMPLE_ID'.
protein_id_column : str, optional
Column name for protein IDs. Defaults to 'PROTEIN_NAME'.
ibaq_raw_column : str, optional
Column name for raw iBAQ values. Defaults to 'IBAQ'.
ibaq_corrected_column : str, optional
Column name for corrected iBAQ values. Defaults to 'IBAQ_BEC'.
export_anndata : bool, optional
Whether to export the data to an AnnData object. Defaults to False.
Returns
-------
pd.DataFrame
DataFrame containing the original and corrected iBAQ values.
Raises
------
ValueError
If input files cannot be loaded, sample IDs are invalid, or output file cannot be saved.
FileNotFoundError
If the output file does not exist when exporting to AnnData.
"""

# Load the data
logging.info(f"Loading iBAQ data from TSV files in folder '{folder}'")
logger.info(f"Loading iBAQ data from TSV files in folder '{folder}'")

try:
df_ibaq = combine_ibaq_tsv_files(folder, pattern=pattern, comment=comment, sep=sep)
Expand All @@ -147,7 +178,7 @@ def run_batch_correction(
batch_ids = get_batch_id_from_sample_names(df_wide.columns)

# Run batch correction
logging.info("Applying batch correction to iBAQ values")
logger.info("Applying batch correction to iBAQ values")
df_corrected = apply_batch_correction(df_wide, list(batch_ids), kwargs={})

# Convert the data back to long format
Expand All @@ -174,7 +205,7 @@ def run_batch_correction(

# Export the raw and corrected iBAQ values to an AnnData object
if export_anndata:
logging.info("Exporting raw and corrected iBAQ values to an AnnData object")
logger.info("Exporting raw and corrected iBAQ values to an AnnData object")
output_path = Path(output)
if not output_path.exists():
raise FileNotFoundError(f"Output file {output} does not exist!")
Expand All @@ -191,7 +222,7 @@ def run_batch_correction(
except Exception as e:
raise ValueError(f"Failed to write AnnData object: {e}")

logging.info("Batch correction completed...")
logger.info("Batch correction completed...")

return df_ibaq

Expand Down Expand Up @@ -267,24 +298,11 @@ def correct_batches(
export_anndata: bool,
):
"""
Command-line interface for correcting batch effects in iBAQ data.
Correcting batch effects in iBAQ data.
This command processes TSV files containing raw iBAQ values, applies batch correction,
and outputs the corrected values. It supports various options for specifying file patterns,
column names, and output formats, including exporting to an AnnData object.
Parameters:
ctx: Click context object.
folder (str): Directory containing TSV files with raw iBAQ values.
pattern (str): Pattern to match TSV files.
comment (str): Comment character for TSV files; lines starting with this character are ignored.
sep (str): Separator for TSV files.
output (str): Output file name for corrected iBAQ values.
sample_id_column (str): Column name for sample IDs.
protein_id_column (str): Column name for protein IDs.
ibaq_raw_column (str): Column name for raw iBAQ values.
ibaq_corrected_column (str): Column name for corrected iBAQ values.
export_anndata (bool): Flag to export data to an AnnData object.
column names, and output formats, including exporting to an AnnData file.
"""
run_batch_correction(
folder=folder,
Expand Down
19 changes: 1 addition & 18 deletions ibaqpy/commands/features2peptides.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,24 +88,7 @@ def features2parquet(
save_parquet: bool,
) -> None:
"""
Command-line interface for converting feature data to a parquet file with optional
normalization and filtering steps.
Parameters:
ctx: Click context object.
parquet (str): Path to the input parquet file generated by quantms.io.
sdrf (str): Path to the SDRF file generated by quantms.
min_aa (int): Minimum number of amino acids to filter peptides.
min_unique (int): Minimum number of unique peptides to filter proteins.
remove_ids (str): Path to a file with protein IDs to remove from analysis.
remove_decoy_contaminants (bool): Flag to remove decoy and contaminant proteins.
remove_low_frequency_peptides (bool): Flag to remove low-frequency peptides.
output (str): Path to the output file for peptide intensities.
skip_normalization (bool): Flag to skip the normalization step.
nmethod (str): Method for normalizing feature intensities.
pnmethod (str): Method for normalizing peptide intensities.
log2 (bool): Flag to apply log2 transformation to peptide intensities.
save_parquet (bool): Flag to save normalized peptides to a parquet file.
Convert feature data to a parquet file with optional normalization and filtering steps.
"""

peptide_normalization(
Expand Down
18 changes: 1 addition & 17 deletions ibaqpy/commands/peptides2protein.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,28 +78,12 @@ def peptides2protein(
qc_report: str,
) -> None:
"""
CLI command to compute IBAQ values for proteins from peptide intensity data.
Compute IBAQ values for proteins from peptide intensity data.
This command processes peptide identifications and computes IBAQ values,
optionally normalizing the data and calculating protein metrics using a
proteomic ruler approach. It supports generating a QC report with distribution
plots if verbose mode is enabled.
Parameters:
fasta (str): Path to the protein database file.
peptides (str): Path to the peptide intensity file.
enzyme (str): Enzyme used for protein digestion (default: Trypsin).
normalize (bool): Flag to normalize IBAQ values.
min_aa (int): Minimum amino acids to consider a peptide (default: 7).
max_aa (int): Maximum amino acids to consider a peptide (default: 30).
tpa (bool): Flag to calculate TPA values.
ruler (bool): Flag to use the ProteomicRuler approach.
organism (str): Source organism of the data (default: human).
ploidy (int): Ploidy number (default: 2).
cpc (float): Cellular protein concentration in g/L (default: 200).
output (str): Path to the output file for IBAQ values.
verbose (bool): Flag to print additional information.
qc_report (str): Path to the PDF file for QC images (default: QCprofile.pdf).
"""
peptides_to_protein(
fasta=fasta,
Expand Down
16 changes: 12 additions & 4 deletions ibaqpy/ibaq/file_utils.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
import glob
import logging
import warnings
from typing import List, Optional
from typing import List, Optional, TYPE_CHECKING

import anndata as an
import pandas as pd

from ibaqpy.ibaq.ibaqpy_postprocessing import pivot_wider


if TYPE_CHECKING:
import anndata as an

logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())

def create_anndata(
df: pd.DataFrame,
obs_col: str,
Expand All @@ -17,7 +22,7 @@ def create_anndata(
layer_cols: Optional[List[str]] = None,
obs_metadata_cols: Optional[List[str]] = None,
var_metadata_cols: Optional[List[str]] = None,
) -> an.AnnData:
) -> "an.AnnData":
"""
Create an AnnData object from a long-format DataFrame.
Expand All @@ -33,6 +38,9 @@ def create_anndata(
Returns:
anndata.AnnData: The constructed AnnData object.
"""

import anndata as an

if df.empty:
raise ValueError("Cannot create AnnData object from empty DataFrame")
# Validate that the required columns exist in the DataFrame.
Expand Down Expand Up @@ -101,7 +109,7 @@ def add_metadata(metadata_df: pd.DataFrame, key: str, cols: List[str]) -> pd.Dat
)
adata.layers[layer_col] = df_layer.to_numpy()

logging.info(f"Created AnnData object:\n {adata}")
logger.info(f"Created AnnData object:\n {adata}")

return adata

Expand Down
50 changes: 38 additions & 12 deletions ibaqpy/ibaq/ibaqpy_commons.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt
from matplotlib.figure import Figure


PROTEIN_NAME = "ProteinName"
Expand Down Expand Up @@ -93,7 +95,7 @@ def plot_distributions(
title: str = "",
log2: bool = True,
width: float = 10,
) -> matplotlib.pyplot:
) -> Figure:
"""
Print the quantile plot for the dataset
:param dataset: DataFrame
Expand Down Expand Up @@ -127,7 +129,7 @@ def plot_box_plot(
rotation: int = 30,
title: str = "",
violin: bool = False,
) -> matplotlib.pyplot:
) -> Figure:
"""
Plot a box plot of two values field and classes field
:param violin: Also add violin on top of box plot
Expand Down Expand Up @@ -174,9 +176,16 @@ def plot_box_plot(
# Functions needed by Combiner
def load_sdrf(sdrf_path: str) -> pd.DataFrame:
"""
Load sdrf TSV as a dataframe.
:param sdrf_path: Path to SDRF TSV.
:return:
Load SDRF TSV as a dataframe.
Parameters
----------
sdrf_path : str
Path to SDRF TSV.
Returns
-------
pd.DataFrame
"""
if not os.path.exists(sdrf_path):
raise FileNotFoundError(f"{sdrf_path} does not exist!")
Expand All @@ -188,16 +197,29 @@ def load_sdrf(sdrf_path: str) -> pd.DataFrame:
def load_feature(feature_path: str) -> pd.DataFrame:
"""
Load feature file as a dataframe.
:param feature_path: Path to feature file.
:return:
Parameters
----------
feature_path : str
Path to feature file.
Returns
-------
pd.DataFrame
Raises
------
ValueError
If the provided file's suffix is not supported, either "parquet" or "csv
"""
suffix = os.path.splitext(feature_path)[1][1:]
if suffix == "parquet":
return pd.read_parquet(feature_path)
elif suffix == "csv":
return pd.read_csv(feature_path)
else:
raise SystemExit(
raise ValueError(
f"{suffix} is not allowed as input, please provide msstats_in or feature parquet."
)

Expand All @@ -209,11 +231,15 @@ def is_parquet(path: str) -> bool:
This function attempts to open the specified file and read its header
to determine if it matches the Parquet file signature.
Parameters:
path (str): The file path to check.
Parameters
----------
path : str
The file path to check.
Returns:
bool: True if the file is a Parquet file, False otherwise.
Returns
-------
bool
True if the file is a Parquet file, False otherwise.
"""
try:
with open(path, "rb") as fh:
Expand Down
Loading

0 comments on commit 92d7b54

Please sign in to comment.