diff --git a/CHANGELOG.md b/CHANGELOG.md index b478024d..4abac9ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,10 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` +- [#275 BigBio](https://github.com/bigbio/quantms/pull/275) Added support for bruker data. And speed-up to DIA-NN pipeline. + ### `Changed` ### `Fixed` +- Fixed bug where modification masses were not calculated correctly in DIA-NN conversion. + ### `Dependencies` ### `Parameters` diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index ad670051..783c703e 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -18,6 +18,50 @@ custom_logo: "./nf-core-quantms_logo_light.png" custom_logo_url: "https://github.com/bigbio/quantms" custom_logo_title: "quantms" +custom_data: + total_ion_chromatograms: + file_format: "tsv" + section_name: "MS1 TIC" + description: "MS1 total ion chromatograms extracted from the .d files" + plot_type: "linegraph" + pconfig: + id: "ms1_tic" + title: "MS1 TIC" + ylab: "Ion Count" + ymin: 0 + base_peak_chromatograms: + file_format: "tsv" + section_name: "MS1 BPC" + description: "MS1 base peak chromatograms extracted from the .d files" + plot_type: "linegraph" + pconfig: + id: "ms1_bpc" + title: "MS1 BPC" + ylab: "Ion Count" + ymin: 0 + number_of_peaks: + file_format: "tsv" + section_name: "MS1 Peaks" + description: "MS1 Peaks from the .d files" + plot_type: "linegraph" + pconfig: + id: "ms1_peaks" + title: "MS1 Peaks" + ylab: "Peak Count" + ymin: 0 + general_stats: + file_format: "tsv" + section_name: "General Stats" + description: "General stats from the .d files" + plot_type: "table" sp: + total_ion_chromatograms: + fn: "tic_*" + base_peak_chromatograms: + fn: "bpc_*" + number_of_peaks: + fn: "ms1_peaks_*" + general_stats: + fn: "general_stats.tsv" quantms/exp_design: fn: "*_design.tsv" diff --git a/bin/diann_convert.py b/bin/diann_convert.py index 257d5e01..34e31645 100755 --- a/bin/diann_convert.py +++ b/bin/diann_convert.py @@ -3,17 +3,31 @@ This script converts the output from DIA-NN into three standard formats: MSstats, Triqler and mzTab. License: Apache 2.0 Authors: Hong Wong, Yasset Perez-Riverol +Revisions: + 2023-Aug-05: J. Sebastian Paez """ import logging import os import re +from dataclasses import dataclass +from pathlib import Path +from typing import Any, List, Tuple, Dict, Set import click import numpy as np import pandas as pd from pyopenms import AASequence, FASTAFile, ModificationsDB +from pyopenms.Constants import PROTON_MASS_U + +pd.set_option("display.max_rows", 500) +pd.set_option("display.max_columns", 500) +pd.set_option("display.width", 1000) CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) +REVISION = "0.1.1" + +logging.basicConfig(format="%(asctime)s [%(funcName)s] - %(message)s", level=logging.DEBUG) +logger = logging.getLogger(__name__) @click.group(context_settings=CONTEXT_SETTINGS) @@ -23,13 +37,14 @@ def cli(): @click.command("convert") @click.option("--folder", "-f") +@click.option("--exp_design", "-d") @click.option("--diann_version", "-v") @click.option("--dia_params", "-p") @click.option("--charge", "-c") @click.option("--missed_cleavages", "-m") @click.option("--qvalue_threshold", "-q", type=float) @click.pass_context -def convert(ctx, folder, dia_params, diann_version, charge, missed_cleavages, qvalue_threshold): +def convert(ctx, folder, exp_design, dia_params, diann_version, charge, missed_cleavages, qvalue_threshold): """ Convert DIA-NN output to MSstats, Triqler or mzTab. The output formats are @@ -39,10 +54,10 @@ def convert(ctx, folder, dia_params, diann_version, charge, missed_cleavages, qv the DiaNN main report, protein matrix, precursor matrix, experimental design file, protein sequence FASTA file, version file of DiaNN and mzml_info TSVs :type folder: str - :param diann_version: Path to a version file of DIA-NN - :type diann_version: str :param dia_params: A list contains DIA parameters :type dia_params: list + :param diann_version: Path to a version file of DIA-NN + :type diann_version: str :param charge: The charge assigned by DIA-NN(max_precursor_charge) :type charge: int :param missed_cleavages: Allowed missed cleavages assigned by DIA-NN @@ -50,191 +65,322 @@ def convert(ctx, folder, dia_params, diann_version, charge, missed_cleavages, qv :param qvalue_threshold: Threshold for filtering q value :type qvalue_threshold: float """ - pathdict = {key: [] for key in ["report", "exp_design", "pg_matrix", "pr_matrix", "fasta", "mzml_info"]} - fileslist = os.listdir(folder) - if not folder.endswith("/"): - folder = folder + "/" - for i in fileslist: - if i.endswith("report.tsv"): - pathdict["report"].append(i) - elif i.endswith("_openms_design.tsv"): - pathdict["exp_design"].append(i) - elif i.endswith("pg_matrix.tsv"): - pathdict["pg_matrix"].append(i) - elif i.endswith("pr_matrix.tsv"): - pathdict["pr_matrix"].append(i) - elif i.endswith(".fasta") or i.endswith(".fa"): - pathdict["fasta"].append(i) - elif i.endswith("mzml_info.tsv"): - pathdict["mzml_info"].append(i) - else: - pass - - for item in pathdict.items(): - if item[0] != "mzml_info" and len(item[1]) > 1: - logging.error(f"{item[0]} is duplicate, check whether the file is redundant or change the file name!") - - diann_report = folder + pathdict["report"][0] - exp_design = folder + pathdict["exp_design"][0] - pg_matrix = folder + pathdict["pg_matrix"][0] - pr_matrix = folder + pathdict["pr_matrix"][0] - fasta = folder + pathdict["fasta"][0] - diann_version_file = diann_version - - with open(diann_version_file) as f: - for line in f: - if "DIA-NN" in line: - diann_version_id = line.rstrip("\n").split(": ")[1] - break + logger.debug(f"Revision {REVISION}") + logger.debug("Reading input files...") + diann_directory = DiannDirectory(folder, diann_version_file=diann_version) + report = diann_directory.main_report_df(qvalue_threshold=qvalue_threshold) + s_DataFrame, f_table = get_exp_design_dfs(exp_design) - remain_cols = [ - "File.Name", - "Run", - "Protein.Group", + # Convert to MSstats + msstats_columns_keep = [ "Protein.Names", - "Protein.Ids", - "First.Protein.Description", - "PG.MaxLFQ", - "RT.Start", - "Global.Q.Value", - "Lib.Q.Value", - "PEP", - "Precursor.Normalised", - "Precursor.Id", - "Q.Value", "Modified.Sequence", - "Stripped.Sequence", "Precursor.Charge", "Precursor.Quantity", - "Global.PG.Q.Value", + "File.Name", + "Run", ] - report = pd.read_csv(diann_report, sep="\t", header=0, usecols=remain_cols) - - # filter based on qvalue parameter for downstream analysiss - report = report[report["Q.Value"] < qvalue_threshold] - report["Calculate.Precursor.Mz"] = report.apply( - lambda x: calculate_mz(x["Stripped.Sequence"], x["Precursor.Charge"]), axis=1 - ) - - precursor_list = list(report["Precursor.Id"].unique()) - report["precursor.Index"] = report.apply(lambda x: precursor_list.index(x["Precursor.Id"]), axis=1) - - with open(exp_design, "r") as f: - data = f.readlines() - empty_row = data.index("\n") - f_table = [i.replace("\n", "").split("\t") for i in data[1:empty_row]] - f_header = data[0].replace("\n", "").split("\t") - f_table = pd.DataFrame(f_table, columns=f_header) - f_table.loc[:, "run"] = f_table.apply( - lambda x: os.path.splitext(os.path.basename(x["Spectra_Filepath"]))[0], axis=1 - ) - - s_table = [i.replace("\n", "").split("\t") for i in data[empty_row + 1 :]][1:] - s_header = data[empty_row + 1].replace("\n", "").split("\t") - s_DataFrame = pd.DataFrame(s_table, columns=s_header) - # Convert to MSstats - out_msstats = pd.DataFrame() - out_msstats = report[ - ["Protein.Names", "Modified.Sequence", "Precursor.Charge", "Precursor.Quantity", "File.Name", "Run"] - ] + logger.debug("Converting to MSstats format...") + out_msstats = report[msstats_columns_keep] out_msstats.columns = ["ProteinName", "PeptideSequence", "PrecursorCharge", "Intensity", "Reference", "Run"] out_msstats = out_msstats[out_msstats["Intensity"] != 0] + + # Q: What is this line doing? out_msstats.loc[:, "PeptideSequence"] = out_msstats.apply( lambda x: AASequence.fromString(x["PeptideSequence"]).toString(), axis=1 ) - out_msstats.loc[:, "FragmentIon"] = "NA" - out_msstats.loc[:, "ProductCharge"] = "0" - out_msstats.loc[:, "IsotopeLabelType"] = "L" - out_msstats["Reference"] = out_msstats.apply(lambda x: os.path.basename(x["Reference"]), axis=1) - - out_msstats[["Fraction", "BioReplicate", "Condition"]] = out_msstats.apply( - lambda x: query_expdesign_value(x["Run"], f_table, s_DataFrame), axis=1, result_type="expand" + out_msstats["FragmentIon"] = "NA" + out_msstats["ProductCharge"] = "0" + out_msstats["IsotopeLabelType"] = "L" + unique_reference_map = {k: os.path.basename(k) for k in out_msstats["Reference"].unique()} + out_msstats["Reference"] = out_msstats["Reference"].map(unique_reference_map) + del unique_reference_map + + # TODO remove this if not debugging + logger.debug("\n\nReference Column >>>") + logger.debug(out_msstats["Reference"]) + + logger.debug(f"\n\nout_msstats ({out_msstats.shape}) >>>") + logger.debug(out_msstats.head(5)) + + logger.debug(f"\n\nf_table ({f_table.shape})>>>") + logger.debug(f_table.head(5)) + + logger.debug(f"\n\ns_DataFrame ({s_DataFrame.shape})>>>") + logger.debug(s_DataFrame.head(5)) + ## END TODO + + logger.debug("Adding Fraction, BioReplicate, Condition columns") + # Changing implementation from apply to merge went from several minutes to + # ~50ms + tmp = ( + s_DataFrame[["Sample", "MSstats_Condition", "MSstats_BioReplicate"]] + .merge(f_table[["Fraction", "Sample", "run"]], on="Sample") + .rename(columns={"run": "Run", "MSstats_BioReplicate": "BioReplicate", "MSstats_Condition": "Condition"}) + .drop(columns=["Sample"]) + ) + out_msstats = out_msstats.merge( + tmp, + on="Run", + validate="many_to_one", ) - out_msstats.to_csv(os.path.splitext(os.path.basename(exp_design))[0] + "_msstats_in.csv", sep=",", index=False) + del tmp + exp_out_prefix = str(Path(exp_design).stem) + out_msstats.to_csv(exp_out_prefix + "_msstats_in.csv", sep=",", index=False) + logger.info(f"MSstats input file is saved as {exp_out_prefix}_msstats_in.csv") # Convert to Triqler - out_triqler = pd.DataFrame() - out_triqler = out_msstats[["ProteinName", "PeptideSequence", "PrecursorCharge", "Intensity", "Run", "Condition"]] + trinqler_cols = ["ProteinName", "PeptideSequence", "PrecursorCharge", "Intensity", "Run", "Condition"] + out_triqler = out_msstats[trinqler_cols] del out_msstats out_triqler.columns = ["proteins", "peptide", "charge", "intensity", "run", "condition"] out_triqler = out_triqler[out_triqler["intensity"] != 0] out_triqler.loc[:, "searchScore"] = report["Q.Value"] out_triqler.loc[:, "searchScore"] = 1 - out_triqler["searchScore"] - out_triqler.to_csv(os.path.splitext(os.path.basename(exp_design))[0] + "_triqler_in.tsv", sep="\t", index=False) + out_triqler.to_csv(exp_out_prefix + "_triqler_in.tsv", sep="\t", index=False) + logger.info(f"Triqler input file is saved as {exp_out_prefix}_triqler_in.tsv") del out_triqler + mztab_out = f"{str(Path(exp_design).stem)}_out.mzTab" # Convert to mzTab - if diann_version_id == "1.8.1": - fasta_df = pd.DataFrame() + diann_directory.convert_to_mztab( + report=report, + f_table=f_table, + charge=charge, + missed_cleavages=missed_cleavages, + dia_params=dia_params, + out=mztab_out, + ) + + +def _true_stem(x): + """ + Return the true stem of a file name, i.e. the + file name without the extension. + + :param x: The file name + :type x: str + :return: The true stem of the file name + :rtype: str + + Examples: + >>> _true_stem("foo.mzML") + 'foo' + >>> _true_stem("foo.d.tar") + 'foo' + + These examples can be tested with pytest: + $ pytest -v --doctest-modules + """ + split = os.path.basename(x).split(".") + stem = split[0] + + # Should I check here that the extensions are + # allowed? I can see how this would break if the + # file name contains a period. + return stem + + +def get_exp_design_dfs(exp_design_file): + logger.info(f"Reading experimental design file: {exp_design_file}") + with open(exp_design_file, "r") as f: + data = f.readlines() + empty_row = data.index("\n") + f_table = [i.replace("\n", "").split("\t") for i in data[1:empty_row]] + f_header = data[0].replace("\n", "").split("\t") + f_table = pd.DataFrame(f_table, columns=f_header) + f_table.loc[:, "run"] = f_table.apply(lambda x: _true_stem(x["Spectra_Filepath"]), axis=1) + + s_table = [i.replace("\n", "").split("\t") for i in data[empty_row + 1 :]][1:] + s_header = data[empty_row + 1].replace("\n", "").split("\t") + s_DataFrame = pd.DataFrame(s_table, columns=s_header) + + return s_DataFrame, f_table + + +@dataclass +class DiannDirectory: + base_path: os.PathLike + diann_version_file: str + + def __post_init__(self): + self.base_path = Path(self.base_path) + if not self.base_path.exists() and not self.base_path.is_dir(): + raise NotADirectoryError(f"Path {self.base_path} does not exist") + self.diann_version_file = Path(self.diann_version_file) + if not self.diann_version_file.is_file(): + raise FileNotFoundError(f"Path {self.diann_version_file} does not exist") + + def find_suffix_file(self, suffix: str, only_first=True) -> os.PathLike: + """Finds a file with a given suffix in the directory. + + :param suffix: The suffix to search for + :type suffix: str + :param only_first: Whether to return only the first file found, if false returns all, defaults to True + :type only_first: bool, optional + + :raises FileNotFoundError: If no file with the given suffix is found + """ + matching = self.base_path.glob(f"**/*{suffix}") + if only_first: + try: + return next(matching) + except StopIteration: + raise FileNotFoundError(f"Could not find file with suffix {suffix}") + else: + out = list(matching) + if len(out) == 0: + raise FileNotFoundError(f"Could not find file with suffix {suffix}") + else: + return out + + @property + def report(self) -> os.PathLike: + return self.find_suffix_file("report.tsv") + + @property + def pg_matrix(self) -> os.PathLike: + return self.find_suffix_file("pg_matrix.tsv") + + @property + def pr_matrix(self) -> os.PathLike: + return self.find_suffix_file("pr_matrix.tsv") + + @property + def fasta(self) -> os.PathLike: + try: + return self.find_suffix_file(".fasta") + except FileNotFoundError: + return self.find_suffix_file(".fa") + + @property + def mzml_info(self) -> os.PathLike: + return self.find_suffix_file("mzml_info.tsv") + + @property + def validate_diann_version(self) -> str: + logger.debug("Validating DIANN version") + diann_version_id = None + with open(self.diann_version_file) as f: + for line in f: + if "DIA-NN" in line: + logger.debug(f"Found DIA-NN version: {line}") + diann_version_id = line.rstrip("\n").split(": ")[1] + + if diann_version_id is None: + raise ValueError(f"Could not find DIA-NN version in file {self.diann_version_file}") + elif diann_version_id == "1.8.1": + return diann_version_id + else: + # Maybe this error should be detected beforehand to save time ... + raise ValueError(f"Unsupported DIANN version {self.diann_version}") + + def convert_to_mztab( + self, report, f_table, charge: int, missed_cleavages: int, dia_params: List[Any], out: os.PathLike + ) -> None: + logger.info("Converting to mzTab") + # Convert to mzTab + self.validate_diann_version + + # This could be a branching point if we want to support other versions + # of DIA-NN, maybe something like this: + # if diann_version_id == "1.8.1": + # self.convert_to_mztab_1_8_1(report, f_table, charge, missed_cleavages, dia_params) + # else: + # raise ValueError(f"Unsupported DIANN version {diann_version_id}, supported versions are 1.8.1 ...") + + logger.info(f"Reading fasta file: {self.fasta}") entries = [] f = FASTAFile() - f.load(fasta, entries) - line = 0 - for e in entries: - fasta_df.loc[line, "id"] = e.identifier - fasta_df.loc[line, "seq"] = e.sequence - fasta_df.loc[line, "len"] = len(e.sequence) - line += 1 - - index_ref = f_table - index_ref.loc[:, "ms_run"] = index_ref.apply(lambda x: x["Fraction_Group"], axis=1) - index_ref.loc[:, "study_variable"] = index_ref.apply(lambda x: x["Sample"], axis=1) - index_ref.loc[:, "ms_run"] = index_ref.loc[:, "ms_run"].astype("int") - index_ref.loc[:, "study_variable"] = index_ref.loc[:, "study_variable"].astype("int") - report[["ms_run", "study_variable"]] = report.apply( - lambda x: add_info(x["Run"], index_ref), axis=1, result_type="expand" - ) - - (MTD, database) = mztab_MTD(index_ref, dia_params, fasta, charge, missed_cleavages) + f.load(str(self.fasta), entries) + fasta_entries = [(e.identifier, e.sequence, len(e.sequence)) for e in entries] + fasta_df = pd.DataFrame(fasta_entries, columns=["id", "seq", "len"]) + + logger.info("Mapping run information to report") + index_ref = f_table.copy() + index_ref.rename(columns={"Fraction_Group": "ms_run", "Sample": "study_variable", "run": "Run"}, inplace=True) + index_ref["ms_run"] = index_ref["ms_run"].astype("int") + index_ref["study_variable"] = index_ref["study_variable"].astype("int") + report = report.merge(index_ref[["ms_run", "Run", "study_variable"]], on="Run", validate="many_to_one") + + (MTD, database) = mztab_MTD(index_ref, dia_params, str(self.fasta), charge, missed_cleavages) pg = pd.read_csv( - pg_matrix, + self.pg_matrix, sep="\t", header=0, ) PRH = mztab_PRH(report, pg, index_ref, database, fasta_df) del pg pr = pd.read_csv( - pr_matrix, + self.pr_matrix, sep="\t", header=0, ) + precursor_list = list(report["Precursor.Id"].unique()) PEH = mztab_PEH(report, pr, precursor_list, index_ref, database) del pr - PSH = mztab_PSH(report, folder, database) + PSH = mztab_PSH(report, str(self.base_path), database) del report MTD.loc["", :] = "" PRH.loc[len(PRH) + 1, :] = "" PEH.loc[len(PEH) + 1, :] = "" - with open(os.path.splitext(os.path.basename(exp_design))[0] + "_out.mzTab", "w", newline="") as f: + with open(out, "w", newline="") as f: MTD.to_csv(f, mode="w", sep="\t", index=False, header=False) PRH.to_csv(f, mode="w", sep="\t", index=False, header=True) PEH.to_csv(f, mode="w", sep="\t", index=False, header=True) PSH.to_csv(f, mode="w", sep="\t", index=False, header=True) + logger.info(f"mzTab file generated successfully! at {out}_out.mzTab") -def query_expdesign_value(reference, f_table, s_table): - """ - By matching the "Run" column in f_table or the "Sample" column in s_table, this function returns a tuple containing Fraction, - BioReplicate and Condition. - - :param reference: The value of "Run" column in out_msstats - :type reference: str - :param f_table: A table contains experiment settings(search engine settings etc.) - :type f_table: pandas.core.frame.DataFrame - :param s_table: A table contains experimental design - :type s_table: pandas.core.frame.DataFrame - :return: A tuple contains Fraction, BioReplicate and Condition - :rtype: tuple - """ - query_reference = f_table[f_table["run"] == reference] - Fraction = query_reference["Fraction"].values[0] - row = s_table[s_table["Sample"] == query_reference["Sample"].values[0]] - BioReplicate = row["MSstats_BioReplicate"].values[0] - Condition = row["MSstats_Condition"].values[0] + def main_report_df(self, qvalue_threshold: float) -> pd.DataFrame: + remain_cols = [ + "File.Name", + "Run", + "Protein.Group", + "Protein.Names", + "Protein.Ids", + "First.Protein.Description", + "PG.MaxLFQ", + "RT.Start", + "Global.Q.Value", + "Lib.Q.Value", + "PEP", + "Precursor.Normalised", + "Precursor.Id", + "Q.Value", + "Modified.Sequence", + "Stripped.Sequence", + "Precursor.Charge", + "Precursor.Quantity", + "Global.PG.Q.Value", + ] + report = pd.read_csv(self.report, sep="\t", header=0, usecols=remain_cols) + + # filter based on qvalue parameter for downstream analysiss + logger.debug(f"Filtering report based on qvalue threshold: {qvalue_threshold}, {len(report)} rows") + report = report[report["Q.Value"] < qvalue_threshold] + logger.debug(f"Report filtered, {len(report)} rows remaining") + + logger.debug("Calculating Precursor.Mz") + # Making the map is 10x faster, and includes the mass of + # the modification. with respect to the previous implementation. + uniq_masses = {k: AASequence.fromString(k).getMonoWeight() for k in report["Modified.Sequence"].unique()} + mass_vector = report["Modified.Sequence"].map(uniq_masses) + report["Calculate.Precursor.Mz"] = (mass_vector + (PROTON_MASS_U * report["Precursor.Charge"])) / report[ + "Precursor.Charge" + ] + + logger.debug("Indexing Precursors") + # Making the map is 1500x faster + precursor_index_map = {k: i for i, k in enumerate(report["Precursor.Id"].unique())} + report["precursor.Index"] = report["Precursor.Id"].map(precursor_index_map) - return Fraction, BioReplicate, Condition + logger.debug(f"Shape of main report {report.shape}") + logger.debug(str(report.head())) + + return report def MTD_mod_info(fix_mod, var_mod): @@ -296,6 +442,7 @@ def mztab_MTD(index_ref, dia_params, fasta, charge, missed_cleavages): :return: MTD sub-table :rtype: pandas.core.frame.DataFrame """ + logger.info("Constructing MTD sub-table...") dia_params_list = dia_params.split(";") dia_params_list = ["null" if i == "" else i for i in dia_params_list] FragmentMassTolerance = dia_params_list[0] @@ -402,17 +549,25 @@ def mztab_PRH(report, pg, index_ref, database, fasta_df): :return: PRH sub-table :rtype: pandas.core.frame.DataFrame """ + logger.info("Constructing PRH sub-table...") + logger.debug( + f"Input report shape: {report.shape}," + f" input pg shape: {pg.shape}," + f" input index_ref shape: {index_ref.shape}," + f" input fasta_df shape: {fasta_df.shape}" + ) file = list(pg.columns[5:]) col = {} for i in file: col[i] = ( - "protein_abundance_assay[" - + str(index_ref[index_ref["run"] == os.path.splitext(os.path.split(i)[1])[0]]["ms_run"].values[0]) - + "]" + "protein_abundance_assay[" + str(index_ref[index_ref["Run"] == _true_stem(i)]["ms_run"].values[0]) + "]" ) pg.rename(columns=col, inplace=True) - pg.loc[:, "opt_global_result_type"] = pg.apply(classify_result_type, axis=1, result_type="expand") + + logger.debug("Classifying results type ...") + pg["opt_global_result_type"] = "single_protein" + pg.loc[pg["Protein.Ids"].str.contains(";"), "opt_global_result_type"] = "indistinguishable_protein_group" out_mztab_PRH = pd.DataFrame() out_mztab_PRH = pg.drop(["Protein.Names"], axis=1) @@ -432,6 +587,8 @@ def mztab_PRH(report, pg, index_ref, database, fasta_df): ] for i in null_col: out_mztab_PRH.loc[:, i] = "null" + + logger.debug("Extracting accession values (keeping first)...") out_mztab_PRH.loc[:, "accession"] = out_mztab_PRH.apply(lambda x: x["accession"].split(";")[0], axis=1) protein_details_df = out_mztab_PRH[out_mztab_PRH["opt_global_result_type"] == "indistinguishable_protein_group"] @@ -440,46 +597,70 @@ def mztab_PRH(report, pg, index_ref, database, fasta_df): protein_details_df = ( protein_details_df.drop("accession", axis=1).join(prh_series).reset_index().drop(columns="index") ) - protein_details_df.loc[:, "opt_global_result_type"] = protein_details_df.apply(lambda x: "protein_details", axis=1) + protein_details_df.loc[:, "col"] = "protein_details" # protein_details_df = protein_details_df[-protein_details_df["accession"].str.contains("-")] out_mztab_PRH = pd.concat([out_mztab_PRH, protein_details_df]).reset_index(drop=True) - out_mztab_PRH.loc[:, "protein_coverage"] = out_mztab_PRH.apply( - lambda x: calculate_protein_coverage(report, x["accession"], x["Protein.Ids"], fasta_df), - axis=1, - result_type="expand", + logger.debug("Calculating protein coverage (bottleneck)...") + # This is a bottleneck + # reimplementation runs in 67s vs 137s (old) in my data + out_mztab_PRH.loc[:, "protein_coverage"] = calculate_protein_coverages( + report=report, out_mztab_PRH=out_mztab_PRH, fasta_df=fasta_df ) + logger.debug("Getting ambiguity members...") + # IN THEORY this should be the same as + # out_mztab_PRH["ambiguity_members"] = out_mztab_PRH["Protein.Ids"] + # out_mztab_PRH.loc[out_mztab_PRH["opt_global_result_type"] == "single_protein", "ambiguity_members"] = "null" + # or out_mztab_PRH.loc[out_mztab_PRH["Protein.Ids"] == out_mztab_PRH["accession"], "ambiguity_members"] = "null" out_mztab_PRH.loc[:, "ambiguity_members"] = out_mztab_PRH.apply( lambda x: x["Protein.Ids"] if x["opt_global_result_type"] == "indistinguishable_protein_group" else "null", axis=1, ) + logger.debug("Matching PRH to best search engine score...") + score_looker = ModScoreLooker(report) out_mztab_PRH[["modifiedSequence", "best_search_engine_score[1]"]] = out_mztab_PRH.apply( - lambda x: PRH_match_report(report, x["accession"]), axis=1, result_type="expand" + lambda x: score_looker.get_score(x["Protein.Ids"]), axis=1, result_type="expand" ) + logger.debug("Matching PRH to modifications...") out_mztab_PRH.loc[:, "modifications"] = out_mztab_PRH.apply( lambda x: find_modification(x["modifiedSequence"]), axis=1, result_type="expand" ) + logger.debug("Matching PRH to protein quantification...") ## quantity at protein level: PG.MaxLFQ - max_study_variable = max(index_ref["study_variable"]) - PRH_params = [] - for i in range(1, max_study_variable + 1): - PRH_params.extend( - [ - "protein_abundance_study_variable[" + str(i) + "]", - "protein_abundance_stdev_study_variable[" + str(i) + "]", - "protein_abundance_std_error_study_variable[" + str(i) + "]", - ] - ) - - out_mztab_PRH[PRH_params] = out_mztab_PRH.apply( - lambda x: match_in_report(report, x["accession"], max_study_variable, 1, "protein"), - axis=1, - result_type="expand", + # This used to be a bottleneck in performance + # This implementation drops the run time from 57s to 25ms + protein_agg_report = ( + report[["PG.MaxLFQ", "Protein.Ids", "study_variable"]] + .groupby(["study_variable", "Protein.Ids"]) + .agg({"PG.MaxLFQ": ["mean", "std", "sem"]}) + .reset_index() + .pivot(columns=["study_variable"], index="Protein.Ids") + .reset_index() + ) + protein_agg_report.columns = ["::".join([str(s) for s in col]).strip() for col in protein_agg_report.columns.values] + subname_mapper = { + "Protein.Ids::::": "Protein.Ids", + "PG.MaxLFQ::mean": "protein_abundance_study_variable", + "PG.MaxLFQ::std": "protein_abundance_stdev_study_variable", + "PG.MaxLFQ::sem": "protein_abundance_std_error_study_variable", + } + name_mapper = name_mapper_builder(subname_mapper) + protein_agg_report.rename(columns=name_mapper, inplace=True) + # out_mztab_PRH has columns accession and Protein.Ids; 'Q9NZJ9', 'A0A024RBG1;Q9NZJ9;Q9NZJ9-2'] + # the report table has 'Protein.Group' and 'Protein.Ids': 'Q9NZJ9', 'A0A024RBG1;Q9NZJ9;Q9NZJ9-2' + # Oddly enough the last implementation mapped the the accession (Q9NZJ9) in the mztab + # to the Protein.Ids (A0A024RBG1;Q9NZJ9;Q9NZJ9-2), leading to A LOT of missing values. + out_mztab_PRH = out_mztab_PRH.merge( + protein_agg_report, on="Protein.Ids", how="left", validate="many_to_one", copy=True ) + del name_mapper + del subname_mapper + del protein_agg_report + # end of (former) bottleneck out_mztab_PRH.loc[:, "PRH"] = "PRT" index = out_mztab_PRH.loc[:, "PRH"] @@ -491,13 +672,12 @@ def mztab_PRH(report, pg, index_ref, database, fasta_df): col for col in out_mztab_PRH.columns if col.startswith("opt_") ] out_mztab_PRH = out_mztab_PRH[new_cols] - - # out_mztab_PRH.to_csv("./out_protein.mztab", sep=",", index=False) - return out_mztab_PRH -def mztab_PEH(report, pr, precursor_list, index_ref, database): +def mztab_PEH( + report: pd.DataFrame, pr: pd.DataFrame, precursor_list: List[str], index_ref: pd.DataFrame, database: os.PathLike +) -> pd.DataFrame: """ Construct PEH sub-table. @@ -514,6 +694,13 @@ def mztab_PEH(report, pr, precursor_list, index_ref, database): :return: PEH sub-table :rtype: pandas.core.frame.DataFrame """ + logger.info("Constructing PEH sub-table...") + logger.debug( + f"report.shape: {report.shape}, " + f" pr.shape: {pr.shape}," + f" len(precursor_list): {len(precursor_list)}," + f" index_ref.shape: {index_ref.shape}" + ) out_mztab_PEH = pd.DataFrame() out_mztab_PEH = pr.iloc[:, 0:10] out_mztab_PEH.drop( @@ -529,14 +716,17 @@ def mztab_PEH(report, pr, precursor_list, index_ref, database): inplace=True, ) + logger.debug("Finding modifications...") out_mztab_PEH.loc[:, "modifications"] = out_mztab_PEH.apply( lambda x: find_modification(x["opt_global_cv_MS:1000889_peptidoform_sequence"]), axis=1, result_type="expand" ) + logger.debug("Extracting sequence...") out_mztab_PEH.loc[:, "opt_global_cv_MS:1000889_peptidoform_sequence"] = out_mztab_PEH.apply( lambda x: AASequence.fromString(x["opt_global_cv_MS:1000889_peptidoform_sequence"]).toString(), axis=1 ) + logger.debug("Checking accession uniqueness...") out_mztab_PEH.loc[:, "unique"] = out_mztab_PEH.apply( lambda x: "0" if ";" in str(x["accession"]) else "1", axis=1, result_type="expand" ) @@ -546,46 +736,78 @@ def mztab_PEH(report, pr, precursor_list, index_ref, database): out_mztab_PEH.loc[:, i] = "null" out_mztab_PEH.loc[:, "opt_global_cv_MS:1002217_decoy_peptide"] = "0" - ## average value of each study_variable - ## quantity at peptide level: Precursor.Normalised - out_mztab_PEH.loc[:, "pr_id"] = out_mztab_PEH.apply( - lambda x: precursor_list.index(x["Precursor.Id"]), axis=1, result_type="expand" + logger.debug("Matching precursor IDs...") + # Pre-calculating the indices and using a lookup table drops run time from + # ~6.5s to 11ms + precursor_indices = {k: i for i, k in enumerate(precursor_list)} + pr_ids = out_mztab_PEH["Precursor.Id"].map(precursor_indices) + out_mztab_PEH["pr_id"] = pr_ids + del precursor_indices + + logger.debug("Getting scores per run") + # This implementation is 422-700x faster than the apply-based one + tmp = ( + report.groupby(["precursor.Index", "ms_run"]) + .agg({"Q.Value": ["min"]}) + .reset_index() + .pivot(columns=["ms_run"], index="precursor.Index") + .reset_index() ) - max_assay = max(index_ref["ms_run"]) - max_study_variable = max(index_ref["study_variable"]) - - ms_run_score = [] - for i in range(1, max_assay + 1): - ms_run_score.append("search_engine_score[1]_ms_run[" + str(i) + "]") - out_mztab_PEH[ms_run_score] = out_mztab_PEH.apply( - lambda x: match_in_report(report, x["pr_id"], max_assay, 0, "pep"), axis=1, result_type="expand" + tmp.columns = ["::".join([str(s) for s in col]).strip() for col in tmp.columns.values] + subname_mapper = { + "precursor.Index::::": "precursor.Index", + "Q.Value::min": "search_engine_score[1]_ms_run", + } + name_mapper = name_mapper_builder(subname_mapper) + tmp.rename(columns=name_mapper, inplace=True) + out_mztab_PEH = out_mztab_PEH.merge( + tmp.rename(columns={"precursor.Index": "pr_id"}), on="pr_id", validate="one_to_one" ) - - PEH_params = [] - for i in range(1, max_study_variable + 1): - PEH_params.extend( - [ - "peptide_abundance_study_variable[" + str(i) + "]", - "peptide_abundance_stdev_study_variable[" + str(i) + "]", - "peptide_abundance_std_error_study_variable[" + str(i) + "]", - "opt_global_mass_to_charge_study_variable[" + str(i) + "]", - "opt_global_retention_time_study_variable[" + str(i) + "]", - ] + del tmp + del subname_mapper + del name_mapper + + logger.debug("Getting peptide abundances per study variable") + pep_study_report = per_peptide_study_report(report) + out_mztab_PEH = out_mztab_PEH.merge(pep_study_report, on="pr_id", how="left", validate="one_to_one", copy=True) + del pep_study_report + + logger.debug("Getting peptide properties...") + # Re-implementing this section from apply -> assign to groupby->agg + # speeds up the process from 11s to 25ms in my data (~440x faster) + # Notably, this changes slightly... + # "opt_global_q-value" was the FIRST "Global.Q.Value", now its the min + # "opt_global_SpecEValue_score" was the FIRST "Lib.Q.Value" now its the min + # I believe picking the first is inconsistent because no sorting is checked + # and the first is arbitrary. + + aggtable = ( + report.groupby(["precursor.Index"]) + .agg( + { + "Q.Value": "min", + "RT.Start": "mean", + "Global.Q.Value": "min", + "Lib.Q.Value": "min", + "Calculate.Precursor.Mz": "mean", + } + ) + .reset_index() + .rename( + columns={ + "precursor.Index": "pr_id", + "Q.Value": "best_search_engine_score[1]", + "RT.Start": "retention_time", + "Global.Q.Value": "opt_global_q-value", + "Lib.Q.Value": "opt_global_SpecEValue_score", + "Calculate.Precursor.Mz": "mass_to_charge", + } ) - out_mztab_PEH[PEH_params] = out_mztab_PEH.apply( - lambda x: match_in_report(report, x["pr_id"], max_study_variable, 1, "pep"), axis=1, result_type="expand" ) + del out_mztab_PEH["mass_to_charge"] + out_mztab_PEH = out_mztab_PEH.merge(aggtable, on="pr_id", validate="one_to_one") - out_mztab_PEH[ - [ - "best_search_engine_score[1]", - "retention_time", - "opt_global_q-value", - "opt_global_SpecEValue_score", - "mass_to_charge", - ] - ] = out_mztab_PEH.apply(lambda x: PEH_match_report(report, x["pr_id"]), axis=1, result_type="expand") - + logger.debug("Re-ordering columns...") out_mztab_PEH.loc[:, "PEH"] = "PEP" out_mztab_PEH.loc[:, "database"] = database index = out_mztab_PEH.loc[:, "PEH"] @@ -596,7 +818,6 @@ def mztab_PEH(report, pr, precursor_list, index_ref, database): col for col in out_mztab_PEH.columns if col.startswith("opt_") ] out_mztab_PEH = out_mztab_PEH[new_cols] - # out_mztab_PEH.to_csv("./out_peptide.mztab", sep=",", index=False) return out_mztab_PEH @@ -616,9 +837,29 @@ def mztab_PSH(report, folder, database): :return: PSH sub-table :rtype: pandas.core.frame.DataFrame """ + logger.info("Constructing PSH sub-table") + + def __find_info(dir, n): + # This line matches n="220101_myfile", folder="." to + # "myfolder/220101_myfile_mzml_info.tsv" + files = list(Path(dir).glob(f"*{n}*_info.tsv")) + # Check that it matches one and only one file + if not files: + raise ValueError(f"Could not find {n} info file in {dir}") + if len(files) > 1: + raise ValueError(f"Found multiple {n} info files in {dir}: {files}") + + return files[0] + out_mztab_PSH = pd.DataFrame() for n, group in report.groupby(["Run"]): - file = folder + n + "_mzml_info.tsv" + if isinstance(n, tuple) and len(n) == 1: + # This is here only to support versions of pandas where the groupby + # key is a tuple. + # related: https://github.com/pandas-dev/pandas/pull/51817 + n = n[0] + + file = __find_info(folder, n) target = pd.read_csv(file, sep="\t") group.sort_values(by="RT.Start", inplace=True) target = target[["Retention_Time", "SpectrumID", "Exp_Mass_To_Charge"]] @@ -682,6 +923,7 @@ def mztab_PSH(report, folder, database): for i in null_col: out_mztab_PSH.loc[:, i] = "null" + logger.info("Finding Modifications ...") out_mztab_PSH.loc[:, "modifications"] = out_mztab_PSH.apply( lambda x: find_modification(x["opt_global_cv_MS:1000889_peptidoform_sequence"]), axis=1, result_type="expand" ) @@ -721,7 +963,7 @@ def add_info(target, index_ref): :return: A tuple contains ms_run and study_variable :rtype: tuple """ - match = index_ref[index_ref["run"] == target] + match = index_ref[index_ref["Run"] == target] ms_run = match["ms_run"].values[0] study_variable = match["study_variable"].values[0] @@ -741,53 +983,6 @@ def classify_result_type(target): return "single_protein" -def calculate_protein_coverage(report, target, reference, fasta_df): - """ - Calculate protein coverage. - - :param report: Dataframe for Dia-NN main report - :type report: pandas.core.frame.DataFrame - :param target: The value of "accession" column in out_mztab_PRH - :type target: str - :param fasta_df: A dataframe contains protein IDs, sequences and lengths - :type fasta_df: pandas.core.frame.DataFrame - :return: Protein coverage - :rtype: str - """ - peptide_list = report[report["Protein.Ids"] == reference]["Stripped.Sequence"].drop_duplicates().values - unique_peptides = [j for i, j in enumerate(peptide_list) if all(j not in k for k in peptide_list[i + 1 :])] - resultlist = [] - ref = fasta_df[fasta_df["id"].str.contains(target)]["seq"].values[0] - - def findstr(basestr, s, resultlist): - result = re.finditer(s, basestr) - if result: - for i in result: - resultlist.append([i.span()[0], i.span()[1] - 1]) - - return resultlist - - for i in unique_peptides: - resultlist = findstr(ref, i, resultlist) - # Sort and merge the interval list - resultlist.sort() - l, r = 0, 1 - while r < len(resultlist): - x1, y1 = resultlist[l][0], resultlist[l][1] - x2, y2 = resultlist[r][0], resultlist[r][1] - if x2 > y1: - l += 1 - r += 1 - else: - resultlist[l] = [x1, max(y1, y2)] - resultlist.pop(r) - - coverage_length = np.array([i[1] - i[0] + 1 for i in resultlist]).sum() - protein_coverage = format(coverage_length / len(ref), ".3f") - - return protein_coverage - - def match_in_report(report, target, max_, flag, level): """ This function is used to match the columns "ms_run" and "study_variable" from the report and @@ -805,7 +1000,7 @@ def match_in_report(report, target, max_, flag, level): :type level: str :return: A tuple contains multiple messages :rtype: tuple - """ + """ # noqa if flag == 1 and level == "pep": result = report[report["precursor.Index"] == target] PEH_params = [] @@ -834,45 +1029,66 @@ def match_in_report(report, target, max_, flag, level): return tuple(PRH_params) -def PRH_match_report(report, target): +class ModScoreLooker: """ - Returns a tuple contains modified sequences and the score at protein level. + Class used to cache the lookup table of accessions to best scores and their + respective mod sequences. + + Pre-computing the lookup table leverages a lot of speedum and vectortized + operations from pandas, and is much faster than doing the lookup on the fly + in a loop. :param report: Dataframe for Dia-NN main report :type report: pandas.core.frame.DataFrame - :param target: The value of "accession" column in report - :type target: str - :return: A tuple contains multiple information to construct PRH sub-table - :rtype: tuple """ - match = report[report["Protein.Ids"] == target] - modSeq = match["Modified.Sequence"].values[0] if match["Modified.Sequence"].values.size > 0 else np.nan - ## Score at protein level: Global.PG.Q.Value (without MBR) - score = match["Global.PG.Q.Value"].min() - return modSeq, score + def __init__(self, report: pd.DataFrame) -> None: + self.lookup_dict = self.make_lookup_dict(report) + def make_lookup_dict(self, report) -> Dict[str, Tuple[str, float]]: + grouped_df = ( + report[["Modified.Sequence", "Protein.Ids", "Global.PG.Q.Value"]] + .sort_values("Global.PG.Q.Value", ascending=True) + .groupby(["Protein.Ids"]) + .head(1) + ) + # Modified.Sequence Protein.Ids Global.PG.Q.Value + # 78265 LFNEQNFFQR Q8IV63;Q8IV63-2;Q8IV63-3 0.000252 + # 103585 NPTIVNFPITNVDLR Q53GS9;Q53GS9-2 0.000252 + # 103586 NPTWKPLIR Q7Z4Q2;Q7Z4Q2-2 0.000252 + # 103588 NPVGYPLAWQFLR Q9NZ08;Q9NZ08-2 0.000252 -def PEH_match_report(report, target): - """ - Returns a tuple contains the score at peptide level, retain time, q_score, spec_e and mz. + out = { + row["Protein.Ids"]: (row["Modified.Sequence"], row["Global.PG.Q.Value"]) for _, row in grouped_df.iterrows() + } + return out - :param report: Dataframe for Dia-NN main report - :type report: pandas.core.frame.DataFrame - :param target: The value of "pr_id" column in report - :type target: str - :return: A tuple contains multiple information to construct PEH sub-table - :rtype: tuple - """ - match = report[report["precursor.Index"] == target] - ## Score at peptide level: the minimum of the respective precursor q-values (minimum of Q.Value per group) - search_score = match["Q.Value"].min() - time = match["RT.Start"].mean() - q_score = match["Global.Q.Value"].values[0] if match["Global.Q.Value"].values.size > 0 else np.nan - spec_e = match["Lib.Q.Value"].values[0] if match["Lib.Q.Value"].values.size > 0 else np.nan - mz = match["Calculate.Precursor.Mz"].mean() + def get_score(self, protein_id: str) -> float: + """Returns a tuple contains modified sequences and the score at protein level. + + Gets the best score and corresponding peptide for a given protein_id + + Note that protein id can be something like 'Q8IV63;Q8IV63-2;Q8IV63-3' + + Note2: This implementation also fixes a bug where the function would + return the first peptide in the report, not the best one. (but with the + score of the best one for that accession) - return search_score, time, q_score, spec_e, mz + :param protein_id: The value of "accession" column in report + :type target: str + :return: A tuple that contains (best modified sequence, best score) + if the accession is not found, (np.nan, np.nan) is returned. + :rtype: tuple + """ + # Q: in what cases can the accession not exist in the table? + # or an accession not have peptides? + val = self.lookup_dict.get(protein_id, (np.nan, np.nan)) + return val + + +# Pre-compiling the regex makes the next function 2x faster +# in my benchmarking - JSPP +MODIFICATION_PATTERN = re.compile(r"\((.*?)\)") def find_modification(peptide): @@ -883,12 +1099,17 @@ def find_modification(peptide): :type peptide: str :return: Modification sites :rtype: str + + Examples: + >>> find_modification("PEPM(UNIMOD:35)IDE") + '4-UNIMOD:35' + >>> find_modification("SM(UNIMOD:35)EWEIRDS(UNIMOD:21)EPTIDEK") + '2-UNIMOD:35,9-UNIMOD:21' """ peptide = str(peptide) - pattern = re.compile(r"\((.*?)\)") - original_mods = pattern.findall(peptide) - peptide = re.sub(r"\(.*?\)", ".", peptide) - position = [i.start() for i in re.finditer(r"\.", peptide)] + original_mods = MODIFICATION_PATTERN.findall(peptide) + peptide = MODIFICATION_PATTERN.sub(".", peptide) + position = [i for i, x in enumerate(peptide) if x == "."] for j in range(1, len(position)): position[j] -= j @@ -900,22 +1121,214 @@ def find_modification(peptide): return original_mods -def calculate_mz(seq, charge): +def name_mapper_builder(subname_mapper): + """Returns a function that renames the columns of the grouped table to match the ones + in the final table. + + Examples: + >>> mapping_dict = { + ... "precursor.Index::::": "pr_id", + ... "Precursor.Normalised::mean": "peptide_abundance_study_variable" + ... } + >>> name_mapper = name_mapper_builder(mapping_dict) + >>> name_mapper("precursor.Index::::") + "pr_id" + >>> name_mapper("Precursor.Normalised::mean::1") + "peptide_abundance_study_variable[1]" + """ + num_regex = re.compile(r"(.*)::(\d+)$") + + def name_mapper(x): + """Renames the columns of the grouped table to match the ones + in the final table. + + Examples: + >>> name_mapper("precursor.Index::::") + "pr_id" + >>> name_mapper("Precursor.Normalised::mean::1") + "peptide_abundance_study_variable[1]" + """ + orig_x = x + for k, v in subname_mapper.items(): + if k in x: + x = x.replace(k, v) + out = num_regex.sub(r"\1[\2]", x) + if out == orig_x: + # This should never happen but I am adding it here + # to prevent myself from shoting myself in the foot in the future. + raise ValueError(f"Column name {x} not found in subname_mapper") + return out + + return name_mapper + + +def per_peptide_study_report(report: pd.DataFrame) -> pd.DataFrame: + """Summarizes the report at peptide/study level and flattens the columns. + + This function was implemented to replace an 'apply -> filter' approach. + In my benchmarking it went from 35.23 seconds for 4 samples, 4 conditions to + 0.007 seconds. + + This implementation differs in several aspects in the output values: + 1. in the fact that it actually gets values for the m/z + 2. always returns a float, whilst the apply version returns an 'object' dtype. + 3. The original implementation, missing values had the string 'null', here + they have the value np.nan. + 4. The order of the final output is different; the original orders columns by + study variables > calculated value, this one is calculated value > study variables. + + Calculates the mean, standard deviation and std error of the precursor + abundances, as well as the mean retention time and m/z. + + The names in the end are called "peptide" but thechnically the are at the + precursor level. (peptide+charge combinations). + + The columns will look like this in the end: + [ + 'pr_id', + 'peptide_abundance_study_variable[1]', + ... + 'peptide_abundance_stdev_study_variable[1]', + ... + 'peptide_abundance_std_error_study_variable[1]', + ... + 'opt_global_retention_time_study_variable[1]', + ... + 'opt_global_mass_to_charge_study_variable[1]', + ... + ] """ - Calculate the precursor m/z based on the peptide sequence and charge state. + pep_study_grouped = ( + report.groupby(["study_variable", "precursor.Index"]) + .agg({"Precursor.Normalised": ["mean", "std", "sem"], "RT.Start": ["mean"], "Calculate.Precursor.Mz": ["mean"]}) + .reset_index() + .pivot(columns=["study_variable"], index="precursor.Index") + .reset_index() + ) + pep_study_grouped.columns = ["::".join([str(s) for s in col]).strip() for col in pep_study_grouped.columns.values] + # Columns here would be like: + # [ + # "precursor.Index::::", + # "Precursor.Normalised::mean::1", + # "Precursor.Normalised::mean::2", + # "Precursor.Normalised::std::1", + # "Precursor.Normalised::std::2", + # "Precursor.Normalised::sem::1", + # "Precursor.Normalised::sem::2", + # "RT.Start::mean::1", + # "RT.Start::mean::2", + # ] + # So the right names need to be given and the table can be joined with the other one + subname_mapper = { + "precursor.Index::::": "pr_id", + "Precursor.Normalised::mean": "peptide_abundance_study_variable", + "Precursor.Normalised::std": "peptide_abundance_stdev_study_variable", + "Precursor.Normalised::sem": "peptide_abundance_std_error_study_variable", + "Calculate.Precursor.Mz::mean": "opt_global_mass_to_charge_study_variable", + "RT.Start::mean": "opt_global_retention_time_study_variable", + } + name_mapper = name_mapper_builder(subname_mapper) + + pep_study_grouped.rename( + columns=name_mapper, + inplace=True, + ) - :param seq: Peptide sequence - :type seq: str - :param charge: charge state - :type charge: int - :return: + return pep_study_grouped + + +def calculate_coverage(ref_sequence: str, sequences: Set[str]): """ - ref = "ARNDBCEQZGHILKMFPSTWYV" - seq = "".join([i for i in seq if i in ref]) - if charge == "": - return None - else: - return AASequence.fromString(seq).getMZ(int(charge)) + Calculates the coverage of the reference sequence by the given sequences. + + Examples: + >>> calculate_coverage("WATEROVERTHEDUCKSBACK", {"WATER", "DUCK"}) + 0.45 + >>> calculate_coverage("DUCKDUCKDUCK", {"DUCK"}) + 1.0 + >>> calculate_coverage("WATEROVERTHEDUCK", {"DUCK"}) + 0.25 + """ + starts = [] + lengths = [] + for sequence in sequences: + local_start = 0 + while True: + local_start = ref_sequence.find(sequence, local_start) + if local_start == -1: + break + starts.append(local_start) + lengths.append(len(sequence)) + local_start += 1 + + # merge overlapping intervals + starts, lengths = zip(*sorted(zip(starts, lengths))) + merged_starts = [] + merged_lengths = [] + for start, length in zip(starts, lengths): + if merged_starts and merged_starts[-1] + merged_lengths[-1] >= start: + merged_lengths[-1] = max(merged_starts[-1] + merged_lengths[-1], start + length) - merged_starts[-1] + else: + merged_starts.append(start) + merged_lengths.append(length) + + # calculate coverage + coverage = sum(merged_lengths) / len(ref_sequence) + return coverage + + +def calculate_protein_coverages(report: pd.DataFrame, out_mztab_PRH: pd.DataFrame, fasta_df: pd.DataFrame) -> List[str]: + """Calculates protein coverages for the PRH table. + + The protein coverage is calculated as the fraction of the protein sequence + in the fasta df, covered by the peptides in the report table, for every + protein in the PRH table (defined by accession, not protein.ids). + """ + nested_df = ( + report[["Protein.Ids", "Stripped.Sequence"]] + .groupby("Protein.Ids") + .agg({"Stripped.Sequence": set}) + .reset_index() + ) + # Protein.Ids Stripped.Sequence + # 0 A0A024RBG1;Q9NZJ9;Q9NZJ9-2 {SEQEDEVLLVSSSR} + # 1 A0A096LP49;A0A096LP49-2 {SPWAMTERKHSSLER} + # 2 A0AVT1;A0AVT1-2 {EDFTLLDFINAVK, KPDHVPISSEDER, QDVIITALDNVEAR,... + ids_to_seqs = dict(zip(nested_df["Protein.Ids"], nested_df["Stripped.Sequence"])) + acc_to_ids = dict(zip(out_mztab_PRH["accession"], out_mztab_PRH["Protein.Ids"])) + fasta_id_to_seqs = dict(zip(fasta_df["id"], fasta_df["seq"])) + acc_to_fasta_ids = {} + + # Since fasta ids are something like sp|P51451|BLK_HUMAN but + # accessions are something like Q9Y6V7-2, we need to find a + # partial string match between the two (the best one) + for acc in acc_to_ids: + # I am pretty sure this is the slowest part of the code + matches = fasta_df[fasta_df["id"].str.contains(acc)]["id"] + if len(matches) == 0: + logger.warning(f"Could not find fasta id for accession {acc} in the fasta file.") + acc_to_fasta_ids[acc] = None + elif len(matches) == 1: + acc_to_fasta_ids[acc] = matches.iloc[0] + else: + # If multiple, find best match. ej. Pick Q9Y6V7 over Q9Y6V7-2 + # This can be acquired by finding the shortest string, since + # it entails more un-matched characters. + acc_to_fasta_ids[acc] = min(matches, key=len) + + out = [None] * len(out_mztab_PRH["accession"]) + + for i, acc in enumerate(out_mztab_PRH["accession"]): + f_id = acc_to_fasta_ids[acc] + if f_id is None: + out_cov = "null" + else: + cov = calculate_coverage(fasta_id_to_seqs[f_id], ids_to_seqs[acc_to_ids[acc]]) + out_cov = format(cov, ".03f") + + out[i] = out_cov + + return out cli.add_command(convert) diff --git a/bin/dotd_2_mqc.py b/bin/dotd_2_mqc.py new file mode 100755 index 00000000..74dfe93a --- /dev/null +++ b/bin/dotd_2_mqc.py @@ -0,0 +1,376 @@ +#!/usr/bin/env python +GENERAL_HELP = """ +Converts .d files to multiqc compatible files. + +This script can be used either as standalone or as part of a larger workflow +with pmultiqc as a plugin. + +For the standalone usage follow the instructions in the usage section. +If you want to use the output of this script as part of a larger workflow +you will have to modify the `multiqc_config.yaml` file used as te input +for multiqc. Please refer to the multiqc documentation for more information. + +Generates the following files: + - tic_.tsv + - bpc_.tsv + - ms1_peaks_.tsv + - general_stats_.tsv + - dotd_mqc.yml + +Usage: + $ python dotd_2_mqc.py single + $ python dotd_2_mqc.py single + $ python dotd_2_mqc.py aggregate + + # These last steps can also be + $ python dotd_2_mqc.py single + # If the input directory contains multiple .d files. + + $ cd + $ multiqc -c dotd_mqc.yml . +""" + +from typing import List, Tuple # noqa: E402 +import os # noqa: E402 +import sqlite3 # noqa: E402 +import argparse # noqa: E402 +from pathlib import Path # noqa: E402 +from dataclasses import dataclass # noqa: E402 +from logging import getLogger # noqa: E402 +import logging # noqa: E402 + +VERSION = "0.0.2" +logging.basicConfig(level=logging.DEBUG) +logger = getLogger(__name__) + +# The time resulution in seconds. +# Larger values will result in smaller data files as outputs +# and will slightly smooth the data. 5 seconds seems to be +# a good value for qc purposes. +SECOND_RESOLUTION = 5 +# This string is used as a template for the multiqc config file. +# Check the module docstring for more information. +MQC_YML = """ +custom_data: + total_ion_chromatograms: + file_format: 'tsv' + section_name: 'MS1 TIC' + description: 'MS1 total ion chromatograms extracted from the .d files' + plot_type: 'linegraph' + pconfig: + id: 'ms1_tic' + title: 'MS1 TIC' + ylab: 'Ion Count' + ymin: 0 + base_peak_chromatograms: + file_format: 'tsv' + section_name: 'MS1 BPC' + description: 'MS1 base peak chromatograms extracted from the .d files' + plot_type: 'linegraph' + pconfig: + id: 'ms1_bpc' + title: 'MS1 BPC' + ylab: 'Ion Count' + ymin: 0 + number_of_peaks: + file_format: 'tsv' + section_name: 'MS1 Peaks' + description: 'MS1 Peaks from the .d files' + plot_type: 'linegraph' + pconfig: + id: 'ms1_peaks' + title: 'MS1 Peaks' + ylab: 'Peak Count' + ymin: 0 + general_stats: + file_format: 'tsv' + section_name: 'General Stats' + description: 'General stats from the .d files' + plot_type: 'table' +sp: + total_ion_chromatograms: + fn: 'tic_*' + base_peak_chromatograms: + fn: 'bpc_*' + number_of_peaks: + fn: 'ms1_peaks_*' + general_stats: + fn: 'general_stats.tsv' +""" + + +@dataclass +class DotDFile: + filepath: os.PathLike + + @property + def sql_filepath(self): + fp = Path(self.filepath) / "analysis.tdf" + return fp + + @property + def basename(self): + return Path(self.filepath).stem + + @property + def ms1_tic(self) -> List[Tuple[float, float]]: + """Gets the MS1 total-ion-chromatogram. + + Returns: + List[Tuple[float, float]]: List of (time, intensity) tuples. + """ + # Note that here I am using min and not mean for purely qc reasons. + # Since the diagnostic aspect here is mainly to see major fluctuations + # in the intensity, and usually these are scans with very low intensity + # due to bubbles or ionization issues, thus the mean would hide that. + cmd = f""" + SELECT MIN(Time), MIN(SummedIntensities) + FROM frames WHERE MsMsType = '0' + GROUP BY CAST(Time / {SECOND_RESOLUTION} AS INTEGER) + ORDER BY Time + """ + conn = sqlite3.connect(self.sql_filepath) + c = conn.cursor() + out = c.execute(cmd).fetchall() + conn.close() + return out + + @property + def ms1_bpc(self) -> List[Tuple[float, float]]: + """Gets the MS1 base-peak-chromatogram. + + Returns: + List[Tuple[float, float]]: List of (time, intensity) tuples. + """ + cmd = f""" + SELECT MIN(Time), MAX(MaxIntensity) + FROM frames WHERE MsMsType = '0' + GROUP BY CAST(Time / {SECOND_RESOLUTION} AS INTEGER) + ORDER BY Time + """ + conn = sqlite3.connect(self.sql_filepath) + c = conn.cursor() + out = c.execute(cmd).fetchall() + conn.close() + return out + + @property + def ms1_peaks(self) -> List[Tuple[float, float]]: + """Gets the number of MS1 peaks. + + Returns: + List[Tuple[float, float]]: List of (time, intensity) tuples. + """ + cmd = f""" + SELECT MIN(Time), AVG(NumPeaks) + FROM frames WHERE MsMsType = '0' + GROUP BY CAST(Time / {SECOND_RESOLUTION} AS INTEGER) + ORDER BY Time + """ + conn = sqlite3.connect(self.sql_filepath) + c = conn.cursor() + out = c.execute(cmd).fetchall() + conn.close() + return out + + def get_acquisition_datetime(self) -> str: + """Gets the acquisition datetime + + Returns + ------- + str + The acquisition datetime in ISO 8601 format. + [('2023-08-06T06:23:19.141-08:00',)] + """ + cmd = "SELECT Value FROM GlobalMetadata WHERE key='AcquisitionDateTime'" + conn = sqlite3.connect(self.sql_filepath) + c = conn.cursor() + out = c.execute(cmd).fetchall() + conn.close() + if not len(out) == 1: + raise RuntimeError("More than one acquisition datetime found.") + + return out[0][0] + + def get_tot_current(self) -> float: + """Gets the total current from the ms1 scans. + + Returns + ------- + float + The total current. + """ + cmd = """ + SELECT SUM(CAST(SummedIntensities AS FLOAT)) + FROM frames WHERE MsMsType = '0' + """ + conn = sqlite3.connect(self.sql_filepath) + c = conn.cursor() + out = c.execute(cmd).fetchall() + conn.close() + if not len(out) == 1: + raise RuntimeError("More than one total current found.") + + return out[0][0] + + def get_dia_scan_current(self) -> float: + """Gets the total current from the ms2 scans. + + Returns + ------- + float + The total current. + """ + cmd = """ + SELECT SUM(CAST(SummedIntensities AS FLOAT)) + FROM frames WHERE MsMsType = '9' + """ + conn = sqlite3.connect(self.sql_filepath) + c = conn.cursor() + out = c.execute(cmd).fetchall() + conn.close() + if not len(out) == 1: + raise RuntimeError("More than one total current found.") + + return out[0][0] + + def get_general_stats(self) -> dict: + """Gets the general stats from the .d file. + + Returns + ------- + dict + A dictionary of general stats. + """ + out = { + "AcquisitionDateTime": self.get_acquisition_datetime(), + "TotalCurrent": self.get_tot_current(), + "DIA_ScanCurrent": self.get_dia_scan_current(), + } + return out + + def write_tables(self, location): + logger.info(f"Writing tables for {self.basename}") + logger.info(f"Writing tables to {location}") + location = Path(location) + location.mkdir(parents=True, exist_ok=True) + tic = self.ms1_tic + bpc = self.ms1_bpc + npeaks = self.ms1_peaks + general_stats = self.get_general_stats() + + tic_path = location / f"tic_{self.basename}.tsv" + bpc_path = location / f"bpc_{self.basename}.tsv" + peaks_location = location / f"ms1_peaks_{self.basename}.tsv" + general_stats_location = location / f"general_stats_{self.basename}.tsv" + + logger.info(f"Writing {tic_path}") + with tic_path.open("w") as f: + for t, i in tic: + f.write(f"{t}\t{i}\n") + + logger.info(f"Writing {bpc_path}") + with bpc_path.open("w") as f: + for t, i in bpc: + f.write(f"{t}\t{i}\n") + + logger.info(f"Writing {peaks_location}") + with peaks_location.open("w") as f: + for t, i in npeaks: + f.write(f"{t}\t{i}\n") + + logger.info(f"Writing {general_stats_location}") + with general_stats_location.open("w") as f: + for k, v in general_stats.items(): + f.write(f"{k}\t{v}\n") + + +def main_single(input_path, output_path): + if input_path.is_dir() and str(input_path).endswith(".d"): + input_files = [input_path] + elif input_path.is_dir(): + input_files = list(input_path.glob("*.d")) + else: + raise RuntimeError(f"Input path {input_path} is not a file or directory.") + + output_path.mkdir(parents=True, exist_ok=True) + + for f in input_files: + d = DotDFile(f) + d.write_tables(output_path) + + logger.info(f"Writing {output_path / 'dotd_mqc.yml'}") + with (output_path / "dotd_mqc.yml").open("w") as f: + f.write(MQC_YML) + + if len(input_files) > 1: + logger.info("Writing aggregate general stats.") + main_aggregate(output_path, output_path) + + logger.info("Done.") + + +def main_aggregate(input_path, output_path): + # Find the general stats files + if not input_path.is_dir(): + logger.error(f"Input path {input_path} is not a directory.") + raise ValueError("Input path must be a directory.") + + general_stats_files = list(input_path.glob("general_stats_*.tsv")) + if not general_stats_files: + logger.error(f"No general stats files found in {input_path}.") + raise ValueError("No general stats files found.") + + # Merge them to a single table + # Effectively transposing the columns and adding column called file, + # which contains the file name from which the stats were acquired. + logger.info("Merging general stats files.") + general_stats = [] + for f in general_stats_files: + curr_stats = {"file": f.stem.replace("general_stats_", "")} + with f.open("r") as fh: + for line in fh: + line = line.strip() + if not line: + continue + k, v = line.split("\t") + curr_stats[k] = v + + general_stats.append(curr_stats) + + # Write the general stats file + logger.info("Writing general stats file.") + with (output_path / "general_stats.tsv").open("w") as f: + f.write("\t".join(general_stats[0].keys()) + "\n") + for s in general_stats: + f.write("\t".join(s.values()) + "\n") + + +if __name__ == "__main__": + # create the top-level parser + parser = argparse.ArgumentParser(add_help=True, usage=GENERAL_HELP) + parser.add_argument("--version", action="version", version=f"%(prog)s {VERSION}") + subparsers = parser.add_subparsers(required=True) + + # create the parser for the "single" command + parser_foo = subparsers.add_parser("single") + parser_foo.add_argument("input", help="Input .d file or directory of .d files.") + parser_foo.add_argument("output", help="Output directory.") + parser_foo.set_defaults(func=main_single) + + # create the parser for the "aggregate" command + parser_bar = subparsers.add_parser("aggregate") + parser_bar.add_argument("input", help="Directory that contains the general stats files to aggregate.") + parser_bar.add_argument("output", help="Output directory.") + parser_bar.set_defaults(func=main_aggregate) + + # parse the args and call whatever function was selected + args, unkargs = parser.parse_known_args() + if unkargs: + print(f"Unknown arguments: {unkargs}") + raise RuntimeError("Unknown arguments.") + + input_path = Path(args.input) + output_path = Path(args.output) + + args.func(input_path, output_path) diff --git a/main.nf b/main.nf index 88fd7784..7becd311 100644 --- a/main.nf +++ b/main.nf @@ -23,7 +23,7 @@ include { validateParameters; paramsHelp } from 'plugin/nf-validation' if (params.help) { def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) def citation = '\n' + WorkflowMain.citation(workflow) + '\n' - def String command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --genome GRCh37 -profile docker" + def String command = "nextflow run ${workflow.manifest.name} --input input_files.sdrf.tsv --database ~/dbs/human_fasta.fasta -profile docker" log.info logo + paramsHelp(command) + citation + NfcoreTemplate.dashedLine(params.monochrome_logs) System.exit(0) } diff --git a/modules/local/assemble_empirical_library/main.nf b/modules/local/assemble_empirical_library/main.nf index fdb9a467..91c4f0ad 100644 --- a/modules/local/assemble_empirical_library/main.nf +++ b/modules/local/assemble_empirical_library/main.nf @@ -7,7 +7,8 @@ process ASSEMBLE_EMPIRICAL_LIBRARY { 'docker.io/biocontainers/diann:v1.8.1_cv1' }" input: - path(mzMLs) + // In this step the real files are passed, and not the names + path(ms_files) val(meta) path("quant/*") path(lib) @@ -22,14 +23,25 @@ process ASSEMBLE_EMPIRICAL_LIBRARY { script: def args = task.ext.args ?: '' - mass_acc_ms1 = meta.precursor_mass_tolerance_unit == "ppm" ? meta.precursor_mass_tolerance : 5 - mass_acc_ms2 = meta.fragment_mass_tolerance_unit == "ppm" ? meta.fragment_mass_tolerance : 13 + mass_acc_ms1 = meta['precursormasstoleranceunit'].toLowerCase().endsWith('ppm') ? meta['precursormasstolerance'] : 5 + mass_acc_ms2 = meta['fragmentmasstoleranceunit'].toLowerCase().endsWith('ppm') ? meta['fragmentmasstolerance'] : 13 - mass_acc = params.mass_acc_automatic ? "--quick-mass-acc --individual-mass-acc" : "--mass-acc $mass_acc_ms2 --mass-acc-ms1 $mass_acc_ms1" - scan_window = params.scan_window_automatic ? "--individual-windows" : "--window $params.scan_window" + if (params.mass_acc_automatic) { + mass_acc = "--quick-mass-acc --individual-mass-acc" + } else { + mass_acc = "--mass-acc $mass_acc_ms2 --mass-acc-ms1 $mass_acc_ms1" + } + scan_window = params.scan_window_automatic ? '--individual-windows' : "--window $params.scan_window" """ - diann --f ${(mzMLs as List).join(' --f ')} \\ + # Precursor Tolerance value was: ${meta['precursormasstolerance']} + # Fragment Tolerance value was: ${meta['fragmentmasstolerance']} + # Precursor Tolerance unit was: ${meta['precursormasstoleranceunit']} + # Fragment Tolerance unit was: ${meta['fragmentmasstoleranceunit']} + + ls -lcth + + diann --f ${(ms_files as List).join(' --f ')} \\ --lib ${lib} \\ --threads ${task.cpus} \\ --out-lib empirical_library.tsv \\ diff --git a/modules/local/decompress_dotd/main.nf b/modules/local/decompress_dotd/main.nf new file mode 100644 index 00000000..f8136c0b --- /dev/null +++ b/modules/local/decompress_dotd/main.nf @@ -0,0 +1,79 @@ + +process DECOMPRESS { + tag "$meta.mzml_id" + label 'process_low' + label 'error_retry' + + conda "conda-forge::gzip=1.12,conda-forge::tar=1.34,conda-forge::bzip2=1.0.8,conda-forge::unzip=6.0,conda-forge::xz=5.2.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-796b0610595ad1995b121d0b85375902097b78d4:a3a3220eb9ee55710d743438b2ab9092867c98c6-0' : + 'quay.io/biocontainers/mulled-v2-796b0610595ad1995b121d0b85375902097b78d4:a3a3220eb9ee55710d743438b2ab9092867c98c6-0' }" + + stageInMode { + if (task.attempt == 1) { + if (executor == 'awsbatch') { + 'symlink' + } else { + 'link' + } + } else if (task.attempt == 2) { + if (executor == 'awsbatch') { + 'copy' + } else { + 'symlink' + } + } else { + 'copy' + } + } + + input: + tuple val(meta), path(compressed_file) + + output: + tuple val(meta), path('*.d'), emit: decompressed_files + path 'versions.yml', emit: version + path '*.log', emit: log + + script: + String prefix = task.ext.prefix ?: "${meta.mzml_id}" + + """ + function extract { + if [ -z "\$1" ]; then + echo "Usage: extract ." + else + if [ -f \$1 ]; then + case \$1 in + *.tar.gz) tar xvzf \$1 ;; + *.gz) gunzip \$1 ;; + *.tar) tar xvf \$1 ;; + *.zip) unzip \$1 ;; + *) echo "extract: '\$1' - unknown archive method" ;; + esac + else + echo "\$1 - file does not exist" + fi + fi + } + + tar --help 2>&1 | tee -a ${prefix}_decompression.log + gunzip --help 2>&1 | tee -a ${prefix}_decompression.log + zip --help 2>&1 | tee -a ${prefix}_decompression.log + echo "Unpacking..." | tee -a ${compressed_file.baseName}_decompression.log + + extract ${compressed_file} 2>&1 | tee -a ${compressed_file.baseName}_conversion.log + [ -d ${file(compressed_file.baseName).baseName}.d ] && \\ + echo "Found ${file(compressed_file.baseName).baseName}.d" || \\ + mv *.d ${file(compressed_file.baseName).baseName}.d + + ls -l | tee -a ${compressed_file.baseName}_decompression.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunzip: \$(gunzip --help 2>&1 | head -1 | grep -oE "\\d+\\.\\d+(\\.\\d+)?") + tar: \$(tar --help 2>&1 | head -1 | grep -oE "\\d+\\.\\d+(\\.\\d+)?") + zip: \$(zip --help | head -2 | tail -1 | grep -oE "\\d+\\.\\d+") + END_VERSIONS + """ +} diff --git a/modules/local/decompress_dotd/meta.yml b/modules/local/decompress_dotd/meta.yml new file mode 100644 index 00000000..b17737a2 --- /dev/null +++ b/modules/local/decompress_dotd/meta.yml @@ -0,0 +1,45 @@ +name: decompression +description: Decompress .tar/.gz files that contain a .d file/directory +keywords: + - raw + - bruker + - .d +tools: + - tar: + description: | + Generates and extracts archives. + homepage: https://www.gnu.org/software/tar/ + - gunzip: + description: | + Decompresses using zlib. + homepage: https://www.gnu.org/software/gzip/ +input: + - meta: + type: map + description: | + Groovy Map containing sample information + - rawfile: + type: file + description: | + Bruker Raw file archived using tar + pattern: "*.{d.tar,.tar,.gz,.d.tar.gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'MD5', enzyme:trypsin ] + - dotd: + type: path + description: Raw Bruker .d file + pattern: "*.d" + - log: + type: file + description: log file + pattern: "*.log" + - version: + type: file + description: File containing software version + pattern: "versions.yml" +authors: + - "@jspaezp" diff --git a/modules/local/diann_preliminary_analysis/main.nf b/modules/local/diann_preliminary_analysis/main.nf index 24441f43..f3844144 100644 --- a/modules/local/diann_preliminary_analysis/main.nf +++ b/modules/local/diann_preliminary_analysis/main.nf @@ -1,5 +1,5 @@ process DIANN_PRELIMINARY_ANALYSIS { - tag "$mzML.baseName" + tag "$ms_file.baseName" label 'process_high' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? @@ -7,7 +7,7 @@ process DIANN_PRELIMINARY_ANALYSIS { 'docker.io/biocontainers/diann:v1.8.1_cv1' }" input: - tuple val(meta), file(mzML), file(predict_tsv) + tuple val(meta), path(ms_file), path(predict_library) output: path "*.quant", emit: diann_quant @@ -20,16 +20,27 @@ process DIANN_PRELIMINARY_ANALYSIS { script: def args = task.ext.args ?: '' - mass_acc_ms1 = meta.precursor_mass_tolerance_unit == "ppm" ? meta.precursor_mass_tolerance : 5 - mass_acc_ms2 = meta.fragment_mass_tolerance_unit == "ppm" ? meta.fragment_mass_tolerance : 13 + // I am using here the ["key"] syntax, since the preprocessed meta makes + // was evaluating to null when using the dot notation. + mass_acc_ms1 = meta['precursormasstoleranceunit'].toLowerCase().endsWith('ppm') ? meta['precursormasstolerance'] : 5 + mass_acc_ms2 = meta['fragmentmasstoleranceunit'].toLowerCase().endsWith('ppm') ? meta['fragmentmasstolerance'] : 13 - mass_acc = params.mass_acc_automatic ? "--quick-mass-acc --individual-mass-acc" : "--mass-acc $mass_acc_ms2 --mass-acc-ms1 $mass_acc_ms1" - scan_window = params.scan_window_automatic ? "--individual-windows" : "--window $params.scan_window" - time_corr_only = params.time_corr_only ? "--time-corr-only" : "" + if (params.mass_acc_automatic) { + mass_acc = '--quick-mass-acc --individual-mass-acc' + } else { + mass_acc = "--mass-acc $mass_acc_ms2 --mass-acc-ms1 $mass_acc_ms1" + } + scan_window = params.scan_window_automatic ? '--individual-windows' : '--window $params.scan_window' + time_corr_only = params.time_corr_only ? '--time-corr-only' : '' """ - diann --lib ${predict_tsv} \\ - --f ${mzML} \\ + # Precursor Tolerance value was: ${meta['precursormasstolerance']} + # Fragment Tolerance value was: ${meta['fragmentmasstolerance']} + # Precursor Tolerance unit was: ${meta['precursormasstoleranceunit']} + # Fragment Tolerance unit was: ${meta['fragmentmasstoleranceunit']} + + diann --lib ${predict_library} \\ + --f ${ms_file} \\ --threads ${task.cpus} \\ --verbose $params.diann_debug \\ ${scan_window} \\ @@ -39,7 +50,7 @@ process DIANN_PRELIMINARY_ANALYSIS { ${mass_acc} \\ ${time_corr_only} \\ $args \\ - 2>&1 | tee ${mzML.baseName}_diann.log + 2>&1 | tee ${ms_file.baseName}_diann.log cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/diannconvert/main.nf b/modules/local/diannconvert/main.nf index 36f2ad6f..b9282db5 100644 --- a/modules/local/diannconvert/main.nf +++ b/modules/local/diannconvert/main.nf @@ -23,6 +23,7 @@ process DIANNCONVERT { path "*msstats_in.csv", emit: out_msstats path "*triqler_in.tsv", emit: out_triqler path "*.mzTab", emit: out_mztab + path "*.log", emit: log path "versions.yml", emit: version exec: @@ -36,6 +37,7 @@ process DIANNCONVERT { """ diann_convert.py convert \\ --folder ./ \\ + --exp_design ${exp_design} \\ --diann_version ./version/versions.yml \\ --dia_params "${dia_params}" \\ --charge $params.max_precursor_charge \\ diff --git a/modules/local/diannsummary/main.nf b/modules/local/diannsummary/main.nf index 438c4414..a4f34981 100644 --- a/modules/local/diannsummary/main.nf +++ b/modules/local/diannsummary/main.nf @@ -7,11 +7,15 @@ process DIANNSUMMARY { 'docker.io/biocontainers/diann:v1.8.1_cv1' }" input: - file(mzMLs) + // Note that the files are passed as names and not paths, this prevents them from being staged + // in the directory + val(ms_files) val(meta) - file(empirical_library) - file("quant/") - file(fasta) + path(empirical_library) + // The quant path is passed, and diann will use the files in the quant directory instead + // of the ones passed in ms_files. + path("quant/") + path(fasta) output: path "diann_report.tsv", emit: main_report @@ -27,17 +31,20 @@ process DIANNSUMMARY { script: def args = task.ext.args ?: '' - mass_acc_ms1 = meta.precursor_mass_tolerance_unit == "ppm" ? meta.precursor_mass_tolerance : 5 - mass_acc_ms2 = meta.fragment_mass_tolerance_unit == "ppm" ? meta.fragment_mass_tolerance : 13 + mass_acc_ms1 = meta["precursormasstoleranceunit"].toLowerCase().endsWith("ppm") ? meta["precursormasstolerance"] : 5 + mass_acc_ms2 = meta["fragmentmasstoleranceunit"].toLowerCase().endsWith("ppm") ? meta["fragmentmasstolerance"] : 13 mass_acc = params.mass_acc_automatic ? "--quick-mass-acc --individual-mass-acc" : "--mass-acc $mass_acc_ms2 --mass-acc-ms1 $mass_acc_ms1" scan_window = params.scan_window_automatic ? "--individual-windows" : "--window $params.scan_window" species_genes = params.species_genes ? "--species-genes": "" """ + # Notes: if .quant files are passed, mzml/.d files are not accessed, so the name needs to be passed but files + # do not need to pe present. + diann --lib ${empirical_library} \\ --fasta ${fasta} \\ - --f ${(mzMLs as List).join(' --f ')} \\ + --f ${(ms_files as List).join(' --f ')} \\ --threads ${task.cpus} \\ --verbose $params.diann_debug \\ ${scan_window} \\ diff --git a/modules/local/dotd_to_mqc/main.nf b/modules/local/dotd_to_mqc/main.nf new file mode 100644 index 00000000..cdf080f7 --- /dev/null +++ b/modules/local/dotd_to_mqc/main.nf @@ -0,0 +1,70 @@ +/* groovylint-disable DuplicateStringLiteral */ +process DOTD2MQC_INDIVIDUAL { + tag "$meta.experiment_id" + label 'process_single' + + conda "base::python=3.10" + conda "conda-forge::python=3.10" + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/python:3.10" + } else { + container "quay.io/biocontainers/python:3.10" + } + + input: + // Note: This step can be optimized by staging only the + // .tdf file inside the .d directory. + // Thus reducing the data transfer of the rest of the .d + // directory. IN PARTICULAR the .tdf.bin + tuple val(meta), path(dot_d_file) + + output: + tuple path("dotd_mqc.yml"), path("*.tsv"), emit: dotd_mqc_data + path "general_stats*.tsv", emit: general_stats + path "versions.yml", emit: version + path "*.log", emit: log + + script: + def prefix = task.ext.prefix ?: "${meta.mzml_id}" + + """ + dotd_2_mqc.py single "${dot_d_file}" \${PWD} \\ + 2>&1 | tee dotd_2_mqc_${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dotd_2_mqc: \$(dotd_2_mqc.py --version | grep -oE "\\d\\.\\d\\.\\d") + dotd_2_mqc_python: \$(python --version | grep -oE "\\d\\.\\d\\.\\d") + END_VERSIONS + """ +} + + +process DOTD2MQC_AGGREGATE { + label 'process_single' + + conda 'base::python=3.10' + container 'continuumio/miniconda3:23.5.2-0-alpine' + + input: + path '*' // tsv files from DOTD2MQC_INDIVIDUAL + + output: + path 'general_stats.tsv', emit: dotd_mqc_data + path 'versions.yml', emit: version + path '*.log', emit: log + + script: + """ + ls -lcth + + dotd_2_mqc.py aggregate \${PWD} \${PWD} \\ + 2>&1 | tee dotd_2_mqc_agg.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dotd_2_mqc: \$(dotd_2_mqc.py --version | grep -oE "\\d\\.\\d\\.\\d") + dotd_2_mqc_python: \$(python --version | grep -oE "\\d\\.\\d\\.\\d") + END_VERSIONS + """ +} diff --git a/modules/local/dotd_to_mqc/meta.yml b/modules/local/dotd_to_mqc/meta.yml new file mode 100644 index 00000000..e69de29b diff --git a/modules/local/generate_diann_cfg/main.nf b/modules/local/generate_diann_cfg/main.nf index e9cb1cde..41df3922 100644 --- a/modules/local/generate_diann_cfg/main.nf +++ b/modules/local/generate_diann_cfg/main.nf @@ -2,9 +2,9 @@ process GENERATE_DIANN_CFG { tag "$meta.experiment_id" label 'process_low' - conda "conda-forge::pandas_schema bioconda::sdrf-pipelines=0.0.22" + conda 'conda-forge::pandas_schema bioconda::sdrf-pipelines=0.0.22' if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/sdrf-pipelines:0.0.22--pyhdfd78af_0" + container 'https://depot.galaxyproject.org/singularity/sdrf-pipelines:0.0.22--pyhdfd78af_0' } else { container "biocontainers/sdrf-pipelines:0.0.22--pyhdfd78af_0" } @@ -13,9 +13,9 @@ process GENERATE_DIANN_CFG { val(meta) output: - path "diann_config.cfg", emit: diann_cfg - path "versions.yml", emit: version - path "*.log" + path 'diann_config.cfg', emit: diann_cfg + path 'versions.yml', emit: version + path '*.log' script: def args = task.ext.args ?: '' diff --git a/modules/local/individual_final_analysis/main.nf b/modules/local/individual_final_analysis/main.nf index ecb9b69f..51eb65bb 100644 --- a/modules/local/individual_final_analysis/main.nf +++ b/modules/local/individual_final_analysis/main.nf @@ -1,5 +1,5 @@ process INDIVIDUAL_FINAL_ANALYSIS { - tag "$mzML.baseName" + tag "$ms_file.baseName" label 'process_high' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? @@ -7,7 +7,7 @@ process INDIVIDUAL_FINAL_ANALYSIS { 'docker.io/biocontainers/diann:v1.8.1_cv1' }" input: - tuple val(meta), file(mzML), file(fasta), file(diann_log), file(library) + tuple val(meta), path(ms_file), path(fasta), path(diann_log), path(library) output: path "*.quant", emit: diann_quant @@ -19,8 +19,8 @@ process INDIVIDUAL_FINAL_ANALYSIS { script: def args = task.ext.args ?: '' - mass_acc_ms1 = meta.precursor_mass_tolerance_unit == "ppm" ? meta.precursor_mass_tolerance : 5 - mass_acc_ms2 = meta.fragment_mass_tolerance_unit == "ppm" ? meta.fragment_mass_tolerance : 13 + mass_acc_ms1 = meta["precursormasstoleranceunit"].toLowerCase().endsWith("ppm") ? meta["precursormasstolerance"] : 5 + mass_acc_ms2 = meta["fragmentmasstoleranceunit"].toLowerCase().endsWith("ppm") ? meta["fragmentmasstolerance"] : 13 scan_window = params.scan_window if (params.mass_acc_automatic | params.scan_window_automatic){ @@ -31,20 +31,20 @@ process INDIVIDUAL_FINAL_ANALYSIS { """ diann --lib ${library} \\ - --f ${mzML} \\ + --f ${ms_file} \\ --fasta ${fasta} \\ --threads ${task.cpus} \\ --verbose $params.diann_debug \\ --temp ./ \\ - --mass-acc \$(echo ${mass_acc_ms2}) \\ - --mass-acc-ms1 \$(echo ${mass_acc_ms1}) \\ - --window \$(echo ${scan_window}) \\ + --mass-acc ${mass_acc_ms2} \\ + --mass-acc-ms1 ${mass_acc_ms1} \\ + --window ${scan_window} \\ --no-ifs-removal \\ --no-main-report \\ --relaxed-prot-inf \\ --pg-level $params.pg_level \\ $args \\ - 2>&1 | tee ${mzML.baseName}_final_diann.log + 2>&1 | tee ${ms_file.baseName}_final_diann.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/pmultiqc/main.nf b/modules/local/pmultiqc/main.nf index 9e234214..0d12625e 100644 --- a/modules/local/pmultiqc/main.nf +++ b/modules/local/pmultiqc/main.nf @@ -26,6 +26,16 @@ process PMULTIQC { def disable_idxml_index = (params.enable_pmultiqc) && (params.pmultiqc_idxml_skip) ? "--ignored_idxml" : "" """ + set -x + set -e + + # leaving here to ease debugging + ls -lcth * + + echo ">>>>>>>>> Experimental Design <<<<<<<<<" + cat results/*openms_design.tsv + + echo ">>>>>>>>> Running Multiqc <<<<<<<<<" multiqc \\ -f \\ --config ./results/multiqc_config.yml \\ diff --git a/modules/local/sdrfparsing/main.nf b/modules/local/sdrfparsing/main.nf index 0bf4b01d..2f7989bb 100644 --- a/modules/local/sdrfparsing/main.nf +++ b/modules/local/sdrfparsing/main.nf @@ -18,12 +18,23 @@ process SDRFPARSING { script: def args = task.ext.args ?: '' + if (params.convert_dotd) { + extensionconversions = ",.d.gz:.mzML,.d.tar.gz:.mzML,d.tar:.mzML,.d.zip:.mzML,.d:.mzML" + } else { + extensionconversions = ",.gz:,.tar.gz:,.tar:,.zip:" + } """ ## -t2 since the one-table format parser is broken in OpenMS2.5 ## -l for legacy behavior to always add sample columns - parse_sdrf convert-openms -t2 -l --extension_convert raw:mzML -s ${sdrf} 2>&1 | tee ${sdrf.baseName}_parsing.log + parse_sdrf convert-openms \\ + -t2 -l \\ + --extension_convert raw:mzML$extensionconversions \\ + -s ${sdrf} \\ + $args \\ + 2>&1 | tee ${sdrf.baseName}_parsing.log + mv openms.tsv ${sdrf.baseName}_config.tsv mv experimental_design.tsv ${sdrf.baseName}_openms_design.tsv diff --git a/modules/local/tdf2mzml/main.nf b/modules/local/tdf2mzml/main.nf new file mode 100644 index 00000000..cd67a86e --- /dev/null +++ b/modules/local/tdf2mzml/main.nf @@ -0,0 +1,33 @@ + +process TDF2MZML { + tag "$meta.mzml_id" + label 'process_single' + label 'error_retry' + + container 'mfreitas/tdf2mzml:latest' // I don't know which stable tag to use... + + input: + tuple val(meta), path(rawfile) + + output: + tuple val(meta), path("*.mzML"), emit: mzmls_converted + tuple val(meta), path("*.d"), emit: dotd_files + path "versions.yml", emit: version + path "*.log", emit: log + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.mzml_id}" + + """ + echo "Converting..." | tee --append ${rawfile.baseName}_conversion.log + tdf2mzml.py -i *.d 2>&1 | tee --append ${rawfile.baseName}_conversion.log + mv *.mzml ${file(rawfile.baseName).baseName}.mzML + mv *.d ${file(rawfile.baseName).baseName}.d + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tdf2mzml.py: \$(tdf2mzml.py --version) + END_VERSIONS + """ +} diff --git a/modules/local/tdf2mzml/meta.yml b/modules/local/tdf2mzml/meta.yml new file mode 100644 index 00000000..4bbf8e3f --- /dev/null +++ b/modules/local/tdf2mzml/meta.yml @@ -0,0 +1,42 @@ +name: tdf2mzml +description: convert raw bruker files to mzml files +keywords: + - raw + - mzML + - .d +tools: + - tdf2mzml: + description: | + It takes a bruker .d raw file as input and outputs indexed mzML + homepage: https://github.com/mafreitas/tdf2mzml + documentation: https://github.com/mafreitas/tdf2mzml +input: + - meta: + type: map + description: | + Groovy Map containing sample information + - rawfile: + type: file + description: | + Bruker Raw file archived using tar + pattern: "*.d.tar" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'MD5', enzyme:trypsin ] + - mzml: + type: file + description: indexed mzML + pattern: "*.mzML" + - log: + type: file + description: log file + pattern: "*.log" + - version: + type: file + description: File containing software version + pattern: "versions.yml" +authors: + - "@jspaezp" diff --git a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py index b02fa23c..f5690c52 100755 --- a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py +++ b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py @@ -58,6 +58,11 @@ def main(): "yaml": yaml.__version__, } + with open("$versions") as f: + # load as text and print for debugging + versions_text = f.read() + print(versions_text) + with open("$versions") as f: versions_by_process = yaml.safe_load(f) | versions_this_module diff --git a/nextflow.config b/nextflow.config index 39eab531..0886ae5f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -156,6 +156,9 @@ params { quantify_decoys = false id_transfer_threshold = 0.50 // only used if targeted_only is set to false (default) + // Bruker data + convert_dotd = false + // DIA-NN diann_debug = 3 scan_window = 8 @@ -166,7 +169,8 @@ params { mass_acc_automatic = true pg_level = 2 species_genes = false - diann_normalize = true + diann_normalize = true + diann_speclib = '' // MSstats general options msstats_remove_one_feat_prot = true diff --git a/nextflow_schema.json b/nextflow_schema.json index e1aed86b..d905cf35 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -152,6 +152,12 @@ "fa_icon": "fas fa-font", "help_text": "Which MS levels to pick as comma separated list, e.g. `--peakpicking_ms_levels 1,2`. Leave empty for auto-detection." }, + "convert_dotd": { + "type": "boolean", + "description": "Convert bruker .d files to mzML", + "fa_icon": "far fa-check-square", + "help_text": "Whether to convert raw .d bruker files to .mzML" + }, "reindex_mzml": { "type": "boolean", "default": true, @@ -919,6 +925,13 @@ "fa_icon": "far fa-check-square", "default": false }, + "diann_speclib": { + "type": "string", + "description": "The spectral library to use for DIA-NN", + "fa_icon": "fas fa-file", + "help_text": "If passed, will use that spectral library to carry out the DIA-NN search, instead of predicting one from the fasta file.", + "hidden": false + }, "diann_debug": { "type": "integer", "description": "Debug level", diff --git a/pyproject.toml b/pyproject.toml index 0d62beb6..814dd46f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,3 +8,6 @@ target_version = ["py37", "py38", "py39", "py310"] profile = "black" known_first_party = ["nf_core"] multi_line_output = 3 + +[tool.ruff] +line-length = 120 diff --git a/subworkflows/local/file_preparation.nf b/subworkflows/local/file_preparation.nf index 341ba3ee..b8592031 100644 --- a/subworkflows/local/file_preparation.nf +++ b/subworkflows/local/file_preparation.nf @@ -3,28 +3,46 @@ // include { THERMORAWFILEPARSER } from '../../modules/local/thermorawfileparser/main' +include { TDF2MZML } from '../../modules/local/tdf2mzml/main' +include { DECOMPRESS } from '../../modules/local/decompress_dotd/main' +include { DOTD2MQC_INDIVIDUAL } from '../../modules/local/dotd_to_mqc/main' +include { DOTD2MQC_AGGREGATE } from '../../modules/local/dotd_to_mqc/main' include { MZMLINDEXING } from '../../modules/local/openms/mzmlindexing/main' include { MZMLSTATISTICS } from '../../modules/local/mzmlstatistics/main' include { OPENMSPEAKPICKER } from '../../modules/local/openms/openmspeakpicker/main' workflow FILE_PREPARATION { take: - ch_mzmls // channel: [ val(meta), raw/mzml ] + ch_rawfiles // channel: [ val(meta), raw/mzml/d.tar ] main: ch_versions = Channel.empty() ch_results = Channel.empty() ch_statistics = Channel.empty() + ch_mqc_data = Channel.empty() + + // Divide the compressed files + ch_rawfiles + .branch { + dottar: WorkflowQuantms.hasExtension(it[1], '.tar') + dotzip: WorkflowQuantms.hasExtension(it[1], '.zip') + gz: WorkflowQuantms.hasExtension(it[1], '.gz') + uncompressed: true + }.set { ch_branched_input } + + compressed_files = ch_branched_input.dottar.mix(ch_branched_input.dotzip, ch_branched_input.gz) + DECOMPRESS(compressed_files) + ch_versions = ch_versions.mix(DECOMPRESS.out.version) + ch_rawfiles = ch_branched_input.uncompressed.mix(DECOMPRESS.out.decompressed_files) // // Divide mzml files - // - ch_mzmls + ch_rawfiles .branch { - raw: WorkflowQuantms.hasExtension(it[1], 'raw') - mzML: WorkflowQuantms.hasExtension(it[1], 'mzML') - } - .set { ch_branched_input } + raw: WorkflowQuantms.hasExtension(it[1], '.raw') + mzML: WorkflowQuantms.hasExtension(it[1], '.mzML') + dotd: WorkflowQuantms.hasExtension(it[1], '.d') + }.set { ch_branched_input } // Note: we used to always index mzMLs if not already indexed but due to // either a bug or limitation in nextflow @@ -43,27 +61,53 @@ workflow FILE_PREPARATION { } THERMORAWFILEPARSER( ch_branched_input.raw ) + // Output is + // {'mzmls_converted': Tuple[val(meta), path(mzml)], + // 'version': Path(versions.yml), + // 'log': Path(*.txt)} + + // Where meta is the same as the input meta ch_versions = ch_versions.mix(THERMORAWFILEPARSER.out.version) ch_results = ch_results.mix(THERMORAWFILEPARSER.out.mzmls_converted) - ch_results.map{ it -> [it[0], it[1]] }.set{ ch_mzml } + ch_results.map{ it -> [it[0], it[1]] }.set{ indexed_mzml_bundle } + + // Exctract qc data from .d files + DOTD2MQC_INDIVIDUAL(ch_branched_input.dotd) + // The map extracts the tsv files from the tuple, the other elem is the yml config. + ch_mqc_data = ch_mqc_data.mix(DOTD2MQC_INDIVIDUAL.out.dotd_mqc_data.map{ it -> it[1] }.collect()) + DOTD2MQC_AGGREGATE(DOTD2MQC_INDIVIDUAL.out.general_stats.collect()) + ch_mqc_data = ch_mqc_data.mix(DOTD2MQC_AGGREGATE.out.dotd_mqc_data.collect()) + ch_versions = ch_versions.mix(DOTD2MQC_INDIVIDUAL.out.version) + ch_versions = ch_versions.mix(DOTD2MQC_AGGREGATE.out.version) - MZMLSTATISTICS( ch_mzml ) + // Convert .d files to mzML + if (params.convert_dotd) { + TDF2MZML( ch_branched_input.dotd ) + ch_versions = ch_versions.mix(TDF2MZML.out.version) + ch_results = indexed_mzml_bundle.mix(TDF2MZML.out.mzmls_converted) + // indexed_mzml_bundle = indexed_mzml_bundle.mix(TDF2MZML.out.mzmls_converted) + } else{ + ch_results = indexed_mzml_bundle.mix(ch_branched_input.dotd) + } + + MZMLSTATISTICS(indexed_mzml_bundle) ch_statistics = ch_statistics.mix(MZMLSTATISTICS.out.mzml_statistics.collect()) ch_versions = ch_versions.mix(MZMLSTATISTICS.out.version) if (params.openms_peakpicking){ + // If the peak picker is enabled, it will over-write not bypass the .d files OPENMSPEAKPICKER ( - ch_results + indexed_mzml_bundle ) ch_versions = ch_versions.mix(OPENMSPEAKPICKER.out.version) ch_results = OPENMSPEAKPICKER.out.mzmls_picked } - emit: - results = ch_results // channel: [val(mzml_id), indexedmzml] + results = ch_results // channel: [val(mzml_id), indexedmzml|.d.tar] statistics = ch_statistics // channel: [ *_mzml_info.tsv ] + mqc_custom_data = ch_mqc_data // channel: [ *.tsv ] version = ch_versions // channel: [ *.version.txt ] } diff --git a/workflows/dia.nf b/workflows/dia.nf index feb08918..506d646d 100644 --- a/workflows/dia.nf +++ b/workflows/dia.nf @@ -20,7 +20,6 @@ include { DIANNSUMMARY } from '../modules/local/diannsummary/m // SUBWORKFLOWS: Consisting of a mix of local and nf-core/modules // - /* ======================================================================================== RUN MAIN WORKFLOW @@ -39,15 +38,16 @@ workflow DIA { main: ch_software_versions = Channel.empty() - Channel.fromPath(params.database).set{ ch_searchdb } + Channel.fromPath(params.database).set { ch_searchdb } ch_file_preparation_results.multiMap { - meta: preprocessed_meta(it[0]) - mzml: it[1] - } - .set { ch_result } + result -> + meta: preprocessed_meta(result[0]) + ms_file:result[1] + } + .set { ch_result } - meta = ch_result.meta.unique {it[0]} + meta = ch_result.meta.unique { it[0] } DIANNCFG(meta) ch_software_versions = ch_software_versions.mix(DIANNCFG.out.version.ifEmpty(null)) @@ -55,48 +55,75 @@ workflow DIA { // // MODULE: SILICOLIBRARYGENERATION // - SILICOLIBRARYGENERATION(ch_searchdb, DIANNCFG.out.diann_cfg) + if (params.diann_speclib) { + speclib = Channel.fromPath(params.diann_speclib) + } else { + SILICOLIBRARYGENERATION(ch_searchdb, DIANNCFG.out.diann_cfg) + speclib = SILICOLIBRARYGENERATION.out.predict_speclib + } // // MODULE: DIANN_PRELIMINARY_ANALYSIS // - DIANN_PRELIMINARY_ANALYSIS(ch_file_preparation_results.combine(SILICOLIBRARYGENERATION.out.predict_speclib)) + DIANN_PRELIMINARY_ANALYSIS(ch_file_preparation_results.combine(speclib)) ch_software_versions = ch_software_versions.mix(DIANN_PRELIMINARY_ANALYSIS.out.version.ifEmpty(null)) // // MODULE: ASSEMBLE_EMPIRICAL_LIBRARY // - ASSEMBLE_EMPIRICAL_LIBRARY(ch_result.mzml.collect(), - meta, - DIANN_PRELIMINARY_ANALYSIS.out.diann_quant.collect(), - SILICOLIBRARYGENERATION.out.predict_speclib - ) + // Order matters in DIANN, This shoudl be sorted for reproducible results. + ASSEMBLE_EMPIRICAL_LIBRARY( + ch_result.ms_file.collect(), + meta, + DIANN_PRELIMINARY_ANALYSIS.out.diann_quant.collect(), + speclib + ) ch_software_versions = ch_software_versions.mix(ASSEMBLE_EMPIRICAL_LIBRARY.out.version.ifEmpty(null)) // // MODULE: INDIVIDUAL_FINAL_ANALYSIS // - INDIVIDUAL_FINAL_ANALYSIS(ch_file_preparation_results.combine(ch_searchdb).combine(ASSEMBLE_EMPIRICAL_LIBRARY.out.log).combine(ASSEMBLE_EMPIRICAL_LIBRARY.out.empirical_library)) + INDIVIDUAL_FINAL_ANALYSIS( + ch_file_preparation_results + .combine(ch_searchdb) + .combine(ASSEMBLE_EMPIRICAL_LIBRARY.out.log) + .combine(ASSEMBLE_EMPIRICAL_LIBRARY.out.empirical_library) + ) ch_software_versions = ch_software_versions.mix(INDIVIDUAL_FINAL_ANALYSIS.out.version.ifEmpty(null)) // // MODULE: DIANNSUMMARY // - DIANNSUMMARY(ch_result.mzml.collect(), meta, ASSEMBLE_EMPIRICAL_LIBRARY.out.empirical_library, + // Order matters in DIANN, This should be sorted for reproducible results. + // NOTE: I am getting here the names of the ms files, not the path. + // Since the next step only needs the name (since it uses the cached .quant) + // Also note that I am converting to a file object here because when executing + // locally, evey element in ch_result is a string, whilst on cloud it is a path. + ch_result + .ms_file.map { msfile -> file(msfile).getName() } + .collect() + .set { ms_file_names } + DIANNSUMMARY(ms_file_names, meta, ASSEMBLE_EMPIRICAL_LIBRARY.out.empirical_library, INDIVIDUAL_FINAL_ANALYSIS.out.diann_quant.collect(), ch_searchdb) ch_software_versions = ch_software_versions.mix(DIANNSUMMARY.out.version.ifEmpty(null)) // // MODULE: DIANNCONVERT // - DIANNCONVERT(DIANNSUMMARY.out.main_report, ch_expdesign, DIANNSUMMARY.out.pg_matrix, DIANNSUMMARY.out.pr_matrix, ch_mzml_info, - meta, ch_searchdb, DIANNSUMMARY.out.version) + DIANNCONVERT( + DIANNSUMMARY.out.main_report, ch_expdesign, + DIANNSUMMARY.out.pg_matrix, + DIANNSUMMARY.out.pr_matrix, ch_mzml_info, + meta, + ch_searchdb, + DIANNSUMMARY.out.version + ) ch_software_versions = ch_software_versions.mix(DIANNCONVERT.out.version.ifEmpty(null)) // // MODULE: MSSTATS ch_msstats_out = Channel.empty() - if(!params.skip_post_msstats){ + if (!params.skip_post_msstats) { MSSTATS(DIANNCONVERT.out.out_msstats) ch_msstats_out = MSSTATS.out.msstats_csv ch_software_versions = ch_software_versions.mix(MSSTATS.out.version.ifEmpty(null)) @@ -108,24 +135,22 @@ workflow DIA { msstats_in = DIANNCONVERT.out.out_msstats out_triqler = DIANNCONVERT.out.out_triqler msstats_out = ch_msstats_out - } - // remove meta.id to make sure cache identical HashCode -def preprocessed_meta(LinkedHashMap meta){ +def preprocessed_meta(LinkedHashMap meta) { def parameters = [:] - parameters["experiment_id"] = meta.experiment_id - parameters["acquisition_method"] = meta.acquisition_method - parameters["dissociationmethod"] = meta.dissociationmethod - parameters["labelling_type"] = meta.labelling_type - parameters["fixedmodifications"] = meta.fixedmodifications - parameters["variablemodifications"] = meta.variablemodifications - parameters["precursormasstolerance"] = meta.precursormasstolerance - parameters["precursormasstoleranceunit"] = meta.precursormasstoleranceunit - parameters["fragmentmasstolerance"] = meta.fragmentmasstolerance - parameters["fragmentmasstoleranceunit"] = meta.fragmentmasstoleranceunit - parameters["enzyme"] = meta.enzyme + parameters['experiment_id'] = meta.experiment_id + parameters['acquisition_method'] = meta.acquisition_method + parameters['dissociationmethod'] = meta.dissociationmethod + parameters['labelling_type'] = meta.labelling_type + parameters['fixedmodifications'] = meta.fixedmodifications + parameters['variablemodifications'] = meta.variablemodifications + parameters['precursormasstolerance'] = meta.precursormasstolerance + parameters['precursormasstoleranceunit'] = meta.precursormasstoleranceunit + parameters['fragmentmasstolerance'] = meta.fragmentmasstolerance + parameters['fragmentmasstoleranceunit'] = meta.fragmentmasstoleranceunit + parameters['enzyme'] = meta.enzyme return parameters } diff --git a/workflows/quantms.nf b/workflows/quantms.nf index 1e41ed38..d8d1eb3a 100644 --- a/workflows/quantms.nf +++ b/workflows/quantms.nf @@ -187,6 +187,7 @@ workflow QUANTMS { ch_multiqc_files = ch_multiqc_files.mix(Channel.from(ch_multiqc_config)) ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(FILE_PREPARATION.out.statistics) + ch_multiqc_files = ch_multiqc_files.mix(FILE_PREPARATION.out.mqc_custom_data) ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) ch_multiqc_quantms_logo = file("$projectDir/assets/nf-core-quantms_logo_light.png")