diff --git a/CHANGELOG.md b/CHANGELOG.md
index b478024d..4abac9ca 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,10 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### `Added`
 
+- [#275 BigBio](https://github.com/bigbio/quantms/pull/275) Added support for bruker data. And speed-up to DIA-NN pipeline.
+
 ### `Changed`
 
 ### `Fixed`
 
+- Fixed bug where modification masses were not calculated correctly in DIA-NN conversion.
+
 ### `Dependencies`
 
 ### `Parameters`
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
index ad670051..783c703e 100644
--- a/assets/multiqc_config.yml
+++ b/assets/multiqc_config.yml
@@ -18,6 +18,50 @@ custom_logo: "./nf-core-quantms_logo_light.png"
 custom_logo_url: "https://github.com/bigbio/quantms"
 custom_logo_title: "quantms"
 
+custom_data:
+  total_ion_chromatograms:
+    file_format: "tsv"
+    section_name: "MS1 TIC"
+    description: "MS1 total ion chromatograms extracted from the .d files"
+    plot_type: "linegraph"
+    pconfig:
+      id: "ms1_tic"
+      title: "MS1 TIC"
+      ylab: "Ion Count"
+      ymin: 0
+  base_peak_chromatograms:
+    file_format: "tsv"
+    section_name: "MS1 BPC"
+    description: "MS1 base peak chromatograms extracted from the .d files"
+    plot_type: "linegraph"
+    pconfig:
+      id: "ms1_bpc"
+      title: "MS1 BPC"
+      ylab: "Ion Count"
+      ymin: 0
+  number_of_peaks:
+    file_format: "tsv"
+    section_name: "MS1 Peaks"
+    description: "MS1 Peaks from the .d files"
+    plot_type: "linegraph"
+    pconfig:
+      id: "ms1_peaks"
+      title: "MS1 Peaks"
+      ylab: "Peak Count"
+      ymin: 0
+  general_stats:
+    file_format: "tsv"
+    section_name: "General Stats"
+    description: "General stats from the .d files"
+    plot_type: "table"
 sp:
+  total_ion_chromatograms:
+    fn: "tic_*"
+  base_peak_chromatograms:
+    fn: "bpc_*"
+  number_of_peaks:
+    fn: "ms1_peaks_*"
+  general_stats:
+    fn: "general_stats.tsv"
   quantms/exp_design:
     fn: "*_design.tsv"
diff --git a/bin/diann_convert.py b/bin/diann_convert.py
index 257d5e01..34e31645 100755
--- a/bin/diann_convert.py
+++ b/bin/diann_convert.py
@@ -3,17 +3,31 @@
 This script converts the output from DIA-NN into three standard formats: MSstats, Triqler and mzTab.
 License: Apache 2.0
 Authors: Hong Wong, Yasset Perez-Riverol
+Revisions:
+    2023-Aug-05: J. Sebastian Paez
 """
 import logging
 import os
 import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, List, Tuple, Dict, Set
 
 import click
 import numpy as np
 import pandas as pd
 from pyopenms import AASequence, FASTAFile, ModificationsDB
+from pyopenms.Constants import PROTON_MASS_U
+
+pd.set_option("display.max_rows", 500)
+pd.set_option("display.max_columns", 500)
+pd.set_option("display.width", 1000)
 
 CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
+REVISION = "0.1.1"
+
+logging.basicConfig(format="%(asctime)s [%(funcName)s] - %(message)s", level=logging.DEBUG)
+logger = logging.getLogger(__name__)
 
 
 @click.group(context_settings=CONTEXT_SETTINGS)
@@ -23,13 +37,14 @@ def cli():
 
 @click.command("convert")
 @click.option("--folder", "-f")
+@click.option("--exp_design", "-d")
 @click.option("--diann_version", "-v")
 @click.option("--dia_params", "-p")
 @click.option("--charge", "-c")
 @click.option("--missed_cleavages", "-m")
 @click.option("--qvalue_threshold", "-q", type=float)
 @click.pass_context
-def convert(ctx, folder, dia_params, diann_version, charge, missed_cleavages, qvalue_threshold):
+def convert(ctx, folder, exp_design, dia_params, diann_version, charge, missed_cleavages, qvalue_threshold):
     """
     Convert DIA-NN output to MSstats, Triqler or mzTab.
      The output formats are
@@ -39,10 +54,10 @@ def convert(ctx, folder, dia_params, diann_version, charge, missed_cleavages, qv
         the DiaNN main report, protein matrix, precursor matrix, experimental design file, protein sequence
         FASTA file, version file of DiaNN and mzml_info TSVs
     :type folder: str
-    :param diann_version: Path to a version file of DIA-NN
-    :type diann_version: str
     :param dia_params: A list contains DIA parameters
     :type dia_params: list
+    :param diann_version: Path to a version file of DIA-NN
+    :type diann_version: str
     :param charge: The charge assigned by DIA-NN(max_precursor_charge)
     :type charge: int
     :param missed_cleavages: Allowed missed cleavages assigned by DIA-NN
@@ -50,191 +65,322 @@ def convert(ctx, folder, dia_params, diann_version, charge, missed_cleavages, qv
     :param qvalue_threshold: Threshold for filtering q value
     :type qvalue_threshold: float
     """
-    pathdict = {key: [] for key in ["report", "exp_design", "pg_matrix", "pr_matrix", "fasta", "mzml_info"]}
-    fileslist = os.listdir(folder)
-    if not folder.endswith("/"):
-        folder = folder + "/"
-    for i in fileslist:
-        if i.endswith("report.tsv"):
-            pathdict["report"].append(i)
-        elif i.endswith("_openms_design.tsv"):
-            pathdict["exp_design"].append(i)
-        elif i.endswith("pg_matrix.tsv"):
-            pathdict["pg_matrix"].append(i)
-        elif i.endswith("pr_matrix.tsv"):
-            pathdict["pr_matrix"].append(i)
-        elif i.endswith(".fasta") or i.endswith(".fa"):
-            pathdict["fasta"].append(i)
-        elif i.endswith("mzml_info.tsv"):
-            pathdict["mzml_info"].append(i)
-        else:
-            pass
-
-    for item in pathdict.items():
-        if item[0] != "mzml_info" and len(item[1]) > 1:
-            logging.error(f"{item[0]} is duplicate, check whether the file is redundant or change the file name!")
-
-    diann_report = folder + pathdict["report"][0]
-    exp_design = folder + pathdict["exp_design"][0]
-    pg_matrix = folder + pathdict["pg_matrix"][0]
-    pr_matrix = folder + pathdict["pr_matrix"][0]
-    fasta = folder + pathdict["fasta"][0]
-    diann_version_file = diann_version
-
-    with open(diann_version_file) as f:
-        for line in f:
-            if "DIA-NN" in line:
-                diann_version_id = line.rstrip("\n").split(": ")[1]
-                break
+    logger.debug(f"Revision {REVISION}")
+    logger.debug("Reading input files...")
+    diann_directory = DiannDirectory(folder, diann_version_file=diann_version)
+    report = diann_directory.main_report_df(qvalue_threshold=qvalue_threshold)
+    s_DataFrame, f_table = get_exp_design_dfs(exp_design)
 
-    remain_cols = [
-        "File.Name",
-        "Run",
-        "Protein.Group",
+    # Convert to MSstats
+    msstats_columns_keep = [
         "Protein.Names",
-        "Protein.Ids",
-        "First.Protein.Description",
-        "PG.MaxLFQ",
-        "RT.Start",
-        "Global.Q.Value",
-        "Lib.Q.Value",
-        "PEP",
-        "Precursor.Normalised",
-        "Precursor.Id",
-        "Q.Value",
         "Modified.Sequence",
-        "Stripped.Sequence",
         "Precursor.Charge",
         "Precursor.Quantity",
-        "Global.PG.Q.Value",
+        "File.Name",
+        "Run",
     ]
-    report = pd.read_csv(diann_report, sep="\t", header=0, usecols=remain_cols)
-
-    # filter based on qvalue parameter for downstream analysiss
-    report = report[report["Q.Value"] < qvalue_threshold]
-    report["Calculate.Precursor.Mz"] = report.apply(
-        lambda x: calculate_mz(x["Stripped.Sequence"], x["Precursor.Charge"]), axis=1
-    )
-
-    precursor_list = list(report["Precursor.Id"].unique())
-    report["precursor.Index"] = report.apply(lambda x: precursor_list.index(x["Precursor.Id"]), axis=1)
-
-    with open(exp_design, "r") as f:
-        data = f.readlines()
-        empty_row = data.index("\n")
-        f_table = [i.replace("\n", "").split("\t") for i in data[1:empty_row]]
-        f_header = data[0].replace("\n", "").split("\t")
-        f_table = pd.DataFrame(f_table, columns=f_header)
-        f_table.loc[:, "run"] = f_table.apply(
-            lambda x: os.path.splitext(os.path.basename(x["Spectra_Filepath"]))[0], axis=1
-        )
-
-        s_table = [i.replace("\n", "").split("\t") for i in data[empty_row + 1 :]][1:]
-        s_header = data[empty_row + 1].replace("\n", "").split("\t")
-        s_DataFrame = pd.DataFrame(s_table, columns=s_header)
 
-    # Convert to MSstats
-    out_msstats = pd.DataFrame()
-    out_msstats = report[
-        ["Protein.Names", "Modified.Sequence", "Precursor.Charge", "Precursor.Quantity", "File.Name", "Run"]
-    ]
+    logger.debug("Converting to MSstats format...")
+    out_msstats = report[msstats_columns_keep]
     out_msstats.columns = ["ProteinName", "PeptideSequence", "PrecursorCharge", "Intensity", "Reference", "Run"]
     out_msstats = out_msstats[out_msstats["Intensity"] != 0]
+
+    # Q: What is this line doing?
     out_msstats.loc[:, "PeptideSequence"] = out_msstats.apply(
         lambda x: AASequence.fromString(x["PeptideSequence"]).toString(), axis=1
     )
-    out_msstats.loc[:, "FragmentIon"] = "NA"
-    out_msstats.loc[:, "ProductCharge"] = "0"
-    out_msstats.loc[:, "IsotopeLabelType"] = "L"
-    out_msstats["Reference"] = out_msstats.apply(lambda x: os.path.basename(x["Reference"]), axis=1)
-
-    out_msstats[["Fraction", "BioReplicate", "Condition"]] = out_msstats.apply(
-        lambda x: query_expdesign_value(x["Run"], f_table, s_DataFrame), axis=1, result_type="expand"
+    out_msstats["FragmentIon"] = "NA"
+    out_msstats["ProductCharge"] = "0"
+    out_msstats["IsotopeLabelType"] = "L"
+    unique_reference_map = {k: os.path.basename(k) for k in out_msstats["Reference"].unique()}
+    out_msstats["Reference"] = out_msstats["Reference"].map(unique_reference_map)
+    del unique_reference_map
+
+    # TODO remove this if not debugging
+    logger.debug("\n\nReference Column >>>")
+    logger.debug(out_msstats["Reference"])
+
+    logger.debug(f"\n\nout_msstats ({out_msstats.shape}) >>>")
+    logger.debug(out_msstats.head(5))
+
+    logger.debug(f"\n\nf_table ({f_table.shape})>>>")
+    logger.debug(f_table.head(5))
+
+    logger.debug(f"\n\ns_DataFrame ({s_DataFrame.shape})>>>")
+    logger.debug(s_DataFrame.head(5))
+    ## END TODO
+
+    logger.debug("Adding Fraction, BioReplicate, Condition columns")
+    # Changing implementation from apply to merge went from several minutes to
+    # ~50ms
+    tmp = (
+        s_DataFrame[["Sample", "MSstats_Condition", "MSstats_BioReplicate"]]
+        .merge(f_table[["Fraction", "Sample", "run"]], on="Sample")
+        .rename(columns={"run": "Run", "MSstats_BioReplicate": "BioReplicate", "MSstats_Condition": "Condition"})
+        .drop(columns=["Sample"])
+    )
+    out_msstats = out_msstats.merge(
+        tmp,
+        on="Run",
+        validate="many_to_one",
     )
-    out_msstats.to_csv(os.path.splitext(os.path.basename(exp_design))[0] + "_msstats_in.csv", sep=",", index=False)
+    del tmp
+    exp_out_prefix = str(Path(exp_design).stem)
+    out_msstats.to_csv(exp_out_prefix + "_msstats_in.csv", sep=",", index=False)
+    logger.info(f"MSstats input file is saved as {exp_out_prefix}_msstats_in.csv")
 
     # Convert to Triqler
-    out_triqler = pd.DataFrame()
-    out_triqler = out_msstats[["ProteinName", "PeptideSequence", "PrecursorCharge", "Intensity", "Run", "Condition"]]
+    trinqler_cols = ["ProteinName", "PeptideSequence", "PrecursorCharge", "Intensity", "Run", "Condition"]
+    out_triqler = out_msstats[trinqler_cols]
     del out_msstats
     out_triqler.columns = ["proteins", "peptide", "charge", "intensity", "run", "condition"]
     out_triqler = out_triqler[out_triqler["intensity"] != 0]
 
     out_triqler.loc[:, "searchScore"] = report["Q.Value"]
     out_triqler.loc[:, "searchScore"] = 1 - out_triqler["searchScore"]
-    out_triqler.to_csv(os.path.splitext(os.path.basename(exp_design))[0] + "_triqler_in.tsv", sep="\t", index=False)
+    out_triqler.to_csv(exp_out_prefix + "_triqler_in.tsv", sep="\t", index=False)
+    logger.info(f"Triqler input file is saved as {exp_out_prefix}_triqler_in.tsv")
     del out_triqler
 
+    mztab_out = f"{str(Path(exp_design).stem)}_out.mzTab"
     # Convert to mzTab
-    if diann_version_id == "1.8.1":
-        fasta_df = pd.DataFrame()
+    diann_directory.convert_to_mztab(
+        report=report,
+        f_table=f_table,
+        charge=charge,
+        missed_cleavages=missed_cleavages,
+        dia_params=dia_params,
+        out=mztab_out,
+    )
+
+
+def _true_stem(x):
+    """
+    Return the true stem of a file name, i.e. the
+    file name without the extension.
+
+    :param x: The file name
+    :type x: str
+    :return: The true stem of the file name
+    :rtype: str
+
+    Examples:
+    >>> _true_stem("foo.mzML")
+    'foo'
+    >>> _true_stem("foo.d.tar")
+    'foo'
+
+    These examples can be tested with pytest:
+    $ pytest -v --doctest-modules
+    """
+    split = os.path.basename(x).split(".")
+    stem = split[0]
+
+    # Should I check here that the extensions are
+    # allowed? I can see how this would break if the
+    # file name contains a period.
+    return stem
+
+
+def get_exp_design_dfs(exp_design_file):
+    logger.info(f"Reading experimental design file: {exp_design_file}")
+    with open(exp_design_file, "r") as f:
+        data = f.readlines()
+        empty_row = data.index("\n")
+        f_table = [i.replace("\n", "").split("\t") for i in data[1:empty_row]]
+        f_header = data[0].replace("\n", "").split("\t")
+        f_table = pd.DataFrame(f_table, columns=f_header)
+        f_table.loc[:, "run"] = f_table.apply(lambda x: _true_stem(x["Spectra_Filepath"]), axis=1)
+
+        s_table = [i.replace("\n", "").split("\t") for i in data[empty_row + 1 :]][1:]
+        s_header = data[empty_row + 1].replace("\n", "").split("\t")
+        s_DataFrame = pd.DataFrame(s_table, columns=s_header)
+
+    return s_DataFrame, f_table
+
+
+@dataclass
+class DiannDirectory:
+    base_path: os.PathLike
+    diann_version_file: str
+
+    def __post_init__(self):
+        self.base_path = Path(self.base_path)
+        if not self.base_path.exists() and not self.base_path.is_dir():
+            raise NotADirectoryError(f"Path {self.base_path} does not exist")
+        self.diann_version_file = Path(self.diann_version_file)
+        if not self.diann_version_file.is_file():
+            raise FileNotFoundError(f"Path {self.diann_version_file} does not exist")
+
+    def find_suffix_file(self, suffix: str, only_first=True) -> os.PathLike:
+        """Finds a file with a given suffix in the directory.
+
+        :param suffix: The suffix to search for
+        :type suffix: str
+        :param only_first: Whether to return only the first file found, if false returns all, defaults to True
+        :type only_first: bool, optional
+
+        :raises FileNotFoundError: If no file with the given suffix is found
+        """
+        matching = self.base_path.glob(f"**/*{suffix}")
+        if only_first:
+            try:
+                return next(matching)
+            except StopIteration:
+                raise FileNotFoundError(f"Could not find file with suffix {suffix}")
+        else:
+            out = list(matching)
+            if len(out) == 0:
+                raise FileNotFoundError(f"Could not find file with suffix {suffix}")
+            else:
+                return out
+
+    @property
+    def report(self) -> os.PathLike:
+        return self.find_suffix_file("report.tsv")
+
+    @property
+    def pg_matrix(self) -> os.PathLike:
+        return self.find_suffix_file("pg_matrix.tsv")
+
+    @property
+    def pr_matrix(self) -> os.PathLike:
+        return self.find_suffix_file("pr_matrix.tsv")
+
+    @property
+    def fasta(self) -> os.PathLike:
+        try:
+            return self.find_suffix_file(".fasta")
+        except FileNotFoundError:
+            return self.find_suffix_file(".fa")
+
+    @property
+    def mzml_info(self) -> os.PathLike:
+        return self.find_suffix_file("mzml_info.tsv")
+
+    @property
+    def validate_diann_version(self) -> str:
+        logger.debug("Validating DIANN version")
+        diann_version_id = None
+        with open(self.diann_version_file) as f:
+            for line in f:
+                if "DIA-NN" in line:
+                    logger.debug(f"Found DIA-NN version: {line}")
+                    diann_version_id = line.rstrip("\n").split(": ")[1]
+
+        if diann_version_id is None:
+            raise ValueError(f"Could not find DIA-NN version in file {self.diann_version_file}")
+        elif diann_version_id == "1.8.1":
+            return diann_version_id
+        else:
+            # Maybe this error should be detected beforehand to save time ...
+            raise ValueError(f"Unsupported DIANN version {self.diann_version}")
+
+    def convert_to_mztab(
+        self, report, f_table, charge: int, missed_cleavages: int, dia_params: List[Any], out: os.PathLike
+    ) -> None:
+        logger.info("Converting to mzTab")
+        # Convert to mzTab
+        self.validate_diann_version
+
+        # This could be a branching point if we want to support other versions
+        # of DIA-NN, maybe something like this:
+        # if diann_version_id == "1.8.1":
+        #     self.convert_to_mztab_1_8_1(report, f_table, charge, missed_cleavages, dia_params)
+        # else:
+        #     raise ValueError(f"Unsupported DIANN version {diann_version_id}, supported versions are 1.8.1 ...")
+
+        logger.info(f"Reading fasta file: {self.fasta}")
         entries = []
         f = FASTAFile()
-        f.load(fasta, entries)
-        line = 0
-        for e in entries:
-            fasta_df.loc[line, "id"] = e.identifier
-            fasta_df.loc[line, "seq"] = e.sequence
-            fasta_df.loc[line, "len"] = len(e.sequence)
-            line += 1
-
-        index_ref = f_table
-        index_ref.loc[:, "ms_run"] = index_ref.apply(lambda x: x["Fraction_Group"], axis=1)
-        index_ref.loc[:, "study_variable"] = index_ref.apply(lambda x: x["Sample"], axis=1)
-        index_ref.loc[:, "ms_run"] = index_ref.loc[:, "ms_run"].astype("int")
-        index_ref.loc[:, "study_variable"] = index_ref.loc[:, "study_variable"].astype("int")
-        report[["ms_run", "study_variable"]] = report.apply(
-            lambda x: add_info(x["Run"], index_ref), axis=1, result_type="expand"
-        )
-
-        (MTD, database) = mztab_MTD(index_ref, dia_params, fasta, charge, missed_cleavages)
+        f.load(str(self.fasta), entries)
+        fasta_entries = [(e.identifier, e.sequence, len(e.sequence)) for e in entries]
+        fasta_df = pd.DataFrame(fasta_entries, columns=["id", "seq", "len"])
+
+        logger.info("Mapping run information to report")
+        index_ref = f_table.copy()
+        index_ref.rename(columns={"Fraction_Group": "ms_run", "Sample": "study_variable", "run": "Run"}, inplace=True)
+        index_ref["ms_run"] = index_ref["ms_run"].astype("int")
+        index_ref["study_variable"] = index_ref["study_variable"].astype("int")
+        report = report.merge(index_ref[["ms_run", "Run", "study_variable"]], on="Run", validate="many_to_one")
+
+        (MTD, database) = mztab_MTD(index_ref, dia_params, str(self.fasta), charge, missed_cleavages)
         pg = pd.read_csv(
-            pg_matrix,
+            self.pg_matrix,
             sep="\t",
             header=0,
         )
         PRH = mztab_PRH(report, pg, index_ref, database, fasta_df)
         del pg
         pr = pd.read_csv(
-            pr_matrix,
+            self.pr_matrix,
             sep="\t",
             header=0,
         )
+        precursor_list = list(report["Precursor.Id"].unique())
         PEH = mztab_PEH(report, pr, precursor_list, index_ref, database)
         del pr
-        PSH = mztab_PSH(report, folder, database)
+        PSH = mztab_PSH(report, str(self.base_path), database)
         del report
         MTD.loc["", :] = ""
         PRH.loc[len(PRH) + 1, :] = ""
         PEH.loc[len(PEH) + 1, :] = ""
-        with open(os.path.splitext(os.path.basename(exp_design))[0] + "_out.mzTab", "w", newline="") as f:
+        with open(out, "w", newline="") as f:
             MTD.to_csv(f, mode="w", sep="\t", index=False, header=False)
             PRH.to_csv(f, mode="w", sep="\t", index=False, header=True)
             PEH.to_csv(f, mode="w", sep="\t", index=False, header=True)
             PSH.to_csv(f, mode="w", sep="\t", index=False, header=True)
 
+        logger.info(f"mzTab file generated successfully! at {out}_out.mzTab")
 
-def query_expdesign_value(reference, f_table, s_table):
-    """
-    By matching the "Run" column in f_table or the "Sample" column in s_table, this function returns a tuple containing Fraction,
-    BioReplicate and Condition.
-
-     :param reference: The value of "Run" column in out_msstats
-     :type reference: str
-     :param f_table: A table contains experiment settings(search engine settings etc.)
-     :type f_table: pandas.core.frame.DataFrame
-     :param s_table: A table contains experimental design
-     :type s_table: pandas.core.frame.DataFrame
-     :return: A tuple contains Fraction, BioReplicate and Condition
-     :rtype: tuple
-    """
-    query_reference = f_table[f_table["run"] == reference]
-    Fraction = query_reference["Fraction"].values[0]
-    row = s_table[s_table["Sample"] == query_reference["Sample"].values[0]]
-    BioReplicate = row["MSstats_BioReplicate"].values[0]
-    Condition = row["MSstats_Condition"].values[0]
+    def main_report_df(self, qvalue_threshold: float) -> pd.DataFrame:
+        remain_cols = [
+            "File.Name",
+            "Run",
+            "Protein.Group",
+            "Protein.Names",
+            "Protein.Ids",
+            "First.Protein.Description",
+            "PG.MaxLFQ",
+            "RT.Start",
+            "Global.Q.Value",
+            "Lib.Q.Value",
+            "PEP",
+            "Precursor.Normalised",
+            "Precursor.Id",
+            "Q.Value",
+            "Modified.Sequence",
+            "Stripped.Sequence",
+            "Precursor.Charge",
+            "Precursor.Quantity",
+            "Global.PG.Q.Value",
+        ]
+        report = pd.read_csv(self.report, sep="\t", header=0, usecols=remain_cols)
+
+        # filter based on qvalue parameter for downstream analysiss
+        logger.debug(f"Filtering report based on qvalue threshold: {qvalue_threshold}, {len(report)} rows")
+        report = report[report["Q.Value"] < qvalue_threshold]
+        logger.debug(f"Report filtered, {len(report)} rows remaining")
+
+        logger.debug("Calculating Precursor.Mz")
+        # Making the map is 10x faster, and includes the mass of
+        # the modification. with respect to the previous implementation.
+        uniq_masses = {k: AASequence.fromString(k).getMonoWeight() for k in report["Modified.Sequence"].unique()}
+        mass_vector = report["Modified.Sequence"].map(uniq_masses)
+        report["Calculate.Precursor.Mz"] = (mass_vector + (PROTON_MASS_U * report["Precursor.Charge"])) / report[
+            "Precursor.Charge"
+        ]
+
+        logger.debug("Indexing Precursors")
+        # Making the map is 1500x faster
+        precursor_index_map = {k: i for i, k in enumerate(report["Precursor.Id"].unique())}
+        report["precursor.Index"] = report["Precursor.Id"].map(precursor_index_map)
 
-    return Fraction, BioReplicate, Condition
+        logger.debug(f"Shape of main report {report.shape}")
+        logger.debug(str(report.head()))
+
+        return report
 
 
 def MTD_mod_info(fix_mod, var_mod):
@@ -296,6 +442,7 @@ def mztab_MTD(index_ref, dia_params, fasta, charge, missed_cleavages):
     :return: MTD sub-table
     :rtype: pandas.core.frame.DataFrame
     """
+    logger.info("Constructing MTD sub-table...")
     dia_params_list = dia_params.split(";")
     dia_params_list = ["null" if i == "" else i for i in dia_params_list]
     FragmentMassTolerance = dia_params_list[0]
@@ -402,17 +549,25 @@ def mztab_PRH(report, pg, index_ref, database, fasta_df):
     :return: PRH sub-table
     :rtype: pandas.core.frame.DataFrame
     """
+    logger.info("Constructing PRH sub-table...")
+    logger.debug(
+        f"Input report shape: {report.shape},"
+        f" input pg shape: {pg.shape},"
+        f" input index_ref shape: {index_ref.shape},"
+        f" input fasta_df shape: {fasta_df.shape}"
+    )
     file = list(pg.columns[5:])
     col = {}
     for i in file:
         col[i] = (
-            "protein_abundance_assay["
-            + str(index_ref[index_ref["run"] == os.path.splitext(os.path.split(i)[1])[0]]["ms_run"].values[0])
-            + "]"
+            "protein_abundance_assay[" + str(index_ref[index_ref["Run"] == _true_stem(i)]["ms_run"].values[0]) + "]"
         )
 
     pg.rename(columns=col, inplace=True)
-    pg.loc[:, "opt_global_result_type"] = pg.apply(classify_result_type, axis=1, result_type="expand")
+
+    logger.debug("Classifying results type ...")
+    pg["opt_global_result_type"] = "single_protein"
+    pg.loc[pg["Protein.Ids"].str.contains(";"), "opt_global_result_type"] = "indistinguishable_protein_group"
 
     out_mztab_PRH = pd.DataFrame()
     out_mztab_PRH = pg.drop(["Protein.Names"], axis=1)
@@ -432,6 +587,8 @@ def mztab_PRH(report, pg, index_ref, database, fasta_df):
     ]
     for i in null_col:
         out_mztab_PRH.loc[:, i] = "null"
+
+    logger.debug("Extracting accession values (keeping first)...")
     out_mztab_PRH.loc[:, "accession"] = out_mztab_PRH.apply(lambda x: x["accession"].split(";")[0], axis=1)
 
     protein_details_df = out_mztab_PRH[out_mztab_PRH["opt_global_result_type"] == "indistinguishable_protein_group"]
@@ -440,46 +597,70 @@ def mztab_PRH(report, pg, index_ref, database, fasta_df):
     protein_details_df = (
         protein_details_df.drop("accession", axis=1).join(prh_series).reset_index().drop(columns="index")
     )
-    protein_details_df.loc[:, "opt_global_result_type"] = protein_details_df.apply(lambda x: "protein_details", axis=1)
+    protein_details_df.loc[:, "col"] = "protein_details"
     # protein_details_df = protein_details_df[-protein_details_df["accession"].str.contains("-")]
     out_mztab_PRH = pd.concat([out_mztab_PRH, protein_details_df]).reset_index(drop=True)
 
-    out_mztab_PRH.loc[:, "protein_coverage"] = out_mztab_PRH.apply(
-        lambda x: calculate_protein_coverage(report, x["accession"], x["Protein.Ids"], fasta_df),
-        axis=1,
-        result_type="expand",
+    logger.debug("Calculating protein coverage (bottleneck)...")
+    # This is a bottleneck
+    # reimplementation runs in 67s vs 137s (old) in my data
+    out_mztab_PRH.loc[:, "protein_coverage"] = calculate_protein_coverages(
+        report=report, out_mztab_PRH=out_mztab_PRH, fasta_df=fasta_df
     )
 
+    logger.debug("Getting ambiguity members...")
+    # IN THEORY this should be the same as
+    # out_mztab_PRH["ambiguity_members"] = out_mztab_PRH["Protein.Ids"]
+    # out_mztab_PRH.loc[out_mztab_PRH["opt_global_result_type"] == "single_protein", "ambiguity_members"] = "null"
+    # or out_mztab_PRH.loc[out_mztab_PRH["Protein.Ids"] == out_mztab_PRH["accession"], "ambiguity_members"] = "null"
     out_mztab_PRH.loc[:, "ambiguity_members"] = out_mztab_PRH.apply(
         lambda x: x["Protein.Ids"] if x["opt_global_result_type"] == "indistinguishable_protein_group" else "null",
         axis=1,
     )
 
+    logger.debug("Matching PRH to best search engine score...")
+    score_looker = ModScoreLooker(report)
     out_mztab_PRH[["modifiedSequence", "best_search_engine_score[1]"]] = out_mztab_PRH.apply(
-        lambda x: PRH_match_report(report, x["accession"]), axis=1, result_type="expand"
+        lambda x: score_looker.get_score(x["Protein.Ids"]), axis=1, result_type="expand"
     )
 
+    logger.debug("Matching PRH to modifications...")
     out_mztab_PRH.loc[:, "modifications"] = out_mztab_PRH.apply(
         lambda x: find_modification(x["modifiedSequence"]), axis=1, result_type="expand"
     )
 
+    logger.debug("Matching PRH to protein quantification...")
     ## quantity at protein level: PG.MaxLFQ
-    max_study_variable = max(index_ref["study_variable"])
-    PRH_params = []
-    for i in range(1, max_study_variable + 1):
-        PRH_params.extend(
-            [
-                "protein_abundance_study_variable[" + str(i) + "]",
-                "protein_abundance_stdev_study_variable[" + str(i) + "]",
-                "protein_abundance_std_error_study_variable[" + str(i) + "]",
-            ]
-        )
-
-    out_mztab_PRH[PRH_params] = out_mztab_PRH.apply(
-        lambda x: match_in_report(report, x["accession"], max_study_variable, 1, "protein"),
-        axis=1,
-        result_type="expand",
+    # This used to be a bottleneck in performance
+    # This implementation drops the run time from 57s to 25ms
+    protein_agg_report = (
+        report[["PG.MaxLFQ", "Protein.Ids", "study_variable"]]
+        .groupby(["study_variable", "Protein.Ids"])
+        .agg({"PG.MaxLFQ": ["mean", "std", "sem"]})
+        .reset_index()
+        .pivot(columns=["study_variable"], index="Protein.Ids")
+        .reset_index()
+    )
+    protein_agg_report.columns = ["::".join([str(s) for s in col]).strip() for col in protein_agg_report.columns.values]
+    subname_mapper = {
+        "Protein.Ids::::": "Protein.Ids",
+        "PG.MaxLFQ::mean": "protein_abundance_study_variable",
+        "PG.MaxLFQ::std": "protein_abundance_stdev_study_variable",
+        "PG.MaxLFQ::sem": "protein_abundance_std_error_study_variable",
+    }
+    name_mapper = name_mapper_builder(subname_mapper)
+    protein_agg_report.rename(columns=name_mapper, inplace=True)
+    # out_mztab_PRH has columns accession and Protein.Ids; 'Q9NZJ9', 'A0A024RBG1;Q9NZJ9;Q9NZJ9-2']
+    # the report table has 'Protein.Group' and 'Protein.Ids': 'Q9NZJ9', 'A0A024RBG1;Q9NZJ9;Q9NZJ9-2'
+    # Oddly enough the last implementation mapped the the accession (Q9NZJ9) in the mztab
+    # to the Protein.Ids (A0A024RBG1;Q9NZJ9;Q9NZJ9-2), leading to A LOT of missing values.
+    out_mztab_PRH = out_mztab_PRH.merge(
+        protein_agg_report, on="Protein.Ids", how="left", validate="many_to_one", copy=True
     )
+    del name_mapper
+    del subname_mapper
+    del protein_agg_report
+    # end of (former) bottleneck
 
     out_mztab_PRH.loc[:, "PRH"] = "PRT"
     index = out_mztab_PRH.loc[:, "PRH"]
@@ -491,13 +672,12 @@ def mztab_PRH(report, pg, index_ref, database, fasta_df):
         col for col in out_mztab_PRH.columns if col.startswith("opt_")
     ]
     out_mztab_PRH = out_mztab_PRH[new_cols]
-
-    # out_mztab_PRH.to_csv("./out_protein.mztab", sep=",", index=False)
-
     return out_mztab_PRH
 
 
-def mztab_PEH(report, pr, precursor_list, index_ref, database):
+def mztab_PEH(
+    report: pd.DataFrame, pr: pd.DataFrame, precursor_list: List[str], index_ref: pd.DataFrame, database: os.PathLike
+) -> pd.DataFrame:
     """
     Construct PEH sub-table.
 
@@ -514,6 +694,13 @@ def mztab_PEH(report, pr, precursor_list, index_ref, database):
     :return: PEH sub-table
     :rtype: pandas.core.frame.DataFrame
     """
+    logger.info("Constructing PEH sub-table...")
+    logger.debug(
+        f"report.shape: {report.shape}, "
+        f" pr.shape: {pr.shape},"
+        f" len(precursor_list): {len(precursor_list)},"
+        f" index_ref.shape: {index_ref.shape}"
+    )
     out_mztab_PEH = pd.DataFrame()
     out_mztab_PEH = pr.iloc[:, 0:10]
     out_mztab_PEH.drop(
@@ -529,14 +716,17 @@ def mztab_PEH(report, pr, precursor_list, index_ref, database):
         inplace=True,
     )
 
+    logger.debug("Finding modifications...")
     out_mztab_PEH.loc[:, "modifications"] = out_mztab_PEH.apply(
         lambda x: find_modification(x["opt_global_cv_MS:1000889_peptidoform_sequence"]), axis=1, result_type="expand"
     )
 
+    logger.debug("Extracting sequence...")
     out_mztab_PEH.loc[:, "opt_global_cv_MS:1000889_peptidoform_sequence"] = out_mztab_PEH.apply(
         lambda x: AASequence.fromString(x["opt_global_cv_MS:1000889_peptidoform_sequence"]).toString(), axis=1
     )
 
+    logger.debug("Checking accession uniqueness...")
     out_mztab_PEH.loc[:, "unique"] = out_mztab_PEH.apply(
         lambda x: "0" if ";" in str(x["accession"]) else "1", axis=1, result_type="expand"
     )
@@ -546,46 +736,78 @@ def mztab_PEH(report, pr, precursor_list, index_ref, database):
         out_mztab_PEH.loc[:, i] = "null"
     out_mztab_PEH.loc[:, "opt_global_cv_MS:1002217_decoy_peptide"] = "0"
 
-    ## average value of each study_variable
-    ## quantity at peptide level: Precursor.Normalised
-    out_mztab_PEH.loc[:, "pr_id"] = out_mztab_PEH.apply(
-        lambda x: precursor_list.index(x["Precursor.Id"]), axis=1, result_type="expand"
+    logger.debug("Matching precursor IDs...")
+    # Pre-calculating the indices and using a lookup table drops run time from
+    # ~6.5s to 11ms
+    precursor_indices = {k: i for i, k in enumerate(precursor_list)}
+    pr_ids = out_mztab_PEH["Precursor.Id"].map(precursor_indices)
+    out_mztab_PEH["pr_id"] = pr_ids
+    del precursor_indices
+
+    logger.debug("Getting scores per run")
+    # This implementation is 422-700x faster than the apply-based one
+    tmp = (
+        report.groupby(["precursor.Index", "ms_run"])
+        .agg({"Q.Value": ["min"]})
+        .reset_index()
+        .pivot(columns=["ms_run"], index="precursor.Index")
+        .reset_index()
     )
-    max_assay = max(index_ref["ms_run"])
-    max_study_variable = max(index_ref["study_variable"])
-
-    ms_run_score = []
-    for i in range(1, max_assay + 1):
-        ms_run_score.append("search_engine_score[1]_ms_run[" + str(i) + "]")
-    out_mztab_PEH[ms_run_score] = out_mztab_PEH.apply(
-        lambda x: match_in_report(report, x["pr_id"], max_assay, 0, "pep"), axis=1, result_type="expand"
+    tmp.columns = ["::".join([str(s) for s in col]).strip() for col in tmp.columns.values]
+    subname_mapper = {
+        "precursor.Index::::": "precursor.Index",
+        "Q.Value::min": "search_engine_score[1]_ms_run",
+    }
+    name_mapper = name_mapper_builder(subname_mapper)
+    tmp.rename(columns=name_mapper, inplace=True)
+    out_mztab_PEH = out_mztab_PEH.merge(
+        tmp.rename(columns={"precursor.Index": "pr_id"}), on="pr_id", validate="one_to_one"
     )
-
-    PEH_params = []
-    for i in range(1, max_study_variable + 1):
-        PEH_params.extend(
-            [
-                "peptide_abundance_study_variable[" + str(i) + "]",
-                "peptide_abundance_stdev_study_variable[" + str(i) + "]",
-                "peptide_abundance_std_error_study_variable[" + str(i) + "]",
-                "opt_global_mass_to_charge_study_variable[" + str(i) + "]",
-                "opt_global_retention_time_study_variable[" + str(i) + "]",
-            ]
+    del tmp
+    del subname_mapper
+    del name_mapper
+
+    logger.debug("Getting peptide abundances per study variable")
+    pep_study_report = per_peptide_study_report(report)
+    out_mztab_PEH = out_mztab_PEH.merge(pep_study_report, on="pr_id", how="left", validate="one_to_one", copy=True)
+    del pep_study_report
+
+    logger.debug("Getting peptide properties...")
+    # Re-implementing this section from apply -> assign to groupby->agg
+    # speeds up the process from 11s to 25ms in my data (~440x faster)
+    # Notably, this changes slightly...
+    # "opt_global_q-value" was the FIRST "Global.Q.Value", now its the min
+    # "opt_global_SpecEValue_score" was the FIRST "Lib.Q.Value" now its the min
+    # I believe picking the first is inconsistent because no sorting is checked
+    # and the first is arbitrary.
+
+    aggtable = (
+        report.groupby(["precursor.Index"])
+        .agg(
+            {
+                "Q.Value": "min",
+                "RT.Start": "mean",
+                "Global.Q.Value": "min",
+                "Lib.Q.Value": "min",
+                "Calculate.Precursor.Mz": "mean",
+            }
+        )
+        .reset_index()
+        .rename(
+            columns={
+                "precursor.Index": "pr_id",
+                "Q.Value": "best_search_engine_score[1]",
+                "RT.Start": "retention_time",
+                "Global.Q.Value": "opt_global_q-value",
+                "Lib.Q.Value": "opt_global_SpecEValue_score",
+                "Calculate.Precursor.Mz": "mass_to_charge",
+            }
         )
-    out_mztab_PEH[PEH_params] = out_mztab_PEH.apply(
-        lambda x: match_in_report(report, x["pr_id"], max_study_variable, 1, "pep"), axis=1, result_type="expand"
     )
+    del out_mztab_PEH["mass_to_charge"]
+    out_mztab_PEH = out_mztab_PEH.merge(aggtable, on="pr_id", validate="one_to_one")
 
-    out_mztab_PEH[
-        [
-            "best_search_engine_score[1]",
-            "retention_time",
-            "opt_global_q-value",
-            "opt_global_SpecEValue_score",
-            "mass_to_charge",
-        ]
-    ] = out_mztab_PEH.apply(lambda x: PEH_match_report(report, x["pr_id"]), axis=1, result_type="expand")
-
+    logger.debug("Re-ordering columns...")
     out_mztab_PEH.loc[:, "PEH"] = "PEP"
     out_mztab_PEH.loc[:, "database"] = database
     index = out_mztab_PEH.loc[:, "PEH"]
@@ -596,7 +818,6 @@ def mztab_PEH(report, pr, precursor_list, index_ref, database):
         col for col in out_mztab_PEH.columns if col.startswith("opt_")
     ]
     out_mztab_PEH = out_mztab_PEH[new_cols]
-    # out_mztab_PEH.to_csv("./out_peptide.mztab", sep=",", index=False)
 
     return out_mztab_PEH
 
@@ -616,9 +837,29 @@ def mztab_PSH(report, folder, database):
     :return: PSH sub-table
     :rtype: pandas.core.frame.DataFrame
     """
+    logger.info("Constructing PSH sub-table")
+
+    def __find_info(dir, n):
+        # This line matches n="220101_myfile", folder="." to
+        # "myfolder/220101_myfile_mzml_info.tsv"
+        files = list(Path(dir).glob(f"*{n}*_info.tsv"))
+        # Check that it matches one and only one file
+        if not files:
+            raise ValueError(f"Could not find {n} info file in {dir}")
+        if len(files) > 1:
+            raise ValueError(f"Found multiple {n} info files in {dir}: {files}")
+
+        return files[0]
+
     out_mztab_PSH = pd.DataFrame()
     for n, group in report.groupby(["Run"]):
-        file = folder + n + "_mzml_info.tsv"
+        if isinstance(n, tuple) and len(n) == 1:
+            # This is here only to support versions of pandas where the groupby
+            # key is a tuple.
+            # related: https://github.com/pandas-dev/pandas/pull/51817
+            n = n[0]
+
+        file = __find_info(folder, n)
         target = pd.read_csv(file, sep="\t")
         group.sort_values(by="RT.Start", inplace=True)
         target = target[["Retention_Time", "SpectrumID", "Exp_Mass_To_Charge"]]
@@ -682,6 +923,7 @@ def mztab_PSH(report, folder, database):
     for i in null_col:
         out_mztab_PSH.loc[:, i] = "null"
 
+    logger.info("Finding Modifications ...")
     out_mztab_PSH.loc[:, "modifications"] = out_mztab_PSH.apply(
         lambda x: find_modification(x["opt_global_cv_MS:1000889_peptidoform_sequence"]), axis=1, result_type="expand"
     )
@@ -721,7 +963,7 @@ def add_info(target, index_ref):
     :return: A tuple contains ms_run and study_variable
     :rtype: tuple
     """
-    match = index_ref[index_ref["run"] == target]
+    match = index_ref[index_ref["Run"] == target]
     ms_run = match["ms_run"].values[0]
     study_variable = match["study_variable"].values[0]
 
@@ -741,53 +983,6 @@ def classify_result_type(target):
     return "single_protein"
 
 
-def calculate_protein_coverage(report, target, reference, fasta_df):
-    """
-    Calculate protein coverage.
-
-    :param report: Dataframe for Dia-NN main report
-    :type report: pandas.core.frame.DataFrame
-    :param target: The value of "accession" column in out_mztab_PRH
-    :type target: str
-    :param fasta_df: A dataframe contains protein IDs, sequences and lengths
-    :type fasta_df: pandas.core.frame.DataFrame
-    :return: Protein coverage
-    :rtype: str
-    """
-    peptide_list = report[report["Protein.Ids"] == reference]["Stripped.Sequence"].drop_duplicates().values
-    unique_peptides = [j for i, j in enumerate(peptide_list) if all(j not in k for k in peptide_list[i + 1 :])]
-    resultlist = []
-    ref = fasta_df[fasta_df["id"].str.contains(target)]["seq"].values[0]
-
-    def findstr(basestr, s, resultlist):
-        result = re.finditer(s, basestr)
-        if result:
-            for i in result:
-                resultlist.append([i.span()[0], i.span()[1] - 1])
-
-        return resultlist
-
-    for i in unique_peptides:
-        resultlist = findstr(ref, i, resultlist)
-    # Sort and merge the interval list
-    resultlist.sort()
-    l, r = 0, 1
-    while r < len(resultlist):
-        x1, y1 = resultlist[l][0], resultlist[l][1]
-        x2, y2 = resultlist[r][0], resultlist[r][1]
-        if x2 > y1:
-            l += 1
-            r += 1
-        else:
-            resultlist[l] = [x1, max(y1, y2)]
-            resultlist.pop(r)
-
-    coverage_length = np.array([i[1] - i[0] + 1 for i in resultlist]).sum()
-    protein_coverage = format(coverage_length / len(ref), ".3f")
-
-    return protein_coverage
-
-
 def match_in_report(report, target, max_, flag, level):
     """
     This function is used to match the columns "ms_run" and "study_variable" from the report and
@@ -805,7 +1000,7 @@ def match_in_report(report, target, max_, flag, level):
     :type level: str
     :return: A tuple contains multiple messages
     :rtype: tuple
-    """
+    """  # noqa
     if flag == 1 and level == "pep":
         result = report[report["precursor.Index"] == target]
         PEH_params = []
@@ -834,45 +1029,66 @@ def match_in_report(report, target, max_, flag, level):
         return tuple(PRH_params)
 
 
-def PRH_match_report(report, target):
+class ModScoreLooker:
     """
-    Returns a tuple contains modified sequences and the score at protein level.
+    Class used to cache the lookup table of accessions to best scores and their
+    respective mod sequences.
+
+    Pre-computing the lookup table leverages a lot of speedum and vectortized
+    operations from pandas, and is much faster than doing the lookup on the fly
+    in a loop.
 
     :param report: Dataframe for Dia-NN main report
     :type report: pandas.core.frame.DataFrame
-    :param target: The value of "accession" column in report
-    :type target: str
-    :return: A tuple contains multiple information to construct PRH sub-table
-    :rtype: tuple
     """
-    match = report[report["Protein.Ids"] == target]
-    modSeq = match["Modified.Sequence"].values[0] if match["Modified.Sequence"].values.size > 0 else np.nan
-    ## Score at protein level: Global.PG.Q.Value (without MBR)
-    score = match["Global.PG.Q.Value"].min()
 
-    return modSeq, score
+    def __init__(self, report: pd.DataFrame) -> None:
+        self.lookup_dict = self.make_lookup_dict(report)
 
+    def make_lookup_dict(self, report) -> Dict[str, Tuple[str, float]]:
+        grouped_df = (
+            report[["Modified.Sequence", "Protein.Ids", "Global.PG.Q.Value"]]
+            .sort_values("Global.PG.Q.Value", ascending=True)
+            .groupby(["Protein.Ids"])
+            .head(1)
+        )
+        #        Modified.Sequence               Protein.Ids  Global.PG.Q.Value
+        # 78265          LFNEQNFFQR  Q8IV63;Q8IV63-2;Q8IV63-3           0.000252
+        # 103585    NPTIVNFPITNVDLR           Q53GS9;Q53GS9-2           0.000252
+        # 103586          NPTWKPLIR           Q7Z4Q2;Q7Z4Q2-2           0.000252
+        # 103588      NPVGYPLAWQFLR           Q9NZ08;Q9NZ08-2           0.000252
 
-def PEH_match_report(report, target):
-    """
-    Returns a tuple contains the score at peptide level, retain time, q_score, spec_e and mz.
+        out = {
+            row["Protein.Ids"]: (row["Modified.Sequence"], row["Global.PG.Q.Value"]) for _, row in grouped_df.iterrows()
+        }
+        return out
 
-    :param report: Dataframe for Dia-NN main report
-    :type report: pandas.core.frame.DataFrame
-    :param target: The value of "pr_id" column in report
-    :type target: str
-    :return: A tuple contains multiple information to construct PEH sub-table
-    :rtype: tuple
-    """
-    match = report[report["precursor.Index"] == target]
-    ## Score at peptide level: the minimum of the respective precursor q-values (minimum of Q.Value per group)
-    search_score = match["Q.Value"].min()
-    time = match["RT.Start"].mean()
-    q_score = match["Global.Q.Value"].values[0] if match["Global.Q.Value"].values.size > 0 else np.nan
-    spec_e = match["Lib.Q.Value"].values[0] if match["Lib.Q.Value"].values.size > 0 else np.nan
-    mz = match["Calculate.Precursor.Mz"].mean()
+    def get_score(self, protein_id: str) -> float:
+        """Returns a tuple contains modified sequences and the score at protein level.
+
+        Gets the best score and corresponding peptide for a given protein_id
+
+        Note that protein id can be something like 'Q8IV63;Q8IV63-2;Q8IV63-3'
+
+        Note2: This implementation also fixes a bug where the function would
+        return the first peptide in the report, not the best one. (but with the
+        score of the best one for that accession)
 
-    return search_score, time, q_score, spec_e, mz
+        :param protein_id: The value of "accession" column in report
+        :type target: str
+        :return: A tuple that contains (best modified sequence, best score)
+            if the accession is not found, (np.nan, np.nan) is returned.
+        :rtype: tuple
+        """
+        # Q: in what cases can the accession not exist in the table?
+        #    or an accession not have peptides?
+        val = self.lookup_dict.get(protein_id, (np.nan, np.nan))
+        return val
+
+
+# Pre-compiling the regex makes the next function 2x faster
+# in my benchmarking - JSPP
+MODIFICATION_PATTERN = re.compile(r"\((.*?)\)")
 
 
 def find_modification(peptide):
@@ -883,12 +1099,17 @@ def find_modification(peptide):
     :type peptide: str
     :return: Modification sites
     :rtype: str
+
+    Examples:
+    >>> find_modification("PEPM(UNIMOD:35)IDE")
+    '4-UNIMOD:35'
+    >>> find_modification("SM(UNIMOD:35)EWEIRDS(UNIMOD:21)EPTIDEK")
+    '2-UNIMOD:35,9-UNIMOD:21'
     """
     peptide = str(peptide)
-    pattern = re.compile(r"\((.*?)\)")
-    original_mods = pattern.findall(peptide)
-    peptide = re.sub(r"\(.*?\)", ".", peptide)
-    position = [i.start() for i in re.finditer(r"\.", peptide)]
+    original_mods = MODIFICATION_PATTERN.findall(peptide)
+    peptide = MODIFICATION_PATTERN.sub(".", peptide)
+    position = [i for i, x in enumerate(peptide) if x == "."]
     for j in range(1, len(position)):
         position[j] -= j
 
@@ -900,22 +1121,214 @@ def find_modification(peptide):
     return original_mods
 
 
-def calculate_mz(seq, charge):
+def name_mapper_builder(subname_mapper):
+    """Returns a function that renames the columns of the grouped table to match the ones
+    in the final table.
+
+    Examples:
+        >>> mapping_dict = {
+        ...     "precursor.Index::::": "pr_id",
+        ...     "Precursor.Normalised::mean": "peptide_abundance_study_variable"
+        ... }
+        >>> name_mapper = name_mapper_builder(mapping_dict)
+        >>> name_mapper("precursor.Index::::")
+        "pr_id"
+        >>> name_mapper("Precursor.Normalised::mean::1")
+        "peptide_abundance_study_variable[1]"
+    """
+    num_regex = re.compile(r"(.*)::(\d+)$")
+
+    def name_mapper(x):
+        """Renames the columns of the grouped table to match the ones
+        in the final table.
+
+        Examples:
+            >>> name_mapper("precursor.Index::::")
+            "pr_id"
+            >>> name_mapper("Precursor.Normalised::mean::1")
+            "peptide_abundance_study_variable[1]"
+        """
+        orig_x = x
+        for k, v in subname_mapper.items():
+            if k in x:
+                x = x.replace(k, v)
+        out = num_regex.sub(r"\1[\2]", x)
+        if out == orig_x:
+            # This should never happen but I am adding it here
+            # to prevent myself from shoting myself in the foot in the future.
+            raise ValueError(f"Column name {x} not found in subname_mapper")
+        return out
+
+    return name_mapper
+
+
+def per_peptide_study_report(report: pd.DataFrame) -> pd.DataFrame:
+    """Summarizes the report at peptide/study level and flattens the columns.
+
+    This function was implemented to replace an 'apply -> filter' approach.
+    In my benchmarking it went from 35.23 seconds for 4 samples, 4 conditions to
+    0.007 seconds.
+
+    This implementation differs in several aspects in the output values:
+    1. in the fact that it actually gets values for the m/z
+    2. always returns a float, whilst the apply version returns an 'object' dtype.
+    3. The original implementation, missing values had the string 'null', here
+       they have the value np.nan.
+    4. The order of the final output is different; the original orders columns by
+       study variables > calculated value, this one is calculated value > study variables.
+
+    Calculates the mean, standard deviation and std error of the precursor
+    abundances, as well as the mean retention time and m/z.
+
+    The names in the end are called "peptide" but thechnically the are at the
+    precursor level. (peptide+charge combinations).
+
+    The columns will look like this in the end:
+    [
+        'pr_id',
+        'peptide_abundance_study_variable[1]',
+        ...
+        'peptide_abundance_stdev_study_variable[1]',
+        ...
+        'peptide_abundance_std_error_study_variable[1]',
+        ...
+        'opt_global_retention_time_study_variable[1]',
+        ...
+        'opt_global_mass_to_charge_study_variable[1]',
+        ...
+    ]
     """
-    Calculate the precursor m/z based on the peptide sequence and charge state.
+    pep_study_grouped = (
+        report.groupby(["study_variable", "precursor.Index"])
+        .agg({"Precursor.Normalised": ["mean", "std", "sem"], "RT.Start": ["mean"], "Calculate.Precursor.Mz": ["mean"]})
+        .reset_index()
+        .pivot(columns=["study_variable"], index="precursor.Index")
+        .reset_index()
+    )
+    pep_study_grouped.columns = ["::".join([str(s) for s in col]).strip() for col in pep_study_grouped.columns.values]
+    # Columns here would be like:
+    # [
+    #     "precursor.Index::::",
+    #     "Precursor.Normalised::mean::1",
+    #     "Precursor.Normalised::mean::2",
+    #     "Precursor.Normalised::std::1",
+    #     "Precursor.Normalised::std::2",
+    #     "Precursor.Normalised::sem::1",
+    #     "Precursor.Normalised::sem::2",
+    #     "RT.Start::mean::1",
+    #     "RT.Start::mean::2",
+    # ]
+    # So the right names need to be given and the table can be joined with the other one
+    subname_mapper = {
+        "precursor.Index::::": "pr_id",
+        "Precursor.Normalised::mean": "peptide_abundance_study_variable",
+        "Precursor.Normalised::std": "peptide_abundance_stdev_study_variable",
+        "Precursor.Normalised::sem": "peptide_abundance_std_error_study_variable",
+        "Calculate.Precursor.Mz::mean": "opt_global_mass_to_charge_study_variable",
+        "RT.Start::mean": "opt_global_retention_time_study_variable",
+    }
+    name_mapper = name_mapper_builder(subname_mapper)
+
+    pep_study_grouped.rename(
+        columns=name_mapper,
+        inplace=True,
+    )
 
-    :param seq: Peptide sequence
-    :type seq: str
-    :param charge: charge state
-    :type charge: int
-    :return:
+    return pep_study_grouped
+
+
+def calculate_coverage(ref_sequence: str, sequences: Set[str]):
     """
-    ref = "ARNDBCEQZGHILKMFPSTWYV"
-    seq = "".join([i for i in seq if i in ref])
-    if charge == "":
-        return None
-    else:
-        return AASequence.fromString(seq).getMZ(int(charge))
+    Calculates the coverage of the reference sequence by the given sequences.
+
+    Examples:
+    >>> calculate_coverage("WATEROVERTHEDUCKSBACK", {"WATER", "DUCK"})
+    0.45
+    >>> calculate_coverage("DUCKDUCKDUCK", {"DUCK"})
+    1.0
+    >>> calculate_coverage("WATEROVERTHEDUCK", {"DUCK"})
+    0.25
+    """
+    starts = []
+    lengths = []
+    for sequence in sequences:
+        local_start = 0
+        while True:
+            local_start = ref_sequence.find(sequence, local_start)
+            if local_start == -1:
+                break
+            starts.append(local_start)
+            lengths.append(len(sequence))
+            local_start += 1
+
+    # merge overlapping intervals
+    starts, lengths = zip(*sorted(zip(starts, lengths)))
+    merged_starts = []
+    merged_lengths = []
+    for start, length in zip(starts, lengths):
+        if merged_starts and merged_starts[-1] + merged_lengths[-1] >= start:
+            merged_lengths[-1] = max(merged_starts[-1] + merged_lengths[-1], start + length) - merged_starts[-1]
+        else:
+            merged_starts.append(start)
+            merged_lengths.append(length)
+
+    # calculate coverage
+    coverage = sum(merged_lengths) / len(ref_sequence)
+    return coverage
+
+
+def calculate_protein_coverages(report: pd.DataFrame, out_mztab_PRH: pd.DataFrame, fasta_df: pd.DataFrame) -> List[str]:
+    """Calculates protein coverages for the PRH table.
+
+    The protein coverage is calculated as the fraction of the protein sequence
+    in the fasta df, covered by the peptides in the report table, for every
+    protein in the PRH table (defined by accession, not protein.ids).
+    """
+    nested_df = (
+        report[["Protein.Ids", "Stripped.Sequence"]]
+        .groupby("Protein.Ids")
+        .agg({"Stripped.Sequence": set})
+        .reset_index()
+    )
+    #                      Protein.Ids                                  Stripped.Sequence
+    # 0     A0A024RBG1;Q9NZJ9;Q9NZJ9-2                                   {SEQEDEVLLVSSSR}
+    # 1        A0A096LP49;A0A096LP49-2                                  {SPWAMTERKHSSLER}
+    # 2                A0AVT1;A0AVT1-2  {EDFTLLDFINAVK, KPDHVPISSEDER, QDVIITALDNVEAR,...
+    ids_to_seqs = dict(zip(nested_df["Protein.Ids"], nested_df["Stripped.Sequence"]))
+    acc_to_ids = dict(zip(out_mztab_PRH["accession"], out_mztab_PRH["Protein.Ids"]))
+    fasta_id_to_seqs = dict(zip(fasta_df["id"], fasta_df["seq"]))
+    acc_to_fasta_ids = {}
+
+    # Since fasta ids are something like sp|P51451|BLK_HUMAN but
+    # accessions are something like Q9Y6V7-2, we need to find a
+    # partial string match between the two (the best one)
+    for acc in acc_to_ids:
+        # I am pretty sure this is the slowest part of the code
+        matches = fasta_df[fasta_df["id"].str.contains(acc)]["id"]
+        if len(matches) == 0:
+            logger.warning(f"Could not find fasta id for accession {acc} in the fasta file.")
+            acc_to_fasta_ids[acc] = None
+        elif len(matches) == 1:
+            acc_to_fasta_ids[acc] = matches.iloc[0]
+        else:
+            # If multiple, find best match. ej. Pick Q9Y6V7 over Q9Y6V7-2
+            # This can be acquired by finding the shortest string, since
+            # it entails more un-matched characters.
+            acc_to_fasta_ids[acc] = min(matches, key=len)
+
+    out = [None] * len(out_mztab_PRH["accession"])
+
+    for i, acc in enumerate(out_mztab_PRH["accession"]):
+        f_id = acc_to_fasta_ids[acc]
+        if f_id is None:
+            out_cov = "null"
+        else:
+            cov = calculate_coverage(fasta_id_to_seqs[f_id], ids_to_seqs[acc_to_ids[acc]])
+            out_cov = format(cov, ".03f")
+
+        out[i] = out_cov
+
+    return out
 
 
 cli.add_command(convert)
diff --git a/bin/dotd_2_mqc.py b/bin/dotd_2_mqc.py
new file mode 100755
index 00000000..74dfe93a
--- /dev/null
+++ b/bin/dotd_2_mqc.py
@@ -0,0 +1,376 @@
+#!/usr/bin/env python
+GENERAL_HELP = """
+Converts .d files to multiqc compatible files.
+
+This script can be used either as standalone or as part of a larger workflow
+with pmultiqc as a plugin.
+
+For the standalone usage follow the instructions in the usage section.
+If you want to use the output of this script as part of a larger workflow
+you will have to modify the `multiqc_config.yaml` file used as te input
+for multiqc. Please refer to the multiqc documentation for more information.
+
+Generates the following files:
+    - tic_<basename>.tsv
+    - bpc_<basename>.tsv
+    - ms1_peaks_<basename>.tsv
+    - general_stats_<basename>.tsv
+    - dotd_mqc.yml
+
+Usage:
+    $ python dotd_2_mqc.py single <input1.d> <output>
+    $ python dotd_2_mqc.py single <input2.d> <output>
+    $ python dotd_2_mqc.py aggregate <output> <output>
+
+    # These last steps can also be
+    $ python dotd_2_mqc.py single <input_dir> <output>
+    # If the input directory contains multiple .d files.
+
+    $ cd <output>
+    $ multiqc -c dotd_mqc.yml . 
+"""
+
+from typing import List, Tuple  # noqa: E402
+import os  # noqa: E402
+import sqlite3  # noqa: E402
+import argparse  # noqa: E402
+from pathlib import Path  # noqa: E402
+from dataclasses import dataclass  # noqa: E402
+from logging import getLogger  # noqa: E402
+import logging  # noqa: E402
+
+VERSION = "0.0.2"
+logging.basicConfig(level=logging.DEBUG)
+logger = getLogger(__name__)
+
+# The time resulution in seconds.
+# Larger values will result in smaller data files as outputs
+# and will slightly smooth the data. 5 seconds seems to be
+# a good value for qc purposes.
+SECOND_RESOLUTION = 5
+# This string is used as a template for the multiqc config file.
+# Check the module docstring for more information.
+MQC_YML = """
+custom_data:
+    total_ion_chromatograms:
+        file_format: 'tsv'
+        section_name: 'MS1 TIC'
+        description: 'MS1 total ion chromatograms extracted from the .d files'
+        plot_type: 'linegraph'
+        pconfig:
+            id: 'ms1_tic'
+            title: 'MS1 TIC'
+            ylab: 'Ion Count'
+            ymin: 0
+    base_peak_chromatograms:
+        file_format: 'tsv'
+        section_name: 'MS1 BPC'
+        description: 'MS1 base peak chromatograms extracted from the .d files'
+        plot_type: 'linegraph'
+        pconfig:
+            id: 'ms1_bpc'
+            title: 'MS1 BPC'
+            ylab: 'Ion Count'
+            ymin: 0
+    number_of_peaks:
+        file_format: 'tsv'
+        section_name: 'MS1 Peaks'
+        description: 'MS1 Peaks from the .d files'
+        plot_type: 'linegraph'
+        pconfig:
+            id: 'ms1_peaks'
+            title: 'MS1 Peaks'
+            ylab: 'Peak Count'
+            ymin: 0
+    general_stats:
+        file_format: 'tsv'
+        section_name: 'General Stats'
+        description: 'General stats from the .d files'
+        plot_type: 'table'
+sp:
+    total_ion_chromatograms:
+        fn: 'tic_*'
+    base_peak_chromatograms:
+        fn: 'bpc_*'
+    number_of_peaks:
+        fn: 'ms1_peaks_*'
+    general_stats:
+        fn: 'general_stats.tsv'
+"""
+
+
+@dataclass
+class DotDFile:
+    filepath: os.PathLike
+
+    @property
+    def sql_filepath(self):
+        fp = Path(self.filepath) / "analysis.tdf"
+        return fp
+
+    @property
+    def basename(self):
+        return Path(self.filepath).stem
+
+    @property
+    def ms1_tic(self) -> List[Tuple[float, float]]:
+        """Gets the MS1 total-ion-chromatogram.
+
+        Returns:
+            List[Tuple[float, float]]: List of (time, intensity) tuples.
+        """
+        # Note that here I am using min and not mean for purely qc reasons.
+        # Since the diagnostic aspect here is mainly to see major fluctuations
+        # in the intensity, and usually these are scans with very low intensity
+        # due to bubbles or ionization issues, thus the mean would hide that.
+        cmd = f"""
+        SELECT MIN(Time), MIN(SummedIntensities)
+        FROM frames  WHERE MsMsType = '0'
+        GROUP BY CAST(Time / {SECOND_RESOLUTION} AS INTEGER)
+        ORDER BY Time
+        """
+        conn = sqlite3.connect(self.sql_filepath)
+        c = conn.cursor()
+        out = c.execute(cmd).fetchall()
+        conn.close()
+        return out
+
+    @property
+    def ms1_bpc(self) -> List[Tuple[float, float]]:
+        """Gets the MS1 base-peak-chromatogram.
+
+        Returns:
+            List[Tuple[float, float]]: List of (time, intensity) tuples.
+        """
+        cmd = f"""
+        SELECT MIN(Time), MAX(MaxIntensity)
+        FROM frames  WHERE MsMsType = '0'
+        GROUP BY CAST(Time / {SECOND_RESOLUTION} AS INTEGER)
+        ORDER BY Time
+        """
+        conn = sqlite3.connect(self.sql_filepath)
+        c = conn.cursor()
+        out = c.execute(cmd).fetchall()
+        conn.close()
+        return out
+
+    @property
+    def ms1_peaks(self) -> List[Tuple[float, float]]:
+        """Gets the number of MS1 peaks.
+
+        Returns:
+            List[Tuple[float, float]]: List of (time, intensity) tuples.
+        """
+        cmd = f"""
+        SELECT MIN(Time), AVG(NumPeaks)
+        FROM frames  WHERE MsMsType = '0'
+        GROUP BY CAST(Time / {SECOND_RESOLUTION} AS INTEGER)
+        ORDER BY Time
+        """
+        conn = sqlite3.connect(self.sql_filepath)
+        c = conn.cursor()
+        out = c.execute(cmd).fetchall()
+        conn.close()
+        return out
+
+    def get_acquisition_datetime(self) -> str:
+        """Gets the acquisition datetime
+
+        Returns
+        -------
+        str
+            The acquisition datetime in ISO 8601 format.
+            [('2023-08-06T06:23:19.141-08:00',)]
+        """
+        cmd = "SELECT Value FROM GlobalMetadata WHERE key='AcquisitionDateTime'"
+        conn = sqlite3.connect(self.sql_filepath)
+        c = conn.cursor()
+        out = c.execute(cmd).fetchall()
+        conn.close()
+        if not len(out) == 1:
+            raise RuntimeError("More than one acquisition datetime found.")
+
+        return out[0][0]
+
+    def get_tot_current(self) -> float:
+        """Gets the total current from the ms1 scans.
+
+        Returns
+        -------
+        float
+            The total current.
+        """
+        cmd = """
+        SELECT SUM(CAST(SummedIntensities AS FLOAT))
+        FROM frames WHERE MsMsType = '0'
+        """
+        conn = sqlite3.connect(self.sql_filepath)
+        c = conn.cursor()
+        out = c.execute(cmd).fetchall()
+        conn.close()
+        if not len(out) == 1:
+            raise RuntimeError("More than one total current found.")
+
+        return out[0][0]
+
+    def get_dia_scan_current(self) -> float:
+        """Gets the total current from the ms2 scans.
+
+        Returns
+        -------
+        float
+            The total current.
+        """
+        cmd = """
+        SELECT SUM(CAST(SummedIntensities AS FLOAT))
+        FROM frames WHERE MsMsType = '9'
+        """
+        conn = sqlite3.connect(self.sql_filepath)
+        c = conn.cursor()
+        out = c.execute(cmd).fetchall()
+        conn.close()
+        if not len(out) == 1:
+            raise RuntimeError("More than one total current found.")
+
+        return out[0][0]
+
+    def get_general_stats(self) -> dict:
+        """Gets the general stats from the .d file.
+
+        Returns
+        -------
+        dict
+            A dictionary of general stats.
+        """
+        out = {
+            "AcquisitionDateTime": self.get_acquisition_datetime(),
+            "TotalCurrent": self.get_tot_current(),
+            "DIA_ScanCurrent": self.get_dia_scan_current(),
+        }
+        return out
+
+    def write_tables(self, location):
+        logger.info(f"Writing tables for {self.basename}")
+        logger.info(f"Writing tables to {location}")
+        location = Path(location)
+        location.mkdir(parents=True, exist_ok=True)
+        tic = self.ms1_tic
+        bpc = self.ms1_bpc
+        npeaks = self.ms1_peaks
+        general_stats = self.get_general_stats()
+
+        tic_path = location / f"tic_{self.basename}.tsv"
+        bpc_path = location / f"bpc_{self.basename}.tsv"
+        peaks_location = location / f"ms1_peaks_{self.basename}.tsv"
+        general_stats_location = location / f"general_stats_{self.basename}.tsv"
+
+        logger.info(f"Writing {tic_path}")
+        with tic_path.open("w") as f:
+            for t, i in tic:
+                f.write(f"{t}\t{i}\n")
+
+        logger.info(f"Writing {bpc_path}")
+        with bpc_path.open("w") as f:
+            for t, i in bpc:
+                f.write(f"{t}\t{i}\n")
+
+        logger.info(f"Writing {peaks_location}")
+        with peaks_location.open("w") as f:
+            for t, i in npeaks:
+                f.write(f"{t}\t{i}\n")
+
+        logger.info(f"Writing {general_stats_location}")
+        with general_stats_location.open("w") as f:
+            for k, v in general_stats.items():
+                f.write(f"{k}\t{v}\n")
+
+
+def main_single(input_path, output_path):
+    if input_path.is_dir() and str(input_path).endswith(".d"):
+        input_files = [input_path]
+    elif input_path.is_dir():
+        input_files = list(input_path.glob("*.d"))
+    else:
+        raise RuntimeError(f"Input path {input_path} is not a file or directory.")
+
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    for f in input_files:
+        d = DotDFile(f)
+        d.write_tables(output_path)
+
+    logger.info(f"Writing {output_path / 'dotd_mqc.yml'}")
+    with (output_path / "dotd_mqc.yml").open("w") as f:
+        f.write(MQC_YML)
+
+    if len(input_files) > 1:
+        logger.info("Writing aggregate general stats.")
+        main_aggregate(output_path, output_path)
+
+    logger.info("Done.")
+
+
+def main_aggregate(input_path, output_path):
+    # Find the general stats files
+    if not input_path.is_dir():
+        logger.error(f"Input path {input_path} is not a directory.")
+        raise ValueError("Input path must be a directory.")
+
+    general_stats_files = list(input_path.glob("general_stats_*.tsv"))
+    if not general_stats_files:
+        logger.error(f"No general stats files found in {input_path}.")
+        raise ValueError("No general stats files found.")
+
+    # Merge them to a single table
+    # Effectively transposing the columns and adding column called file,
+    # which contains the file name from which the stats were acquired.
+    logger.info("Merging general stats files.")
+    general_stats = []
+    for f in general_stats_files:
+        curr_stats = {"file": f.stem.replace("general_stats_", "")}
+        with f.open("r") as fh:
+            for line in fh:
+                line = line.strip()
+                if not line:
+                    continue
+                k, v = line.split("\t")
+                curr_stats[k] = v
+
+        general_stats.append(curr_stats)
+
+    # Write the general stats file
+    logger.info("Writing general stats file.")
+    with (output_path / "general_stats.tsv").open("w") as f:
+        f.write("\t".join(general_stats[0].keys()) + "\n")
+        for s in general_stats:
+            f.write("\t".join(s.values()) + "\n")
+
+
+if __name__ == "__main__":
+    # create the top-level parser
+    parser = argparse.ArgumentParser(add_help=True, usage=GENERAL_HELP)
+    parser.add_argument("--version", action="version", version=f"%(prog)s {VERSION}")
+    subparsers = parser.add_subparsers(required=True)
+
+    # create the parser for the "single" command
+    parser_foo = subparsers.add_parser("single")
+    parser_foo.add_argument("input", help="Input .d file or directory of .d files.")
+    parser_foo.add_argument("output", help="Output directory.")
+    parser_foo.set_defaults(func=main_single)
+
+    # create the parser for the "aggregate" command
+    parser_bar = subparsers.add_parser("aggregate")
+    parser_bar.add_argument("input", help="Directory that contains the general stats files to aggregate.")
+    parser_bar.add_argument("output", help="Output directory.")
+    parser_bar.set_defaults(func=main_aggregate)
+
+    # parse the args and call whatever function was selected
+    args, unkargs = parser.parse_known_args()
+    if unkargs:
+        print(f"Unknown arguments: {unkargs}")
+        raise RuntimeError("Unknown arguments.")
+
+    input_path = Path(args.input)
+    output_path = Path(args.output)
+
+    args.func(input_path, output_path)
diff --git a/main.nf b/main.nf
index 88fd7784..7becd311 100644
--- a/main.nf
+++ b/main.nf
@@ -23,7 +23,7 @@ include { validateParameters; paramsHelp } from 'plugin/nf-validation'
 if (params.help) {
     def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs)
     def citation = '\n' + WorkflowMain.citation(workflow) + '\n'
-    def String command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --genome GRCh37 -profile docker"
+    def String command = "nextflow run ${workflow.manifest.name} --input input_files.sdrf.tsv --database ~/dbs/human_fasta.fasta -profile docker"
     log.info logo + paramsHelp(command) + citation + NfcoreTemplate.dashedLine(params.monochrome_logs)
     System.exit(0)
 }
diff --git a/modules/local/assemble_empirical_library/main.nf b/modules/local/assemble_empirical_library/main.nf
index fdb9a467..91c4f0ad 100644
--- a/modules/local/assemble_empirical_library/main.nf
+++ b/modules/local/assemble_empirical_library/main.nf
@@ -7,7 +7,8 @@ process ASSEMBLE_EMPIRICAL_LIBRARY {
         'docker.io/biocontainers/diann:v1.8.1_cv1' }"
 
     input:
-    path(mzMLs)
+    // In this step the real files are passed, and not the names
+    path(ms_files)
     val(meta)
     path("quant/*")
     path(lib)
@@ -22,14 +23,25 @@ process ASSEMBLE_EMPIRICAL_LIBRARY {
 
     script:
     def args = task.ext.args ?: ''
-    mass_acc_ms1 = meta.precursor_mass_tolerance_unit == "ppm" ? meta.precursor_mass_tolerance : 5
-    mass_acc_ms2 = meta.fragment_mass_tolerance_unit == "ppm" ? meta.fragment_mass_tolerance : 13
+    mass_acc_ms1 = meta['precursormasstoleranceunit'].toLowerCase().endsWith('ppm') ? meta['precursormasstolerance'] : 5
+    mass_acc_ms2 = meta['fragmentmasstoleranceunit'].toLowerCase().endsWith('ppm') ? meta['fragmentmasstolerance'] : 13
 
-    mass_acc = params.mass_acc_automatic ? "--quick-mass-acc --individual-mass-acc" : "--mass-acc $mass_acc_ms2 --mass-acc-ms1 $mass_acc_ms1"
-    scan_window = params.scan_window_automatic ? "--individual-windows" : "--window $params.scan_window"
+    if (params.mass_acc_automatic) {
+        mass_acc = "--quick-mass-acc --individual-mass-acc"
+    } else {
+        mass_acc = "--mass-acc $mass_acc_ms2 --mass-acc-ms1 $mass_acc_ms1"
+    }
+    scan_window = params.scan_window_automatic ? '--individual-windows' : "--window $params.scan_window"
 
     """
-    diann   --f ${(mzMLs as List).join(' --f ')} \\
+    # Precursor Tolerance value was: ${meta['precursormasstolerance']}
+    # Fragment Tolerance value was: ${meta['fragmentmasstolerance']}
+    # Precursor Tolerance unit was: ${meta['precursormasstoleranceunit']}
+    # Fragment Tolerance unit was: ${meta['fragmentmasstoleranceunit']}
+
+    ls -lcth
+
+    diann   --f ${(ms_files as List).join(' --f ')} \\
             --lib ${lib} \\
             --threads ${task.cpus} \\
             --out-lib empirical_library.tsv \\
diff --git a/modules/local/decompress_dotd/main.nf b/modules/local/decompress_dotd/main.nf
new file mode 100644
index 00000000..f8136c0b
--- /dev/null
+++ b/modules/local/decompress_dotd/main.nf
@@ -0,0 +1,79 @@
+
+process DECOMPRESS {
+    tag "$meta.mzml_id"
+    label 'process_low'
+    label 'error_retry'
+
+    conda "conda-forge::gzip=1.12,conda-forge::tar=1.34,conda-forge::bzip2=1.0.8,conda-forge::unzip=6.0,conda-forge::xz=5.2.6"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/mulled-v2-796b0610595ad1995b121d0b85375902097b78d4:a3a3220eb9ee55710d743438b2ab9092867c98c6-0' :
+        'quay.io/biocontainers/mulled-v2-796b0610595ad1995b121d0b85375902097b78d4:a3a3220eb9ee55710d743438b2ab9092867c98c6-0' }"
+
+    stageInMode {
+        if (task.attempt == 1) {
+            if (executor == 'awsbatch') {
+                'symlink'
+            } else {
+                'link'
+            }
+        } else if (task.attempt == 2) {
+            if (executor == 'awsbatch') {
+                'copy'
+            } else {
+                'symlink'
+            }
+        } else {
+            'copy'
+        }
+    }
+
+    input:
+    tuple val(meta), path(compressed_file)
+
+    output:
+    tuple val(meta), path('*.d'),   emit: decompressed_files
+    path 'versions.yml',   emit: version
+    path '*.log',   emit: log
+
+    script:
+    String prefix = task.ext.prefix ?: "${meta.mzml_id}"
+
+    """
+    function extract {
+        if [ -z "\$1" ]; then
+            echo "Usage: extract <path/file_name>.<gz|tar|tar.bz2>"
+        else
+            if [ -f \$1 ]; then
+                case \$1 in
+                    *.tar.gz)    tar xvzf \$1    ;;
+                    *.gz)        gunzip \$1      ;;
+                    *.tar)       tar xvf \$1     ;;
+                    *.zip)       unzip \$1     ;;
+                    *)           echo "extract: '\$1' - unknown archive method" ;;
+                esac
+            else
+                echo "\$1 - file does not exist"
+            fi
+        fi
+    }
+
+    tar --help 2>&1 | tee -a ${prefix}_decompression.log
+    gunzip --help 2>&1 | tee -a ${prefix}_decompression.log
+    zip --help 2>&1 | tee -a ${prefix}_decompression.log
+    echo "Unpacking..." | tee -a ${compressed_file.baseName}_decompression.log
+
+    extract ${compressed_file} 2>&1 | tee -a ${compressed_file.baseName}_conversion.log
+    [ -d ${file(compressed_file.baseName).baseName}.d ] && \\
+        echo "Found ${file(compressed_file.baseName).baseName}.d" || \\
+        mv *.d ${file(compressed_file.baseName).baseName}.d
+
+    ls -l | tee -a ${compressed_file.baseName}_decompression.log
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        gunzip: \$(gunzip --help 2>&1 | head -1 | grep -oE "\\d+\\.\\d+(\\.\\d+)?")
+        tar: \$(tar --help 2>&1 | head -1 | grep -oE "\\d+\\.\\d+(\\.\\d+)?")
+        zip: \$(zip --help | head -2 | tail -1 | grep -oE "\\d+\\.\\d+")
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/decompress_dotd/meta.yml b/modules/local/decompress_dotd/meta.yml
new file mode 100644
index 00000000..b17737a2
--- /dev/null
+++ b/modules/local/decompress_dotd/meta.yml
@@ -0,0 +1,45 @@
+name: decompression
+description: Decompress .tar/.gz files that contain a .d file/directory
+keywords:
+  - raw
+  - bruker
+  - .d
+tools:
+  - tar:
+      description: |
+        Generates and extracts archives.
+      homepage: https://www.gnu.org/software/tar/
+  - gunzip:
+      description: |
+        Decompresses using zlib.
+      homepage: https://www.gnu.org/software/gzip/
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+  - rawfile:
+      type: file
+      description: |
+        Bruker Raw file archived using tar
+      pattern: "*.{d.tar,.tar,.gz,.d.tar.gz}"
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'MD5', enzyme:trypsin ]
+  - dotd:
+      type: path
+      description: Raw Bruker .d file
+      pattern: "*.d"
+  - log:
+      type: file
+      description: log file
+      pattern: "*.log"
+  - version:
+      type: file
+      description: File containing software version
+      pattern: "versions.yml"
+authors:
+  - "@jspaezp"
diff --git a/modules/local/diann_preliminary_analysis/main.nf b/modules/local/diann_preliminary_analysis/main.nf
index 24441f43..f3844144 100644
--- a/modules/local/diann_preliminary_analysis/main.nf
+++ b/modules/local/diann_preliminary_analysis/main.nf
@@ -1,5 +1,5 @@
 process DIANN_PRELIMINARY_ANALYSIS {
-    tag "$mzML.baseName"
+    tag "$ms_file.baseName"
     label 'process_high'
 
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
@@ -7,7 +7,7 @@ process DIANN_PRELIMINARY_ANALYSIS {
         'docker.io/biocontainers/diann:v1.8.1_cv1' }"
 
     input:
-    tuple val(meta), file(mzML), file(predict_tsv)
+    tuple val(meta), path(ms_file), path(predict_library)
 
     output:
     path "*.quant", emit: diann_quant
@@ -20,16 +20,27 @@ process DIANN_PRELIMINARY_ANALYSIS {
     script:
     def args = task.ext.args ?: ''
 
-    mass_acc_ms1 = meta.precursor_mass_tolerance_unit == "ppm" ? meta.precursor_mass_tolerance : 5
-    mass_acc_ms2 = meta.fragment_mass_tolerance_unit == "ppm" ? meta.fragment_mass_tolerance : 13
+    // I am using here the ["key"] syntax, since the preprocessed meta makes
+    // was evaluating to null when using the dot notation.
+    mass_acc_ms1 = meta['precursormasstoleranceunit'].toLowerCase().endsWith('ppm') ? meta['precursormasstolerance'] : 5
+    mass_acc_ms2 = meta['fragmentmasstoleranceunit'].toLowerCase().endsWith('ppm') ? meta['fragmentmasstolerance'] : 13
 
-    mass_acc = params.mass_acc_automatic ? "--quick-mass-acc --individual-mass-acc" : "--mass-acc $mass_acc_ms2 --mass-acc-ms1 $mass_acc_ms1"
-    scan_window = params.scan_window_automatic ? "--individual-windows" : "--window $params.scan_window"
-    time_corr_only = params.time_corr_only ? "--time-corr-only" : ""
+    if (params.mass_acc_automatic) {
+        mass_acc = '--quick-mass-acc --individual-mass-acc'
+    } else {
+        mass_acc = "--mass-acc $mass_acc_ms2 --mass-acc-ms1 $mass_acc_ms1"
+    }
+    scan_window = params.scan_window_automatic ? '--individual-windows' : '--window $params.scan_window'
+    time_corr_only = params.time_corr_only ? '--time-corr-only' : ''
 
     """
-    diann   --lib ${predict_tsv} \\
-            --f ${mzML} \\
+    # Precursor Tolerance value was: ${meta['precursormasstolerance']}
+    # Fragment Tolerance value was: ${meta['fragmentmasstolerance']}
+    # Precursor Tolerance unit was: ${meta['precursormasstoleranceunit']}
+    # Fragment Tolerance unit was: ${meta['fragmentmasstoleranceunit']}
+
+    diann   --lib ${predict_library} \\
+            --f ${ms_file} \\
             --threads ${task.cpus} \\
             --verbose $params.diann_debug \\
             ${scan_window} \\
@@ -39,7 +50,7 @@ process DIANN_PRELIMINARY_ANALYSIS {
             ${mass_acc} \\
             ${time_corr_only} \\
             $args \\
-            2>&1 | tee ${mzML.baseName}_diann.log
+            2>&1 | tee ${ms_file.baseName}_diann.log
 
 
     cat <<-END_VERSIONS > versions.yml
diff --git a/modules/local/diannconvert/main.nf b/modules/local/diannconvert/main.nf
index 36f2ad6f..b9282db5 100644
--- a/modules/local/diannconvert/main.nf
+++ b/modules/local/diannconvert/main.nf
@@ -23,6 +23,7 @@ process DIANNCONVERT {
     path "*msstats_in.csv", emit: out_msstats
     path "*triqler_in.tsv", emit: out_triqler
     path "*.mzTab", emit: out_mztab
+    path "*.log", emit: log
     path "versions.yml", emit: version
 
     exec:
@@ -36,6 +37,7 @@ process DIANNCONVERT {
     """
     diann_convert.py convert \\
         --folder ./ \\
+        --exp_design ${exp_design} \\
         --diann_version ./version/versions.yml \\
         --dia_params "${dia_params}" \\
         --charge $params.max_precursor_charge \\
diff --git a/modules/local/diannsummary/main.nf b/modules/local/diannsummary/main.nf
index 438c4414..a4f34981 100644
--- a/modules/local/diannsummary/main.nf
+++ b/modules/local/diannsummary/main.nf
@@ -7,11 +7,15 @@ process DIANNSUMMARY {
         'docker.io/biocontainers/diann:v1.8.1_cv1' }"
 
     input:
-    file(mzMLs)
+    // Note that the files are passed as names and not paths, this prevents them from being staged
+    // in the directory
+    val(ms_files)
     val(meta)
-    file(empirical_library)
-    file("quant/")
-    file(fasta)
+    path(empirical_library)
+    // The quant path is passed, and diann will use the files in the quant directory instead
+    // of the ones passed in ms_files.
+    path("quant/")
+    path(fasta)
 
     output:
     path "diann_report.tsv", emit: main_report
@@ -27,17 +31,20 @@ process DIANNSUMMARY {
 
     script:
     def args = task.ext.args ?: ''
-    mass_acc_ms1 = meta.precursor_mass_tolerance_unit == "ppm" ? meta.precursor_mass_tolerance : 5
-    mass_acc_ms2 = meta.fragment_mass_tolerance_unit == "ppm" ? meta.fragment_mass_tolerance : 13
+    mass_acc_ms1 = meta["precursormasstoleranceunit"].toLowerCase().endsWith("ppm") ? meta["precursormasstolerance"] : 5
+    mass_acc_ms2 = meta["fragmentmasstoleranceunit"].toLowerCase().endsWith("ppm") ? meta["fragmentmasstolerance"] : 13
 
     mass_acc = params.mass_acc_automatic ? "--quick-mass-acc --individual-mass-acc" : "--mass-acc $mass_acc_ms2 --mass-acc-ms1 $mass_acc_ms1"
     scan_window = params.scan_window_automatic ? "--individual-windows" : "--window $params.scan_window"
     species_genes = params.species_genes ? "--species-genes": ""
 
     """
+    # Notes: if .quant files are passed, mzml/.d files are not accessed, so the name needs to be passed but files
+    # do not need to pe present.
+
     diann   --lib ${empirical_library} \\
             --fasta ${fasta} \\
-            --f ${(mzMLs as List).join(' --f ')} \\
+            --f ${(ms_files as List).join(' --f ')} \\
             --threads ${task.cpus} \\
             --verbose $params.diann_debug \\
             ${scan_window} \\
diff --git a/modules/local/dotd_to_mqc/main.nf b/modules/local/dotd_to_mqc/main.nf
new file mode 100644
index 00000000..cdf080f7
--- /dev/null
+++ b/modules/local/dotd_to_mqc/main.nf
@@ -0,0 +1,70 @@
+/* groovylint-disable DuplicateStringLiteral */
+process DOTD2MQC_INDIVIDUAL {
+    tag "$meta.experiment_id"
+    label 'process_single'
+
+    conda "base::python=3.10"
+    conda "conda-forge::python=3.10"
+    if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
+        container "https://depot.galaxyproject.org/singularity/python:3.10"
+    } else {
+        container "quay.io/biocontainers/python:3.10"
+    }
+
+    input:
+    // Note: This step can be optimized by staging only the
+    // .tdf file inside the .d directory.
+    // Thus reducing the data transfer of the rest of the .d
+    // directory. IN PARTICULAR the .tdf.bin
+    tuple val(meta), path(dot_d_file)
+
+    output:
+    tuple path("dotd_mqc.yml"), path("*.tsv"), emit: dotd_mqc_data
+    path "general_stats*.tsv", emit: general_stats
+    path "versions.yml", emit: version
+    path "*.log", emit: log
+
+    script:
+    def prefix = task.ext.prefix ?: "${meta.mzml_id}"
+
+    """
+    dotd_2_mqc.py single "${dot_d_file}" \${PWD}  \\
+        2>&1 | tee dotd_2_mqc_${prefix}.log
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        dotd_2_mqc: \$(dotd_2_mqc.py --version | grep -oE "\\d\\.\\d\\.\\d")
+        dotd_2_mqc_python: \$(python --version | grep -oE "\\d\\.\\d\\.\\d")
+    END_VERSIONS
+    """
+}
+
+
+process DOTD2MQC_AGGREGATE {
+    label 'process_single'
+
+    conda 'base::python=3.10'
+    container 'continuumio/miniconda3:23.5.2-0-alpine'
+
+    input:
+    path '*' // tsv files from DOTD2MQC_INDIVIDUAL
+
+    output:
+    path 'general_stats.tsv', emit: dotd_mqc_data
+    path 'versions.yml', emit: version
+    path '*.log', emit: log
+
+    script:
+    """
+    ls -lcth
+
+    dotd_2_mqc.py aggregate \${PWD} \${PWD}  \\
+        2>&1 | tee dotd_2_mqc_agg.log
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        dotd_2_mqc: \$(dotd_2_mqc.py --version | grep -oE "\\d\\.\\d\\.\\d")
+        dotd_2_mqc_python: \$(python --version | grep -oE "\\d\\.\\d\\.\\d")
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/dotd_to_mqc/meta.yml b/modules/local/dotd_to_mqc/meta.yml
new file mode 100644
index 00000000..e69de29b
diff --git a/modules/local/generate_diann_cfg/main.nf b/modules/local/generate_diann_cfg/main.nf
index e9cb1cde..41df3922 100644
--- a/modules/local/generate_diann_cfg/main.nf
+++ b/modules/local/generate_diann_cfg/main.nf
@@ -2,9 +2,9 @@ process GENERATE_DIANN_CFG {
     tag "$meta.experiment_id"
     label 'process_low'
 
-    conda "conda-forge::pandas_schema bioconda::sdrf-pipelines=0.0.22"
+    conda 'conda-forge::pandas_schema bioconda::sdrf-pipelines=0.0.22'
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://depot.galaxyproject.org/singularity/sdrf-pipelines:0.0.22--pyhdfd78af_0"
+        container 'https://depot.galaxyproject.org/singularity/sdrf-pipelines:0.0.22--pyhdfd78af_0'
     } else {
         container "biocontainers/sdrf-pipelines:0.0.22--pyhdfd78af_0"
     }
@@ -13,9 +13,9 @@ process GENERATE_DIANN_CFG {
     val(meta)
 
     output:
-    path "diann_config.cfg", emit: diann_cfg
-    path "versions.yml", emit: version
-    path "*.log"
+    path 'diann_config.cfg', emit: diann_cfg
+    path 'versions.yml', emit: version
+    path '*.log'
 
     script:
     def args = task.ext.args ?: ''
diff --git a/modules/local/individual_final_analysis/main.nf b/modules/local/individual_final_analysis/main.nf
index ecb9b69f..51eb65bb 100644
--- a/modules/local/individual_final_analysis/main.nf
+++ b/modules/local/individual_final_analysis/main.nf
@@ -1,5 +1,5 @@
 process INDIVIDUAL_FINAL_ANALYSIS {
-    tag "$mzML.baseName"
+    tag "$ms_file.baseName"
     label 'process_high'
 
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
@@ -7,7 +7,7 @@ process INDIVIDUAL_FINAL_ANALYSIS {
         'docker.io/biocontainers/diann:v1.8.1_cv1' }"
 
     input:
-    tuple val(meta), file(mzML), file(fasta), file(diann_log), file(library)
+    tuple val(meta), path(ms_file), path(fasta), path(diann_log), path(library)
 
     output:
     path "*.quant", emit: diann_quant
@@ -19,8 +19,8 @@ process INDIVIDUAL_FINAL_ANALYSIS {
 
     script:
     def args = task.ext.args ?: ''
-    mass_acc_ms1 = meta.precursor_mass_tolerance_unit == "ppm" ? meta.precursor_mass_tolerance : 5
-    mass_acc_ms2 = meta.fragment_mass_tolerance_unit == "ppm" ? meta.fragment_mass_tolerance : 13
+    mass_acc_ms1 = meta["precursormasstoleranceunit"].toLowerCase().endsWith("ppm") ? meta["precursormasstolerance"] : 5
+    mass_acc_ms2 = meta["fragmentmasstoleranceunit"].toLowerCase().endsWith("ppm") ? meta["fragmentmasstolerance"] : 13
     scan_window = params.scan_window
 
     if (params.mass_acc_automatic | params.scan_window_automatic){
@@ -31,20 +31,20 @@ process INDIVIDUAL_FINAL_ANALYSIS {
 
     """
     diann   --lib ${library} \\
-            --f ${mzML} \\
+            --f ${ms_file} \\
             --fasta ${fasta} \\
             --threads ${task.cpus} \\
             --verbose $params.diann_debug \\
             --temp ./ \\
-            --mass-acc \$(echo ${mass_acc_ms2}) \\
-            --mass-acc-ms1 \$(echo ${mass_acc_ms1}) \\
-            --window \$(echo ${scan_window}) \\
+            --mass-acc ${mass_acc_ms2} \\
+            --mass-acc-ms1 ${mass_acc_ms1} \\
+            --window ${scan_window} \\
             --no-ifs-removal \\
             --no-main-report \\
             --relaxed-prot-inf \\
             --pg-level $params.pg_level \\
             $args \\
-            2>&1 | tee ${mzML.baseName}_final_diann.log
+            2>&1 | tee ${ms_file.baseName}_final_diann.log
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/modules/local/pmultiqc/main.nf b/modules/local/pmultiqc/main.nf
index 9e234214..0d12625e 100644
--- a/modules/local/pmultiqc/main.nf
+++ b/modules/local/pmultiqc/main.nf
@@ -26,6 +26,16 @@ process PMULTIQC {
     def disable_idxml_index = (params.enable_pmultiqc) && (params.pmultiqc_idxml_skip) ? "--ignored_idxml" : ""
 
     """
+    set -x
+    set -e
+
+    # leaving here to ease debugging
+    ls -lcth *
+
+    echo ">>>>>>>>> Experimental Design <<<<<<<<<"
+    cat results/*openms_design.tsv
+
+    echo ">>>>>>>>> Running Multiqc <<<<<<<<<"
     multiqc \\
         -f \\
         --config ./results/multiqc_config.yml \\
diff --git a/modules/local/sdrfparsing/main.nf b/modules/local/sdrfparsing/main.nf
index 0bf4b01d..2f7989bb 100644
--- a/modules/local/sdrfparsing/main.nf
+++ b/modules/local/sdrfparsing/main.nf
@@ -18,12 +18,23 @@ process SDRFPARSING {
 
     script:
     def args = task.ext.args ?: ''
+    if (params.convert_dotd) {
+        extensionconversions = ",.d.gz:.mzML,.d.tar.gz:.mzML,d.tar:.mzML,.d.zip:.mzML,.d:.mzML"
+    } else {
+        extensionconversions = ",.gz:,.tar.gz:,.tar:,.zip:"
+    }
 
     """
     ## -t2 since the one-table format parser is broken in OpenMS2.5
     ## -l for legacy behavior to always add sample columns
 
-    parse_sdrf convert-openms -t2 -l --extension_convert raw:mzML -s ${sdrf} 2>&1 | tee ${sdrf.baseName}_parsing.log
+    parse_sdrf convert-openms \\
+        -t2 -l \\
+        --extension_convert raw:mzML$extensionconversions \\
+        -s ${sdrf} \\
+        $args \\
+        2>&1 | tee ${sdrf.baseName}_parsing.log
+
     mv openms.tsv ${sdrf.baseName}_config.tsv
     mv experimental_design.tsv ${sdrf.baseName}_openms_design.tsv
 
diff --git a/modules/local/tdf2mzml/main.nf b/modules/local/tdf2mzml/main.nf
new file mode 100644
index 00000000..cd67a86e
--- /dev/null
+++ b/modules/local/tdf2mzml/main.nf
@@ -0,0 +1,33 @@
+
+process TDF2MZML {
+    tag "$meta.mzml_id"
+    label 'process_single'
+    label 'error_retry'
+
+    container 'mfreitas/tdf2mzml:latest' // I don't know which stable tag to use...
+
+    input:
+    tuple val(meta), path(rawfile)
+
+    output:
+    tuple val(meta), path("*.mzML"), emit: mzmls_converted
+    tuple val(meta), path("*.d"),   emit: dotd_files
+    path "versions.yml",   emit: version
+    path "*.log",   emit: log
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.mzml_id}"
+
+    """
+    echo "Converting..." | tee --append ${rawfile.baseName}_conversion.log
+    tdf2mzml.py -i *.d 2>&1 | tee --append ${rawfile.baseName}_conversion.log
+    mv *.mzml ${file(rawfile.baseName).baseName}.mzML
+    mv *.d ${file(rawfile.baseName).baseName}.d
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        tdf2mzml.py: \$(tdf2mzml.py --version)
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/tdf2mzml/meta.yml b/modules/local/tdf2mzml/meta.yml
new file mode 100644
index 00000000..4bbf8e3f
--- /dev/null
+++ b/modules/local/tdf2mzml/meta.yml
@@ -0,0 +1,42 @@
+name: tdf2mzml
+description: convert raw bruker files to mzml files
+keywords:
+  - raw
+  - mzML
+  - .d
+tools:
+  - tdf2mzml:
+      description: |
+        It takes a bruker .d raw file as input and outputs indexed mzML
+      homepage: https://github.com/mafreitas/tdf2mzml
+      documentation: https://github.com/mafreitas/tdf2mzml
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+  - rawfile:
+      type: file
+      description: |
+        Bruker Raw file archived using tar
+      pattern: "*.d.tar"
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'MD5', enzyme:trypsin ]
+  - mzml:
+      type: file
+      description: indexed mzML
+      pattern: "*.mzML"
+  - log:
+      type: file
+      description: log file
+      pattern: "*.log"
+  - version:
+      type: file
+      description: File containing software version
+      pattern: "versions.yml"
+authors:
+  - "@jspaezp"
diff --git a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py
index b02fa23c..f5690c52 100755
--- a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py
+++ b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py
@@ -58,6 +58,11 @@ def main():
         "yaml": yaml.__version__,
     }
 
+    with open("$versions") as f:
+        # load as text and print for debugging
+        versions_text = f.read()
+        print(versions_text)
+
     with open("$versions") as f:
         versions_by_process = yaml.safe_load(f) | versions_this_module
 
diff --git a/nextflow.config b/nextflow.config
index 39eab531..0886ae5f 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -156,6 +156,9 @@ params {
     quantify_decoys          = false
     id_transfer_threshold    = 0.50 // only used if targeted_only is set to false (default)
 
+    // Bruker data
+    convert_dotd            = false
+
     // DIA-NN
     diann_debug             = 3
     scan_window             = 8
@@ -166,7 +169,8 @@ params {
     mass_acc_automatic      = true
     pg_level                = 2
     species_genes           = false
-    diann_normalize = true
+    diann_normalize         = true
+    diann_speclib           = ''
 
     // MSstats general options
     msstats_remove_one_feat_prot    = true
diff --git a/nextflow_schema.json b/nextflow_schema.json
index e1aed86b..d905cf35 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -152,6 +152,12 @@
                     "fa_icon": "fas fa-font",
                     "help_text": "Which MS levels to pick as comma separated list, e.g. `--peakpicking_ms_levels 1,2`. Leave empty for auto-detection."
                 },
+                "convert_dotd": {
+                    "type": "boolean",
+                    "description": "Convert bruker .d files to mzML",
+                    "fa_icon": "far fa-check-square",
+                    "help_text": "Whether to convert raw .d bruker files to .mzML"
+                },
                 "reindex_mzml": {
                     "type": "boolean",
                     "default": true,
@@ -919,6 +925,13 @@
                     "fa_icon": "far fa-check-square",
                     "default": false
                 },
+                "diann_speclib": {
+                    "type": "string",
+                    "description": "The spectral library to use for DIA-NN",
+                    "fa_icon": "fas fa-file",
+                    "help_text": "If passed, will use that spectral library to carry out the DIA-NN search, instead of predicting one from the fasta file.",
+                    "hidden": false
+                },
                 "diann_debug": {
                     "type": "integer",
                     "description": "Debug level",
diff --git a/pyproject.toml b/pyproject.toml
index 0d62beb6..814dd46f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,3 +8,6 @@ target_version = ["py37", "py38", "py39", "py310"]
 profile = "black"
 known_first_party = ["nf_core"]
 multi_line_output = 3
+
+[tool.ruff]
+line-length = 120
diff --git a/subworkflows/local/file_preparation.nf b/subworkflows/local/file_preparation.nf
index 341ba3ee..b8592031 100644
--- a/subworkflows/local/file_preparation.nf
+++ b/subworkflows/local/file_preparation.nf
@@ -3,28 +3,46 @@
 //
 
 include { THERMORAWFILEPARSER } from '../../modules/local/thermorawfileparser/main'
+include { TDF2MZML            } from '../../modules/local/tdf2mzml/main'
+include { DECOMPRESS          } from '../../modules/local/decompress_dotd/main'
+include { DOTD2MQC_INDIVIDUAL } from '../../modules/local/dotd_to_mqc/main'
+include { DOTD2MQC_AGGREGATE  } from '../../modules/local/dotd_to_mqc/main'
 include { MZMLINDEXING        } from '../../modules/local/openms/mzmlindexing/main'
 include { MZMLSTATISTICS      } from '../../modules/local/mzmlstatistics/main'
 include { OPENMSPEAKPICKER    } from '../../modules/local/openms/openmspeakpicker/main'
 
 workflow FILE_PREPARATION {
     take:
-    ch_mzmls            // channel: [ val(meta), raw/mzml ]
+    ch_rawfiles            // channel: [ val(meta), raw/mzml/d.tar ]
 
     main:
     ch_versions   = Channel.empty()
     ch_results    = Channel.empty()
     ch_statistics = Channel.empty()
+    ch_mqc_data   = Channel.empty()
+
+    // Divide the compressed files
+    ch_rawfiles
+    .branch {
+        dottar: WorkflowQuantms.hasExtension(it[1], '.tar')
+        dotzip: WorkflowQuantms.hasExtension(it[1], '.zip')
+        gz: WorkflowQuantms.hasExtension(it[1], '.gz')
+        uncompressed: true
+    }.set { ch_branched_input }
+
+    compressed_files = ch_branched_input.dottar.mix(ch_branched_input.dotzip, ch_branched_input.gz)
+    DECOMPRESS(compressed_files)
+    ch_versions = ch_versions.mix(DECOMPRESS.out.version)
+    ch_rawfiles = ch_branched_input.uncompressed.mix(DECOMPRESS.out.decompressed_files)
 
     //
     // Divide mzml files
-    //
-    ch_mzmls
+    ch_rawfiles
     .branch {
-        raw: WorkflowQuantms.hasExtension(it[1], 'raw')
-        mzML: WorkflowQuantms.hasExtension(it[1], 'mzML')
-    }
-    .set { ch_branched_input }
+        raw: WorkflowQuantms.hasExtension(it[1], '.raw')
+        mzML: WorkflowQuantms.hasExtension(it[1], '.mzML')
+        dotd: WorkflowQuantms.hasExtension(it[1], '.d')
+    }.set { ch_branched_input }
 
     // Note: we used to always index mzMLs if not already indexed but due to
     //  either a bug or limitation in nextflow
@@ -43,27 +61,53 @@ workflow FILE_PREPARATION {
     }
 
     THERMORAWFILEPARSER( ch_branched_input.raw )
+    // Output is
+    // {'mzmls_converted': Tuple[val(meta), path(mzml)],
+    //  'version': Path(versions.yml),
+    //  'log': Path(*.txt)}
+
+    // Where meta is the same as the input meta
     ch_versions = ch_versions.mix(THERMORAWFILEPARSER.out.version)
     ch_results  = ch_results.mix(THERMORAWFILEPARSER.out.mzmls_converted)
 
-    ch_results.map{ it -> [it[0], it[1]] }.set{ ch_mzml }
+    ch_results.map{ it -> [it[0], it[1]] }.set{ indexed_mzml_bundle }
+
+    // Exctract qc data from .d files
+    DOTD2MQC_INDIVIDUAL(ch_branched_input.dotd)
+    // The map extracts the tsv files from the tuple, the other elem is the yml config.
+    ch_mqc_data = ch_mqc_data.mix(DOTD2MQC_INDIVIDUAL.out.dotd_mqc_data.map{ it -> it[1] }.collect())
+    DOTD2MQC_AGGREGATE(DOTD2MQC_INDIVIDUAL.out.general_stats.collect())
+    ch_mqc_data = ch_mqc_data.mix(DOTD2MQC_AGGREGATE.out.dotd_mqc_data.collect())
+    ch_versions = ch_versions.mix(DOTD2MQC_INDIVIDUAL.out.version)
+    ch_versions = ch_versions.mix(DOTD2MQC_AGGREGATE.out.version)
 
-    MZMLSTATISTICS( ch_mzml )
+    // Convert .d files to mzML
+    if (params.convert_dotd) {
+        TDF2MZML( ch_branched_input.dotd )
+        ch_versions = ch_versions.mix(TDF2MZML.out.version)
+        ch_results = indexed_mzml_bundle.mix(TDF2MZML.out.mzmls_converted)
+        // indexed_mzml_bundle = indexed_mzml_bundle.mix(TDF2MZML.out.mzmls_converted)
+    } else{
+        ch_results = indexed_mzml_bundle.mix(ch_branched_input.dotd)
+    }
+
+    MZMLSTATISTICS(indexed_mzml_bundle)
     ch_statistics = ch_statistics.mix(MZMLSTATISTICS.out.mzml_statistics.collect())
     ch_versions = ch_versions.mix(MZMLSTATISTICS.out.version)
 
     if (params.openms_peakpicking){
+        // If the peak picker is enabled, it will over-write not bypass the .d files
         OPENMSPEAKPICKER (
-            ch_results
+            indexed_mzml_bundle
         )
 
         ch_versions = ch_versions.mix(OPENMSPEAKPICKER.out.version)
         ch_results = OPENMSPEAKPICKER.out.mzmls_picked
     }
 
-
     emit:
-    results         = ch_results        // channel: [val(mzml_id), indexedmzml]
+    results         = ch_results        // channel: [val(mzml_id), indexedmzml|.d.tar]
     statistics      = ch_statistics     // channel: [ *_mzml_info.tsv ]
+    mqc_custom_data = ch_mqc_data       // channel: [ *.tsv ]
     version         = ch_versions       // channel: [ *.version.txt ]
 }
diff --git a/workflows/dia.nf b/workflows/dia.nf
index feb08918..506d646d 100644
--- a/workflows/dia.nf
+++ b/workflows/dia.nf
@@ -20,7 +20,6 @@ include { DIANNSUMMARY                   } from '../modules/local/diannsummary/m
 // SUBWORKFLOWS: Consisting of a mix of local and nf-core/modules
 //
 
-
 /*
 ========================================================================================
     RUN MAIN WORKFLOW
@@ -39,15 +38,16 @@ workflow DIA {
     main:
 
     ch_software_versions = Channel.empty()
-    Channel.fromPath(params.database).set{ ch_searchdb }
+    Channel.fromPath(params.database).set { ch_searchdb }
 
     ch_file_preparation_results.multiMap {
-                                meta: preprocessed_meta(it[0])
-                                mzml: it[1]
-                                }
-                            .set { ch_result }
+        result ->
+        meta:   preprocessed_meta(result[0])
+        ms_file:result[1]
+        }
+        .set { ch_result }
 
-    meta = ch_result.meta.unique {it[0]}
+    meta = ch_result.meta.unique { it[0] }
 
     DIANNCFG(meta)
     ch_software_versions = ch_software_versions.mix(DIANNCFG.out.version.ifEmpty(null))
@@ -55,48 +55,75 @@ workflow DIA {
     //
     // MODULE: SILICOLIBRARYGENERATION
     //
-    SILICOLIBRARYGENERATION(ch_searchdb, DIANNCFG.out.diann_cfg)
+    if (params.diann_speclib) {
+        speclib = Channel.fromPath(params.diann_speclib)
+    } else {
+        SILICOLIBRARYGENERATION(ch_searchdb, DIANNCFG.out.diann_cfg)
+        speclib = SILICOLIBRARYGENERATION.out.predict_speclib
+    }
 
     //
     // MODULE: DIANN_PRELIMINARY_ANALYSIS
     //
-    DIANN_PRELIMINARY_ANALYSIS(ch_file_preparation_results.combine(SILICOLIBRARYGENERATION.out.predict_speclib))
+    DIANN_PRELIMINARY_ANALYSIS(ch_file_preparation_results.combine(speclib))
     ch_software_versions = ch_software_versions.mix(DIANN_PRELIMINARY_ANALYSIS.out.version.ifEmpty(null))
 
     //
     // MODULE: ASSEMBLE_EMPIRICAL_LIBRARY
     //
-    ASSEMBLE_EMPIRICAL_LIBRARY(ch_result.mzml.collect(),
-                                meta,
-                                DIANN_PRELIMINARY_ANALYSIS.out.diann_quant.collect(),
-                                SILICOLIBRARYGENERATION.out.predict_speclib
-                            )
+    // Order matters in DIANN, This shoudl be sorted for reproducible results.
+    ASSEMBLE_EMPIRICAL_LIBRARY(
+        ch_result.ms_file.collect(),
+        meta,
+        DIANN_PRELIMINARY_ANALYSIS.out.diann_quant.collect(),
+        speclib
+    )
     ch_software_versions = ch_software_versions.mix(ASSEMBLE_EMPIRICAL_LIBRARY.out.version.ifEmpty(null))
 
     //
     // MODULE: INDIVIDUAL_FINAL_ANALYSIS
     //
-    INDIVIDUAL_FINAL_ANALYSIS(ch_file_preparation_results.combine(ch_searchdb).combine(ASSEMBLE_EMPIRICAL_LIBRARY.out.log).combine(ASSEMBLE_EMPIRICAL_LIBRARY.out.empirical_library))
+    INDIVIDUAL_FINAL_ANALYSIS(
+        ch_file_preparation_results
+        .combine(ch_searchdb)
+        .combine(ASSEMBLE_EMPIRICAL_LIBRARY.out.log)
+        .combine(ASSEMBLE_EMPIRICAL_LIBRARY.out.empirical_library)
+    )
     ch_software_versions = ch_software_versions.mix(INDIVIDUAL_FINAL_ANALYSIS.out.version.ifEmpty(null))
 
     //
     // MODULE: DIANNSUMMARY
     //
-    DIANNSUMMARY(ch_result.mzml.collect(), meta, ASSEMBLE_EMPIRICAL_LIBRARY.out.empirical_library,
+    // Order matters in DIANN, This should be sorted for reproducible results.
+    // NOTE: I am getting here the names of the ms files, not the path.
+    // Since the next step only needs the name (since it uses the cached .quant)
+    // Also note that I am converting to a file object here because when executing
+    // locally, evey element in ch_result is a string, whilst on cloud it is a path.
+    ch_result
+        .ms_file.map { msfile -> file(msfile).getName() }
+        .collect()
+        .set { ms_file_names }
+    DIANNSUMMARY(ms_file_names, meta, ASSEMBLE_EMPIRICAL_LIBRARY.out.empirical_library,
                     INDIVIDUAL_FINAL_ANALYSIS.out.diann_quant.collect(), ch_searchdb)
     ch_software_versions = ch_software_versions.mix(DIANNSUMMARY.out.version.ifEmpty(null))
 
     //
     // MODULE: DIANNCONVERT
     //
-    DIANNCONVERT(DIANNSUMMARY.out.main_report, ch_expdesign, DIANNSUMMARY.out.pg_matrix, DIANNSUMMARY.out.pr_matrix, ch_mzml_info,
-                meta, ch_searchdb, DIANNSUMMARY.out.version)
+    DIANNCONVERT(
+        DIANNSUMMARY.out.main_report, ch_expdesign,
+        DIANNSUMMARY.out.pg_matrix,
+        DIANNSUMMARY.out.pr_matrix, ch_mzml_info,
+        meta,
+        ch_searchdb,
+        DIANNSUMMARY.out.version
+    )
     ch_software_versions = ch_software_versions.mix(DIANNCONVERT.out.version.ifEmpty(null))
 
     //
     // MODULE: MSSTATS
     ch_msstats_out = Channel.empty()
-    if(!params.skip_post_msstats){
+    if (!params.skip_post_msstats) {
         MSSTATS(DIANNCONVERT.out.out_msstats)
         ch_msstats_out = MSSTATS.out.msstats_csv
         ch_software_versions = ch_software_versions.mix(MSSTATS.out.version.ifEmpty(null))
@@ -108,24 +135,22 @@ workflow DIA {
     msstats_in      = DIANNCONVERT.out.out_msstats
     out_triqler     = DIANNCONVERT.out.out_triqler
     msstats_out     = ch_msstats_out
-
 }
 
-
 // remove meta.id to make sure cache identical HashCode
-def preprocessed_meta(LinkedHashMap meta){
+def preprocessed_meta(LinkedHashMap meta) {
     def parameters = [:]
-    parameters["experiment_id"]                 = meta.experiment_id
-    parameters["acquisition_method"]            = meta.acquisition_method
-    parameters["dissociationmethod"]            = meta.dissociationmethod
-    parameters["labelling_type"]                = meta.labelling_type
-    parameters["fixedmodifications"]            = meta.fixedmodifications
-    parameters["variablemodifications"]         = meta.variablemodifications
-    parameters["precursormasstolerance"]        = meta.precursormasstolerance
-    parameters["precursormasstoleranceunit"]    = meta.precursormasstoleranceunit
-    parameters["fragmentmasstolerance"]         = meta.fragmentmasstolerance
-    parameters["fragmentmasstoleranceunit"]     = meta.fragmentmasstoleranceunit
-    parameters["enzyme"]                        = meta.enzyme
+    parameters['experiment_id']                 = meta.experiment_id
+    parameters['acquisition_method']            = meta.acquisition_method
+    parameters['dissociationmethod']            = meta.dissociationmethod
+    parameters['labelling_type']                = meta.labelling_type
+    parameters['fixedmodifications']            = meta.fixedmodifications
+    parameters['variablemodifications']         = meta.variablemodifications
+    parameters['precursormasstolerance']        = meta.precursormasstolerance
+    parameters['precursormasstoleranceunit']    = meta.precursormasstoleranceunit
+    parameters['fragmentmasstolerance']         = meta.fragmentmasstolerance
+    parameters['fragmentmasstoleranceunit']     = meta.fragmentmasstoleranceunit
+    parameters['enzyme']                        = meta.enzyme
 
     return parameters
 }
diff --git a/workflows/quantms.nf b/workflows/quantms.nf
index 1e41ed38..d8d1eb3a 100644
--- a/workflows/quantms.nf
+++ b/workflows/quantms.nf
@@ -187,6 +187,7 @@ workflow QUANTMS {
     ch_multiqc_files = ch_multiqc_files.mix(Channel.from(ch_multiqc_config))
     ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml'))
     ch_multiqc_files = ch_multiqc_files.mix(FILE_PREPARATION.out.statistics)
+    ch_multiqc_files = ch_multiqc_files.mix(FILE_PREPARATION.out.mqc_custom_data)
     ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml'))
     ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect())
     ch_multiqc_quantms_logo = file("$projectDir/assets/nf-core-quantms_logo_light.png")