From 6b8d3d38163595a3e3ebf98e85567a004f4affd3 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Tue, 15 Oct 2024 16:19:38 +0200 Subject: [PATCH 01/42] add print links method to LinkGraph, improve LinkGraph string representation --- src/nplinker/scoring/link_graph.py | 102 ++++++++++++++++++++------ tests/unit/scoring/test_link_graph.py | 17 +++++ 2 files changed, 97 insertions(+), 22 deletions(-) diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index 50151997..90336635 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import Sequence from functools import wraps +from os import PathLike from typing import Union from networkx import Graph from tabulate import tabulate @@ -76,17 +77,17 @@ def __init__(self) -> None: Display the empty LinkGraph object: >>> lg - | | Object 1 | Object 2 | Metcalf Score | Rosetta Score | - |----|------------|------------|-----------------|-----------------| + | | Genomic Object Type | Genomic Object ID | Metabolomic Object Type | Metabolomic Object ID | Metcalf Score | Rosetta Score | + |----|-----------------------|---------------------|---------------------------|-------------------------|-----------------|-----------------| Add a link between a GCF and a Spectrum object: >>> lg.add_link(gcf, spectrum, metcalf=Score("metcalf", 1.0, {"cutoff": 0.5})) Display all links in LinkGraph object: >>> lg - | | Object 1 | Object 2 | Metcalf Score | Rosetta Score | - |----|--------------|------------------------|-----------------|-----------------| - | 1 | GCF(id=gcf1) | Spectrum(id=spectrum1) | 1 | - | + | | Genomic Object Type | Genomic Object ID | Metabolomic Object Type | Metabolomic Object ID | Metcalf Score | Rosetta Score | + |----|-----------------------|---------------------|---------------------------|-------------------------|-----------------|-----------------| + | 1 | GCF | 1 | Spectrum | 1 | 1.00 | - | Get all links for a given object: >>> lg[gcf] @@ -285,35 +286,92 @@ def _filter_two_nodes(self, u: Entity, v: Entity, lg: LinkGraph) -> None: if link_data is not None: lg.add_link(u, v, **link_data) - def _get_table_repr(self) -> str: - """Generate a table representation of the LinkGraph. + def get_table_data(self, display_limit: int | None = None) -> list[dict]: + """Generate the table data for the LinkGraph. + + This method iterates over the links in the LinkGraph and constructs a table + containing information about genomic and metabolomic objects, as well as their + associated scores. Each row in the table represents a link between a genomic + object and a metabolomic object. - The table is truncated to 60 links. + Args: + display_limit (int | None): The maximum number of rows to include in the + table. If None, all rows are included. + + Returns: + list: A list of dictionaries, where each dictionary contains + the following keys: + - Index (int) + - Genomic Object Type (str) + - Genomic Object ID (str or int) + - Metabolomic Object Type (str) + - Metabolomic Object ID (str or int) + - Metcalf Score (str, formatted to 2 decimal places, or "-") + - Rosetta Score (str, formatted to 2 decimal places, or "-") """ - headers = ["", "Object 1", "Object 2", "Metcalf Score", "Rosetta Score"] + genomic_object_classes = (GCF,) + table_data = [] - display_limit = 60 for index, (u, v, data) in enumerate(self.links, start=1): + genomic_object = u if isinstance(u, genomic_object_classes) else v + metabolomic_object = v if isinstance(u, genomic_object_classes) else u metcalf_score = data.get("metcalf") rosetta_score = data.get("rosetta") - row = [ - index, - str(u if isinstance(u, GCF) else v), - str(v if isinstance(u, GCF) else u), - f"{metcalf_score.value:.2f}" if metcalf_score else "-", - f"{rosetta_score.value:.2f}" if rosetta_score else "-", - ] - table_data.append(row) - - if index == display_limit: + table_data.append( + { + "Index": index, + "Genomic Object Type": genomic_object.__class__.__name__, + "Genomic Object ID": genomic_object.id, + "Metabolomic Object Type": metabolomic_object.__class__.__name__, + "Metabolomic Object ID": metabolomic_object.id, + "Metcalf Score": f"{metcalf_score.value:.2f}" if metcalf_score else "-", + "Rosetta Score": f"{rosetta_score.value:.2f}" if rosetta_score else "-", + } + ) + + if display_limit is not None and index == display_limit: break - table = tabulate(table_data, headers=headers, tablefmt="github", stralign="right") + return table_data + + def _get_table_repr(self, display_limit: int | None = 60) -> str: + """Generate a table representation of the LinkGraph. + + Args: + display_limit: The maximum number of links to display in the table. Defaults to 60. + + Returns: + str: A string representation of the table in GitHub-flavored markdown format. If the number of links + exceeds the display limit, the table is truncated and an additional line indicating the total number + of links is appended. + """ + table = tabulate( + self.get_table_data(display_limit), + headers="keys", + tablefmt="github", + stralign="right", + ) if len(self.links) > display_limit: truncated_info = f"...\n[ {len(self.links)} links ]" - return f"{table}\n{truncated_info}" + table += f"\n{truncated_info}" return table + + def print_links(self, file: str | PathLike) -> None: + """Print the links in the LinkGraph to a file. + + Args: + file: the file to write the links to. + + Examples: + >>> lg.print_links("links.tsv") + """ + table_data = self.get_table_data() + headers = table_data[0].keys() + with open(file, "w") as f: + f.write("\t".join(headers) + "\n") + for row in table_data: + f.write("\t".join(str(row[h]) for h in headers) + "\n") diff --git a/tests/unit/scoring/test_link_graph.py b/tests/unit/scoring/test_link_graph.py index 9f7c9d7d..4745c856 100644 --- a/tests/unit/scoring/test_link_graph.py +++ b/tests/unit/scoring/test_link_graph.py @@ -112,3 +112,20 @@ def test_filter(gcfs, spectra, score): # test filtering with GCFs and Spectra lg_filtered = lg.filter(u_nodes, v_nodes) assert len(lg_filtered) == 4 + + +def test_get_table_data(lg, gcfs, spectra, score): + table_data = lg.get_table_data() + assert type(table_data) is list + assert type(table_data[0]) is dict + assert table_data == [ + { + "Index": 1, + "Genomic Object Type": gcfs[0].__class__.__name__, + "Genomic Object ID": gcfs[0].id, + "Metabolomic Object Type": spectra[0].__class__.__name__, + "Metabolomic Object ID": spectra[0].id, + "Metcalf Score": f"{score.value:.2f}", + "Rosetta Score": "-", + }, + ] From cdd26c3330c6867c1be09b1a5ed90e3a16088fa3 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 16 Oct 2024 17:17:38 +0200 Subject: [PATCH 02/42] feat: add a method to print tabular results files --- src/nplinker/genomics/bgc.py | 21 ++++++++++++ src/nplinker/metabolomics/spectrum.py | 16 +++++++++ src/nplinker/nplinker.py | 49 +++++++++++++++++++++++++++ src/nplinker/scoring/link_graph.py | 14 ++++---- 4 files changed, 93 insertions(+), 7 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 08978587..57161d07 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -192,3 +192,24 @@ def aa_predictions(self) -> list: for p in predict_aa(self.antismash_file): self._aa_predictions[p[0]] = p[1] return [self._aa_predictions] + + def to_dict(self) -> dict: + """Convert the BGC object to a dictionary that can be used to export the results. + + Returns: + A dictionary containing relavant information about the BGC object. + """ + gcf_ids = [gcf.id for gcf in self.parents if gcf.id is not None] + gcf_bsc = [gcf.bigscape_class for gcf in self.parents if gcf.bigscape_class is not None] + + return { + "GCF_id": ", ".join(gcf_ids) if gcf_ids else None, + "GCF_bigscape_class": ", ".join(gcf_bsc) if gcf_bsc else None, + "BGC_name": self.id, + "strain_id": self.strain.id, + "description": self.description, + "antismash_id": self.antismash_id, + "antismash_region": self.antismash_region, + "antismash_cluster_type": ", ".join(self.product_prediction), + "mibig_bgc_class": self.mibig_bgc_class, + } diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index 61d8d421..a2891a2b 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -97,3 +97,19 @@ def has_strain(self, strain: Strain) -> bool: True when the given strain exist in the spectrum. """ return strain in self.strains + + def to_dict(self) -> dict: + """Convert the Spectrum object to a dictionary that can be used to export the results. + + Returns: + A dictionary containing relavant information about the Spectrum object. + """ + return { + "spectrum_id": self.id, + "num_strains_with_spectrum": len(self.strains), + "precursor_mz": self.precursor_mz, + "rt": self.rt, + "molecular_family": self.family.id if self.family else None, + "gnps_id": self.gnps_id, + "gnps_annotations": self.gnps_annotations, + } diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index a7146dcc..f15ee1dd 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -355,3 +355,52 @@ def save_data( data = (self.bgcs, self.gcfs, self.spectra, self.mfs, self.strains, links) with open(file, "wb") as f: pickle.dump(data, f) + + def print_bgcs(self, file: str | PathLike) -> None: + """Prints the BGC data to a specified file in tab-separated format. + + Args: + file: The path to the file where the BGC data will be printed. + """ + headers = self.bgcs[0].to_dict().keys() + + with open(file, "w") as f: + f.write("\t".join(headers) + "\n") + for bgc in self.bgcs: + row_data = bgc.to_dict() + f.write("\t".join(str(row_data[h]) for h in headers) + "\n") + + def print_gcfs(self, file: str | PathLike) -> None: + """Prints the GCF data to a specified file in tab-separated format. + + Args: + file: The path to the file where the GCF data will be printed. + """ + headers = self.gcfs[0].to_dict().keys() + + with open(file, "w") as f: + f.write("\t".join(headers) + "\n") + for gcf in self.gcfs: + row_data = gcf.to_dict() + f.write("\t".join(str(row_data[h]) for h in headers) + "\n") + + def print_spectra(self, file: str | PathLike) -> None: + """Prints the Spectrum data to a specified file in tab-separated format. + + Args: + file: The path to the file where the Spectrum data will be printed. + """ + headers = self.spectra[0].to_dict().keys() + + with open(file, "w") as f: + f.write("\t".join(headers) + "\n") + for spectrum in self.spectra: + row_data = spectrum.to_dict() + f.write("\t".join(str(row_data[h]) for h in headers) + "\n") + + def print_results(self, lg: LinkGraph | None = None) -> None: + """Prints the results to the output directory in tab-separated format.""" + self.print_bgcs(self._output_dir / "genomics_data.tsv") + self.print_spectra(self._output_dir / "metabolomics_data.tsv") + if lg is not None: + lg.print_links(self._output_dir / "links.tsv") diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index 90336635..fd1db438 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -321,13 +321,13 @@ def get_table_data(self, display_limit: int | None = None) -> list[dict]: table_data.append( { - "Index": index, - "Genomic Object Type": genomic_object.__class__.__name__, - "Genomic Object ID": genomic_object.id, - "Metabolomic Object Type": metabolomic_object.__class__.__name__, - "Metabolomic Object ID": metabolomic_object.id, - "Metcalf Score": f"{metcalf_score.value:.2f}" if metcalf_score else "-", - "Rosetta Score": f"{rosetta_score.value:.2f}" if rosetta_score else "-", + "index": index, + "genomic_object_type": genomic_object.__class__.__name__, + "genomic_object_id": genomic_object.id, + "metabolomic_object_type": metabolomic_object.__class__.__name__, + "metabolomic_object_id": metabolomic_object.id, + "metcalf_score": f"{metcalf_score.value:.2f}" if metcalf_score else "-", + "rosetta_score": f"{rosetta_score.value:.2f}" if rosetta_score else "-", } ) From ec8b8ae0a12885d6ddbc28ebb5b3c90b156e1140 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 16 Oct 2024 17:26:58 +0200 Subject: [PATCH 03/42] improve method names and docstrings, remove unused method to export gcf file --- src/nplinker/nplinker.py | 47 +++++++++++++++--------------- src/nplinker/scoring/link_graph.py | 4 +-- 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index f15ee1dd..9e87ed92 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -356,11 +356,13 @@ def save_data( with open(file, "wb") as f: pickle.dump(data, f) - def print_bgcs(self, file: str | PathLike) -> None: - """Prints the BGC data to a specified file in tab-separated format. + def export_genomics_data(self, file: str | PathLike) -> None: + """Exports the genomics data to a specified file in tab-separated format. + + Each row in the file corresponds to a BGC object. Args: - file: The path to the file where the BGC data will be printed. + file: The path to the file where the genomics data will be printed. """ headers = self.bgcs[0].to_dict().keys() @@ -370,25 +372,13 @@ def print_bgcs(self, file: str | PathLike) -> None: row_data = bgc.to_dict() f.write("\t".join(str(row_data[h]) for h in headers) + "\n") - def print_gcfs(self, file: str | PathLike) -> None: - """Prints the GCF data to a specified file in tab-separated format. - - Args: - file: The path to the file where the GCF data will be printed. - """ - headers = self.gcfs[0].to_dict().keys() - - with open(file, "w") as f: - f.write("\t".join(headers) + "\n") - for gcf in self.gcfs: - row_data = gcf.to_dict() - f.write("\t".join(str(row_data[h]) for h in headers) + "\n") + def export_metabolomics_data(self, file: str | PathLike) -> None: + """Exports the metabolomics data to a specified file in tab-separated format. - def print_spectra(self, file: str | PathLike) -> None: - """Prints the Spectrum data to a specified file in tab-separated format. + Each row in the file corresponds to a Spectrum object. Args: - file: The path to the file where the Spectrum data will be printed. + file: The path to the file where the metabolomics data will be printed. """ headers = self.spectra[0].to_dict().keys() @@ -398,9 +388,18 @@ def print_spectra(self, file: str | PathLike) -> None: row_data = spectrum.to_dict() f.write("\t".join(str(row_data[h]) for h in headers) + "\n") - def print_results(self, lg: LinkGraph | None = None) -> None: - """Prints the results to the output directory in tab-separated format.""" - self.print_bgcs(self._output_dir / "genomics_data.tsv") - self.print_spectra(self._output_dir / "metabolomics_data.tsv") + def export_results(self, lg: LinkGraph | None = None) -> None: + """Exports the results to the output directory in tab-separated format. + + This method exports genomics and metabolomics data to their respective + TSV files in the specified output directory. If a LinkGraph object is + provided, it also exports the links data to a TSV file. + + Args: + lg (LinkGraph | None): An optional LinkGraph object. If provided, + the links data will be exported to 'links.tsv'. + """ + self.export_genomics_data(self._output_dir / "genomics_data.tsv") + self.export_metabolomics_data(self._output_dir / "metabolomics_data.tsv") if lg is not None: - lg.print_links(self._output_dir / "links.tsv") + lg.export_links(self._output_dir / "links.tsv") diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index fd1db438..86a9ca6e 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -360,8 +360,8 @@ def _get_table_repr(self, display_limit: int | None = 60) -> str: return table - def print_links(self, file: str | PathLike) -> None: - """Print the links in the LinkGraph to a file. + def export_links(self, file: str | PathLike) -> None: + """Exports the links in the LinkGraph to a file. Args: file: the file to write the links to. From 2207df1eb1a5cc7a5df8d801a103dfe31a162f68 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 16 Oct 2024 18:07:22 +0200 Subject: [PATCH 04/42] improve doctring and typing --- src/nplinker/genomics/bgc.py | 26 +++++++++++++++++------- src/nplinker/metabolomics/spectrum.py | 29 ++++++++++++++++++++------- src/nplinker/scoring/link_graph.py | 2 +- 3 files changed, 42 insertions(+), 15 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 57161d07..880a3710 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -193,23 +193,35 @@ def aa_predictions(self) -> list: self._aa_predictions[p[0]] = p[1] return [self._aa_predictions] - def to_dict(self) -> dict: + def to_dict(self) -> dict[str, any]: """Convert the BGC object to a dictionary that can be used to export the results. + This method gathers relevant information from the BGC object and formats it into a dictionary + where each key-value pair represents a specific attribute of the BGC. + Returns: - A dictionary containing relavant information about the BGC object. + dict[str, str]: A dictionary containing relevant information about the BGC object, including: + - GCF_id: A comma-separated string of GCF IDs or "-" if none. + - GCF_bigscape_class: A comma-separated string of BiG-SCAPE classes or "-" if none. + - BGC_name: The name of the BGC. + - strain_id: The ID of the strain. + - description: A description of the BGC. + - antismash_id: The antiSMASH ID. + - antismash_region: The antiSMASH region. + - antismash_cluster_type: A comma-separated string of product predictions. + - mibig_bgc_class: The MiBIG BGC class or "-" if none. """ - gcf_ids = [gcf.id for gcf in self.parents if gcf.id is not None] - gcf_bsc = [gcf.bigscape_class for gcf in self.parents if gcf.bigscape_class is not None] + gcf_ids = {gcf.id for gcf in self.parents if gcf.id is not None} + gcf_bsc = {bsc for bsc in self.bigscape_classes if bsc is not None} return { - "GCF_id": ", ".join(gcf_ids) if gcf_ids else None, - "GCF_bigscape_class": ", ".join(gcf_bsc) if gcf_bsc else None, + "GCF_id": ", ".join(gcf_ids) if gcf_ids else "-", + "GCF_bigscape_class": ", ".join(gcf_bsc) if gcf_bsc else "-", "BGC_name": self.id, "strain_id": self.strain.id, "description": self.description, "antismash_id": self.antismash_id, "antismash_region": self.antismash_region, "antismash_cluster_type": ", ".join(self.product_prediction), - "mibig_bgc_class": self.mibig_bgc_class, + "mibig_bgc_class": self.mibig_bgc_class if self.mibig_bgc_class else "-", } diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index a2891a2b..20f64c9f 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -98,18 +98,33 @@ def has_strain(self, strain: Strain) -> bool: """ return strain in self.strains - def to_dict(self) -> dict: + def to_dict(self) -> dict[str, any]: """Convert the Spectrum object to a dictionary that can be used to export the results. + This method gathers relevant information from the Spectrum object and formats it into a dictionary + where each key-value pair represents a specific attribute of the Spectrum. + Returns: - A dictionary containing relavant information about the Spectrum object. + dict[str, str]: A dictionary containing relevant information about the Spectrum object, including: + - "spectrum_id": The unique identifier of the spectrum. + - "num_strains_with_spectrum": The number of strains associated with the spectrum. + - "precursor_mz": The precursor m/z value formatted to four decimal places. + - "rt": The retention time formatted to three decimal places. + - "molecular_family": The identifier of the molecular family, or "-" if not available. + - "gnps_id": The GNPS identifier, or "-" if not available. + - "gnps_annotations": A formatted string of GNPS annotations, or "-" if not available. """ + + def format_gnps_annotations(annotations: dict) -> str: + """Format GNPS annotations dictionary into a string.""" + return "; ".join(f"{k}: {v}" for k, v in annotations.items()) + return { "spectrum_id": self.id, "num_strains_with_spectrum": len(self.strains), - "precursor_mz": self.precursor_mz, - "rt": self.rt, - "molecular_family": self.family.id if self.family else None, - "gnps_id": self.gnps_id, - "gnps_annotations": self.gnps_annotations, + "precursor_mz": round(self.precursor_mz, 4), + "rt": round(self.rt, 3), + "molecular_family": self.family.id if self.family else "-", + "gnps_id": self.gnps_id if self.gnps_id else "-", + "gnps_annotations": self.gnps_annotations if self.gnps_annotations else "-", } diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index 86a9ca6e..278f3941 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -286,7 +286,7 @@ def _filter_two_nodes(self, u: Entity, v: Entity, lg: LinkGraph) -> None: if link_data is not None: lg.add_link(u, v, **link_data) - def get_table_data(self, display_limit: int | None = None) -> list[dict]: + def get_table_data(self, display_limit: int | None = None) -> list[dict[str, any]]: """Generate the table data for the LinkGraph. This method iterates over the links in the LinkGraph and constructs a table From c6e166a04647876cefd276036f1f9f799e7ecbbb Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 16 Oct 2024 18:15:04 +0200 Subject: [PATCH 05/42] fix a failing test --- tests/unit/scoring/test_link_graph.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/tests/unit/scoring/test_link_graph.py b/tests/unit/scoring/test_link_graph.py index 4745c856..4c7e68b3 100644 --- a/tests/unit/scoring/test_link_graph.py +++ b/tests/unit/scoring/test_link_graph.py @@ -118,14 +118,11 @@ def test_get_table_data(lg, gcfs, spectra, score): table_data = lg.get_table_data() assert type(table_data) is list assert type(table_data[0]) is dict - assert table_data == [ - { - "Index": 1, - "Genomic Object Type": gcfs[0].__class__.__name__, - "Genomic Object ID": gcfs[0].id, - "Metabolomic Object Type": spectra[0].__class__.__name__, - "Metabolomic Object ID": spectra[0].id, - "Metcalf Score": f"{score.value:.2f}", - "Rosetta Score": "-", - }, - ] + assert len(table_data) == 1 + assert table_data[0]["index"] == 1 + assert table_data[0]["genomic_object_type"] == gcfs[0].__class__.__name__ + assert table_data[0]["genomic_object_id"] == gcfs[0].id + assert table_data[0]["metabolomic_object_type"] == spectra[0].__class__.__name__ + assert table_data[0]["metabolomic_object_id"] == spectra[0].id + assert table_data[0]["metcalf_score"] == f"{score.value:.2f}" + assert table_data[0]["rosetta_score"] == "-" From 32ca3ddd534c23cceede4c6318b82d5bd42c1ba2 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 16 Oct 2024 18:21:43 +0200 Subject: [PATCH 06/42] refactor a little bit the spectrum method to covert to dict --- src/nplinker/metabolomics/spectrum.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index 20f64c9f..3dc6b3ed 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -98,6 +98,10 @@ def has_strain(self, strain: Strain) -> bool: """ return strain in self.strains + def _formatted_gnps_annotations(self) -> str: + """Format GNPS annotations dictionary into a string.""" + return "; ".join(f"{k}: {v}" for k, v in self.gnps_annotations.items()) + def to_dict(self) -> dict[str, any]: """Convert the Spectrum object to a dictionary that can be used to export the results. @@ -114,11 +118,6 @@ def to_dict(self) -> dict[str, any]: - "gnps_id": The GNPS identifier, or "-" if not available. - "gnps_annotations": A formatted string of GNPS annotations, or "-" if not available. """ - - def format_gnps_annotations(annotations: dict) -> str: - """Format GNPS annotations dictionary into a string.""" - return "; ".join(f"{k}: {v}" for k, v in annotations.items()) - return { "spectrum_id": self.id, "num_strains_with_spectrum": len(self.strains), @@ -126,5 +125,7 @@ def format_gnps_annotations(annotations: dict) -> str: "rt": round(self.rt, 3), "molecular_family": self.family.id if self.family else "-", "gnps_id": self.gnps_id if self.gnps_id else "-", - "gnps_annotations": self.gnps_annotations if self.gnps_annotations else "-", + "gnps_annotations": self._formatted_gnps_annotations() + if self.gnps_annotations + else "-", } From 8e7945d3318a41de213a113ac2fb7c259f1002f5 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 16 Oct 2024 18:41:57 +0200 Subject: [PATCH 07/42] change the output format for gnps_annotations in metabolomics results file, improve docstrings --- src/nplinker/genomics/bgc.py | 26 +++++++++++------------ src/nplinker/metabolomics/spectrum.py | 30 +++++++++++---------------- src/nplinker/scoring/link_graph.py | 17 +++++++-------- 3 files changed, 33 insertions(+), 40 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 880a3710..2624cfae 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -194,22 +194,22 @@ def aa_predictions(self) -> list: return [self._aa_predictions] def to_dict(self) -> dict[str, any]: - """Convert the BGC object to a dictionary that can be used to export the results. + """Convert the BGC object to a dictionary for exporting results. - This method gathers relevant information from the BGC object and formats it into a dictionary - where each key-value pair represents a specific attribute of the BGC. + This method compiles relevant information from the BGC object and formats it into a dictionary. + Each key-value pair in the dictionary represents a specific attribute of the BGC. Returns: - dict[str, str]: A dictionary containing relevant information about the BGC object, including: - - GCF_id: A comma-separated string of GCF IDs or "-" if none. - - GCF_bigscape_class: A comma-separated string of BiG-SCAPE classes or "-" if none. - - BGC_name: The name of the BGC. - - strain_id: The ID of the strain. - - description: A description of the BGC. - - antismash_id: The antiSMASH ID. - - antismash_region: The antiSMASH region. - - antismash_cluster_type: A comma-separated string of product predictions. - - mibig_bgc_class: The MiBIG BGC class or "-" if none. + A dictionary containing the following key-value pairs: + - GCF_id (str): A comma-separated string of GCF IDs or "-" if none are available. + - GCF_bigscape_class (str): A comma-separated string of BiG-SCAPE classes or "-" if none are available. + - BGC_name (str): The name of the BGC. + - strain_id (str): The ID of the strain. + - description (str): A description of the BGC. + - antismash_id (str): The antiSMASH ID. + - antismash_region (str): The antiSMASH region. + - antismash_cluster_type (str): A comma-separated string of product predictions. + - mibig_bgc_class (str): The MiBIG BGC class or "-" if none is available. """ gcf_ids = {gcf.id for gcf in self.parents if gcf.id is not None} gcf_bsc = {bsc for bsc in self.bigscape_classes if bsc is not None} diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index 3dc6b3ed..5c929a13 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -98,25 +98,21 @@ def has_strain(self, strain: Strain) -> bool: """ return strain in self.strains - def _formatted_gnps_annotations(self) -> str: - """Format GNPS annotations dictionary into a string.""" - return "; ".join(f"{k}: {v}" for k, v in self.gnps_annotations.items()) - def to_dict(self) -> dict[str, any]: - """Convert the Spectrum object to a dictionary that can be used to export the results. + """Convert the Spectrum object to a dictionary for exporting results. - This method gathers relevant information from the Spectrum object and formats it into a dictionary - where each key-value pair represents a specific attribute of the Spectrum. + This method compiles relevant information from the Spectrum object into a dictionary format. + Each key-value pair in the dictionary represents a specific attribute of the Spectrum Object. Returns: - dict[str, str]: A dictionary containing relevant information about the Spectrum object, including: - - "spectrum_id": The unique identifier of the spectrum. - - "num_strains_with_spectrum": The number of strains associated with the spectrum. - - "precursor_mz": The precursor m/z value formatted to four decimal places. - - "rt": The retention time formatted to three decimal places. - - "molecular_family": The identifier of the molecular family, or "-" if not available. - - "gnps_id": The GNPS identifier, or "-" if not available. - - "gnps_annotations": A formatted string of GNPS annotations, or "-" if not available. + A dictionary containing containing the following key-value pairs: + - "spectrum_id" (str): The unique identifier of the spectrum. + - "num_strains_with_spectrum" (int): The number of strains associated with the spectrum. + - "precursor_mz" (float): The precursor m/z value, rounded to four decimal places. + - "rt" (float): The retention time, rounded to three decimal places. + - "molecular_family" (str): The identifier of the molecular family, or "-" if not available. + - "gnps_id" (str): The GNPS identifier, or "-" if not available. + - "gnps_annotations" (dict | str): A dictionary of GNPS annotations, or "-" if not available. """ return { "spectrum_id": self.id, @@ -125,7 +121,5 @@ def to_dict(self) -> dict[str, any]: "rt": round(self.rt, 3), "molecular_family": self.family.id if self.family else "-", "gnps_id": self.gnps_id if self.gnps_id else "-", - "gnps_annotations": self._formatted_gnps_annotations() - if self.gnps_annotations - else "-", + "gnps_annotations": self.gnps_annotations if self.gnps_annotations else "-", } diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index 278f3941..4f7753b9 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -299,15 +299,14 @@ def get_table_data(self, display_limit: int | None = None) -> list[dict[str, any table. If None, all rows are included. Returns: - list: A list of dictionaries, where each dictionary contains - the following keys: - - Index (int) - - Genomic Object Type (str) - - Genomic Object ID (str or int) - - Metabolomic Object Type (str) - - Metabolomic Object ID (str or int) - - Metcalf Score (str, formatted to 2 decimal places, or "-") - - Rosetta Score (str, formatted to 2 decimal places, or "-") + A list of dictionaries, where each dictionary contains + - index (int): The index of the link. + - genomic_object_type (str): The type of the genomic object. + - genomic_object_id (str or int): The ID of the genomic object. + - metabolomic_object_type (str): The type of the metabolomic object. + - metabolomic_object_id (str or int): The ID of the metabolomic object. + - metcalf_score (str): The Metcalf score, formatted to 2 decimal places, or "-". + - rosetta_score (str): The Rosetta score, formatted to 2 decimal places, or "-". """ genomic_object_classes = (GCF,) From 25928100a34bc0e0d49b706895e43fab50b0cee7 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Thu, 17 Oct 2024 14:47:36 +0200 Subject: [PATCH 08/42] fix: convert int to str before using join --- src/nplinker/genomics/bgc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 2624cfae..d9787a38 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -211,8 +211,8 @@ def to_dict(self) -> dict[str, any]: - antismash_cluster_type (str): A comma-separated string of product predictions. - mibig_bgc_class (str): The MiBIG BGC class or "-" if none is available. """ - gcf_ids = {gcf.id for gcf in self.parents if gcf.id is not None} - gcf_bsc = {bsc for bsc in self.bigscape_classes if bsc is not None} + gcf_ids = {str(gcf.id) for gcf in self.parents if gcf.id is not None} + gcf_bsc = {str(bsc) for bsc in self.bigscape_classes if bsc is not None} return { "GCF_id": ", ".join(gcf_ids) if gcf_ids else "-", From 7f53de8456cd999c456bdd28fad07b2aca541c8a Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Thu, 17 Oct 2024 15:40:07 +0200 Subject: [PATCH 09/42] change representation of empty values in output files for improved integration to excel --- src/nplinker/genomics/bgc.py | 6 +++--- src/nplinker/metabolomics/spectrum.py | 6 +++--- src/nplinker/scoring/link_graph.py | 24 ++++++++++++++++++------ 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index d9787a38..902ba5f2 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -215,13 +215,13 @@ def to_dict(self) -> dict[str, any]: gcf_bsc = {str(bsc) for bsc in self.bigscape_classes if bsc is not None} return { - "GCF_id": ", ".join(gcf_ids) if gcf_ids else "-", - "GCF_bigscape_class": ", ".join(gcf_bsc) if gcf_bsc else "-", + "GCF_id": ", ".join(gcf_ids), + "GCF_bigscape_class": ", ".join(gcf_bsc), "BGC_name": self.id, "strain_id": self.strain.id, "description": self.description, "antismash_id": self.antismash_id, "antismash_region": self.antismash_region, "antismash_cluster_type": ", ".join(self.product_prediction), - "mibig_bgc_class": self.mibig_bgc_class if self.mibig_bgc_class else "-", + "mibig_bgc_class": self.mibig_bgc_class if self.mibig_bgc_class else "", } diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index 5c929a13..2b89dddc 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -119,7 +119,7 @@ def to_dict(self) -> dict[str, any]: "num_strains_with_spectrum": len(self.strains), "precursor_mz": round(self.precursor_mz, 4), "rt": round(self.rt, 3), - "molecular_family": self.family.id if self.family else "-", - "gnps_id": self.gnps_id if self.gnps_id else "-", - "gnps_annotations": self.gnps_annotations if self.gnps_annotations else "-", + "molecular_family": self.family.id if self.family else "", + "gnps_id": self.gnps_id if self.gnps_id else "", + "gnps_annotations": self.gnps_annotations if self.gnps_annotations else "", } diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index 4f7753b9..bd715723 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -104,6 +104,18 @@ def __init__(self) -> None: Get the link data between two objects: >>> lg.get_link_data(gcf, spectrum) {"metcalf": Score("metcalf", 1.0, {"cutoff": 0.5})} + + Filter the links for `gcf1` and `gcf2`: + >>> new_lg = lg.filter([gcf1, gcf2]) + + Filter the links for `spectrum1` and `spectrum2`: + >>> new_lg = lg.filter([spectrum1, spectrum2]) + + Filter the links between two lists of objects: + >>> new_lg = lg.filter([gcf1, gcf2], [spectrum1, spectrum2]) + + Export the links to a file: + >>> lg.export_links("links.tsv") """ self._g: Graph = Graph() @@ -305,8 +317,8 @@ def get_table_data(self, display_limit: int | None = None) -> list[dict[str, any - genomic_object_id (str or int): The ID of the genomic object. - metabolomic_object_type (str): The type of the metabolomic object. - metabolomic_object_id (str or int): The ID of the metabolomic object. - - metcalf_score (str): The Metcalf score, formatted to 2 decimal places, or "-". - - rosetta_score (str): The Rosetta score, formatted to 2 decimal places, or "-". + - metcalf_score (float): The Metcalf score, rounded to 2 decimal places. + - rosetta_score (float): The Rosetta score, rounded to 2 decimal places. """ genomic_object_classes = (GCF,) @@ -321,12 +333,12 @@ def get_table_data(self, display_limit: int | None = None) -> list[dict[str, any table_data.append( { "index": index, - "genomic_object_type": genomic_object.__class__.__name__, "genomic_object_id": genomic_object.id, - "metabolomic_object_type": metabolomic_object.__class__.__name__, + "genomic_object_type": genomic_object.__class__.__name__, "metabolomic_object_id": metabolomic_object.id, - "metcalf_score": f"{metcalf_score.value:.2f}" if metcalf_score else "-", - "rosetta_score": f"{rosetta_score.value:.2f}" if rosetta_score else "-", + "metabolomic_object_type": metabolomic_object.__class__.__name__, + "metcalf_score": round(metcalf_score.value, 2) if metcalf_score else "", + "rosetta_score": round(rosetta_score.value, 2) if rosetta_score else "", } ) From ad049c843384c68dfa24dbee5ab99d00f6726c27 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Thu, 17 Oct 2024 17:00:59 +0200 Subject: [PATCH 10/42] refactoring the export methods --- src/nplinker/genomics/bgc.py | 9 ++-- src/nplinker/metabolomics/spectrum.py | 6 +-- src/nplinker/nplinker.py | 55 +++++++++++------------- src/nplinker/scoring/link_graph.py | 62 +++++++++++++++------------ tests/unit/scoring/test_link_graph.py | 4 +- 5 files changed, 68 insertions(+), 68 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 902ba5f2..486b9861 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -211,17 +211,14 @@ def to_dict(self) -> dict[str, any]: - antismash_cluster_type (str): A comma-separated string of product predictions. - mibig_bgc_class (str): The MiBIG BGC class or "-" if none is available. """ - gcf_ids = {str(gcf.id) for gcf in self.parents if gcf.id is not None} - gcf_bsc = {str(bsc) for bsc in self.bigscape_classes if bsc is not None} - return { - "GCF_id": ", ".join(gcf_ids), - "GCF_bigscape_class": ", ".join(gcf_bsc), + "GCF_id": {gcf.id for gcf in self.parents if gcf.id is not None}, + "GCF_bigscape_class": {bsc for bsc in self.bigscape_classes if bsc is not None}, "BGC_name": self.id, "strain_id": self.strain.id, "description": self.description, "antismash_id": self.antismash_id, "antismash_region": self.antismash_region, - "antismash_cluster_type": ", ".join(self.product_prediction), + "antismash_cluster_type": self.product_prediction, "mibig_bgc_class": self.mibig_bgc_class if self.mibig_bgc_class else "", } diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index 2b89dddc..aa008579 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -119,7 +119,7 @@ def to_dict(self) -> dict[str, any]: "num_strains_with_spectrum": len(self.strains), "precursor_mz": round(self.precursor_mz, 4), "rt": round(self.rt, 3), - "molecular_family": self.family.id if self.family else "", - "gnps_id": self.gnps_id if self.gnps_id else "", - "gnps_annotations": self.gnps_annotations if self.gnps_annotations else "", + "molecular_family": self.family.id if self.family else None, + "gnps_id": self.gnps_id, + "gnps_annotations": self.gnps_annotations, } diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index 9e87ed92..52599957 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -356,37 +356,34 @@ def save_data( with open(file, "wb") as f: pickle.dump(data, f) - def export_genomics_data(self, file: str | PathLike) -> None: - """Exports the genomics data to a specified file in tab-separated format. - - Each row in the file corresponds to a BGC object. + def export_objects(self, objects: BGC | Spectrum, filename: str) -> None: + """Exports the data for a list of BGC or Spectrum objects to a specified file in tab-separated format. Args: - file: The path to the file where the genomics data will be printed. + objects (BGC | Spectrum): A list of BGC or Spectrum objects to be exported. + filename (str): The name of the file where the data will be saved. """ - headers = self.bgcs[0].to_dict().keys() - - with open(file, "w") as f: - f.write("\t".join(headers) + "\n") - for bgc in self.bgcs: - row_data = bgc.to_dict() - f.write("\t".join(str(row_data[h]) for h in headers) + "\n") - - def export_metabolomics_data(self, file: str | PathLike) -> None: - """Exports the metabolomics data to a specified file in tab-separated format. - - Each row in the file corresponds to a Spectrum object. - - Args: - file: The path to the file where the metabolomics data will be printed. - """ - headers = self.spectra[0].to_dict().keys() - - with open(file, "w") as f: + headers = objects[0].to_dict().keys() + with open(self._output_dir / filename, "w") as f: f.write("\t".join(headers) + "\n") - for spectrum in self.spectra: - row_data = spectrum.to_dict() - f.write("\t".join(str(row_data[h]) for h in headers) + "\n") + for obj in objects: + row_data = obj.to_dict() + formatted_row = [] + for header in headers: + item = row_data.get(header, "") + # Convert list, tuple, set to comma-separated string + if isinstance(item, (list, tuple, set)): + formatted_row.append(", ".join(map(str, item))) + # Convert dict to comma-separated string + elif isinstance(item, dict): + formatted_row.append(", ".join([f"{k}:{v}" for k, v in item.items()])) + # Convert non-empty value to string + elif item: + formatted_row.append(str(item)) + # Convert empty value to empty string + else: + formatted_row.append("") + f.write("\t".join(formatted_row) + "\n") def export_results(self, lg: LinkGraph | None = None) -> None: """Exports the results to the output directory in tab-separated format. @@ -399,7 +396,7 @@ def export_results(self, lg: LinkGraph | None = None) -> None: lg (LinkGraph | None): An optional LinkGraph object. If provided, the links data will be exported to 'links.tsv'. """ - self.export_genomics_data(self._output_dir / "genomics_data.tsv") - self.export_metabolomics_data(self._output_dir / "metabolomics_data.tsv") + self.export_objects(self.bgcs, "genomics_data.tsv") + self.export_objects(self.spectra, "metabolomics_data.tsv") if lg is not None: lg.export_links(self._output_dir / "links.tsv") diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index bd715723..0d6f4074 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -311,41 +311,47 @@ def get_table_data(self, display_limit: int | None = None) -> list[dict[str, any table. If None, all rows are included. Returns: - A list of dictionaries, where each dictionary contains + A list of dictionaries containing the table data. + """ + table_data = [] + for index, link in enumerate(self.links, start=1): + table_data.append(self.link_to_dict(link, index)) + if display_limit is not None and index == display_limit: + break + return table_data + + def link_to_dict(self, link: LINK, index: int) -> dict[str, any]: + """Convert a link to a dictionary representation. + + Args: + link: A tuple containing the link information (u, v, data). + index: The index of the link. + + Returns: + A dictionary containing the link information with the following keys: - index (int): The index of the link. - - genomic_object_type (str): The type of the genomic object. - genomic_object_id (str or int): The ID of the genomic object. - - metabolomic_object_type (str): The type of the metabolomic object. + - genomic_object_type (str): The type of the genomic object. - metabolomic_object_id (str or int): The ID of the metabolomic object. + - metabolomic_object_type (str): The type of the metabolomic object. - metcalf_score (float): The Metcalf score, rounded to 2 decimal places. - rosetta_score (float): The Rosetta score, rounded to 2 decimal places. """ + u, v, data = link genomic_object_classes = (GCF,) - - table_data = [] - - for index, (u, v, data) in enumerate(self.links, start=1): - genomic_object = u if isinstance(u, genomic_object_classes) else v - metabolomic_object = v if isinstance(u, genomic_object_classes) else u - metcalf_score = data.get("metcalf") - rosetta_score = data.get("rosetta") - - table_data.append( - { - "index": index, - "genomic_object_id": genomic_object.id, - "genomic_object_type": genomic_object.__class__.__name__, - "metabolomic_object_id": metabolomic_object.id, - "metabolomic_object_type": metabolomic_object.__class__.__name__, - "metcalf_score": round(metcalf_score.value, 2) if metcalf_score else "", - "rosetta_score": round(rosetta_score.value, 2) if rosetta_score else "", - } - ) - - if display_limit is not None and index == display_limit: - break - - return table_data + genomic_object = u if isinstance(u, genomic_object_classes) else v + metabolomic_object = v if isinstance(u, genomic_object_classes) else u + metcalf_score = data.get("metcalf") + rosetta_score = data.get("rosetta") + return { + "index": index, + "genomic_object_id": genomic_object.id, + "genomic_object_type": genomic_object.__class__.__name__, + "metabolomic_object_id": metabolomic_object.id, + "metabolomic_object_type": metabolomic_object.__class__.__name__, + "metcalf_score": round(metcalf_score.value, 2) if metcalf_score else "", + "rosetta_score": round(rosetta_score.value, 2) if rosetta_score else "", + } def _get_table_repr(self, display_limit: int | None = 60) -> str: """Generate a table representation of the LinkGraph. diff --git a/tests/unit/scoring/test_link_graph.py b/tests/unit/scoring/test_link_graph.py index 4c7e68b3..5a4e7197 100644 --- a/tests/unit/scoring/test_link_graph.py +++ b/tests/unit/scoring/test_link_graph.py @@ -124,5 +124,5 @@ def test_get_table_data(lg, gcfs, spectra, score): assert table_data[0]["genomic_object_id"] == gcfs[0].id assert table_data[0]["metabolomic_object_type"] == spectra[0].__class__.__name__ assert table_data[0]["metabolomic_object_id"] == spectra[0].id - assert table_data[0]["metcalf_score"] == f"{score.value:.2f}" - assert table_data[0]["rosetta_score"] == "-" + assert table_data[0]["metcalf_score"] == round(score.value, 2) + assert table_data[0]["rosetta_score"] == "" From b220fb024af7be8479fe9facc362c3de83c9520f Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Fri, 18 Oct 2024 17:07:21 +0200 Subject: [PATCH 11/42] small refactor: specify staticmethod --- src/nplinker/scoring/link_graph.py | 47 +++++++++++++++--------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index 0d6f4074..091474e5 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -298,29 +298,8 @@ def _filter_two_nodes(self, u: Entity, v: Entity, lg: LinkGraph) -> None: if link_data is not None: lg.add_link(u, v, **link_data) - def get_table_data(self, display_limit: int | None = None) -> list[dict[str, any]]: - """Generate the table data for the LinkGraph. - - This method iterates over the links in the LinkGraph and constructs a table - containing information about genomic and metabolomic objects, as well as their - associated scores. Each row in the table represents a link between a genomic - object and a metabolomic object. - - Args: - display_limit (int | None): The maximum number of rows to include in the - table. If None, all rows are included. - - Returns: - A list of dictionaries containing the table data. - """ - table_data = [] - for index, link in enumerate(self.links, start=1): - table_data.append(self.link_to_dict(link, index)) - if display_limit is not None and index == display_limit: - break - return table_data - - def link_to_dict(self, link: LINK, index: int) -> dict[str, any]: + @staticmethod + def link_to_dict(link: LINK, index: int) -> dict[str, any]: """Convert a link to a dictionary representation. Args: @@ -353,6 +332,28 @@ def link_to_dict(self, link: LINK, index: int) -> dict[str, any]: "rosetta_score": round(rosetta_score.value, 2) if rosetta_score else "", } + def get_table_data(self, display_limit: int | None = None) -> list[dict[str, any]]: + """Generate the table data for the LinkGraph. + + This method iterates over the links in the LinkGraph and constructs a table + containing information about genomic and metabolomic objects, as well as their + associated scores. Each row in the table represents a link between a genomic + object and a metabolomic object. + + Args: + display_limit (int | None): The maximum number of rows to include in the + table. If None, all rows are included. + + Returns: + A list of dictionaries containing the table data. + """ + table_data = [] + for index, link in enumerate(self.links, start=1): + table_data.append(self.link_to_dict(link, index)) + if display_limit is not None and index == display_limit: + break + return table_data + def _get_table_repr(self, display_limit: int | None = 60) -> str: """Generate a table representation of the LinkGraph. From f98fa98097c6785925382a95a4bb43922bec0e71 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Fri, 18 Oct 2024 17:10:01 +0200 Subject: [PATCH 12/42] add more tests --- src/nplinker/genomics/bgc.py | 6 ++--- tests/unit/genomics/test_bgc.py | 28 +++++++++++++++++++++ tests/unit/metabolomics/test_spectrum.py | 32 ++++++++++++++++++++++++ tests/unit/scoring/test_link_graph.py | 28 +++++++++++++++------ 4 files changed, 84 insertions(+), 10 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 486b9861..7d606fe0 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -215,10 +215,10 @@ def to_dict(self) -> dict[str, any]: "GCF_id": {gcf.id for gcf in self.parents if gcf.id is not None}, "GCF_bigscape_class": {bsc for bsc in self.bigscape_classes if bsc is not None}, "BGC_name": self.id, - "strain_id": self.strain.id, + "product_prediction": self.product_prediction, + "mibig_bgc_class": self.mibig_bgc_class, "description": self.description, + "strain_id": self.strain.id, "antismash_id": self.antismash_id, "antismash_region": self.antismash_region, - "antismash_cluster_type": self.product_prediction, - "mibig_bgc_class": self.mibig_bgc_class if self.mibig_bgc_class else "", } diff --git a/tests/unit/genomics/test_bgc.py b/tests/unit/genomics/test_bgc.py index 1cf3f401..9706e961 100644 --- a/tests/unit/genomics/test_bgc.py +++ b/tests/unit/genomics/test_bgc.py @@ -24,3 +24,31 @@ def test_add_and_detach_parent(): assert bgc.parents == {gcf} bgc.detach_parent(gcf) assert bgc.parents == set() + + +def test_to_dict(): + bgc = BGC("BGC0000001", "Polyketide", "NRP") + bgc.strain = Strain("sample_strain") + bgc.description = "Sample description" + + dict_repr = bgc.to_dict() + assert dict_repr["GCF_id"] == set() + assert dict_repr["GCF_bigscape_class"] == set() + assert dict_repr["BGC_name"] == "BGC0000001" + assert dict_repr["product_prediction"] == ("Polyketide", "NRP") + assert dict_repr["mibig_bgc_class"] is None + assert dict_repr["description"] == "Sample description" + assert dict_repr["strain_id"] == "sample_strain" + assert dict_repr["antismash_id"] is None + assert dict_repr["antismash_region"] is None + + bgc.add_parent(GCF("1")) + bgc.mibig_bgc_class = ("NRP",) + bgc.antismash_id = "ABC_0001" + bgc.antismash_region = 1 + dict_repr = bgc.to_dict() + assert dict_repr["GCF_id"] == {"1"} + assert dict_repr["GCF_bigscape_class"] == set() + assert dict_repr["mibig_bgc_class"] == ("NRP",) + assert dict_repr["antismash_id"] == "ABC_0001" + assert dict_repr["antismash_region"] == 1 diff --git a/tests/unit/metabolomics/test_spectrum.py b/tests/unit/metabolomics/test_spectrum.py index e984eaba..d77ea0d4 100644 --- a/tests/unit/metabolomics/test_spectrum.py +++ b/tests/unit/metabolomics/test_spectrum.py @@ -68,3 +68,35 @@ def test_has_strain(): spec.strains.add(strain1) assert spec.has_strain(strain1) assert not spec.has_strain(strain2) + + +def test_to_dict(): + """Test the to_dict method.""" + spec = Spectrum("spec1", [100, 200], [0.1, 0.2], 150, 0, {"info": "test"}) + spec.strains.add(Strain("strain1")) + spec.strains.add(Strain("strain2")) + + dict_repr = spec.to_dict() + assert dict_repr["spectrum_id"] == "spec1" + assert dict_repr["num_strains_with_spectrum"] == 2 + assert dict_repr["precursor_mz"] == 150.0 + assert dict_repr["rt"] == 0.0 + assert dict_repr["molecular_family"] is None + assert dict_repr["gnps_id"] is None + assert dict_repr["gnps_annotations"] == dict() + + # Test with gnps information + spec.gnps_id = "GNPS0001" + spec.gnps_annotations = {"annotation1": "value1"} + + # Test with molecular family + class MockMolecularFamily: + def __init__(self, id): + self.id = id + + spec.family = MockMolecularFamily("family1") + + dict_repr = spec.to_dict() + assert dict_repr["molecular_family"] == "family1" + assert dict_repr["gnps_id"] == "GNPS0001" + assert dict_repr["gnps_annotations"] == {"annotation1": "value1"} diff --git a/tests/unit/scoring/test_link_graph.py b/tests/unit/scoring/test_link_graph.py index 5a4e7197..f1542338 100644 --- a/tests/unit/scoring/test_link_graph.py +++ b/tests/unit/scoring/test_link_graph.py @@ -114,15 +114,29 @@ def test_filter(gcfs, spectra, score): assert len(lg_filtered) == 4 +def test_link_to_dict(lg, gcfs, spectra, score): + link = lg.links[0] + index = 1 + dict_repr = lg.link_to_dict(link, index) + assert type(dict_repr) is dict + assert dict_repr["index"] == 1 + assert dict_repr["genomic_object_type"] == gcfs[0].__class__.__name__ + assert dict_repr["genomic_object_id"] == gcfs[0].id + assert dict_repr["metabolomic_object_type"] == spectra[0].__class__.__name__ + assert dict_repr["metabolomic_object_id"] == spectra[0].id + assert dict_repr["metcalf_score"] == round(score.value, 2) + assert dict_repr["rosetta_score"] == "" + + def test_get_table_data(lg, gcfs, spectra, score): + # add a second link + lg.add_link(gcfs[1], spectra[1], metcalf=score) + table_data = lg.get_table_data() assert type(table_data) is list assert type(table_data[0]) is dict + assert len(table_data) == 2 + + display_limit = 1 + table_data = lg.get_table_data(display_limit) assert len(table_data) == 1 - assert table_data[0]["index"] == 1 - assert table_data[0]["genomic_object_type"] == gcfs[0].__class__.__name__ - assert table_data[0]["genomic_object_id"] == gcfs[0].id - assert table_data[0]["metabolomic_object_type"] == spectra[0].__class__.__name__ - assert table_data[0]["metabolomic_object_id"] == spectra[0].id - assert table_data[0]["metcalf_score"] == round(score.value, 2) - assert table_data[0]["rosetta_score"] == "" From a8a83290b2a0980ab85cd1655af4f585aa7a2140 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Fri, 18 Oct 2024 17:28:46 +0200 Subject: [PATCH 13/42] correct typing in doctrings --- src/nplinker/genomics/bgc.py | 20 ++++++++++---------- src/nplinker/metabolomics/spectrum.py | 6 +++--- src/nplinker/scoring/link_graph.py | 8 ++++---- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 7d606fe0..2ea0c3b9 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -201,24 +201,24 @@ def to_dict(self) -> dict[str, any]: Returns: A dictionary containing the following key-value pairs: - - GCF_id (str): A comma-separated string of GCF IDs or "-" if none are available. - - GCF_bigscape_class (str): A comma-separated string of BiG-SCAPE classes or "-" if none are available. - - BGC_name (str): The name of the BGC. + - GCF_id (set): A set of GCF IDs. + - GCF_bigscape_class (set): A set of BiG-SCAPE classes. - strain_id (str): The ID of the strain. - - description (str): A description of the BGC. - - antismash_id (str): The antiSMASH ID. - - antismash_region (str): The antiSMASH region. - - antismash_cluster_type (str): A comma-separated string of product predictions. - - mibig_bgc_class (str): The MiBIG BGC class or "-" if none is available. + - description (str | None): A description of the BGC. + - BGC_name (str): The name of the BGC. + - product_prediction (tuple): (predicted) natural products or product classes of the BGC. + - mibig_bgc_class (tuple[str] | None): MIBiG biosynthetic classes to which the BGC belongs. + - antismash_id (str | None): The antiSMASH ID. + - antismash_region (int | None): The antiSMASH region. """ return { "GCF_id": {gcf.id for gcf in self.parents if gcf.id is not None}, "GCF_bigscape_class": {bsc for bsc in self.bigscape_classes if bsc is not None}, + "strain_id": self.strain.id, + "description": self.description, "BGC_name": self.id, "product_prediction": self.product_prediction, "mibig_bgc_class": self.mibig_bgc_class, - "description": self.description, - "strain_id": self.strain.id, "antismash_id": self.antismash_id, "antismash_region": self.antismash_region, } diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index aa008579..4842f9b0 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -110,9 +110,9 @@ def to_dict(self) -> dict[str, any]: - "num_strains_with_spectrum" (int): The number of strains associated with the spectrum. - "precursor_mz" (float): The precursor m/z value, rounded to four decimal places. - "rt" (float): The retention time, rounded to three decimal places. - - "molecular_family" (str): The identifier of the molecular family, or "-" if not available. - - "gnps_id" (str): The GNPS identifier, or "-" if not available. - - "gnps_annotations" (dict | str): A dictionary of GNPS annotations, or "-" if not available. + - "molecular_family" (str | None ): The identifier of the molecular family. + - "gnps_id" (str | None ): The GNPS identifier. + - "gnps_annotations" (dict): A dictionary of GNPS annotations. """ return { "spectrum_id": self.id, diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index 091474e5..4fc5c23b 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -309,12 +309,12 @@ def link_to_dict(link: LINK, index: int) -> dict[str, any]: Returns: A dictionary containing the link information with the following keys: - index (int): The index of the link. - - genomic_object_id (str or int): The ID of the genomic object. + - genomic_object_id (str): The ID of the genomic object. - genomic_object_type (str): The type of the genomic object. - - metabolomic_object_id (str or int): The ID of the metabolomic object. + - metabolomic_object_id (str): The ID of the metabolomic object. - metabolomic_object_type (str): The type of the metabolomic object. - - metcalf_score (float): The Metcalf score, rounded to 2 decimal places. - - rosetta_score (float): The Rosetta score, rounded to 2 decimal places. + - metcalf_score (float | str): The Metcalf score, rounded to 2 decimal places. + - rosetta_score (float | str): The Rosetta score, rounded to 2 decimal places. """ u, v, data = link genomic_object_classes = (GCF,) From c6c33e6d1dbbcdf60bfaeaee3e5b83305ea36382 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Tue, 22 Oct 2024 18:25:26 +0200 Subject: [PATCH 14/42] typing: changed typings to pass mypy static typing checks --- .github/workflows/format-typing-check.yml | 2 +- pyproject.toml | 1 + src/nplinker/genomics/bgc.py | 7 ++++--- src/nplinker/metabolomics/spectrum.py | 3 ++- src/nplinker/nplinker.py | 4 ++-- src/nplinker/scoring/link_graph.py | 7 ++++--- 6 files changed, 14 insertions(+), 10 deletions(-) diff --git a/.github/workflows/format-typing-check.yml b/.github/workflows/format-typing-check.yml index a5def2b9..10ea0990 100644 --- a/.github/workflows/format-typing-check.yml +++ b/.github/workflows/format-typing-check.yml @@ -37,7 +37,7 @@ jobs: - name: Install ruff and mypy run: | pip install ruff mypy typing_extensions \ - types-Deprecated types-beautifulsoup4 types-jsonschema types-networkx pandas-stubs + types-Deprecated types-beautifulsoup4 types-jsonschema types-networkx types-tabulate pandas-stubs - name: Get all changed python files id: changed-python-files uses: tj-actions/changed-files@v44 diff --git a/pyproject.toml b/pyproject.toml index 4ab04c75..c627f6ca 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,6 +63,7 @@ dev = [ "types-beautifulsoup4", "types-jsonschema", "types-networkx", + "types-tabulate", "pandas-stubs", # docs "black", diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 2ea0c3b9..e59d29ae 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -1,6 +1,7 @@ from __future__ import annotations import logging from typing import TYPE_CHECKING +from typing import Any from deprecated import deprecated from nplinker.strain import Strain from .aa_pred import predict_aa @@ -193,7 +194,7 @@ def aa_predictions(self) -> list: self._aa_predictions[p[0]] = p[1] return [self._aa_predictions] - def to_dict(self) -> dict[str, any]: + def to_dict(self) -> dict[str, Any]: """Convert the BGC object to a dictionary for exporting results. This method compiles relevant information from the BGC object and formats it into a dictionary. @@ -203,7 +204,7 @@ def to_dict(self) -> dict[str, any]: A dictionary containing the following key-value pairs: - GCF_id (set): A set of GCF IDs. - GCF_bigscape_class (set): A set of BiG-SCAPE classes. - - strain_id (str): The ID of the strain. + - strain_id (str | None): The ID of the strain. - description (str | None): A description of the BGC. - BGC_name (str): The name of the BGC. - product_prediction (tuple): (predicted) natural products or product classes of the BGC. @@ -214,7 +215,7 @@ def to_dict(self) -> dict[str, any]: return { "GCF_id": {gcf.id for gcf in self.parents if gcf.id is not None}, "GCF_bigscape_class": {bsc for bsc in self.bigscape_classes if bsc is not None}, - "strain_id": self.strain.id, + "strain_id": self.strain.id if self.strain is not None else None, "description": self.description, "BGC_name": self.id, "product_prediction": self.product_prediction, diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index 4842f9b0..e0e10e6d 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -1,6 +1,7 @@ from __future__ import annotations from functools import cached_property from typing import TYPE_CHECKING +from typing import Any import numpy as np from nplinker.strain import Strain from nplinker.strain import StrainCollection @@ -98,7 +99,7 @@ def has_strain(self, strain: Strain) -> bool: """ return strain in self.strains - def to_dict(self) -> dict[str, any]: + def to_dict(self) -> dict[str, Any]: """Convert the Spectrum object to a dictionary for exporting results. This method compiles relevant information from the Spectrum object into a dictionary format. diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index 52599957..99e139bf 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -356,11 +356,11 @@ def save_data( with open(file, "wb") as f: pickle.dump(data, f) - def export_objects(self, objects: BGC | Spectrum, filename: str) -> None: + def export_objects(self, objects: Sequence[BGC | Spectrum], filename: str) -> None: """Exports the data for a list of BGC or Spectrum objects to a specified file in tab-separated format. Args: - objects (BGC | Spectrum): A list of BGC or Spectrum objects to be exported. + objects (list[BGC | Spectrum]): A list of BGC or Spectrum objects to be exported. filename (str): The name of the file where the data will be saved. """ headers = objects[0].to_dict().keys() diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index 4fc5c23b..8da29912 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -2,6 +2,7 @@ from collections.abc import Sequence from functools import wraps from os import PathLike +from typing import Any from typing import Union from networkx import Graph from tabulate import tabulate @@ -299,7 +300,7 @@ def _filter_two_nodes(self, u: Entity, v: Entity, lg: LinkGraph) -> None: lg.add_link(u, v, **link_data) @staticmethod - def link_to_dict(link: LINK, index: int) -> dict[str, any]: + def link_to_dict(link: LINK, index: int) -> dict[str, Any]: """Convert a link to a dictionary representation. Args: @@ -332,7 +333,7 @@ def link_to_dict(link: LINK, index: int) -> dict[str, any]: "rosetta_score": round(rosetta_score.value, 2) if rosetta_score else "", } - def get_table_data(self, display_limit: int | None = None) -> list[dict[str, any]]: + def get_table_data(self, display_limit: int | None = None) -> list[dict[str, Any]]: """Generate the table data for the LinkGraph. This method iterates over the links in the LinkGraph and constructs a table @@ -372,7 +373,7 @@ def _get_table_repr(self, display_limit: int | None = 60) -> str: stralign="right", ) - if len(self.links) > display_limit: + if display_limit is not None and len(self.links) > display_limit: truncated_info = f"...\n[ {len(self.links)} links ]" table += f"\n{truncated_info}" From a2603381a6161574751cce26f3cebade34ce530b Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Tue, 22 Oct 2024 18:46:48 +0200 Subject: [PATCH 15/42] refactor: change the order of methods/functions --- src/nplinker/genomics/bgc.py | 64 +++++------ src/nplinker/nplinker.py | 138 ++++++++++++------------ src/nplinker/scoring/link_graph.py | 168 ++++++++++++++--------------- 3 files changed, 185 insertions(+), 185 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index e59d29ae..6dfd6c66 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -117,18 +117,6 @@ def __reduce__(self) -> tuple: """Reduce function for pickling.""" return (self.__class__, (self.id, *self.product_prediction), self.__dict__) - def add_parent(self, gcf: GCF) -> None: - """Add a parent GCF to the BGC. - - Args: - gcf: gene cluster family - """ - gcf.add_bgc(self) - - def detach_parent(self, gcf: GCF) -> None: - """Remove a parent GCF.""" - gcf.detach_bgc(self) - @property def strain(self) -> Strain | None: """Get the strain of the BGC.""" @@ -162,6 +150,18 @@ def bigscape_classes(self) -> set[str | None]: """ return {p.bigscape_class for p in self.parents} + def add_parent(self, gcf: GCF) -> None: + """Add a parent GCF to the BGC. + + Args: + gcf: gene cluster family + """ + gcf.add_bgc(self) + + def detach_parent(self, gcf: GCF) -> None: + """Remove a parent GCF.""" + gcf.detach_bgc(self) + def is_mibig(self) -> bool: """Check if the BGC is a MIBiG reference BGC or not. @@ -174,26 +174,6 @@ def is_mibig(self) -> bool: """ return self.id.startswith("BGC") - # CG: why not providing whole product but only amino acid as product monomer? - # this property is not used in NPLinker core business. - @property - @deprecated(version="2.0.0", reason="This method will be removed soon") - def aa_predictions(self) -> list: - """Amino acids as predicted monomers of product. - - Returns: - list of dicts with key as amino acid and value as prediction - probability. - """ - # Load aa predictions and cache them - self._aa_predictions = None - if self._aa_predictions is None: - self._aa_predictions = {} - if self.antismash_file is not None: - for p in predict_aa(self.antismash_file): - self._aa_predictions[p[0]] = p[1] - return [self._aa_predictions] - def to_dict(self) -> dict[str, Any]: """Convert the BGC object to a dictionary for exporting results. @@ -223,3 +203,23 @@ def to_dict(self) -> dict[str, Any]: "antismash_id": self.antismash_id, "antismash_region": self.antismash_region, } + + # CG: why not providing whole product but only amino acid as product monomer? + # this property is not used in NPLinker core business. + @property + @deprecated(version="2.0.0", reason="This method will be removed soon") + def aa_predictions(self) -> list: + """Amino acids as predicted monomers of product. + + Returns: + list of dicts with key as amino acid and value as prediction + probability. + """ + # Load aa predictions and cache them + self._aa_predictions = None + if self._aa_predictions is None: + self._aa_predictions = {} + if self.antismash_file is not None: + for p in predict_aa(self.antismash_file): + self._aa_predictions[p[0]] = p[1] + return [self._aa_predictions] diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index 99e139bf..79dffcbe 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -168,34 +168,50 @@ def scoring_methods(self) -> list[str]: """Get names of all valid scoring methods.""" return list(self._valid_scoring_methods.keys()) - def load_data(self): - """Load all data from files into memory. - - This method is a convenience function that calls the - [`DatasetArranger`][nplinker.arranger.DatasetArranger] class to arrange data files - (download, generate and/or validate data) in the [correct directory structure][working-directory-structure], - and then calls the [`DatasetLoader`][nplinker.loader.DatasetLoader] class to load all data - from the files into memory. + def export_objects(self, objects: Sequence[BGC | Spectrum], filename: str) -> None: + """Exports the data for a list of BGC or Spectrum objects to a specified file in tab-separated format. - The loaded data is stored in various data containers for easy access, e.g. - [`self.bgcs`][nplinker.NPLinker.bgcs] for all BGC objects, - [`self.strains`][nplinker.NPLinker.strains] for all Strain objects, etc. + Args: + objects (list[BGC | Spectrum]): A list of BGC or Spectrum objects to be exported. + filename (str): The name of the file where the data will be saved. """ - arranger = DatasetArranger(self.config) - arranger.arrange() - loader = DatasetLoader(self.config) - loader.load() + headers = objects[0].to_dict().keys() + with open(self._output_dir / filename, "w") as f: + f.write("\t".join(headers) + "\n") + for obj in objects: + row_data = obj.to_dict() + formatted_row = [] + for header in headers: + item = row_data.get(header, "") + # Convert list, tuple, set to comma-separated string + if isinstance(item, (list, tuple, set)): + formatted_row.append(", ".join(map(str, item))) + # Convert dict to comma-separated string + elif isinstance(item, dict): + formatted_row.append(", ".join([f"{k}:{v}" for k, v in item.items()])) + # Convert non-empty value to string + elif item: + formatted_row.append(str(item)) + # Convert empty value to empty string + else: + formatted_row.append("") + f.write("\t".join(formatted_row) + "\n") - self._bgc_dict = {bgc.id: bgc for bgc in loader.bgcs} - self._gcf_dict = {gcf.id: gcf for gcf in loader.gcfs} - self._spec_dict = {spec.id: spec for spec in loader.spectra} - self._mf_dict = {mf.id: mf for mf in loader.mfs} + def export_results(self, lg: LinkGraph | None = None) -> None: + """Exports the results to the output directory in tab-separated format. - self._mibig_bgcs = loader.mibig_bgcs - self._strains = loader.strains - self._product_types = loader.product_types - self._chem_classes = loader.chem_classes - self._class_matches = loader.class_matches + This method exports genomics and metabolomics data to their respective + TSV files in the specified output directory. If a LinkGraph object is + provided, it also exports the links data to a TSV file. + + Args: + lg (LinkGraph | None): An optional LinkGraph object. If provided, + the links data will be exported to 'links.tsv'. + """ + self.export_objects(self.bgcs, "genomics_data.tsv") + self.export_objects(self.spectra, "metabolomics_data.tsv") + if lg is not None: + lg.export_links(self._output_dir / "links.tsv") @overload def get_links( @@ -281,6 +297,35 @@ def get_links( return scoring.get_links(*objects, **scoring_params) + def load_data(self): + """Load all data from files into memory. + + This method is a convenience function that calls the + [`DatasetArranger`][nplinker.arranger.DatasetArranger] class to arrange data files + (download, generate and/or validate data) in the [correct directory structure][working-directory-structure], + and then calls the [`DatasetLoader`][nplinker.loader.DatasetLoader] class to load all data + from the files into memory. + + The loaded data is stored in various data containers for easy access, e.g. + [`self.bgcs`][nplinker.NPLinker.bgcs] for all BGC objects, + [`self.strains`][nplinker.NPLinker.strains] for all Strain objects, etc. + """ + arranger = DatasetArranger(self.config) + arranger.arrange() + loader = DatasetLoader(self.config) + loader.load() + + self._bgc_dict = {bgc.id: bgc for bgc in loader.bgcs} + self._gcf_dict = {gcf.id: gcf for gcf in loader.gcfs} + self._spec_dict = {spec.id: spec for spec in loader.spectra} + self._mf_dict = {mf.id: mf for mf in loader.mfs} + + self._mibig_bgcs = loader.mibig_bgcs + self._strains = loader.strains + self._product_types = loader.product_types + self._chem_classes = loader.chem_classes + self._class_matches = loader.class_matches + def lookup_bgc(self, id: str) -> BGC | None: """Get the BGC object with the given ID. @@ -355,48 +400,3 @@ def save_data( data = (self.bgcs, self.gcfs, self.spectra, self.mfs, self.strains, links) with open(file, "wb") as f: pickle.dump(data, f) - - def export_objects(self, objects: Sequence[BGC | Spectrum], filename: str) -> None: - """Exports the data for a list of BGC or Spectrum objects to a specified file in tab-separated format. - - Args: - objects (list[BGC | Spectrum]): A list of BGC or Spectrum objects to be exported. - filename (str): The name of the file where the data will be saved. - """ - headers = objects[0].to_dict().keys() - with open(self._output_dir / filename, "w") as f: - f.write("\t".join(headers) + "\n") - for obj in objects: - row_data = obj.to_dict() - formatted_row = [] - for header in headers: - item = row_data.get(header, "") - # Convert list, tuple, set to comma-separated string - if isinstance(item, (list, tuple, set)): - formatted_row.append(", ".join(map(str, item))) - # Convert dict to comma-separated string - elif isinstance(item, dict): - formatted_row.append(", ".join([f"{k}:{v}" for k, v in item.items()])) - # Convert non-empty value to string - elif item: - formatted_row.append(str(item)) - # Convert empty value to empty string - else: - formatted_row.append("") - f.write("\t".join(formatted_row) + "\n") - - def export_results(self, lg: LinkGraph | None = None) -> None: - """Exports the results to the output directory in tab-separated format. - - This method exports genomics and metabolomics data to their respective - TSV files in the specified output directory. If a LinkGraph object is - provided, it also exports the links data to a TSV file. - - Args: - lg (LinkGraph | None): An optional LinkGraph object. If provided, - the links data will be exported to 'links.tsv'. - """ - self.export_objects(self.bgcs, "genomics_data.tsv") - self.export_objects(self.spectra, "metabolomics_data.tsv") - if lg is not None: - lg.export_links(self._output_dir / "links.tsv") diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index 8da29912..e01dbc59 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -198,44 +198,21 @@ def add_link( self._g.add_edge(u, v, **data) - @validate_uv - def has_link(self, u: Entity, v: Entity) -> bool: - """Check if there is a link between two objects. - - Args: - u: the first object, either a GCF, Spectrum, or MolecularFamily - v: the second object, either a GCF, Spectrum, or MolecularFamily - - Returns: - True if there is a link between the two objects, False otherwise - - Examples: - >>> lg.has_link(gcf, spectrum) - True - """ - return self._g.has_edge(u, v) - - @validate_uv - def get_link_data( - self, - u: Entity, - v: Entity, - ) -> LINK_DATA | None: - """Get the data for a link between two objects. + def export_links(self, file: str | PathLike) -> None: + """Exports the links in the LinkGraph to a file. Args: - u: the first object, either a GCF, Spectrum, or MolecularFamily - v: the second object, either a GCF, Spectrum, or MolecularFamily - - Returns: - A dictionary of scoring methods and their data for the link between the two objects, or - None if there is no link between the two objects. + file: the file to write the links to. Examples: - >>> lg.get_link_data(gcf, spectrum) - {"metcalf": Score("metcalf", 1.0, {"cutoff": 0.5})} + >>> lg.print_links("links.tsv") """ - return self._g.get_edge_data(u, v) # type: ignore + table_data = self.get_table_data() + headers = table_data[0].keys() + with open(file, "w") as f: + f.write("\t".join(headers) + "\n") + for row in table_data: + f.write("\t".join(str(row[h]) for h in headers) + "\n") def filter(self, u_nodes: Sequence[Entity], v_nodes: Sequence[Entity] = [], /) -> LinkGraph: """Return a new LinkGraph object with the filtered links between the given objects. @@ -281,23 +258,66 @@ def filter(self, u_nodes: Sequence[Entity], v_nodes: Sequence[Entity] = [], /) - return lg - @validate_u - def _filter_one_node(self, u: Entity, lg: LinkGraph) -> None: - """Filter the links for a given object and add them to the new LinkGraph object.""" - try: - links = self[u] - except KeyError: - pass - else: - for node2, value in links.items(): - lg.add_link(u, node2, **value) + @validate_uv + def get_link_data( + self, + u: Entity, + v: Entity, + ) -> LINK_DATA | None: + """Get the data for a link between two objects. + + Args: + u: the first object, either a GCF, Spectrum, or MolecularFamily + v: the second object, either a GCF, Spectrum, or MolecularFamily + + Returns: + A dictionary of scoring methods and their data for the link between the two objects, or + None if there is no link between the two objects. + + Examples: + >>> lg.get_link_data(gcf, spectrum) + {"metcalf": Score("metcalf", 1.0, {"cutoff": 0.5})} + """ + return self._g.get_edge_data(u, v) # type: ignore + + def get_table_data(self, display_limit: int | None = None) -> list[dict[str, Any]]: + """Generate the table data for the LinkGraph. + + This method iterates over the links in the LinkGraph and constructs a table + containing information about genomic and metabolomic objects, as well as their + associated scores. Each row in the table represents a link between a genomic + object and a metabolomic object. + + Args: + display_limit (int | None): The maximum number of rows to include in the + table. If None, all rows are included. + + Returns: + A list of dictionaries containing the table data. + """ + table_data = [] + for index, link in enumerate(self.links, start=1): + table_data.append(self.link_to_dict(link, index)) + if display_limit is not None and index == display_limit: + break + return table_data @validate_uv - def _filter_two_nodes(self, u: Entity, v: Entity, lg: LinkGraph) -> None: - """Filter the links between two objects and add them to the new LinkGraph object.""" - link_data = self.get_link_data(u, v) - if link_data is not None: - lg.add_link(u, v, **link_data) + def has_link(self, u: Entity, v: Entity) -> bool: + """Check if there is a link between two objects. + + Args: + u: the first object, either a GCF, Spectrum, or MolecularFamily + v: the second object, either a GCF, Spectrum, or MolecularFamily + + Returns: + True if there is a link between the two objects, False otherwise + + Examples: + >>> lg.has_link(gcf, spectrum) + True + """ + return self._g.has_edge(u, v) @staticmethod def link_to_dict(link: LINK, index: int) -> dict[str, Any]: @@ -333,27 +353,23 @@ def link_to_dict(link: LINK, index: int) -> dict[str, Any]: "rosetta_score": round(rosetta_score.value, 2) if rosetta_score else "", } - def get_table_data(self, display_limit: int | None = None) -> list[dict[str, Any]]: - """Generate the table data for the LinkGraph. - - This method iterates over the links in the LinkGraph and constructs a table - containing information about genomic and metabolomic objects, as well as their - associated scores. Each row in the table represents a link between a genomic - object and a metabolomic object. - - Args: - display_limit (int | None): The maximum number of rows to include in the - table. If None, all rows are included. + @validate_u + def _filter_one_node(self, u: Entity, lg: LinkGraph) -> None: + """Filter the links for a given object and add them to the new LinkGraph object.""" + try: + links = self[u] + except KeyError: + pass + else: + for node2, value in links.items(): + lg.add_link(u, node2, **value) - Returns: - A list of dictionaries containing the table data. - """ - table_data = [] - for index, link in enumerate(self.links, start=1): - table_data.append(self.link_to_dict(link, index)) - if display_limit is not None and index == display_limit: - break - return table_data + @validate_uv + def _filter_two_nodes(self, u: Entity, v: Entity, lg: LinkGraph) -> None: + """Filter the links between two objects and add them to the new LinkGraph object.""" + link_data = self.get_link_data(u, v) + if link_data is not None: + lg.add_link(u, v, **link_data) def _get_table_repr(self, display_limit: int | None = 60) -> str: """Generate a table representation of the LinkGraph. @@ -378,19 +394,3 @@ def _get_table_repr(self, display_limit: int | None = 60) -> str: table += f"\n{truncated_info}" return table - - def export_links(self, file: str | PathLike) -> None: - """Exports the links in the LinkGraph to a file. - - Args: - file: the file to write the links to. - - Examples: - >>> lg.print_links("links.tsv") - """ - table_data = self.get_table_data() - headers = table_data[0].keys() - with open(file, "w") as f: - f.write("\t".join(headers) + "\n") - for row in table_data: - f.write("\t".join(str(row[h]) for h in headers) + "\n") From 328968358e4c12ceb38b1d8fdbbd3b699857144d Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Mon, 4 Nov 2024 19:22:41 +0100 Subject: [PATCH 16/42] restore the order of already existing functions and methods --- src/nplinker/genomics/bgc.py | 24 ++--- src/nplinker/nplinker.py | 138 ++++++++++++++--------------- src/nplinker/scoring/link_graph.py | 78 ++++++++-------- 3 files changed, 120 insertions(+), 120 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 6dfd6c66..9b544160 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -117,6 +117,18 @@ def __reduce__(self) -> tuple: """Reduce function for pickling.""" return (self.__class__, (self.id, *self.product_prediction), self.__dict__) + def add_parent(self, gcf: GCF) -> None: + """Add a parent GCF to the BGC. + + Args: + gcf: gene cluster family + """ + gcf.add_bgc(self) + + def detach_parent(self, gcf: GCF) -> None: + """Remove a parent GCF.""" + gcf.detach_bgc(self) + @property def strain(self) -> Strain | None: """Get the strain of the BGC.""" @@ -150,18 +162,6 @@ def bigscape_classes(self) -> set[str | None]: """ return {p.bigscape_class for p in self.parents} - def add_parent(self, gcf: GCF) -> None: - """Add a parent GCF to the BGC. - - Args: - gcf: gene cluster family - """ - gcf.add_bgc(self) - - def detach_parent(self, gcf: GCF) -> None: - """Remove a parent GCF.""" - gcf.detach_bgc(self) - def is_mibig(self) -> bool: """Check if the BGC is a MIBiG reference BGC or not. diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index 79dffcbe..99e139bf 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -168,50 +168,34 @@ def scoring_methods(self) -> list[str]: """Get names of all valid scoring methods.""" return list(self._valid_scoring_methods.keys()) - def export_objects(self, objects: Sequence[BGC | Spectrum], filename: str) -> None: - """Exports the data for a list of BGC or Spectrum objects to a specified file in tab-separated format. + def load_data(self): + """Load all data from files into memory. - Args: - objects (list[BGC | Spectrum]): A list of BGC or Spectrum objects to be exported. - filename (str): The name of the file where the data will be saved. - """ - headers = objects[0].to_dict().keys() - with open(self._output_dir / filename, "w") as f: - f.write("\t".join(headers) + "\n") - for obj in objects: - row_data = obj.to_dict() - formatted_row = [] - for header in headers: - item = row_data.get(header, "") - # Convert list, tuple, set to comma-separated string - if isinstance(item, (list, tuple, set)): - formatted_row.append(", ".join(map(str, item))) - # Convert dict to comma-separated string - elif isinstance(item, dict): - formatted_row.append(", ".join([f"{k}:{v}" for k, v in item.items()])) - # Convert non-empty value to string - elif item: - formatted_row.append(str(item)) - # Convert empty value to empty string - else: - formatted_row.append("") - f.write("\t".join(formatted_row) + "\n") + This method is a convenience function that calls the + [`DatasetArranger`][nplinker.arranger.DatasetArranger] class to arrange data files + (download, generate and/or validate data) in the [correct directory structure][working-directory-structure], + and then calls the [`DatasetLoader`][nplinker.loader.DatasetLoader] class to load all data + from the files into memory. - def export_results(self, lg: LinkGraph | None = None) -> None: - """Exports the results to the output directory in tab-separated format. + The loaded data is stored in various data containers for easy access, e.g. + [`self.bgcs`][nplinker.NPLinker.bgcs] for all BGC objects, + [`self.strains`][nplinker.NPLinker.strains] for all Strain objects, etc. + """ + arranger = DatasetArranger(self.config) + arranger.arrange() + loader = DatasetLoader(self.config) + loader.load() - This method exports genomics and metabolomics data to their respective - TSV files in the specified output directory. If a LinkGraph object is - provided, it also exports the links data to a TSV file. + self._bgc_dict = {bgc.id: bgc for bgc in loader.bgcs} + self._gcf_dict = {gcf.id: gcf for gcf in loader.gcfs} + self._spec_dict = {spec.id: spec for spec in loader.spectra} + self._mf_dict = {mf.id: mf for mf in loader.mfs} - Args: - lg (LinkGraph | None): An optional LinkGraph object. If provided, - the links data will be exported to 'links.tsv'. - """ - self.export_objects(self.bgcs, "genomics_data.tsv") - self.export_objects(self.spectra, "metabolomics_data.tsv") - if lg is not None: - lg.export_links(self._output_dir / "links.tsv") + self._mibig_bgcs = loader.mibig_bgcs + self._strains = loader.strains + self._product_types = loader.product_types + self._chem_classes = loader.chem_classes + self._class_matches = loader.class_matches @overload def get_links( @@ -297,35 +281,6 @@ def get_links( return scoring.get_links(*objects, **scoring_params) - def load_data(self): - """Load all data from files into memory. - - This method is a convenience function that calls the - [`DatasetArranger`][nplinker.arranger.DatasetArranger] class to arrange data files - (download, generate and/or validate data) in the [correct directory structure][working-directory-structure], - and then calls the [`DatasetLoader`][nplinker.loader.DatasetLoader] class to load all data - from the files into memory. - - The loaded data is stored in various data containers for easy access, e.g. - [`self.bgcs`][nplinker.NPLinker.bgcs] for all BGC objects, - [`self.strains`][nplinker.NPLinker.strains] for all Strain objects, etc. - """ - arranger = DatasetArranger(self.config) - arranger.arrange() - loader = DatasetLoader(self.config) - loader.load() - - self._bgc_dict = {bgc.id: bgc for bgc in loader.bgcs} - self._gcf_dict = {gcf.id: gcf for gcf in loader.gcfs} - self._spec_dict = {spec.id: spec for spec in loader.spectra} - self._mf_dict = {mf.id: mf for mf in loader.mfs} - - self._mibig_bgcs = loader.mibig_bgcs - self._strains = loader.strains - self._product_types = loader.product_types - self._chem_classes = loader.chem_classes - self._class_matches = loader.class_matches - def lookup_bgc(self, id: str) -> BGC | None: """Get the BGC object with the given ID. @@ -400,3 +355,48 @@ def save_data( data = (self.bgcs, self.gcfs, self.spectra, self.mfs, self.strains, links) with open(file, "wb") as f: pickle.dump(data, f) + + def export_objects(self, objects: Sequence[BGC | Spectrum], filename: str) -> None: + """Exports the data for a list of BGC or Spectrum objects to a specified file in tab-separated format. + + Args: + objects (list[BGC | Spectrum]): A list of BGC or Spectrum objects to be exported. + filename (str): The name of the file where the data will be saved. + """ + headers = objects[0].to_dict().keys() + with open(self._output_dir / filename, "w") as f: + f.write("\t".join(headers) + "\n") + for obj in objects: + row_data = obj.to_dict() + formatted_row = [] + for header in headers: + item = row_data.get(header, "") + # Convert list, tuple, set to comma-separated string + if isinstance(item, (list, tuple, set)): + formatted_row.append(", ".join(map(str, item))) + # Convert dict to comma-separated string + elif isinstance(item, dict): + formatted_row.append(", ".join([f"{k}:{v}" for k, v in item.items()])) + # Convert non-empty value to string + elif item: + formatted_row.append(str(item)) + # Convert empty value to empty string + else: + formatted_row.append("") + f.write("\t".join(formatted_row) + "\n") + + def export_results(self, lg: LinkGraph | None = None) -> None: + """Exports the results to the output directory in tab-separated format. + + This method exports genomics and metabolomics data to their respective + TSV files in the specified output directory. If a LinkGraph object is + provided, it also exports the links data to a TSV file. + + Args: + lg (LinkGraph | None): An optional LinkGraph object. If provided, + the links data will be exported to 'links.tsv'. + """ + self.export_objects(self.bgcs, "genomics_data.tsv") + self.export_objects(self.spectra, "metabolomics_data.tsv") + if lg is not None: + lg.export_links(self._output_dir / "links.tsv") diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index e01dbc59..f7690013 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -198,6 +198,45 @@ def add_link( self._g.add_edge(u, v, **data) + @validate_uv + def has_link(self, u: Entity, v: Entity) -> bool: + """Check if there is a link between two objects. + + Args: + u: the first object, either a GCF, Spectrum, or MolecularFamily + v: the second object, either a GCF, Spectrum, or MolecularFamily + + Returns: + True if there is a link between the two objects, False otherwise + + Examples: + >>> lg.has_link(gcf, spectrum) + True + """ + return self._g.has_edge(u, v) + + @validate_uv + def get_link_data( + self, + u: Entity, + v: Entity, + ) -> LINK_DATA | None: + """Get the data for a link between two objects. + + Args: + u: the first object, either a GCF, Spectrum, or MolecularFamily + v: the second object, either a GCF, Spectrum, or MolecularFamily + + Returns: + A dictionary of scoring methods and their data for the link between the two objects, or + None if there is no link between the two objects. + + Examples: + >>> lg.get_link_data(gcf, spectrum) + {"metcalf": Score("metcalf", 1.0, {"cutoff": 0.5})} + """ + return self._g.get_edge_data(u, v) # type: ignore + def export_links(self, file: str | PathLike) -> None: """Exports the links in the LinkGraph to a file. @@ -258,28 +297,6 @@ def filter(self, u_nodes: Sequence[Entity], v_nodes: Sequence[Entity] = [], /) - return lg - @validate_uv - def get_link_data( - self, - u: Entity, - v: Entity, - ) -> LINK_DATA | None: - """Get the data for a link between two objects. - - Args: - u: the first object, either a GCF, Spectrum, or MolecularFamily - v: the second object, either a GCF, Spectrum, or MolecularFamily - - Returns: - A dictionary of scoring methods and their data for the link between the two objects, or - None if there is no link between the two objects. - - Examples: - >>> lg.get_link_data(gcf, spectrum) - {"metcalf": Score("metcalf", 1.0, {"cutoff": 0.5})} - """ - return self._g.get_edge_data(u, v) # type: ignore - def get_table_data(self, display_limit: int | None = None) -> list[dict[str, Any]]: """Generate the table data for the LinkGraph. @@ -302,23 +319,6 @@ def get_table_data(self, display_limit: int | None = None) -> list[dict[str, Any break return table_data - @validate_uv - def has_link(self, u: Entity, v: Entity) -> bool: - """Check if there is a link between two objects. - - Args: - u: the first object, either a GCF, Spectrum, or MolecularFamily - v: the second object, either a GCF, Spectrum, or MolecularFamily - - Returns: - True if there is a link between the two objects, False otherwise - - Examples: - >>> lg.has_link(gcf, spectrum) - True - """ - return self._g.has_edge(u, v) - @staticmethod def link_to_dict(link: LINK, index: int) -> dict[str, Any]: """Convert a link to a dictionary representation. From d2272e2ffe7ad32738fc04affd4612536364efc5 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Mon, 4 Nov 2024 20:10:01 +0100 Subject: [PATCH 17/42] make dicts json compatible --- src/nplinker/genomics/bgc.py | 18 +++++++++--------- src/nplinker/metabolomics/spectrum.py | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 9b544160..c61d7942 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -177,28 +177,28 @@ def is_mibig(self) -> bool: def to_dict(self) -> dict[str, Any]: """Convert the BGC object to a dictionary for exporting results. - This method compiles relevant information from the BGC object and formats it into a dictionary. + Compiles relevant information from the BGC object and formats it into a dictionary. Each key-value pair in the dictionary represents a specific attribute of the BGC. Returns: A dictionary containing the following key-value pairs: - - GCF_id (set): A set of GCF IDs. - - GCF_bigscape_class (set): A set of BiG-SCAPE classes. + - GCF_id (list[str]): A list of GCF IDs. + - GCF_bigscape_class (list[str | None]): A list of BiG-SCAPE classes. - strain_id (str | None): The ID of the strain. - description (str | None): A description of the BGC. - BGC_name (str): The name of the BGC. - - product_prediction (tuple): (predicted) natural products or product classes of the BGC. - - mibig_bgc_class (tuple[str] | None): MIBiG biosynthetic classes to which the BGC belongs. + - product_prediction (list[str]): (predicted) products or product classes of the BGC. + - mibig_bgc_class (list[str] | None): MIBiG biosynthetic classes. - antismash_id (str | None): The antiSMASH ID. - - antismash_region (int | None): The antiSMASH region. + - antismash_region (int | None): The antiSMASH region number. """ return { - "GCF_id": {gcf.id for gcf in self.parents if gcf.id is not None}, - "GCF_bigscape_class": {bsc for bsc in self.bigscape_classes if bsc is not None}, + "GCF_id": [gcf.id for gcf in self.parents if gcf.id is not None], + "GCF_bigscape_class": [bsc for bsc in self.bigscape_classes if bsc is not None], "strain_id": self.strain.id if self.strain is not None else None, "description": self.description, "BGC_name": self.id, - "product_prediction": self.product_prediction, + "product_prediction": list(self.product_prediction), "mibig_bgc_class": self.mibig_bgc_class, "antismash_id": self.antismash_id, "antismash_region": self.antismash_region, diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index e0e10e6d..6fccf47b 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -113,7 +113,7 @@ def to_dict(self) -> dict[str, Any]: - "rt" (float): The retention time, rounded to three decimal places. - "molecular_family" (str | None ): The identifier of the molecular family. - "gnps_id" (str | None ): The GNPS identifier. - - "gnps_annotations" (dict): A dictionary of GNPS annotations. + - "gnps_annotations" (dict[str, str]): A dictionary of GNPS annotations. """ return { "spectrum_id": self.id, From cb49209bbba18e47c1d05b5a65b2a325de29404c Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Mon, 4 Nov 2024 20:16:57 +0100 Subject: [PATCH 18/42] rename functions and variables --- src/nplinker/nplinker.py | 12 ++--- src/nplinker/scoring/link_graph.py | 84 +++++++++++++++--------------- 2 files changed, 48 insertions(+), 48 deletions(-) diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index 99e139bf..bc03fde7 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -356,8 +356,8 @@ def save_data( with open(file, "wb") as f: pickle.dump(data, f) - def export_objects(self, objects: Sequence[BGC | Spectrum], filename: str) -> None: - """Exports the data for a list of BGC or Spectrum objects to a specified file in tab-separated format. + def objects_to_tsv(self, objects: Sequence[BGC] | Sequence[BGC], filename: str) -> None: + """Exports a list of BGC or Spectrum objects to a specified file in tab-separated format. Args: objects (list[BGC | Spectrum]): A list of BGC or Spectrum objects to be exported. @@ -385,7 +385,7 @@ def export_objects(self, objects: Sequence[BGC | Spectrum], filename: str) -> No formatted_row.append("") f.write("\t".join(formatted_row) + "\n") - def export_results(self, lg: LinkGraph | None = None) -> None: + def to_tsv(self, lg: LinkGraph | None = None) -> None: """Exports the results to the output directory in tab-separated format. This method exports genomics and metabolomics data to their respective @@ -396,7 +396,7 @@ def export_results(self, lg: LinkGraph | None = None) -> None: lg (LinkGraph | None): An optional LinkGraph object. If provided, the links data will be exported to 'links.tsv'. """ - self.export_objects(self.bgcs, "genomics_data.tsv") - self.export_objects(self.spectra, "metabolomics_data.tsv") + self.objects_to_tsv(self.bgcs, "genomics_data.tsv") + self.objects_to_tsv(self.spectra, "metabolomics_data.tsv") if lg is not None: - lg.export_links(self._output_dir / "links.tsv") + lg.to_tsv(self._output_dir / "links.tsv") diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index f7690013..32ed290a 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -237,22 +237,6 @@ def get_link_data( """ return self._g.get_edge_data(u, v) # type: ignore - def export_links(self, file: str | PathLike) -> None: - """Exports the links in the LinkGraph to a file. - - Args: - file: the file to write the links to. - - Examples: - >>> lg.print_links("links.tsv") - """ - table_data = self.get_table_data() - headers = table_data[0].keys() - with open(file, "w") as f: - f.write("\t".join(headers) + "\n") - for row in table_data: - f.write("\t".join(str(row[h]) for h in headers) + "\n") - def filter(self, u_nodes: Sequence[Entity], v_nodes: Sequence[Entity] = [], /) -> LinkGraph: """Return a new LinkGraph object with the filtered links between the given objects. @@ -297,28 +281,6 @@ def filter(self, u_nodes: Sequence[Entity], v_nodes: Sequence[Entity] = [], /) - return lg - def get_table_data(self, display_limit: int | None = None) -> list[dict[str, Any]]: - """Generate the table data for the LinkGraph. - - This method iterates over the links in the LinkGraph and constructs a table - containing information about genomic and metabolomic objects, as well as their - associated scores. Each row in the table represents a link between a genomic - object and a metabolomic object. - - Args: - display_limit (int | None): The maximum number of rows to include in the - table. If None, all rows are included. - - Returns: - A list of dictionaries containing the table data. - """ - table_data = [] - for index, link in enumerate(self.links, start=1): - table_data.append(self.link_to_dict(link, index)) - if display_limit is not None and index == display_limit: - break - return table_data - @staticmethod def link_to_dict(link: LINK, index: int) -> dict[str, Any]: """Convert a link to a dictionary representation. @@ -338,9 +300,9 @@ def link_to_dict(link: LINK, index: int) -> dict[str, Any]: - rosetta_score (float | str): The Rosetta score, rounded to 2 decimal places. """ u, v, data = link - genomic_object_classes = (GCF,) - genomic_object = u if isinstance(u, genomic_object_classes) else v - metabolomic_object = v if isinstance(u, genomic_object_classes) else u + genomic_types = (GCF,) + genomic_object = u if isinstance(u, genomic_types) else v + metabolomic_object = v if isinstance(u, genomic_types) else u metcalf_score = data.get("metcalf") rosetta_score = data.get("rosetta") return { @@ -353,6 +315,22 @@ def link_to_dict(link: LINK, index: int) -> dict[str, Any]: "rosetta_score": round(rosetta_score.value, 2) if rosetta_score else "", } + def to_tsv(self, file: str | PathLike) -> None: + """Exports the links in the LinkGraph to a file in tab-separated format. + + Args: + file: the file to write the links to. + + Examples: + >>> lg.print_links("links.tsv") + """ + table_data = self._links_to_dicts() + headers = table_data[0].keys() + with open(file, "w") as f: + f.write("\t".join(headers) + "\n") + for row in table_data: + f.write("\t".join(str(row[h]) for h in headers) + "\n") + @validate_u def _filter_one_node(self, u: Entity, lg: LinkGraph) -> None: """Filter the links for a given object and add them to the new LinkGraph object.""" @@ -383,7 +361,7 @@ def _get_table_repr(self, display_limit: int | None = 60) -> str: of links is appended. """ table = tabulate( - self.get_table_data(display_limit), + self._links_to_dicts(display_limit), headers="keys", tablefmt="github", stralign="right", @@ -394,3 +372,25 @@ def _get_table_repr(self, display_limit: int | None = 60) -> str: table += f"\n{truncated_info}" return table + + def _links_to_dicts(self, display_limit: int | None = None) -> list[dict[str, Any]]: + """Generate the table data for the LinkGraph. + + This method iterates over the links in the LinkGraph and constructs a table + containing information about genomic and metabolomic objects, as well as their + associated scores. Each row in the table represents a link between a genomic + object and a metabolomic object. + + Args: + display_limit (int | None): The maximum number of rows to include in the + table. If None, all rows are included. + + Returns: + A list of dictionaries containing the table data. + """ + link_dicts = [] + for index, link in enumerate(self.links, start=1): + link_dicts.append(self.link_to_dict(link, index)) + if display_limit is not None and index == display_limit: + break + return link_dicts From 6a4da5f0761a8b4388d9b4ae5e62a8af9b7c79d2 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Mon, 4 Nov 2024 20:31:36 +0100 Subject: [PATCH 19/42] refactor: changed the place when the index is added to the link dict --- src/nplinker/scoring/link_graph.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index 32ed290a..45ba5b30 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -282,16 +282,14 @@ def filter(self, u_nodes: Sequence[Entity], v_nodes: Sequence[Entity] = [], /) - return lg @staticmethod - def link_to_dict(link: LINK, index: int) -> dict[str, Any]: + def link_to_dict(link: LINK) -> dict[str, Any]: """Convert a link to a dictionary representation. Args: link: A tuple containing the link information (u, v, data). - index: The index of the link. Returns: A dictionary containing the link information with the following keys: - - index (int): The index of the link. - genomic_object_id (str): The ID of the genomic object. - genomic_object_type (str): The type of the genomic object. - metabolomic_object_id (str): The ID of the metabolomic object. @@ -306,7 +304,6 @@ def link_to_dict(link: LINK, index: int) -> dict[str, Any]: metcalf_score = data.get("metcalf") rosetta_score = data.get("rosetta") return { - "index": index, "genomic_object_id": genomic_object.id, "genomic_object_type": genomic_object.__class__.__name__, "metabolomic_object_id": metabolomic_object.id, @@ -388,9 +385,8 @@ def _links_to_dicts(self, display_limit: int | None = None) -> list[dict[str, An Returns: A list of dictionaries containing the table data. """ + links = self.links[:display_limit] if display_limit else self.links link_dicts = [] - for index, link in enumerate(self.links, start=1): - link_dicts.append(self.link_to_dict(link, index)) - if display_limit is not None and index == display_limit: - break + for idx, link in enumerate(links): + link_dicts.append({"index": idx + 1, **self.link_to_dict(link)}) return link_dicts From edcc7db0d7b97be459d14b77f2768191db54a9cc Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Mon, 4 Nov 2024 21:08:04 +0100 Subject: [PATCH 20/42] use csv package to write the tabular output files --- src/nplinker/nplinker.py | 37 ++++++++++++++++-------------- src/nplinker/scoring/link_graph.py | 13 ++++++----- 2 files changed, 27 insertions(+), 23 deletions(-) diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index bc03fde7..16713f40 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -1,4 +1,5 @@ from __future__ import annotations +import csv import logging import pickle from collections.abc import Sequence @@ -356,34 +357,36 @@ def save_data( with open(file, "wb") as f: pickle.dump(data, f) - def objects_to_tsv(self, objects: Sequence[BGC] | Sequence[BGC], filename: str) -> None: + def objects_to_tsv(self, objects: Sequence[BGC] | Sequence[Spectrum], filename: str) -> None: """Exports a list of BGC or Spectrum objects to a specified file in tab-separated format. Args: - objects (list[BGC | Spectrum]): A list of BGC or Spectrum objects to be exported. + objects (list): A list of BGC or a list of Spectrum objects to be exported. filename (str): The name of the file where the data will be saved. """ + if not objects: + raise ValueError("No objects provided to export") + headers = objects[0].to_dict().keys() - with open(self._output_dir / filename, "w") as f: - f.write("\t".join(headers) + "\n") + with open(self._output_dir / filename, "w", newline="") as outfile: + writer = csv.DictWriter(outfile, fieldnames=headers, delimiter="\t") + writer.writeheader() for obj in objects: - row_data = obj.to_dict() - formatted_row = [] + row = obj.to_dict() for header in headers: - item = row_data.get(header, "") + value = row[header] # Convert list, tuple, set to comma-separated string - if isinstance(item, (list, tuple, set)): - formatted_row.append(", ".join(map(str, item))) + if isinstance(value, (list, tuple, set)): + row[header] = ", ".join(map(str, value)) # Convert dict to comma-separated string - elif isinstance(item, dict): - formatted_row.append(", ".join([f"{k}:{v}" for k, v in item.items()])) - # Convert non-empty value to string - elif item: - formatted_row.append(str(item)) - # Convert empty value to empty string + elif isinstance(value, dict): + row[header] = ", ".join([f"{k}:{v}" for k, v in value.items()]) + # Convert anything else to string else: - formatted_row.append("") - f.write("\t".join(formatted_row) + "\n") + row[header] = str(value) if value else "" + # Replace tabs with 4 spaces + row[header] = row[header].replace("\t", " ") + writer.writerow(row) def to_tsv(self, lg: LinkGraph | None = None) -> None: """Exports the results to the output directory in tab-separated format. diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index 45ba5b30..d1f3cf4b 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -1,4 +1,5 @@ from __future__ import annotations +import csv from collections.abc import Sequence from functools import wraps from os import PathLike @@ -313,20 +314,20 @@ def link_to_dict(link: LINK) -> dict[str, Any]: } def to_tsv(self, file: str | PathLike) -> None: - """Exports the links in the LinkGraph to a file in tab-separated format. + """Exports the links in the LinkGraph to a file in tab-separated format. Args: file: the file to write the links to. Examples: - >>> lg.print_links("links.tsv") + >>> lg.to_tsv("links.tsv") """ table_data = self._links_to_dicts() headers = table_data[0].keys() - with open(file, "w") as f: - f.write("\t".join(headers) + "\n") - for row in table_data: - f.write("\t".join(str(row[h]) for h in headers) + "\n") + with open(file, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=headers, delimiter="\t") + writer.writeheader() + writer.writerows(table_data) @validate_u def _filter_one_node(self, u: Entity, lg: LinkGraph) -> None: From 05f9f76ef26847b54554fd582ca27caa7c424245 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Mon, 4 Nov 2024 21:12:54 +0100 Subject: [PATCH 21/42] make sure all elements of the input list have the same type of data. --- src/nplinker/nplinker.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index 16713f40..55450a24 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -367,6 +367,11 @@ def objects_to_tsv(self, objects: Sequence[BGC] | Sequence[Spectrum], filename: if not objects: raise ValueError("No objects provided to export") + # Ensure all elements in the list are of the same type + obj_type = type(objects[0]) + if not all(isinstance(obj, obj_type) for obj in objects): + raise TypeError("All objects in the list must be of the same type") + headers = objects[0].to_dict().keys() with open(self._output_dir / filename, "w", newline="") as outfile: writer = csv.DictWriter(outfile, fieldnames=headers, delimiter="\t") From bff7731c7ae1593091897d6590b4789bdaec9067 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Mon, 4 Nov 2024 21:20:01 +0100 Subject: [PATCH 22/42] shorten to long doc string lines, correct some doc strings --- src/nplinker/scoring/link_graph.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index d1f3cf4b..5ee61aa7 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -79,17 +79,17 @@ def __init__(self) -> None: Display the empty LinkGraph object: >>> lg - | | Genomic Object Type | Genomic Object ID | Metabolomic Object Type | Metabolomic Object ID | Metcalf Score | Rosetta Score | - |----|-----------------------|---------------------|---------------------------|-------------------------|-----------------|-----------------| + | index | genomic_object_id | genomic_object_type | metabolomic_object_id | metabolomic_object_type | metcalf_score | rosetta_score | + |---------|---------------------|-----------------------|-------------------------|---------------------------|-----------------|-----------------| Add a link between a GCF and a Spectrum object: >>> lg.add_link(gcf, spectrum, metcalf=Score("metcalf", 1.0, {"cutoff": 0.5})) Display all links in LinkGraph object: >>> lg - | | Genomic Object Type | Genomic Object ID | Metabolomic Object Type | Metabolomic Object ID | Metcalf Score | Rosetta Score | - |----|-----------------------|---------------------|---------------------------|-------------------------|-----------------|-----------------| - | 1 | GCF | 1 | Spectrum | 1 | 1.00 | - | + | index | genomic_object_id | genomic_object_type | metabolomic_object_id | metabolomic_object_type | metcalf_score | rosetta_score | + |---------|---------------------|-----------------------|-------------------------|---------------------------|-----------------|-----------------| + | 1 | 1 | GCF | 1 | Spectrum | 1.00 | | Get all links for a given object: >>> lg[gcf] @@ -117,7 +117,7 @@ def __init__(self) -> None: >>> new_lg = lg.filter([gcf1, gcf2], [spectrum1, spectrum2]) Export the links to a file: - >>> lg.export_links("links.tsv") + >>> lg.to_tsv("links.tsv") """ self._g: Graph = Graph() @@ -354,9 +354,9 @@ def _get_table_repr(self, display_limit: int | None = 60) -> str: display_limit: The maximum number of links to display in the table. Defaults to 60. Returns: - str: A string representation of the table in GitHub-flavored markdown format. If the number of links - exceeds the display limit, the table is truncated and an additional line indicating the total number - of links is appended. + str: A string representation of the table in GitHub-flavored markdown format. If the + number of links exceeds the display limit, the table is truncated and an additional + line indicating the total number of links is appended. """ table = tabulate( self._links_to_dicts(display_limit), From d4bf9fb2d277424faa52181db9750a85d000c322 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Mon, 4 Nov 2024 21:32:05 +0100 Subject: [PATCH 23/42] tests: adapted the test to the changes --- tests/unit/data/justafile.ipynb | 131 ++++++++++++++++++++++++++ tests/unit/genomics/test_bgc.py | 20 ++-- tests/unit/scoring/test_link_graph.py | 12 +-- 3 files changed, 150 insertions(+), 13 deletions(-) create mode 100644 tests/unit/data/justafile.ipynb diff --git a/tests/unit/data/justafile.ipynb b/tests/unit/data/justafile.ipynb new file mode 100644 index 00000000..43a5453b --- /dev/null +++ b/tests/unit/data/justafile.ipynb @@ -0,0 +1,131 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from nplinker.genomics.antismash import AntismashBGCLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "loader = AntismashBGCLoader(\"antismash\")\n", + "mapping = loader.get_genome_bgcs_mapping()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "isinstance(mapping, dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(mapping)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "20" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(mapping[\"GCF_000514515.1\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'NZ_AZWB01000006.region001'" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mapping[\"GCF_000514515.1\"][-1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "npl_dev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/unit/genomics/test_bgc.py b/tests/unit/genomics/test_bgc.py index 9706e961..71f173ba 100644 --- a/tests/unit/genomics/test_bgc.py +++ b/tests/unit/genomics/test_bgc.py @@ -32,10 +32,10 @@ def test_to_dict(): bgc.description = "Sample description" dict_repr = bgc.to_dict() - assert dict_repr["GCF_id"] == set() - assert dict_repr["GCF_bigscape_class"] == set() + assert dict_repr["GCF_id"] == list() + assert dict_repr["GCF_bigscape_class"] == list() assert dict_repr["BGC_name"] == "BGC0000001" - assert dict_repr["product_prediction"] == ("Polyketide", "NRP") + assert dict_repr["product_prediction"] == ["Polyketide", "NRP"] assert dict_repr["mibig_bgc_class"] is None assert dict_repr["description"] == "Sample description" assert dict_repr["strain_id"] == "sample_strain" @@ -43,12 +43,18 @@ def test_to_dict(): assert dict_repr["antismash_region"] is None bgc.add_parent(GCF("1")) - bgc.mibig_bgc_class = ("NRP",) + bgc.mibig_bgc_class = [ + "NRP", + ] bgc.antismash_id = "ABC_0001" bgc.antismash_region = 1 dict_repr = bgc.to_dict() - assert dict_repr["GCF_id"] == {"1"} - assert dict_repr["GCF_bigscape_class"] == set() - assert dict_repr["mibig_bgc_class"] == ("NRP",) + assert dict_repr["GCF_id"] == [ + "1", + ] + assert dict_repr["GCF_bigscape_class"] == list() + assert dict_repr["mibig_bgc_class"] == [ + "NRP", + ] assert dict_repr["antismash_id"] == "ABC_0001" assert dict_repr["antismash_region"] == 1 diff --git a/tests/unit/scoring/test_link_graph.py b/tests/unit/scoring/test_link_graph.py index f1542338..32e73f7f 100644 --- a/tests/unit/scoring/test_link_graph.py +++ b/tests/unit/scoring/test_link_graph.py @@ -116,10 +116,8 @@ def test_filter(gcfs, spectra, score): def test_link_to_dict(lg, gcfs, spectra, score): link = lg.links[0] - index = 1 - dict_repr = lg.link_to_dict(link, index) + dict_repr = lg.link_to_dict(link) assert type(dict_repr) is dict - assert dict_repr["index"] == 1 assert dict_repr["genomic_object_type"] == gcfs[0].__class__.__name__ assert dict_repr["genomic_object_id"] == gcfs[0].id assert dict_repr["metabolomic_object_type"] == spectra[0].__class__.__name__ @@ -128,15 +126,17 @@ def test_link_to_dict(lg, gcfs, spectra, score): assert dict_repr["rosetta_score"] == "" -def test_get_table_data(lg, gcfs, spectra, score): +def test__links_to_dicts(lg, gcfs, spectra, score): # add a second link lg.add_link(gcfs[1], spectra[1], metcalf=score) - table_data = lg.get_table_data() + table_data = lg._links_to_dicts() assert type(table_data) is list assert type(table_data[0]) is dict assert len(table_data) == 2 + assert table_data[0]["index"] == 1 + assert table_data[1]["index"] == 2 display_limit = 1 - table_data = lg.get_table_data(display_limit) + table_data = lg._links_to_dicts(display_limit) assert len(table_data) == 1 From 2c05efbbc0b7c511beec865241e7cfe8024c5cab Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Mon, 4 Nov 2024 21:38:40 +0100 Subject: [PATCH 24/42] remove a file that was committed by accident --- tests/unit/data/justafile.ipynb | 131 -------------------------------- 1 file changed, 131 deletions(-) delete mode 100644 tests/unit/data/justafile.ipynb diff --git a/tests/unit/data/justafile.ipynb b/tests/unit/data/justafile.ipynb deleted file mode 100644 index 43a5453b..00000000 --- a/tests/unit/data/justafile.ipynb +++ /dev/null @@ -1,131 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from nplinker.genomics.antismash import AntismashBGCLoader" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "loader = AntismashBGCLoader(\"antismash\")\n", - "mapping = loader.get_genome_bgcs_mapping()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "isinstance(mapping, dict)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(mapping)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "20" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(mapping[\"GCF_000514515.1\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'NZ_AZWB01000006.region001'" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mapping[\"GCF_000514515.1\"][-1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "npl_dev", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.10" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From 32d78c3820d291b99bae29d0a04689b40d8d183f Mon Sep 17 00:00:00 2001 From: Annette Lien <70581832+liannette@users.noreply.github.com> Date: Tue, 19 Nov 2024 19:37:47 +0100 Subject: [PATCH 25/42] Improve docstrings Apply suggestions from code review Co-authored-by: Cunliang Geng --- src/nplinker/genomics/bgc.py | 9 +++------ src/nplinker/scoring/link_graph.py | 7 ++++--- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index c61d7942..59835df6 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -175,15 +175,12 @@ def is_mibig(self) -> bool: return self.id.startswith("BGC") def to_dict(self) -> dict[str, Any]: - """Convert the BGC object to a dictionary for exporting results. - - Compiles relevant information from the BGC object and formats it into a dictionary. - Each key-value pair in the dictionary represents a specific attribute of the BGC. - + """Convert the BGC object to a dictionary for exporting purpose. Returns: A dictionary containing the following key-value pairs: + - GCF_id (list[str]): A list of GCF IDs. - - GCF_bigscape_class (list[str | None]): A list of BiG-SCAPE classes. + - GCF_bigscape_class (list[str]): A list of BiG-SCAPE classes. - strain_id (str | None): The ID of the strain. - description (str | None): A description of the BGC. - BGC_name (str): The name of the BGC. diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index 5ee61aa7..7dddf5fd 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -291,6 +291,7 @@ def link_to_dict(link: LINK) -> dict[str, Any]: Returns: A dictionary containing the link information with the following keys: + - genomic_object_id (str): The ID of the genomic object. - genomic_object_type (str): The type of the genomic object. - metabolomic_object_id (str): The ID of the metabolomic object. @@ -314,10 +315,10 @@ def link_to_dict(link: LINK) -> dict[str, Any]: } def to_tsv(self, file: str | PathLike) -> None: - """Exports the links in the LinkGraph to a file in tab-separated format. + """Exports the links in the LinkGraph to a TSV file. Args: - file: the file to write the links to. + file: the path to the output TSV file. Examples: >>> lg.to_tsv("links.tsv") @@ -354,7 +355,7 @@ def _get_table_repr(self, display_limit: int | None = 60) -> str: display_limit: The maximum number of links to display in the table. Defaults to 60. Returns: - str: A string representation of the table in GitHub-flavored markdown format. If the + A string representation of the table in GitHub-flavored markdown format. If the number of links exceeds the display limit, the table is truncated and an additional line indicating the total number of links is appended. """ From b04226bddf61d9e28ac722b2d298b3f2e365e9e0 Mon Sep 17 00:00:00 2001 From: Annette Lien <70581832+liannette@users.noreply.github.com> Date: Tue, 19 Nov 2024 19:46:16 +0100 Subject: [PATCH 26/42] Improve docstrings Apply suggestions from code review Co-authored-by: Cunliang Geng --- src/nplinker/metabolomics/spectrum.py | 6 ++---- src/nplinker/nplinker.py | 14 ++++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index 6fccf47b..93b9be87 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -100,13 +100,11 @@ def has_strain(self, strain: Strain) -> bool: return strain in self.strains def to_dict(self) -> dict[str, Any]: - """Convert the Spectrum object to a dictionary for exporting results. - - This method compiles relevant information from the Spectrum object into a dictionary format. - Each key-value pair in the dictionary represents a specific attribute of the Spectrum Object. + """Convert the Spectrum object to a dictionary for exporting purpose. Returns: A dictionary containing containing the following key-value pairs: + - "spectrum_id" (str): The unique identifier of the spectrum. - "num_strains_with_spectrum" (int): The number of strains associated with the spectrum. - "precursor_mz" (float): The precursor m/z value, rounded to four decimal places. diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index 55450a24..956ec5db 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -358,11 +358,11 @@ def save_data( pickle.dump(data, f) def objects_to_tsv(self, objects: Sequence[BGC] | Sequence[Spectrum], filename: str) -> None: - """Exports a list of BGC or Spectrum objects to a specified file in tab-separated format. + """Exports a list of BGC or Spectrum objects to a tsv file. Args: objects (list): A list of BGC or a list of Spectrum objects to be exported. - filename (str): The name of the file where the data will be saved. + filename (str): The name of the output file. """ if not objects: raise ValueError("No objects provided to export") @@ -394,11 +394,13 @@ def objects_to_tsv(self, objects: Sequence[BGC] | Sequence[Spectrum], filename: writer.writerow(row) def to_tsv(self, lg: LinkGraph | None = None) -> None: - """Exports the results to the output directory in tab-separated format. + """Export data to tsv files. - This method exports genomics and metabolomics data to their respective - TSV files in the specified output directory. If a LinkGraph object is - provided, it also exports the links data to a TSV file. + This method exports following data to seperated TSV files: + + - BGC objects: `genomics_data.tsv` + - Spectrum objects: `metabolomics_data.tsv` + - LinkGraph object (if given): `links.tsv` Args: lg (LinkGraph | None): An optional LinkGraph object. If provided, From 5fd41085d37594b908dee9144d6dd4c4b37090ef Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Tue, 19 Nov 2024 20:14:27 +0100 Subject: [PATCH 27/42] refactor: add method to convert a value to string for tabular output --- src/nplinker/nplinker.py | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index 55450a24..55cee15f 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -372,27 +372,38 @@ def objects_to_tsv(self, objects: Sequence[BGC] | Sequence[Spectrum], filename: if not all(isinstance(obj, obj_type) for obj in objects): raise TypeError("All objects in the list must be of the same type") - headers = objects[0].to_dict().keys() with open(self._output_dir / filename, "w", newline="") as outfile: + headers = objects[0].to_dict().keys() writer = csv.DictWriter(outfile, fieldnames=headers, delimiter="\t") writer.writeheader() for obj in objects: row = obj.to_dict() - for header in headers: - value = row[header] - # Convert list, tuple, set to comma-separated string - if isinstance(value, (list, tuple, set)): - row[header] = ", ".join(map(str, value)) - # Convert dict to comma-separated string - elif isinstance(value, dict): - row[header] = ", ".join([f"{k}:{v}" for k, v in value.items()]) - # Convert anything else to string - else: - row[header] = str(value) if value else "" - # Replace tabs with 4 spaces - row[header] = row[header].replace("\t", " ") + for key, value in row.items(): + row[key] = self.to_string(value).replace("\t", " ") writer.writerow(row) + @staticmethod + def to_string(value: Any) -> str: + """Convert various types of values to a string. + + Args: + value: The value to be converted to a string. + Can be a list, tuple, set, dict, or any other type. + + Returns: + A string representation of the input value. + """ + # Convert list, tuple, set to comma-separated string + if isinstance(value, (list, tuple, set)): + value = ", ".join(map(str, value)) + # Convert dict to comma-separated string + elif isinstance(value, dict): + value = ", ".join([f"{k}:{v}" for k, v in value.items()]) + # Convert anything else to string + else: + value = str(value) if value else "" + return value + def to_tsv(self, lg: LinkGraph | None = None) -> None: """Exports the results to the output directory in tab-separated format. From 940eb191062fb3f8254b3dffacfb34e2f7d43530 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 20 Nov 2024 15:13:21 +0100 Subject: [PATCH 28/42] improve doctring, add a comment about key order of bgc dict representation --- src/nplinker/genomics/bgc.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 59835df6..a3b2e831 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -176,6 +176,7 @@ def is_mibig(self) -> bool: def to_dict(self) -> dict[str, Any]: """Convert the BGC object to a dictionary for exporting purpose. + Returns: A dictionary containing the following key-value pairs: @@ -189,6 +190,7 @@ def to_dict(self) -> dict[str, Any]: - antismash_id (str | None): The antiSMASH ID. - antismash_region (int | None): The antiSMASH region number. """ + # Keys are ordered to make the output easier to analyze return { "GCF_id": [gcf.id for gcf in self.parents if gcf.id is not None], "GCF_bigscape_class": [bsc for bsc in self.bigscape_classes if bsc is not None], From e551dcc22b90c6748095549e9de031da8005b072 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 20 Nov 2024 16:11:19 +0100 Subject: [PATCH 29/42] move to_string method to the BGC/Spectrum class, add a to_tabular method --- src/nplinker/genomics/bgc.py | 40 +++++++++++++++++++++++++++ src/nplinker/metabolomics/spectrum.py | 40 +++++++++++++++++++++++++++ src/nplinker/nplinker.py | 32 +++------------------ 3 files changed, 84 insertions(+), 28 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index a3b2e831..3134cfef 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -203,6 +203,46 @@ def to_dict(self) -> dict[str, Any]: "antismash_region": self.antismash_region, } + @staticmethod + def to_string(value: Any) -> str: + """Convert various types of values to a string. + + Args: + value: The value to be converted to a string. + Can be a list, tuple, set, dict, or any other type. + + Returns: + A string representation of the input value. + """ + # Convert list, tuple, set to comma-separated string + if isinstance(value, (list, tuple, set)): + value = ", ".join(map(str, value)) + # Convert dict to comma-separated string + elif isinstance(value, dict): + value = ", ".join([f"{k}:{v}" for k, v in value.items()]) + # Convert anything else to string + else: + value = str(value) if value else "" + return value + + def to_tabular(self, delimiter: str = "\t") -> str: + """Convert the BGC object to a tabular string format. + + Args: + delimiter: The delimiter to use for separating values. Default is tab. + + Returns: + A string representation of the BGC object in tabular format. + """ + values = [self.to_string(value) for value in self.to_dict().values()] + if delimiter == "\t": + values = [value.replace("\t", " ") for value in values] + elif delimiter == ",": + values = [value.replace(",", ";") for value in values] + elif delimiter == ";": + values = [value.replace(";", ":") for value in values] + return delimiter.join(values) + # CG: why not providing whole product but only amino acid as product monomer? # this property is not used in NPLinker core business. @property diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index 93b9be87..43861eb9 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -122,3 +122,43 @@ def to_dict(self) -> dict[str, Any]: "gnps_id": self.gnps_id, "gnps_annotations": self.gnps_annotations, } + + @staticmethod + def to_string(value: Any) -> str: + """Convert various types of values to a string. + + Args: + value: The value to be converted to a string. + Can be a list, tuple, set, dict, or any other type. + + Returns: + A string representation of the input value. + """ + # Convert list, tuple, set to comma-separated string + if isinstance(value, (list, tuple, set)): + value = ", ".join(map(str, value)) + # Convert dict to comma-separated string + elif isinstance(value, dict): + value = ", ".join([f"{k}:{v}" for k, v in value.items()]) + # Convert anything else to string + else: + value = str(value) if value else "" + return value + + def to_tabular(self, delimiter: str = "\t") -> str: + """Convert the Spectrum object to a tabular string format. + + Args: + delimiter: The delimiter to use for separating values. Default is tab. + + Returns: + A string representation of the Spectrum object in tabular format. + """ + values = [self.to_string(value) for value in self.to_dict().values()] + if delimiter == "\t": + values = [value.replace("\t", " ") for value in values] + elif delimiter == ",": + values = [value.replace(",", ";") for value in values] + elif delimiter == ";": + values = [value.replace(";", ":") for value in values] + return delimiter.join(values) diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index 41920cc9..397a5d94 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -374,41 +374,17 @@ def objects_to_tsv(self, objects: Sequence[BGC] | Sequence[Spectrum], filename: with open(self._output_dir / filename, "w", newline="") as outfile: headers = objects[0].to_dict().keys() - writer = csv.DictWriter(outfile, fieldnames=headers, delimiter="\t") - writer.writeheader() + writer = csv.writer(outfile, delimiter="\t") + writer.writerow(headers) for obj in objects: - row = obj.to_dict() - for key, value in row.items(): - row[key] = self.to_string(value).replace("\t", " ") + row = obj.to_tabular(delimiter="\t").split("\t") writer.writerow(row) - @staticmethod - def to_string(value: Any) -> str: - """Convert various types of values to a string. - - Args: - value: The value to be converted to a string. - Can be a list, tuple, set, dict, or any other type. - - Returns: - A string representation of the input value. - """ - # Convert list, tuple, set to comma-separated string - if isinstance(value, (list, tuple, set)): - value = ", ".join(map(str, value)) - # Convert dict to comma-separated string - elif isinstance(value, dict): - value = ", ".join([f"{k}:{v}" for k, v in value.items()]) - # Convert anything else to string - else: - value = str(value) if value else "" - return value - def to_tsv(self, lg: LinkGraph | None = None) -> None: """Export data to tsv files. This method exports following data to seperated TSV files: - + - BGC objects: `genomics_data.tsv` - Spectrum objects: `metabolomics_data.tsv` - LinkGraph object (if given): `links.tsv` From f9ae9f228090dba056466b4e4a57d8d377e9bd87 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 20 Nov 2024 16:11:46 +0100 Subject: [PATCH 30/42] add tests for the to_string method --- tests/unit/genomics/test_bgc.py | 9 +++++++++ tests/unit/metabolomics/test_spectrum.py | 10 ++++++++++ 2 files changed, 19 insertions(+) diff --git a/tests/unit/genomics/test_bgc.py b/tests/unit/genomics/test_bgc.py index 71f173ba..dbbe322c 100644 --- a/tests/unit/genomics/test_bgc.py +++ b/tests/unit/genomics/test_bgc.py @@ -58,3 +58,12 @@ def test_to_dict(): ] assert dict_repr["antismash_id"] == "ABC_0001" assert dict_repr["antismash_region"] == 1 + + +def test_to_string(): + assert BGC.to_string([1, "a"]) == "1, a" + assert BGC.to_string((1, "a")) == "1, a" + assert BGC.to_string({1, "a"}) in ["1, a", "a, 1"] + assert BGC.to_string({"key1": 1, "key2": "value2"}) == "key1:1, key2:value2" + assert BGC.to_string(100.2) == "100.2" + assert BGC.to_string(None) == "" diff --git a/tests/unit/metabolomics/test_spectrum.py b/tests/unit/metabolomics/test_spectrum.py index d77ea0d4..13eb8c38 100644 --- a/tests/unit/metabolomics/test_spectrum.py +++ b/tests/unit/metabolomics/test_spectrum.py @@ -100,3 +100,13 @@ def __init__(self, id): assert dict_repr["molecular_family"] == "family1" assert dict_repr["gnps_id"] == "GNPS0001" assert dict_repr["gnps_annotations"] == {"annotation1": "value1"} + + +def test_to_string(): + spec = Spectrum("spec1", [100, 200], [0.1, 0.2], 150) + assert spec.to_string([1, "a"]) == "1, a" + assert spec.to_string((1, "a")) == "1, a" + assert spec.to_string({1, "a"}) in ["1, a", "a, 1"] + assert spec.to_string({"key1": 1, "key2": "value2"}) == "key1:1, key2:value2" + assert spec.to_string(100.2) == "100.2" + assert spec.to_string(None) == "" From 1b00262b18245fc037229f63e59ad5c5705c0e53 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 20 Nov 2024 16:44:00 +0100 Subject: [PATCH 31/42] change to_tabular to it returns a list and not a string --- src/nplinker/genomics/bgc.py | 18 ++++-------------- src/nplinker/metabolomics/spectrum.py | 18 ++++-------------- src/nplinker/nplinker.py | 2 +- 3 files changed, 9 insertions(+), 29 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 3134cfef..31514dd1 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -225,23 +225,13 @@ def to_string(value: Any) -> str: value = str(value) if value else "" return value - def to_tabular(self, delimiter: str = "\t") -> str: - """Convert the BGC object to a tabular string format. - - Args: - delimiter: The delimiter to use for separating values. Default is tab. + def to_tabular(self) -> list: + """Convert the BGC object to a tabular format. Returns: - A string representation of the BGC object in tabular format. + list: A list of strings representing the BGC object in tabular format. """ - values = [self.to_string(value) for value in self.to_dict().values()] - if delimiter == "\t": - values = [value.replace("\t", " ") for value in values] - elif delimiter == ",": - values = [value.replace(",", ";") for value in values] - elif delimiter == ";": - values = [value.replace(";", ":") for value in values] - return delimiter.join(values) + return [self.to_string(value) for value in self.to_dict().values()] # CG: why not providing whole product but only amino acid as product monomer? # this property is not used in NPLinker core business. diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index 43861eb9..312195c2 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -145,20 +145,10 @@ def to_string(value: Any) -> str: value = str(value) if value else "" return value - def to_tabular(self, delimiter: str = "\t") -> str: - """Convert the Spectrum object to a tabular string format. - - Args: - delimiter: The delimiter to use for separating values. Default is tab. + def to_tabular(self) -> list: + """Convert the Spectrum object to a tabular format. Returns: - A string representation of the Spectrum object in tabular format. + list: A list of strings representing the Spectrum object in tabular format. """ - values = [self.to_string(value) for value in self.to_dict().values()] - if delimiter == "\t": - values = [value.replace("\t", " ") for value in values] - elif delimiter == ",": - values = [value.replace(",", ";") for value in values] - elif delimiter == ";": - values = [value.replace(";", ":") for value in values] - return delimiter.join(values) + return [self.to_string(value) for value in self.to_dict().values()] diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index 397a5d94..a3d75d21 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -377,7 +377,7 @@ def objects_to_tsv(self, objects: Sequence[BGC] | Sequence[Spectrum], filename: writer = csv.writer(outfile, delimiter="\t") writer.writerow(headers) for obj in objects: - row = obj.to_tabular(delimiter="\t").split("\t") + row = [item.replace("\t", " ") for item in obj.to_tabular()] writer.writerow(row) def to_tsv(self, lg: LinkGraph | None = None) -> None: From 0d6bec396a37ad04d9ee9c25077d740949b2e622 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Mon, 25 Nov 2024 18:24:26 +0100 Subject: [PATCH 32/42] refactor: to_tabular returns dict, to_string turned into private func, tabs are replaced in to_tabular --- src/nplinker/genomics/bgc.py | 15 +++++++++------ src/nplinker/metabolomics/spectrum.py | 15 +++++++++------ src/nplinker/nplinker.py | 9 ++++----- tests/unit/genomics/test_bgc.py | 10 ++++------ tests/unit/metabolomics/test_spectrum.py | 10 ++++------ 5 files changed, 30 insertions(+), 29 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 31514dd1..efca901a 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -204,18 +204,18 @@ def to_dict(self) -> dict[str, Any]: } @staticmethod - def to_string(value: Any) -> str: + def _to_string(value: Any) -> str: """Convert various types of values to a string. Args: value: The value to be converted to a string. - Can be a list, tuple, set, dict, or any other type. + Can be a list, dict, or any other type. Returns: A string representation of the input value. """ # Convert list, tuple, set to comma-separated string - if isinstance(value, (list, tuple, set)): + if isinstance(value, list): value = ", ".join(map(str, value)) # Convert dict to comma-separated string elif isinstance(value, dict): @@ -225,13 +225,16 @@ def to_string(value: Any) -> str: value = str(value) if value else "" return value - def to_tabular(self) -> list: + def to_tabular(self) -> dict[str, Any]: """Convert the BGC object to a tabular format. Returns: - list: A list of strings representing the BGC object in tabular format. + dict: A dictionary representing the BGC object in tabular format. """ - return [self.to_string(value) for value in self.to_dict().values()] + return { + key: self._to_string(value).replace("\t", " ") + for key, value in self.to_dict().items() + } # CG: why not providing whole product but only amino acid as product monomer? # this property is not used in NPLinker core business. diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index 312195c2..d898fd90 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -124,18 +124,18 @@ def to_dict(self) -> dict[str, Any]: } @staticmethod - def to_string(value: Any) -> str: + def _to_string(value: Any) -> str: """Convert various types of values to a string. Args: value: The value to be converted to a string. - Can be a list, tuple, set, dict, or any other type. + Can be a list, dict, or any other type. Returns: A string representation of the input value. """ # Convert list, tuple, set to comma-separated string - if isinstance(value, (list, tuple, set)): + if isinstance(value, list): value = ", ".join(map(str, value)) # Convert dict to comma-separated string elif isinstance(value, dict): @@ -145,10 +145,13 @@ def to_string(value: Any) -> str: value = str(value) if value else "" return value - def to_tabular(self) -> list: + def to_tabular(self) -> dict[str, Any]: """Convert the Spectrum object to a tabular format. Returns: - list: A list of strings representing the Spectrum object in tabular format. + dict: A dictionary representing the Spectrum object in tabular format. """ - return [self.to_string(value) for value in self.to_dict().values()] + return { + key: self._to_string(value).replace("\t", " ") + for key, value in self.to_dict().items() + } diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index a3d75d21..1a42d7a1 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -373,12 +373,11 @@ def objects_to_tsv(self, objects: Sequence[BGC] | Sequence[Spectrum], filename: raise TypeError("All objects in the list must be of the same type") with open(self._output_dir / filename, "w", newline="") as outfile: - headers = objects[0].to_dict().keys() - writer = csv.writer(outfile, delimiter="\t") - writer.writerow(headers) + headers = objects[0].to_tabular().keys() + writer = csv.DictWriter(outfile, fieldnames=headers, delimiter="\t") + writer.writeheader() for obj in objects: - row = [item.replace("\t", " ") for item in obj.to_tabular()] - writer.writerow(row) + writer.writerow(obj.to_tabular()) def to_tsv(self, lg: LinkGraph | None = None) -> None: """Export data to tsv files. diff --git a/tests/unit/genomics/test_bgc.py b/tests/unit/genomics/test_bgc.py index dbbe322c..1ce40fd0 100644 --- a/tests/unit/genomics/test_bgc.py +++ b/tests/unit/genomics/test_bgc.py @@ -61,9 +61,7 @@ def test_to_dict(): def test_to_string(): - assert BGC.to_string([1, "a"]) == "1, a" - assert BGC.to_string((1, "a")) == "1, a" - assert BGC.to_string({1, "a"}) in ["1, a", "a, 1"] - assert BGC.to_string({"key1": 1, "key2": "value2"}) == "key1:1, key2:value2" - assert BGC.to_string(100.2) == "100.2" - assert BGC.to_string(None) == "" + assert BGC._to_string([1, "a"]) == "1, a" + assert BGC._to_string({"key1": 1, "key2": "value2"}) == "key1:1, key2:value2" + assert BGC._to_string(100.2) == "100.2" + assert BGC._to_string(None) == "" diff --git a/tests/unit/metabolomics/test_spectrum.py b/tests/unit/metabolomics/test_spectrum.py index 13eb8c38..3ad10ece 100644 --- a/tests/unit/metabolomics/test_spectrum.py +++ b/tests/unit/metabolomics/test_spectrum.py @@ -104,9 +104,7 @@ def __init__(self, id): def test_to_string(): spec = Spectrum("spec1", [100, 200], [0.1, 0.2], 150) - assert spec.to_string([1, "a"]) == "1, a" - assert spec.to_string((1, "a")) == "1, a" - assert spec.to_string({1, "a"}) in ["1, a", "a, 1"] - assert spec.to_string({"key1": 1, "key2": "value2"}) == "key1:1, key2:value2" - assert spec.to_string(100.2) == "100.2" - assert spec.to_string(None) == "" + assert spec._to_string([1, "a"]) == "1, a" + assert spec._to_string({"key1": 1, "key2": "value2"}) == "key1:1, key2:value2" + assert spec._to_string(100.2) == "100.2" + assert spec._to_string(None) == "" From 41757c7fd424ddf08f1f3636c3123bff79b03387 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Mon, 2 Dec 2024 15:57:34 +0100 Subject: [PATCH 33/42] fix typing in to_tabular methods --- src/nplinker/genomics/bgc.py | 2 +- src/nplinker/metabolomics/spectrum.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index efca901a..a3035597 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -225,7 +225,7 @@ def _to_string(value: Any) -> str: value = str(value) if value else "" return value - def to_tabular(self) -> dict[str, Any]: + def to_tabular(self) -> dict[str, str]: """Convert the BGC object to a tabular format. Returns: diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index d898fd90..2a47383f 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -145,7 +145,7 @@ def _to_string(value: Any) -> str: value = str(value) if value else "" return value - def to_tabular(self) -> dict[str, Any]: + def to_tabular(self) -> dict[str, str]: """Convert the Spectrum object to a tabular format. Returns: From b94eddf3f7fca08c3e56401fc07f37b1dd39b794 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Mon, 2 Dec 2024 16:01:38 +0100 Subject: [PATCH 34/42] update docstrings and comments --- src/nplinker/genomics/bgc.py | 6 ++++-- src/nplinker/metabolomics/spectrum.py | 8 +++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index a3035597..4ebac52d 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -209,12 +209,12 @@ def _to_string(value: Any) -> str: Args: value: The value to be converted to a string. - Can be a list, dict, or any other type. + Can be a list, dict, or any other JSON-compatible type. Returns: A string representation of the input value. """ - # Convert list, tuple, set to comma-separated string + # Convert list to comma-separated string if isinstance(value, list): value = ", ".join(map(str, value)) # Convert dict to comma-separated string @@ -230,6 +230,8 @@ def to_tabular(self) -> dict[str, str]: Returns: dict: A dictionary representing the BGC object in tabular format. + The keys can be treated as headers and values are strings in which tabs are removed. + This dict can be exported as a TSV file. """ return { key: self._to_string(value).replace("\t", " ") diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index 2a47383f..182227ae 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -129,12 +129,12 @@ def _to_string(value: Any) -> str: Args: value: The value to be converted to a string. - Can be a list, dict, or any other type. + Can be a list, dict, or any other JSON-compatible type. Returns: A string representation of the input value. """ - # Convert list, tuple, set to comma-separated string + # Convert list to comma-separated string if isinstance(value, list): value = ", ".join(map(str, value)) # Convert dict to comma-separated string @@ -149,7 +149,9 @@ def to_tabular(self) -> dict[str, str]: """Convert the Spectrum object to a tabular format. Returns: - dict: A dictionary representing the Spectrum object in tabular format. + dict: A dictionary representing the BGC object in tabular format. + The keys can be treated as headers and values are strings in which tabs are removed. + This dict can be exported as a TSV file. """ return { key: self._to_string(value).replace("\t", " ") From 94bcb670597474bfebc945741af09d8e8569ef0a Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Mon, 2 Dec 2024 16:35:47 +0100 Subject: [PATCH 35/42] ensure 0 and 0.0 are correctly converted to strings, and not to empty strings --- src/nplinker/genomics/bgc.py | 5 ++++- src/nplinker/metabolomics/spectrum.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 4ebac52d..996e9826 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -220,9 +220,12 @@ def _to_string(value: Any) -> str: # Convert dict to comma-separated string elif isinstance(value, dict): value = ", ".join([f"{k}:{v}" for k, v in value.items()]) + # Convert None to empty string + elif value is None: + value = "" # Convert anything else to string else: - value = str(value) if value else "" + value = str(value) return value def to_tabular(self) -> dict[str, str]: diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index 182227ae..d82ae4c2 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -140,9 +140,12 @@ def _to_string(value: Any) -> str: # Convert dict to comma-separated string elif isinstance(value, dict): value = ", ".join([f"{k}:{v}" for k, v in value.items()]) + # Convert None to empty string + elif value is None: + value = "" # Convert anything else to string else: - value = str(value) if value else "" + value = str(value) return value def to_tabular(self) -> dict[str, str]: From 16a56c759a9b2d654bb3fbe0390f6ad98cb626ce Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Mon, 2 Dec 2024 16:43:28 +0100 Subject: [PATCH 36/42] change the order of methods --- src/nplinker/genomics/bgc.py | 26 +++++++++++++------------- src/nplinker/metabolomics/spectrum.py | 26 +++++++++++++------------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 996e9826..ce34573d 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -203,6 +203,19 @@ def to_dict(self) -> dict[str, Any]: "antismash_region": self.antismash_region, } + def to_tabular(self) -> dict[str, str]: + """Convert the BGC object to a tabular format. + + Returns: + dict: A dictionary representing the BGC object in tabular format. + The keys can be treated as headers and values are strings in which tabs are removed. + This dict can be exported as a TSV file. + """ + return { + key: self._to_string(value).replace("\t", " ") + for key, value in self.to_dict().items() + } + @staticmethod def _to_string(value: Any) -> str: """Convert various types of values to a string. @@ -228,19 +241,6 @@ def _to_string(value: Any) -> str: value = str(value) return value - def to_tabular(self) -> dict[str, str]: - """Convert the BGC object to a tabular format. - - Returns: - dict: A dictionary representing the BGC object in tabular format. - The keys can be treated as headers and values are strings in which tabs are removed. - This dict can be exported as a TSV file. - """ - return { - key: self._to_string(value).replace("\t", " ") - for key, value in self.to_dict().items() - } - # CG: why not providing whole product but only amino acid as product monomer? # this property is not used in NPLinker core business. @property diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index d82ae4c2..2674f461 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -123,6 +123,19 @@ def to_dict(self) -> dict[str, Any]: "gnps_annotations": self.gnps_annotations, } + def to_tabular(self) -> dict[str, str]: + """Convert the Spectrum object to a tabular format. + + Returns: + dict: A dictionary representing the BGC object in tabular format. + The keys can be treated as headers and values are strings in which tabs are removed. + This dict can be exported as a TSV file. + """ + return { + key: self._to_string(value).replace("\t", " ") + for key, value in self.to_dict().items() + } + @staticmethod def _to_string(value: Any) -> str: """Convert various types of values to a string. @@ -147,16 +160,3 @@ def _to_string(value: Any) -> str: else: value = str(value) return value - - def to_tabular(self) -> dict[str, str]: - """Convert the Spectrum object to a tabular format. - - Returns: - dict: A dictionary representing the BGC object in tabular format. - The keys can be treated as headers and values are strings in which tabs are removed. - This dict can be exported as a TSV file. - """ - return { - key: self._to_string(value).replace("\t", " ") - for key, value in self.to_dict().items() - } From 183bd5f66c75a76dd5322da6e0be8f24896ab54b Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Mon, 2 Dec 2024 17:34:52 +0100 Subject: [PATCH 37/42] remove whitespace in blank lines --- src/nplinker/genomics/bgc.py | 2 +- src/nplinker/scoring/link_graph.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index ce34573d..77f16543 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -215,7 +215,7 @@ def to_tabular(self) -> dict[str, str]: key: self._to_string(value).replace("\t", " ") for key, value in self.to_dict().items() } - + @staticmethod def _to_string(value: Any) -> str: """Convert various types of values to a string. diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index 7dddf5fd..e3653398 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -291,7 +291,7 @@ def link_to_dict(link: LINK) -> dict[str, Any]: Returns: A dictionary containing the link information with the following keys: - + - genomic_object_id (str): The ID of the genomic object. - genomic_object_type (str): The type of the genomic object. - metabolomic_object_id (str): The ID of the metabolomic object. From e2227dffb18ddab82f11ef89ace861e04ff890f5 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Mon, 2 Dec 2024 18:07:21 +0100 Subject: [PATCH 38/42] update and add tests --- tests/integration/test_nplinker_local.py | 40 ++++++++++++++++++++ tests/unit/genomics/test_bgc.py | 41 ++++++++++++++++++-- tests/unit/metabolomics/test_spectrum.py | 48 +++++++++++++++++++++--- tests/unit/scoring/test_link_graph.py | 30 +++++++++++++++ 4 files changed, 150 insertions(+), 9 deletions(-) diff --git a/tests/integration/test_nplinker_local.py b/tests/integration/test_nplinker_local.py index 54144dd1..2c27a4ab 100644 --- a/tests/integration/test_nplinker_local.py +++ b/tests/integration/test_nplinker_local.py @@ -1,5 +1,6 @@ import os import pickle +from pathlib import Path import pytest from nplinker.genomics import GCF from nplinker.metabolomics import MolecularFamily @@ -106,3 +107,42 @@ def test_save_data(npl): assert obj1 in mfs else: assert False + + +def test_objects_to_tsv(npl, tmp_path): + tsv_file = tmp_path / "test.tsv" + + # Test objects_to_tsv for BGCs + npl.objects_to_tsv(npl.bgcs, tsv_file) + with open(tsv_file, "r") as f: + lines = f.readlines() + assert len(lines) == len(npl.bgcs) + 1 # +1 for header + + # Test objects_to_tsv for Spectra + npl.objects_to_tsv(npl.spectra, tsv_file) + with open(tsv_file, "r") as f: + lines = f.readlines() + assert len(lines) == len(npl.spectra) + 1 # +1 for header + + +def test_to_tsv(npl): + lg = npl.get_links(npl.spectra[:1], "metcalf") + npl.to_tsv(lg) + + # Check the genomics_data.tsv file + genomics_tsv_file = Path(npl.output_dir) / "genomics_data.tsv" + with open(genomics_tsv_file, "r") as f: + lines = f.readlines() + assert len(lines) == len(npl.bgcs) + 1 # +1 for header + + # Check metabolomics_data.tsv file + metabolomics_tsv_file = Path(npl.output_dir) / "metabolomics_data.tsv" + with open(metabolomics_tsv_file, "r") as f: + lines = f.readlines() + assert len(lines) == len(npl.spectra) + 1 # +1 for header + + # Check the links.tsv file + links_tsv_file = Path(npl.output_dir) / "links.tsv" + with open(links_tsv_file, "r") as f: + lines = f.readlines() + assert len(lines) == len(lg.links) + 1 # +1 for header diff --git a/tests/unit/genomics/test_bgc.py b/tests/unit/genomics/test_bgc.py index 1ce40fd0..fd21dd36 100644 --- a/tests/unit/genomics/test_bgc.py +++ b/tests/unit/genomics/test_bgc.py @@ -60,8 +60,43 @@ def test_to_dict(): assert dict_repr["antismash_region"] == 1 -def test_to_string(): - assert BGC._to_string([1, "a"]) == "1, a" +def test__to_string(): + assert BGC._to_string([]) == "" + assert BGC._to_string([1, 2.0, "a"]) == "1, 2.0, a" + assert BGC._to_string(dict()) == "" assert BGC._to_string({"key1": 1, "key2": "value2"}) == "key1:1, key2:value2" - assert BGC._to_string(100.2) == "100.2" assert BGC._to_string(None) == "" + assert BGC._to_string(0) == "0" + assert BGC._to_string(0.0) == "0.0" + assert BGC._to_string(100.2) == "100.2" + assert BGC._to_string(False) == "False" + + +def test_to_tabular(): + bgc = BGC("BGC0000001", "Polyketide", "NRP") + bgc.strain = Strain("sample_strain") + bgc.description = "Sample description" + + tabular_repr = bgc.to_tabular() + assert tabular_repr["GCF_id"] == "" + assert tabular_repr["GCF_bigscape_class"] == "" + assert tabular_repr["BGC_name"] == "BGC0000001" + assert tabular_repr["product_prediction"] == "Polyketide, NRP" + assert tabular_repr["mibig_bgc_class"] == "" + assert tabular_repr["description"] == "Sample description" + assert tabular_repr["strain_id"] == "sample_strain" + assert tabular_repr["antismash_id"] == "" + assert tabular_repr["antismash_region"] == "" + + bgc.add_parent(GCF("1")) + bgc.mibig_bgc_class = [ + "NRP", + ] + bgc.antismash_id = "ABC_0001" + bgc.antismash_region = 1 + tabular_repr = bgc.to_tabular() + assert tabular_repr["GCF_id"] == "1" + assert tabular_repr["GCF_bigscape_class"] == "" + assert tabular_repr["mibig_bgc_class"] == "NRP" + assert tabular_repr["antismash_id"] == "ABC_0001" + assert tabular_repr["antismash_region"] == "1" diff --git a/tests/unit/metabolomics/test_spectrum.py b/tests/unit/metabolomics/test_spectrum.py index 3ad10ece..68c77d12 100644 --- a/tests/unit/metabolomics/test_spectrum.py +++ b/tests/unit/metabolomics/test_spectrum.py @@ -102,9 +102,45 @@ def __init__(self, id): assert dict_repr["gnps_annotations"] == {"annotation1": "value1"} -def test_to_string(): - spec = Spectrum("spec1", [100, 200], [0.1, 0.2], 150) - assert spec._to_string([1, "a"]) == "1, a" - assert spec._to_string({"key1": 1, "key2": "value2"}) == "key1:1, key2:value2" - assert spec._to_string(100.2) == "100.2" - assert spec._to_string(None) == "" +def test__to_string(): + assert Spectrum._to_string([]) == "" + assert Spectrum._to_string([1, 2.0, "a"]) == "1, 2.0, a" + assert Spectrum._to_string(dict()) == "" + assert Spectrum._to_string({"key1": 1, "key2": "value2"}) == "key1:1, key2:value2" + assert Spectrum._to_string(None) == "" + assert Spectrum._to_string(0) == "0" + assert Spectrum._to_string(0.0) == "0.0" + assert Spectrum._to_string(100.2) == "100.2" + assert Spectrum._to_string(False) == "False" + + +def test_to_tabular(): + """Test the to_tabular method.""" + spec = Spectrum("spec1", [100, 200], [0.1, 0.2], 150, 0, {"info": "test"}) + spec.strains.add(Strain("strain1")) + spec.strains.add(Strain("strain2")) + + tabular_repr = spec.to_tabular() + assert tabular_repr["spectrum_id"] == "spec1" + assert tabular_repr["num_strains_with_spectrum"] == "2" + assert tabular_repr["precursor_mz"] == "150" + assert tabular_repr["rt"] == "0" + assert tabular_repr["molecular_family"] == "" + assert tabular_repr["gnps_id"] == "" + assert tabular_repr["gnps_annotations"] == "" + + # Test with molecular family + class MockMolecularFamily: + def __init__(self, id): + self.id = id + + spec.family = MockMolecularFamily("family1") + + # Test with gnps information + spec.gnps_id = "GNPS0001" + spec.gnps_annotations = {"key1": "value1", "key2": "value2"} + + tabular_repr = spec.to_tabular() + assert tabular_repr["molecular_family"] == "family1" + assert tabular_repr["gnps_id"] == "GNPS0001" + assert tabular_repr["gnps_annotations"] == "key1:value1, key2:value2" diff --git a/tests/unit/scoring/test_link_graph.py b/tests/unit/scoring/test_link_graph.py index 32e73f7f..85ea247c 100644 --- a/tests/unit/scoring/test_link_graph.py +++ b/tests/unit/scoring/test_link_graph.py @@ -140,3 +140,33 @@ def test__links_to_dicts(lg, gcfs, spectra, score): display_limit = 1 table_data = lg._links_to_dicts(display_limit) assert len(table_data) == 1 + + +def test_to_tsv(lg, gcfs, mfs, score, tmp_path): + lg.add_link(gcfs[1], mfs[0], metcalf=score) + + tsv_file = tmp_path / "links.tsv" + lg.to_tsv(tsv_file) + + with open(tsv_file, "r") as f: + lines = f.readlines() + + # Check the header + expected_header_names = [ + "index", + "genomic_object_id", + "genomic_object_type", + "metabolomic_object_id", + "metabolomic_object_type", + "metcalf_score", + "rosetta_score", + ] + assert lines[0].rstrip("\n").split("\t") == expected_header_names + + # Check first link data + expected_line = ["1", "gcf1", "GCF", "spectrum1", "Spectrum", "1.0", ""] + assert lines[1].rstrip("\n").split("\t") == expected_line + + # Check second link data + expected_line = ["2", "gcf2", "GCF", "mf1", "MolecularFamily", "1.0", ""] + assert lines[2].rstrip("\n").split("\t") == expected_line From 642c67c279a7ff4d782726fdf3b0402ba88c3945 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Mon, 2 Dec 2024 18:12:29 +0100 Subject: [PATCH 39/42] change variable name to fix mypy error --- src/nplinker/genomics/bgc.py | 10 +++++----- src/nplinker/metabolomics/spectrum.py | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 77f16543..8decbb81 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -229,17 +229,17 @@ def _to_string(value: Any) -> str: """ # Convert list to comma-separated string if isinstance(value, list): - value = ", ".join(map(str, value)) + formatted_value = ", ".join(map(str, value)) # Convert dict to comma-separated string elif isinstance(value, dict): - value = ", ".join([f"{k}:{v}" for k, v in value.items()]) + formatted_value = ", ".join([f"{k}:{v}" for k, v in value.items()]) # Convert None to empty string elif value is None: - value = "" + formatted_value = "" # Convert anything else to string else: - value = str(value) - return value + formatted_value = str(value) + return formatted_value # CG: why not providing whole product but only amino acid as product monomer? # this property is not used in NPLinker core business. diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index 2674f461..6928a4f1 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -149,14 +149,14 @@ def _to_string(value: Any) -> str: """ # Convert list to comma-separated string if isinstance(value, list): - value = ", ".join(map(str, value)) + formatted_value = ", ".join(map(str, value)) # Convert dict to comma-separated string elif isinstance(value, dict): - value = ", ".join([f"{k}:{v}" for k, v in value.items()]) + formatted_value = ", ".join([f"{k}:{v}" for k, v in value.items()]) # Convert None to empty string elif value is None: - value = "" + formatted_value = "" # Convert anything else to string else: - value = str(value) - return value + formatted_value = str(value) + return formatted_value From 7cd675f9318eb44ff337acc277250983f035d33d Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Mon, 2 Dec 2024 18:32:30 +0100 Subject: [PATCH 40/42] test: trying to fix unit test issue where the spectrum rt is a dict instead of numerical --- tests/unit/metabolomics/test_spectrum.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/unit/metabolomics/test_spectrum.py b/tests/unit/metabolomics/test_spectrum.py index 68c77d12..75f74990 100644 --- a/tests/unit/metabolomics/test_spectrum.py +++ b/tests/unit/metabolomics/test_spectrum.py @@ -72,7 +72,14 @@ def test_has_strain(): def test_to_dict(): """Test the to_dict method.""" - spec = Spectrum("spec1", [100, 200], [0.1, 0.2], 150, 0, {"info": "test"}) + spec = Spectrum( + id="spec1", + mz=[100.0, 200.0], + intensity=[0.1, 0.2], + precursor_mz=150.0, + rt=0.0, + metadata={"info": "test"}, + ) spec.strains.add(Strain("strain1")) spec.strains.add(Strain("strain2")) From 19b6f1e4bb726b8fc9a20915c0ac216e599b7a55 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Mon, 2 Dec 2024 18:57:45 +0100 Subject: [PATCH 41/42] tests: add precursor charge to the test spectra --- tests/unit/metabolomics/test_spectrum.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/tests/unit/metabolomics/test_spectrum.py b/tests/unit/metabolomics/test_spectrum.py index e643e1b7..e81bec30 100644 --- a/tests/unit/metabolomics/test_spectrum.py +++ b/tests/unit/metabolomics/test_spectrum.py @@ -73,14 +73,7 @@ def test_has_strain(): def test_to_dict(): """Test the to_dict method.""" - spec = Spectrum( - id="spec1", - mz=[100.0, 200.0], - intensity=[0.1, 0.2], - precursor_mz=150.0, - rt=0.0, - metadata={"info": "test"}, - ) + spec = Spectrum("spec1", [100, 200], [0.1, 0.2], 150, 1, 0, {"info": "test"}) spec.strains.add(Strain("strain1")) spec.strains.add(Strain("strain2")) @@ -88,7 +81,7 @@ def test_to_dict(): assert dict_repr["spectrum_id"] == "spec1" assert dict_repr["num_strains_with_spectrum"] == 2 assert dict_repr["precursor_mz"] == 150.0 - assert dict_repr["rt"] == 0.0 + assert dict_repr["rt"] == 0 assert dict_repr["molecular_family"] is None assert dict_repr["gnps_id"] is None assert dict_repr["gnps_annotations"] == dict() @@ -124,7 +117,7 @@ def test__to_string(): def test_to_tabular(): """Test the to_tabular method.""" - spec = Spectrum("spec1", [100, 200], [0.1, 0.2], 150, 0, {"info": "test"}) + spec = Spectrum("spec1", [100, 200], [0.1, 0.2], 150, 1, 0, {"info": "test"}) spec.strains.add(Strain("strain1")) spec.strains.add(Strain("strain2")) From 40391fe7a3a71390cf71165ce26c2351056c4359 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 4 Dec 2024 09:52:00 +0100 Subject: [PATCH 42/42] Update src/nplinker/metabolomics/spectrum.py --- src/nplinker/metabolomics/spectrum.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index af4a401c..db9b4c3f 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -138,7 +138,7 @@ def to_tabular(self) -> dict[str, str]: """Convert the Spectrum object to a tabular format. Returns: - dict: A dictionary representing the BGC object in tabular format. + dict: A dictionary representing the Spectrum object in tabular format. The keys can be treated as headers and values are strings in which tabs are removed. This dict can be exported as a TSV file. """