Ensembl · MatBarba · Mar 22, 2024 · Mar 19, 2024 · Mar 19, 2024 · Mar 19, 2024
diff --git a/pipelines/nextflow/modules/genome_metadata/accession_metadata.nf b/pipelines/nextflow/modules/genome_metadata/accession_metadata.nf
@@ -0,0 +1,30 @@
+// See the NOTICE file distributed with this work for additional information
+// regarding copyright ownership.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+process ACCESSION_METADATA {
+    tag "$input_json"
+    label 'local'
+
+    input:
+        path(input_json)
+
+    output:
+        tuple env(accession), path(input_json)
+
+    shell:
+        '''
+        accession=$(jq -r '.["assembly"]["accession"]' !{input_json})
+        '''
+}
diff --git a/pipelines/nextflow/modules/genome_metadata/datasets_metadata.nf b/pipelines/nextflow/modules/genome_metadata/datasets_metadata.nf
@@ -0,0 +1,40 @@
+// See the NOTICE file distributed with this work for additional information
+// regarding copyright ownership.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+process DATASETS_METADATA {
+    tag "$accession"
+    label 'local'
+    label 'cached'
+
+    input:
+        val(accession)
+
+    output:
+        tuple val(accession), path("ncbi_meta.json")
+
+    shell:
+        '''
+        datasets summary genome accession !{accession} | jq '.' > ncbi_meta.json
+        if [ ! -s "ncbi_meta.json" ]; then
+            echo "No Metadata from datasets for !{accession}"
+            exit 1
+        fi
+        '''
+
+    stub:
+        """
+        echo '{"reports":[{"organism":{"tax_id":1000}}]}' > ncbi_meta.json
+        """
+}
diff --git a/pipelines/nextflow/modules/genome_metadata/prepare_genome_metadata.nf b/pipelines/nextflow/modules/genome_metadata/prepare_genome_metadata.nf
@@ -14,23 +14,18 @@
 // limitations under the License.
 
 process PREPARE_GENOME_METADATA {
+    tag "$accession"
     label 'local'
 
     input:
-        path(input_json, stageAs: "input_genome.json")
+        tuple val(accession), path(input_json), path(ncbi_json)
 
     output:
-        path ("genome.json"), emit: genomic_dataset
+        tuple val(accession), path("genome.json"), emit: genomic_dataset
 
     shell:
     output_json = "genome.json"
     '''
-    genome_metadata_prepare --input_file !{input_json} --output_file !{output_json}
+    genome_metadata_prepare --input_file !{input_json} --output_file !{output_json} --ncbi_meta !{ncbi_json}
     '''
-
-    stub:
-        output_json = "genome.json"
-        """
-        genome_metadata_prepare --input_file $input_json --output_file $output_json --mock_run
-        """
 }
diff --git a/pipelines/nextflow/subworkflows/genome_prepare/main.nf b/pipelines/nextflow/subworkflows/genome_prepare/main.nf
@@ -77,13 +77,21 @@ workflow GENOME_PREPARE {
         amended_genome = AMEND_GENOME_DATA(genome_data_files).amended_json
 
         // Group files
-        prepared_files = amended_genome.concat(
+        prepared_files = amended_genome.mix(
+                seq_region,
+                fasta_dna,
                 gene_models,
                 functional_annotation,
                 fasta_pep,
-                seq_region,
-                fasta_dna
             )
+            .map{ meta, file ->
+                key = meta
+                if (meta["has_annotation"]) {
+                    key = groupKey(meta, 6)
+                } else {
+                    key = groupKey(meta, 3)
+                }
+                [key, file] }
             .groupTuple()
 
         // Checks and generate sequence stats for manifest

diff --git a/pipelines/nextflow/workflows/genome_prepare/main.nf b/pipelines/nextflow/workflows/genome_prepare/main.nf
@@ -32,6 +32,8 @@ if (params.brc_mode) {
 include { GENOME_PREPARE } from '../../subworkflows/genome_prepare/main.nf'
 // Import module
 include { PREPARE_GENOME_METADATA } from '../../modules/genome_metadata/prepare_genome_metadata.nf'
+include { DATASETS_METADATA } from '../../modules/genome_metadata/datasets_metadata.nf'
+include { ACCESSION_METADATA } from '../../modules/genome_metadata/accession_metadata.nf'
 // Utilities
 include { read_json } from '../../modules/utils/utils.nf'
 
@@ -49,22 +51,31 @@ def meta_from_genome_json(json_path) {
         prod_name = data.species.production_name
         publish_dir = prod_name
     }
+    has_annotation = false
+    if (data.annotation) {
+        has_annotation = true
+    }
 
     return [
+        // id: prod_name,
         accession: data.assembly.accession,
         production_name: prod_name,
         publish_dir: publish_dir,
         prefix: "",
+        has_annotation: has_annotation,
     ]
 }
 
 // Run main workflow
 workflow {
     ch_genome_json = Channel.fromPath("${params.input_dir}/*.json", checkIfExists: true)
-    PREPARE_GENOME_METADATA(ch_genome_json)
+    accession_meta = ACCESSION_METADATA(ch_genome_json)
+    accession_val = accession_meta.map{ accession, meta_file -> accession }
+    dataset_report = DATASETS_METADATA(accession_val)
+    PREPARE_GENOME_METADATA(accession_meta.join(dataset_report))
 
     PREPARE_GENOME_METADATA.out.genomic_dataset
-        .map{ json_file -> tuple(meta_from_genome_json(json_file), json_file) }
+        .map{ accession, json_file -> tuple(meta_from_genome_json(json_file), json_file) }
         .set { genome_metadata }
 
     GENOME_PREPARE(genome_metadata)

diff --git a/src/python/ensembl/io/genomio/genome_metadata/prepare.py b/src/python/ensembl/io/genomio/genome_metadata/prepare.py
@@ -21,21 +21,15 @@
     "add_assembly_version",
     "add_genebuild_metadata",
     "add_species_metadata",
-    "get_taxonomy_from_accession",
     "prepare_genome_metadata",
     "PROVIDER_DATA",
-    "DEFAULT_API_URL",
     "MissingNodeError",
     "MetadataError",
 ]
 
 import datetime
 from os import PathLike
-from typing import Any, Dict, Optional
-from xml.etree import ElementTree
-from xml.etree.ElementTree import Element
-
-import requests
+from typing import Dict
 
 from ensembl.io.genomio.utils import get_json, print_json
 from ensembl.utils.argparse import ArgumentParser
@@ -64,7 +58,6 @@
         },
     },
 }
-DEFAULT_API_URL = "https://www.ebi.ac.uk/ena/browser/api/xml"
 
 
 class MissingNodeError(Exception):
@@ -75,15 +68,15 @@ class MetadataError(Exception):
     """When a metadata value is not expected."""
 
 
-def add_provider(genome_metadata: Dict, gff3_file: Optional[PathLike] = None) -> None:
+def add_provider(genome_metadata: Dict, ncbi_data: Dict) -> None:
     """Updates the genome metadata adding provider information for assembly and gene models.
 
     Assembly provider metadata will only be added if it is missing, i.e. neither `"provider_name"` or
     `"provider_url"` are present. The gene model metadata will only be added if `gff3_file` is provided.
 
     Args:
         genome_data: Genome information of assembly, accession and annotation.
-        gff3_file: Path to GFF3 file to use as annotation source for this genome.
+        ncbi_data: Report data from datasets for this accession.
 
     Raises:
         MetadataError: If accession's format in genome metadata does not match with a known provider.
@@ -104,7 +97,7 @@ def add_provider(genome_metadata: Dict, gff3_file: Optional[PathLike] = None) ->
         assembly["provider_url"] = f'{provider["assembly"]["provider_url"]}/{accession}'
 
     # Add annotation provider if there are gene models
-    if gff3_file:
+    if "annotation_info" in ncbi_data:
         annotation = genome_metadata.setdefault("annotation", {})
         if ("provider_name" not in annotation) and ("provider_url" not in annotation):
             annotation["provider_name"] = provider["annotation"]["provider_name"]
@@ -141,95 +134,35 @@ def add_genebuild_metadata(genome_data: Dict) -> None:
         genebuild["start_date"] = current_date
 
 
-def add_species_metadata(genome_metadata: Dict, base_api_url: str = DEFAULT_API_URL) -> None:
-    """Adds missing species metadata from its taxonomy based on the genome's accession.
-
-    If `"taxonomy_id"` is already present in the species metadata, nothing is added. The `"taxonomy_id"`,
-    `"scientific_name"` and `"strain"` will be fetched from the taxonomy information linked to the given
-    accession.
+def add_species_metadata(genome_metadata: Dict, ncbi_data: Dict) -> None:
+    """Adds tax_id and name from the NCBI dataset report.
 
     Args:
         genome_metadata: Genome information of assembly, accession and annotation.
-        base_api_url: Base API URL to fetch the taxonomy data from.
+        ncbi_data: Report data from datasets for this accession.
 
     """
     species = genome_metadata.setdefault("species", {})
-    if not "taxonomy_id" in species:
-        accession = genome_metadata["assembly"]["accession"]
-        taxonomy = get_taxonomy_from_accession(accession, base_api_url)
-        species["taxonomy_id"] = taxonomy["taxon_id"]
-        if (not "strain" in species) and ("strain" in taxonomy):
-            species["strain"] = taxonomy["strain"]
-        if not "scientific_name" in species:
-            species["scientific_name"] = taxonomy["scientific_name"]
-
-
-def get_taxonomy_from_accession(accession: str, base_api_url: str = DEFAULT_API_URL) -> Dict[str, Any]:
-    """Returns the taxonomy metadata associated to the given accession.
-
-    Args:
-        accession: INSDC accession ID.
-        base_api_url: Base API URL to fetch the taxonomy data from.
-
-    Returns:
-        Dictionary with key-value pairs for `"taxon_id"` and `"scientific_name"`. `"strain"` will also be
-        included if it is present in the fetched taxonomy data.
-
-    Raises:
-        MissingNodeError: If `"TAXON"` node is missing in the taxonomy data fetched.
-
-    """
-    # Use the GenBank accession without version
-    gb_accession = accession.replace("GCF", "GCA").split(".")[0]
-    if not base_api_url.endswith("/"):
-        base_api_url += "/"
-    response = requests.get(f"{base_api_url}{gb_accession}", timeout=60)
-    response.raise_for_status()
-    entry = ElementTree.fromstring(response.text)
-    taxon_node = entry.find(".//TAXON")
-    if taxon_node is None:
-        raise MissingNodeError("Cannot find the TAXON node")
-    # Fetch taxon ID, scientific_name and strain
-    taxon_id = _get_node_text(taxon_node, "TAXON_ID")
-    scientific_name = _get_node_text(taxon_node, "SCIENTIFIC_NAME")
-    strain = _get_node_text(taxon_node, "STRAIN", optional=True)
-    taxonomy = {
-        # Ignore arg-type check in the following line since taxon_id cannot be None
-        "taxon_id": int(taxon_id),  # type: ignore[arg-type]
-        "scientific_name": scientific_name,
-    }
-    if strain:
-        taxonomy["strain"] = strain
-    return taxonomy
-
-
-def _get_node_text(node: Optional[Element], tag: str, optional: bool = False) -> Optional[str]:
-    """Returns the value of the field matching the provided tag inside `node`.
-
-    If the tag is not present and `optional` is True, returns `None` instead.
+    try:
+        organism = ncbi_data["organism"]
+    except KeyError:
+        return
 
-    Args:
-        node: Node of an XML tree.
-        tag: Tag to fetch within the node.
-        optional: Do not raise an exception if the tag does not exist.
+    if "tax_id" in organism:
+        species.setdefault("taxonomy_id", organism["tax_id"])
+    if "organism_name" in organism:
+        species.setdefault("scientific_name", organism["organism_name"])
 
-    Raises:
-        MissingNodeError: If no node is provided or if the tag is missing (if `optional == False`).
-    """
-    if node is None:
-        raise MissingNodeError(f"No node provided to look for '{tag}'")
-    tag_text = node.findtext(tag)
-    if not optional and (tag_text is None):
-        raise MissingNodeError(f"No node found for tag '{tag}'")
-    return tag_text
+    try:
+        species.setdefault("strain", organism["infraspecific_names"]["strain"])
+    except KeyError:
+        pass
 
 
 def prepare_genome_metadata(
     input_file: PathLike,
     output_file: PathLike,
-    gff3_file: Optional[PathLike] = None,
-    base_api_url: str = DEFAULT_API_URL,
-    mock_run: bool = False,
+    ncbi_meta: PathLike,
 ) -> None:
     """Updates the genome metadata JSON file with additional information.
 
@@ -239,20 +172,19 @@ def prepare_genome_metadata(
     Args:
         input_file: Path to JSON file with genome metadata.
         output_file: Output directory where to generate the final `genome.json` file.
-        gff3_file: Path to GFF3 file to use as annotation source for this genome.
-        base_api_url: Base API URL to fetch the taxonomy data from.
-        mock_run: Do not call external taxonomy service.
+        ncbi_meta: JSON file from NCBI datasets for the accession.
 
     """
     genome_data = get_json(input_file)
+    ncbi_data = {}
+    if ncbi_meta:
+        ncbi_data = get_json(ncbi_meta)["reports"][0]
+
     # Amend any missing metadata
-    add_provider(genome_data, gff3_file)
+    add_provider(genome_data, ncbi_data)
     add_assembly_version(genome_data)
     add_genebuild_metadata(genome_data)
-    if mock_run:
-        genome_data["species"].setdefault("taxonomy_id", 9999999)
-    else:
-        add_species_metadata(genome_data, base_api_url)
+    add_species_metadata(genome_data, ncbi_data)
     # Dump updated genome metadata
     print_json(output_file, genome_data)
 
@@ -264,19 +196,13 @@ def main() -> None:
     parser.add_argument_dst_path(
         "--output_file", required=True, help="Output path for the new genome metadata file"
     )
-    parser.add_argument_src_path("--gff3_file", help="GFF3 file to use as annotation source")
-    parser.add_argument(
-        "--base_api_url", default=DEFAULT_API_URL, help="API URL to fetch the taxonomy data from"
+    parser.add_argument_src_path(
+        "--ncbi_meta", required=True, help="JSON file from NCBI datasets for this accession."
     )
-    parser.add_argument("--mock_run", action="store_true", help="Do not call external APIs")
     parser.add_log_arguments()
     args = parser.parse_args()
     init_logging_with_args(args)
 
     prepare_genome_metadata(
-        input_file=args.input_file,
-        output_file=args.output_file,
-        gff3_file=args.gff3_file,
-        base_api_url=args.base_api_url,
-        mock_run=args.mock_run,
+        input_file=args.input_file, output_file=args.output_file, ncbi_meta=args.ncbi_meta
     )