Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prepare genome metadata with datasets #326

Merged
merged 19 commits into from
Mar 22, 2024
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions pipelines/nextflow/modules/genome_metadata/accession_metadata.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// See the NOTICE file distributed with this work for additional information
// regarding copyright ownership.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

process ACCESSION_METADATA {
tag "$input_json"
label 'local'

input:
path(input_json)

output:
tuple env(accession), path(input_json)

shell:
'''
accession=$(jq -r '.["assembly"]["accession"]' !{input_json})
'''
}
40 changes: 40 additions & 0 deletions pipelines/nextflow/modules/genome_metadata/datasets_metadata.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// See the NOTICE file distributed with this work for additional information
// regarding copyright ownership.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

process DATASETS_METADATA {
tag "$accession"
label 'local'
label 'cached'

input:
val(accession)

output:
tuple val(accession), path("ncbi_meta.json")

shell:
'''
datasets summary genome accession !{accession} | jq '.' > ncbi_meta.json
if [ ! -s "ncbi_meta.json" ]; then
echo "No Metadata from datasets for !{accession}"
exit 1
fi
'''

stub:
"""
echo '{"reports":[{"organism":{"tax_id":1000}}]}' > ncbi_meta.json
"""
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,23 +14,18 @@
// limitations under the License.

process PREPARE_GENOME_METADATA {
tag "$accession"
label 'local'

input:
path(input_json, stageAs: "input_genome.json")
tuple val(accession), path(input_json), path(ncbi_json)

output:
path ("genome.json"), emit: genomic_dataset
tuple val(accession), path("genome.json"), emit: genomic_dataset

shell:
output_json = "genome.json"
'''
genome_metadata_prepare --input_file !{input_json} --output_file !{output_json}
genome_metadata_prepare --input_file !{input_json} --output_file !{output_json} --ncbi_meta !{ncbi_json}
'''

stub:
output_json = "genome.json"
"""
genome_metadata_prepare --input_file $input_json --output_file $output_json --mock_run
"""
}
14 changes: 11 additions & 3 deletions pipelines/nextflow/subworkflows/genome_prepare/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,21 @@ workflow GENOME_PREPARE {
amended_genome = AMEND_GENOME_DATA(genome_data_files).amended_json

// Group files
prepared_files = amended_genome.concat(
prepared_files = amended_genome.mix(
seq_region,
fasta_dna,
gene_models,
functional_annotation,
fasta_pep,
seq_region,
fasta_dna
)
.map{ meta, file ->
key = meta
if (meta["has_annotation"]) {
key = groupKey(meta, 6)
} else {
key = groupKey(meta, 3)
}
[key, file] }
.groupTuple()

// Checks and generate sequence stats for manifest
Expand Down
15 changes: 13 additions & 2 deletions pipelines/nextflow/workflows/genome_prepare/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ if (params.brc_mode) {
include { GENOME_PREPARE } from '../../subworkflows/genome_prepare/main.nf'
// Import module
include { PREPARE_GENOME_METADATA } from '../../modules/genome_metadata/prepare_genome_metadata.nf'
include { DATASETS_METADATA } from '../../modules/genome_metadata/datasets_metadata.nf'
include { ACCESSION_METADATA } from '../../modules/genome_metadata/accession_metadata.nf'
// Utilities
include { read_json } from '../../modules/utils/utils.nf'

Expand All @@ -49,22 +51,31 @@ def meta_from_genome_json(json_path) {
prod_name = data.species.production_name
publish_dir = prod_name
}
has_annotation = false
if (data.annotation) {
has_annotation = true
}

return [
// id: prod_name,
accession: data.assembly.accession,
production_name: prod_name,
publish_dir: publish_dir,
prefix: "",
has_annotation: has_annotation,
]
}

// Run main workflow
workflow {
ch_genome_json = Channel.fromPath("${params.input_dir}/*.json", checkIfExists: true)
PREPARE_GENOME_METADATA(ch_genome_json)
accession_meta = ACCESSION_METADATA(ch_genome_json)
accession_val = accession_meta.map{ accession, meta_file -> accession }
dataset_report = DATASETS_METADATA(accession_val)
PREPARE_GENOME_METADATA(accession_meta.join(dataset_report))

PREPARE_GENOME_METADATA.out.genomic_dataset
.map{ json_file -> tuple(meta_from_genome_json(json_file), json_file) }
.map{ accession, json_file -> tuple(meta_from_genome_json(json_file), json_file) }
.set { genome_metadata }

GENOME_PREPARE(genome_metadata)
Expand Down
134 changes: 30 additions & 104 deletions src/python/ensembl/io/genomio/genome_metadata/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,21 +21,15 @@
"add_assembly_version",
"add_genebuild_metadata",
"add_species_metadata",
"get_taxonomy_from_accession",
"prepare_genome_metadata",
"PROVIDER_DATA",
"DEFAULT_API_URL",
"MissingNodeError",
"MetadataError",
]

import datetime
from os import PathLike
from typing import Any, Dict, Optional
from xml.etree import ElementTree
from xml.etree.ElementTree import Element

import requests
from typing import Dict

from ensembl.io.genomio.utils import get_json, print_json
from ensembl.utils.argparse import ArgumentParser
Expand Down Expand Up @@ -64,7 +58,6 @@
},
},
}
DEFAULT_API_URL = "https://www.ebi.ac.uk/ena/browser/api/xml"


class MissingNodeError(Exception):
Expand All @@ -75,15 +68,15 @@ class MetadataError(Exception):
"""When a metadata value is not expected."""


def add_provider(genome_metadata: Dict, gff3_file: Optional[PathLike] = None) -> None:
def add_provider(genome_metadata: Dict, ncbi_data: Dict) -> None:
"""Updates the genome metadata adding provider information for assembly and gene models.

Assembly provider metadata will only be added if it is missing, i.e. neither `"provider_name"` or
`"provider_url"` are present. The gene model metadata will only be added if `gff3_file` is provided.

Args:
genome_data: Genome information of assembly, accession and annotation.
gff3_file: Path to GFF3 file to use as annotation source for this genome.
ncbi_data: Report data from datasets for this accession.

Raises:
MetadataError: If accession's format in genome metadata does not match with a known provider.
Expand All @@ -104,7 +97,7 @@ def add_provider(genome_metadata: Dict, gff3_file: Optional[PathLike] = None) ->
assembly["provider_url"] = f'{provider["assembly"]["provider_url"]}/{accession}'

# Add annotation provider if there are gene models
if gff3_file:
if "annotation_info" in ncbi_data:
annotation = genome_metadata.setdefault("annotation", {})
if ("provider_name" not in annotation) and ("provider_url" not in annotation):
annotation["provider_name"] = provider["annotation"]["provider_name"]
Expand Down Expand Up @@ -141,95 +134,35 @@ def add_genebuild_metadata(genome_data: Dict) -> None:
genebuild["start_date"] = current_date


def add_species_metadata(genome_metadata: Dict, base_api_url: str = DEFAULT_API_URL) -> None:
"""Adds missing species metadata from its taxonomy based on the genome's accession.

If `"taxonomy_id"` is already present in the species metadata, nothing is added. The `"taxonomy_id"`,
`"scientific_name"` and `"strain"` will be fetched from the taxonomy information linked to the given
accession.
def add_species_metadata(genome_metadata: Dict, ncbi_data: Dict) -> None:
"""Adds tax_id and name from the NCBI dataset report.

Args:
genome_metadata: Genome information of assembly, accession and annotation.
base_api_url: Base API URL to fetch the taxonomy data from.
ncbi_data: Report data from datasets for this accession.

"""
species = genome_metadata.setdefault("species", {})
if not "taxonomy_id" in species:
accession = genome_metadata["assembly"]["accession"]
taxonomy = get_taxonomy_from_accession(accession, base_api_url)
species["taxonomy_id"] = taxonomy["taxon_id"]
if (not "strain" in species) and ("strain" in taxonomy):
species["strain"] = taxonomy["strain"]
if not "scientific_name" in species:
species["scientific_name"] = taxonomy["scientific_name"]


def get_taxonomy_from_accession(accession: str, base_api_url: str = DEFAULT_API_URL) -> Dict[str, Any]:
"""Returns the taxonomy metadata associated to the given accession.

Args:
accession: INSDC accession ID.
base_api_url: Base API URL to fetch the taxonomy data from.

Returns:
Dictionary with key-value pairs for `"taxon_id"` and `"scientific_name"`. `"strain"` will also be
included if it is present in the fetched taxonomy data.

Raises:
MissingNodeError: If `"TAXON"` node is missing in the taxonomy data fetched.

"""
# Use the GenBank accession without version
gb_accession = accession.replace("GCF", "GCA").split(".")[0]
if not base_api_url.endswith("/"):
base_api_url += "/"
response = requests.get(f"{base_api_url}{gb_accession}", timeout=60)
response.raise_for_status()
entry = ElementTree.fromstring(response.text)
taxon_node = entry.find(".//TAXON")
if taxon_node is None:
raise MissingNodeError("Cannot find the TAXON node")
# Fetch taxon ID, scientific_name and strain
taxon_id = _get_node_text(taxon_node, "TAXON_ID")
scientific_name = _get_node_text(taxon_node, "SCIENTIFIC_NAME")
strain = _get_node_text(taxon_node, "STRAIN", optional=True)
taxonomy = {
# Ignore arg-type check in the following line since taxon_id cannot be None
"taxon_id": int(taxon_id), # type: ignore[arg-type]
"scientific_name": scientific_name,
}
if strain:
taxonomy["strain"] = strain
return taxonomy


def _get_node_text(node: Optional[Element], tag: str, optional: bool = False) -> Optional[str]:
"""Returns the value of the field matching the provided tag inside `node`.

If the tag is not present and `optional` is True, returns `None` instead.
try:
organism = ncbi_data["organism"]
except KeyError:
return

Args:
node: Node of an XML tree.
tag: Tag to fetch within the node.
optional: Do not raise an exception if the tag does not exist.
if "tax_id" in organism:
species.setdefault("taxonomy_id", organism["tax_id"])
if "organism_name" in organism:
species.setdefault("scientific_name", organism["organism_name"])

Raises:
MissingNodeError: If no node is provided or if the tag is missing (if `optional == False`).
"""
if node is None:
raise MissingNodeError(f"No node provided to look for '{tag}'")
tag_text = node.findtext(tag)
if not optional and (tag_text is None):
raise MissingNodeError(f"No node found for tag '{tag}'")
return tag_text
try:
species.setdefault("strain", organism["infraspecific_names"]["strain"])
except KeyError:
pass


def prepare_genome_metadata(
input_file: PathLike,
output_file: PathLike,
gff3_file: Optional[PathLike] = None,
base_api_url: str = DEFAULT_API_URL,
mock_run: bool = False,
ncbi_meta: PathLike,
) -> None:
"""Updates the genome metadata JSON file with additional information.

Expand All @@ -239,20 +172,19 @@ def prepare_genome_metadata(
Args:
input_file: Path to JSON file with genome metadata.
output_file: Output directory where to generate the final `genome.json` file.
gff3_file: Path to GFF3 file to use as annotation source for this genome.
base_api_url: Base API URL to fetch the taxonomy data from.
mock_run: Do not call external taxonomy service.
ncbi_meta: JSON file from NCBI datasets for the accession.

"""
genome_data = get_json(input_file)
ncbi_data = {}
if ncbi_meta:
ncbi_data = get_json(ncbi_meta)["reports"][0]

# Amend any missing metadata
add_provider(genome_data, gff3_file)
add_provider(genome_data, ncbi_data)
add_assembly_version(genome_data)
add_genebuild_metadata(genome_data)
if mock_run:
genome_data["species"].setdefault("taxonomy_id", 9999999)
else:
add_species_metadata(genome_data, base_api_url)
add_species_metadata(genome_data, ncbi_data)
# Dump updated genome metadata
print_json(output_file, genome_data)

Expand All @@ -264,19 +196,13 @@ def main() -> None:
parser.add_argument_dst_path(
"--output_file", required=True, help="Output path for the new genome metadata file"
)
parser.add_argument_src_path("--gff3_file", help="GFF3 file to use as annotation source")
parser.add_argument(
"--base_api_url", default=DEFAULT_API_URL, help="API URL to fetch the taxonomy data from"
parser.add_argument_src_path(
"--ncbi_meta", required=True, help="JSON file from NCBI datasets for this accession."
)
parser.add_argument("--mock_run", action="store_true", help="Do not call external APIs")
parser.add_log_arguments()
args = parser.parse_args()
init_logging_with_args(args)

prepare_genome_metadata(
input_file=args.input_file,
output_file=args.output_file,
gff3_file=args.gff3_file,
base_api_url=args.base_api_url,
mock_run=args.mock_run,
input_file=args.input_file, output_file=args.output_file, ncbi_meta=args.ncbi_meta
)
Loading