diff --git a/pipelines/nextflow/modules/genome_metadata/amend_genome_data.nf b/pipelines/nextflow/modules/genome_metadata/amend_genome_data.nf index 15c416f42..4e0df9cc9 100644 --- a/pipelines/nextflow/modules/genome_metadata/amend_genome_data.nf +++ b/pipelines/nextflow/modules/genome_metadata/amend_genome_data.nf @@ -29,7 +29,7 @@ process AMEND_GENOME_DATA { ''' genome_metadata_extend --genome_infile !{genome_json} \ --report_file !{asm_report} \ - --genbank_infile !{genbank_gbff} \ + --genbank_file !{genbank_gbff} \ --genome_outfile !{output} schemas_json_validate --json_file !{output} --json_schema genome diff --git a/src/python/ensembl/io/genomio/genome_metadata/extend.py b/src/python/ensembl/io/genomio/genome_metadata/extend.py index 22c102508..7be556952 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/extend.py +++ b/src/python/ensembl/io/genomio/genome_metadata/extend.py @@ -12,21 +12,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Add more metadata to the genome metadata file, including added seq_regions (e.g. MT chromosome).""" +"""Update a genome metadata file to include additional sequence regions (e.g. MT chromosome).""" __all__ = [ - "MissingDataError", "get_additions", "get_gbff_regions", "get_report_regions_names", - "amend_genomic_metadata", + "amend_genome_metadata", ] import csv from os import PathLike from pathlib import Path import re -from typing import List, Tuple, Optional +from typing import Dict, List, Tuple, Optional from Bio import SeqIO @@ -38,17 +37,7 @@ _VERSION_END = re.compile(r"\.\d+$") -class MissingDataError(Exception): - """Used if some data is missing from the report file.""" - - def __init__(self, report_path: PathLike, accession: str, msg: str): - report_msg = f"Can't get data for {accession} in report {report_path}" - if msg: - report_msg = f"{report_msg}: {msg}" - self.msg = report_msg - - -def get_additions(report_path: Path, gbff_path: Optional[Path]) -> List[str]: +def get_additions(report_path: PathLike, gbff_path: Optional[PathLike]) -> List[str]: """Returns all `seq_regions` that are mentioned in the report but that are not in the data. Args: @@ -57,81 +46,74 @@ def get_additions(report_path: Path, gbff_path: Optional[Path]) -> List[str]: """ gbff_regions = set(get_gbff_regions(gbff_path)) report_regions = get_report_regions_names(report_path) - additions = [] - for rep_seq in report_regions: - (rs_seq, gb_seq) = rep_seq - if rs_seq not in gbff_regions and gb_seq not in gbff_regions: - if rs_seq: - additions.append(rs_seq) + for seq_region_name in report_regions: + (genbank_seq_name, refseq_seq_name) = seq_region_name + if genbank_seq_name not in gbff_regions and refseq_seq_name not in gbff_regions: + if refseq_seq_name: + additions.append(refseq_seq_name) else: - additions.append(gb_seq) + additions.append(genbank_seq_name) additions = sorted(additions) return additions -def get_gbff_regions(gbff_path: Optional[Path]) -> List[str]: - """Returns the `seq_region` data from the GBFF file. +def get_gbff_regions(gbff_path: Optional[PathLike]) -> List[str]: + """Returns the `seq_region` data from a GBFF file. Args: - gbff_path: Gbff file path to use. + gbff_path: GBFF file path to use. """ - if not gbff_path: - return [] - seq_regions = [] - with open_gz_file(gbff_path) as gbff_file: - for record in SeqIO.parse(gbff_file, "genbank"): - record_id = re.sub(_VERSION_END, "", record.id) - seq_regions.append(record_id) + if gbff_path: + with open_gz_file(gbff_path) as gbff_file: + for record in SeqIO.parse(gbff_file, "genbank"): + record_id = re.sub(_VERSION_END, "", record.id) + seq_regions.append(record_id) return seq_regions -def _report_to_csv(report_path: Path) -> Tuple[str, dict]: - """Returns an assembly report as a CSV string, and the head metadata as a dict. +def _report_to_csv(report_path: PathLike) -> Tuple[str, Dict]: + """Returns the assembly report as a CSV string, and its metadata as a dictionary. Args: - report_path: Path to a `seq_region` file from INSDC/RefSeq. - + report_path: Path to the assembly report file from INSDC/RefSeq. """ data = "" metadata = {} - with report_path.open("r") as report: - last_head = "" + with Path(report_path).open("r") as report: + prev_line = "" for line in report: - # Ignore header if line.startswith("#"): # Get metadata values if possible - match = re.search("# (.+?): (.+?)$", line) + match = re.search(r"^#\s*([^:]+?):\s+(.+?)\s*$", line) if match: metadata[match.group(1)] = match.group(2) - last_head = line - continue - if last_head: - data += last_head[2:].strip() + "\n" - last_head = "" - data += line + prev_line = line + else: + if prev_line: + # Add previous line as header of CSV string, removing the initial "# " + data += prev_line[2:].strip() + "\n" + prev_line = "" + data += line return data, metadata -def get_report_regions_names(report_path: Path) -> List[Tuple[str, str]]: - """Returns a list of `seq_region` names from the report file. +def get_report_regions_names(report_path: PathLike) -> List[Tuple[str, str]]: + """Returns a list of GenBank-RefSeq `seq_region` names from the assembly report file. Args: - report_path: Path to the seq_regions report from INSDC/RefSeq. + report_path: Path to the assembly report file from INSDC/RefSeq. """ # Get the report in a CSV format, easier to manipulate report_csv, _ = _report_to_csv(report_path) - - # Feed the csv string to the CSV reader + # Feed the CSV string to the CSV reader reader = csv.DictReader(report_csv.splitlines(), delimiter="\t", quoting=csv.QUOTE_NONE) - # Create the seq_regions seq_regions = [] for row in reader: refseq_name = row["RefSeq-Accn"] genbank_name = row["GenBank-Accn"] - if refseq_name == "na": refseq_name = "" if genbank_name == "na": @@ -142,28 +124,26 @@ def get_report_regions_names(report_path: Path) -> List[Tuple[str, str]]: return seq_regions -def amend_genomic_metadata( +def amend_genome_metadata( genome_infile: PathLike, genome_outfile: PathLike, report_file: Optional[PathLike] = None, - genbank_infile: Optional[PathLike] = None, + genbank_file: Optional[PathLike] = None, ) -> None: """ Args: - genome_infile: Genome data following the src/python/ensembl/io/genomio/data/schemas/genome.json. - genome_outfile: Amended genome data file. + genome_infile: Genome metadata following the `src/python/ensembl/io/genomio/data/schemas/genome.json`. + genome_outfile: Amended genome metadata file. report_file: INSDC/RefSeq sequences report file. - genbank_infile: INSDC/RefSeq GBFF file. + genbank_file: INSDC/RefSeq GBFF file. """ genome_metadata = get_json(genome_infile) - # Get additional sequences in the assembly but not in the data if report_file: - gbff_path = Path(genbank_infile) if genbank_infile else None - additions = get_additions(Path(report_file), gbff_path) + genbank_path = Path(genbank_file) if genbank_file else None + additions = get_additions(report_file, genbank_path) if additions: genome_metadata["added_seq"] = {"region_name": additions} - # Print out the file genome_outfile = Path(genome_outfile) print_json(genome_outfile, genome_metadata) @@ -171,26 +151,24 @@ def amend_genomic_metadata( def main() -> None: """Module's entry-point.""" - parser = ArgumentParser( - description="Update genome metadata file to include additional sequence regions (e.g. MT chromosome)." - ) + parser = ArgumentParser(description=__doc__) parser.add_argument_src_path( "--genome_infile", required=True, - help="Input genome file (following the src/python/ensembl/io/genomio/data/schemas/genome.json)", + help="Input genome metadata file (following src/python/ensembl/io/genomio/data/schemas/genome.json)", ) parser.add_argument_dst_path( "--genome_outfile", required=True, help="Path to the new amended genome metadata file" ) parser.add_argument_src_path("--report_file", help="INSDC/RefSeq sequences report file") - parser.add_argument_src_path("--genbank_infile", help="INSDC/RefSeq GBFF file") + parser.add_argument_src_path("--genbank_file", help="INSDC/RefSeq GBFF file") parser.add_log_arguments() args = parser.parse_args() init_logging_with_args(args) - amend_genomic_metadata( + amend_genome_metadata( genome_infile=args.genome_infile, genome_outfile=args.genome_outfile, report_file=args.report_file, - genbank_infile=args.genbank_infile, + genbank_file=args.genbank_file, ) diff --git a/src/python/tests/genome_metadata/test_extend.py b/src/python/tests/genome_metadata/test_extend.py new file mode 100644 index 000000000..7ba9c615d --- /dev/null +++ b/src/python/tests/genome_metadata/test_extend.py @@ -0,0 +1,181 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unit testing of `ensembl.io.genomio.genome_metadata.extend` module. + +Typical usage example:: + $ pytest test_extend.py + +""" + +from pathlib import Path +from typing import Callable, Dict, List, Tuple + +from deepdiff import DeepDiff +import pytest + +from ensembl.io.genomio.genome_metadata import extend + + +@pytest.mark.dependency(name="test_get_gbff_regions") +@pytest.mark.parametrize( + "gbff_file, output", + [ + pytest.param("", [], id="No GBFF file"), + pytest.param("sequences.gbff", ["CP089274", "CP089275", "RefChr0002"], id="sequences.gbff"), + ], +) +def test_get_gbff_regions(data_dir: Path, gbff_file: str, output: List[str]) -> None: + """Tests the `extend.get_gbff_regions()` method. + + Args: + data_dir: Module's test data directory fixture. + gbff_file: GBFF file name. + output: Expected list of sequence region IDs. + """ + if gbff_file: + gbff_path = data_dir / gbff_file + else: + gbff_path = None + result = extend.get_gbff_regions(gbff_path) + assert result == output + + +@pytest.mark.dependency(name="test_report_to_csv") +@pytest.mark.parametrize( + "report_file, output", + [ + pytest.param( + "no_metadata_report.txt", + ("1\t1\tChromosome\tCP089274.1\tRefChr0001.1\t5935961", {}), + id="no_metadata_report.txt", + ), + pytest.param( + "assembly_report.txt", + ( + ( + "Name\tMolecule\tLocation\tGenBank-Accn\tRefSeq-Accn\tLength\n" + "1\t1\tChromosome\tCP089274.1\tRefChr0001.1\t5935961\n" + "2\t2\tChromosome\tCP089275.1\tna\t5880203\n3\t3\tChromosome\tna\tRefChr0002.1\t5901247" + ), + {"Assembly name": "GCA000002765", "Organism name": "Plasmodium falciparum", "Taxid": "36329"}, + ), + id="assembly_report.txt", + ), + ], +) +def test_report_to_csv(data_dir: Path, report_file: str, output: Tuple[str, Dict]) -> None: + """Tests the `extend._report_to_csv()` method. + + Args: + data_dir: Module's test data directory fixture. + report_file: Assembly report file name. + output: Expected returned value for the given assembly report file. + """ + report_path = data_dir / report_file + result = extend._report_to_csv(report_path) + assert result[0] == output[0] + assert not DeepDiff(result[1], output[1]) + + +@pytest.mark.dependency(name="test_get_report_regions_names", depends=["test_report_to_csv"]) +@pytest.mark.parametrize( + "report_file, output", + [ + pytest.param( + "assembly_report.txt", + [("CP089274", "RefChr0001"), ("CP089275", ""), ("", "RefChr0002")], + id="assembly_report.txt", + ), + ], +) +def test_get_report_regions_names(data_dir: Path, report_file: str, output: List[Tuple[str, str]]) -> None: + """Tests the `extend.get_report_regions_names()` method. + + Args: + data_dir: Module's test data directory fixture. + report_file: Assembly report file name. + output: Expected returned value for the given assembly report file. + """ + report_path = data_dir / report_file + result = extend.get_report_regions_names(report_path) + assert result == output + + +@pytest.mark.dependency( + name="test_get_additions", depends=["test_get_gbff_regions", "test_get_report_regions_names"] +) +@pytest.mark.parametrize( + "report_file, gbff_file, output", + [ + pytest.param( + "assembly_report.txt", "", ["CP089275", "RefChr0001", "RefChr0002"], id="Additional regions found" + ), + pytest.param("assembly_report.txt", "sequences.gbff", [], id="No additional regions"), + ], +) +def test_get_additions(data_dir: Path, report_file: str, gbff_file: str, output: List[str]) -> None: + """Tests the `extend.get_additions()` method. + + Args: + data_dir: Module's test data directory fixture. + report_file: Assembly report file name. + gbff_path: GBFF file name. + output: Expected sequence regions names that need to be added. + """ + report_path = data_dir / report_file + gbff_path = data_dir / gbff_file if gbff_file else None + result = extend.get_additions(report_path, gbff_path) + assert result == output + + +@pytest.mark.dependency(depends=["test_get_additions"]) +@pytest.mark.parametrize( + "genome_infile, report_file, genbank_file, output_file", + [ + pytest.param("genome.json", "", "", "genome.json", id="No report file"), + pytest.param( + "genome.json", "assembly_report.txt", "", "updated_genome.json", id="Additional seq regions" + ), + pytest.param( + "genome.json", "assembly_report.txt", "sequences.gbff", "genome.json", id="No additional regions" + ), + ], +) +def test_amend_genome_metadata( + tmp_path: Path, + data_dir: Path, + assert_files: Callable[[Path, Path], None], + genome_infile: str, + report_file: str, + genbank_file: str, + output_file: str, +) -> None: + """Tests the `extend.amend_genome_metadata()` method. + + Args: + tmp_path: Test's unique temporary directory fixture. + data_dir: Module's test data directory fixture. + assert_files: File diff assertion fixture. + genome_infile: Input genome metadata file. + report_file: INSDC/RefSeq sequences report file. + genbank_file: INSDC/RefSeq GBFF file. + output_file: Expected amended genome metadata file. + """ + genome_inpath = data_dir / genome_infile + report_path = data_dir / report_file if report_file else None + genbank_path = data_dir / genbank_file if genbank_file else None + genome_outpath = tmp_path / "genome.out" + extend.amend_genome_metadata(genome_inpath, genome_outpath, report_path, genbank_path) + assert_files(genome_outpath, data_dir / output_file) diff --git a/src/python/tests/genome_metadata/test_extend/assembly_report.txt b/src/python/tests/genome_metadata/test_extend/assembly_report.txt new file mode 100644 index 000000000..ed0234104 --- /dev/null +++ b/src/python/tests/genome_metadata/test_extend/assembly_report.txt @@ -0,0 +1,8 @@ +# Assembly name: GCA000002765 +# Organism name: Plasmodium falciparum +# Taxid: 36329 +# +# Name Molecule Location GenBank-Accn RefSeq-Accn Length +1 1 Chromosome CP089274.1 RefChr0001.1 5935961 +2 2 Chromosome CP089275.1 na 5880203 +3 3 Chromosome na RefChr0002.1 5901247 \ No newline at end of file diff --git a/src/python/tests/genome_metadata/test_extend/genome.json b/src/python/tests/genome_metadata/test_extend/genome.json new file mode 100644 index 000000000..77d2f895d --- /dev/null +++ b/src/python/tests/genome_metadata/test_extend/genome.json @@ -0,0 +1,20 @@ +{ + "BRC4": { + "component": "PlasmoDB", + "organism_abbrev": "pfal3D7" + }, + "assembly": { + "accession": "GCA_000002765.1", + "provider_name": "RefSeq", + "provider_url": "https://www.ncbi.nlm.nih.gov/refseq", + "version": 1 + }, + "genebuild": { + "start_date": "2023-10-17", + "version": "2023-10-17" + }, + "species": { + "scientific_name": "Plasmodium falciparum", + "taxonomy_id": 36329 + } +} \ No newline at end of file diff --git a/src/python/tests/genome_metadata/test_extend/no_metadata_report.txt b/src/python/tests/genome_metadata/test_extend/no_metadata_report.txt new file mode 100644 index 000000000..9ad47af49 --- /dev/null +++ b/src/python/tests/genome_metadata/test_extend/no_metadata_report.txt @@ -0,0 +1 @@ +1 1 Chromosome CP089274.1 RefChr0001.1 5935961 \ No newline at end of file diff --git a/src/python/tests/genome_metadata/test_extend/sequences.gbff b/src/python/tests/genome_metadata/test_extend/sequences.gbff new file mode 100644 index 000000000..411fb75d4 --- /dev/null +++ b/src/python/tests/genome_metadata/test_extend/sequences.gbff @@ -0,0 +1,116 @@ +LOCUS CP089274 1047 bp DNA linear INV 14-OCT-2019 +DEFINITION Plasmodium falciparum 3D7 genome assembly, chromosome: MIT. +ACCESSION CP089274 +VERSION CP089274.1 +DBLINK BioProject: PRJNA13173 + BioSample: SAMN00102897 +KEYWORDS . +SOURCE Plasmodium falciparum 3D7 + ORGANISM Plasmodium falciparum 3D7 + Eukaryota; Sar; Alveolata; Apicomplexa; Aconoidasida; Haemosporida; + Plasmodiidae; Plasmodium; Plasmodium (Laverania). +REFERENCE 1 + CONSRTM Pathogen Informatics + TITLE Direct Submission + JOURNAL Submitted (28-JUN-2019) WTSI, Pathogen Informatics, Wellcome Trust + Sanger Institute, CB10 1SA, United Kingdom +FEATURES Location/Qualifiers + source 1..1047 + /organism="Plasmodium falciparum 3D7" + /mol_type="genomic DNA" + /isolate="3D7" + /db_xref="taxon:36329" + /chromosome="MIT" +ORIGIN + 1 aagcttttgg tatctcgtaa tgtagaacaa tattgagttg accgtcaaat ccttttcatt + 61 aaaagagtgg attaaatgcc cagccaacac catccaattt gattgggaat tatctgtgtt + 121 acaaattttt gatcccaggc tggtaaaaaa tgtaaacttt tagcccataa gaatagaaac + 181 agatgccagg ccaataactc aaacagagct atgacgctat caatttttag caagacggat + 241 aaatttttca tagaacttaa cgtatcatca tccatgcaaa gataaaacgg tagataggga + 301 acaaactgcc tcaagacgtt cttaacccag ctcacgcatc gcttctaacg gtgaactctc + 361 attccaatgg aaccttgttc aagttcaaat agattggtaa ggtatagtgt ttactatcaa + 421 atgaaacaat gtgttccacc gctagtgttt gcttctaaca ttccacttgc ttataactgt + 481 atggacgtaa cctccaggca aagaaaatga ccggtcaaaa cggaatcaat taactatgga + 541 tagctgatac tatcaattta tcattactca agtcagcata gtatatatga aggtttctat + 601 ggaaacacac ttcccttctc gccatttgat agcggttaac ctttcctttt ccttacgtac + 661 tctagctatg aacacaattg tctattcgta caattattca tatatatatt tgaaacagga + 721 catacatgtt catttattct gaatagaata agaactctat aaataaccag actatttcaa + 781 caaaatgcca atataaaatt gtaatttgat cagtgtgagg tataacaata tatgatatac + 841 cgaaagaatt tataaaccat tcggtagaag tatcatatat ttctattatt cttataaagt + 901 atattattaa taataataaa cctattacta catgagaaaa atgtaatcct gtaacacaat + 961 aaaataatgt agtatataca gtatcattta tatgatatga taaatgtaaa tactctgtag + 1021 tttgtagaga tgcaaaacat tctccta +// +LOCUS CP089275 1070 bp DNA linear INV 02-DEC-2019 +DEFINITION Plasmodium falciparum 3D7 genome assembly, chromosome: API. +ACCESSION CP089275 +VERSION CP089275.1 +DBLINK BioProject: PRJNA13173 + BioSample: SAMN00102897 +KEYWORDS . +SOURCE Plasmodium falciparum 3D7 + ORGANISM Plasmodium falciparum 3D7 + Eukaryota; Sar; Alveolata; Apicomplexa; Aconoidasida; Haemosporida; + Plasmodiidae; Plasmodium; Plasmodium (Laverania). +REFERENCE 1 + CONSRTM Pathogen Informatics + TITLE Direct Submission + JOURNAL Submitted (28-JUN-2019) WTSI, Pathogen Informatics, Wellcome Trust + Sanger Institute, CB10 1SA, United Kingdom +FEATURES Location/Qualifiers + source 1..1070 + /organism="Plasmodium falciparum 3D7" + /mol_type="genomic DNA" + /isolate="3D7" + /db_xref="taxon:36329" + /chromosome="API" +ORIGIN + 1 atgataaaat ttttaaaacc taaaataaaa atattaaaaa aattaaatat acctttttta + 61 ttatatttat ctagtaaata taattataaa tatttaaatt ataaaatttc atataaatct + 121 tattttgatt taaaattaaa atttattaga tatatatgtt ataattattg tataacatat + 181 aaaaaatatt tatattattt gaataaaata gataataaaa atataaatat tttatatttt + 241 aaattattaa aaatattaga attaagattg gatatatttt tagttaatat aggttttttt + 301 aaaactatat tgcaatcaag gtattatatt aaatataaaa atatttatat taataatatt + 361 ataaataaat attataatat taatttaaaa aataatgata ttttattttt taataataaa + 421 ataaaatata taatattaaa aaatttaatt tataaatata atatttatat ttacatatct + 481 aatttatata aatataattt tattaaaata tatagttata ataaatattt tataatatgt + 541 atttataatt ttaaaattaa aatattaaat ataaataata tattaaataa tatattatat + 601 atttataatg atatatatta tatataatta atagttttta tgataaatat aatctaatgg + 661 ttaagatgaa gaattgtggt ttcttttata tgagttcaaa tctctttatt tatctgttaa + 721 atataaaaat ttaatgatat aacttaattg ataaagtaaa taattgcaaa ttattatatt + 781 tcagtttgaa tctgaatatc atttaaagag agatatggtg aaatttggta tacacaatgg + 841 acttaaaata atttgagtta attattatta atattaaatt tttaagaaaa tatataatat + 901 atttttttaa attctgtaat atattttaaa atttatatat tcaaaagact ttatttataa + 961 aaagtctaaa tttattaaga aaatccatta acattattgt tgtaagggtt caaatccctt + 1021 tatctctaac tataacattt atagctaagt ggtcgaaagc aatggactca +// +LOCUS RefChr0002 192 bp DNA linear INV 11-NOV-2019 +DEFINITION Plasmodium falciparum 3D7 genome assembly, chromosome: API. +ACCESSION RefChr0002 +VERSION RefChr0002.1 +DBLINK BioProject: PRJNA13173 + BioSample: SAMN00102897 +KEYWORDS . +SOURCE Plasmodium falciparum 3D7 + ORGANISM Plasmodium falciparum 3D7 + Eukaryota; Sar; Alveolata; Apicomplexa; Aconoidasida; Haemosporida; + Plasmodiidae; Plasmodium; Plasmodium (Laverania). +REFERENCE 1 + CONSRTM Pathogen Informatics + TITLE Direct Submission + JOURNAL Submitted (28-JUN-2019) WTSI, Pathogen Informatics, Wellcome Trust + Sanger Institute, CB10 1SA, United Kingdom +COMMENT On Jan 10, 2020 this sequence version replaced LN999985.1. +FEATURES Location/Qualifiers + source 1..192 + /organism="Plasmodium falciparum 3D7" + /mol_type="genomic DNA" + /isolate="3D7" + /db_xref="taxon:36329" + /chromosome="NUC" +ORIGIN + 1 tcagtttgaa tctgaatatc atttaaagag agatatggtg aaatttggta tacacaatgg + 61 atttataatg atatatatta tatataatta atagttttta tgataaatat aatctaatgg + 121 tattttgatt taaaattaaa atttattaga tatatatgtt ataattattg tataacatat + 181 aaaaaatatt ta +// \ No newline at end of file diff --git a/src/python/tests/genome_metadata/test_extend/updated_genome.json b/src/python/tests/genome_metadata/test_extend/updated_genome.json new file mode 100644 index 000000000..abd08f0a4 --- /dev/null +++ b/src/python/tests/genome_metadata/test_extend/updated_genome.json @@ -0,0 +1,27 @@ +{ + "BRC4": { + "component": "PlasmoDB", + "organism_abbrev": "pfal3D7" + }, + "added_seq": { + "region_name": [ + "CP089275", + "RefChr0001", + "RefChr0002" + ] + }, + "assembly": { + "accession": "GCA_000002765.1", + "provider_name": "RefSeq", + "provider_url": "https://www.ncbi.nlm.nih.gov/refseq", + "version": 1 + }, + "genebuild": { + "start_date": "2023-10-17", + "version": "2023-10-17" + }, + "species": { + "scientific_name": "Plasmodium falciparum", + "taxonomy_id": 36329 + } +} \ No newline at end of file