Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add unit test for ensembl.io.genomio.genome_metadata.extend module #308

Merged
merged 15 commits into from
Mar 6, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ process AMEND_GENOME_DATA {
'''
genome_metadata_extend --genome_infile !{genome_json} \
--report_file !{asm_report} \
--genbank_infile !{genbank_gbff} \
--genbank_file !{genbank_gbff} \
--genome_outfile !{output}

schemas_json_validate --json_file !{output} --json_schema genome
Expand Down
116 changes: 47 additions & 69 deletions src/python/ensembl/io/genomio/genome_metadata/extend.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,20 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Add more metadata to the genome metadata file, including added seq_regions (e.g. MT chromosome)."""
"""Update a genome metadata file to include additional sequence regions (e.g. MT chromosome)."""

__all__ = [
"MissingDataError",
"get_additions",
"get_gbff_regions",
"get_report_regions_names",
"amend_genomic_metadata",
"amend_genome_metadata",
]

import csv
from os import PathLike
from pathlib import Path
import re
from typing import List, Tuple, Optional
from typing import Dict, List, Tuple, Optional

from Bio import SeqIO

Expand All @@ -38,17 +37,7 @@
_VERSION_END = re.compile(r"\.\d+$")


class MissingDataError(Exception):
"""Used if some data is missing from the report file."""

def __init__(self, report_path: PathLike, accession: str, msg: str):
report_msg = f"Can't get data for {accession} in report {report_path}"
if msg:
report_msg = f"{report_msg}: {msg}"
self.msg = report_msg


def get_additions(report_path: Path, gbff_path: Optional[Path]) -> List[str]:
def get_additions(report_path: PathLike, gbff_path: Optional[PathLike]) -> List[str]:
"""Returns all `seq_regions` that are mentioned in the report but that are not in the data.

Args:
Expand All @@ -57,81 +46,74 @@ def get_additions(report_path: Path, gbff_path: Optional[Path]) -> List[str]:
"""
gbff_regions = set(get_gbff_regions(gbff_path))
report_regions = get_report_regions_names(report_path)

additions = []
for rep_seq in report_regions:
(rs_seq, gb_seq) = rep_seq
if rs_seq not in gbff_regions and gb_seq not in gbff_regions:
if rs_seq:
additions.append(rs_seq)
for seq_region_name in report_regions:
(genbank_seq_name, refseq_seq_name) = seq_region_name
if genbank_seq_name not in gbff_regions and refseq_seq_name not in gbff_regions:
if refseq_seq_name:
additions.append(refseq_seq_name)
else:
additions.append(gb_seq)
additions.append(genbank_seq_name)
additions = sorted(additions)
return additions


def get_gbff_regions(gbff_path: Optional[Path]) -> List[str]:
"""Returns the `seq_region` data from the GBFF file.
def get_gbff_regions(gbff_path: Optional[PathLike]) -> List[str]:
"""Returns the `seq_region` data from a GBFF file.

Args:
gbff_path: Gbff file path to use.
gbff_path: GBFF file path to use.
"""
if not gbff_path:
return []

seq_regions = []
with open_gz_file(gbff_path) as gbff_file:
for record in SeqIO.parse(gbff_file, "genbank"):
record_id = re.sub(_VERSION_END, "", record.id)
seq_regions.append(record_id)
if gbff_path:
with open_gz_file(gbff_path) as gbff_file:
for record in SeqIO.parse(gbff_file, "genbank"):
record_id = re.sub(_VERSION_END, "", record.id)
seq_regions.append(record_id)
return seq_regions


def _report_to_csv(report_path: Path) -> Tuple[str, dict]:
"""Returns an assembly report as a CSV string, and the head metadata as a dict.
def _report_to_csv(report_path: PathLike) -> Tuple[str, Dict]:
"""Returns the assembly report as a CSV string, and its metadata as a dictionary.

Args:
report_path: Path to a `seq_region` file from INSDC/RefSeq.

report_path: Path to the assembly report file from INSDC/RefSeq.
"""
data = ""
metadata = {}
with report_path.open("r") as report:
last_head = ""
with Path(report_path).open("r") as report:
prev_line = ""
for line in report:
# Ignore header
if line.startswith("#"):
# Get metadata values if possible
match = re.search("# (.+?): (.+?)$", line)
match = re.search(r"^#\s*([^:]+?):\s+(.+?)\s*$", line)
if match:
metadata[match.group(1)] = match.group(2)
last_head = line
continue
if last_head:
data += last_head[2:].strip() + "\n"
last_head = ""
data += line
prev_line = line
else:
if prev_line:
# Add previous line as header of CSV string, removing the initial "# "
data += prev_line[2:].strip() + "\n"
prev_line = ""
data += line
return data, metadata


def get_report_regions_names(report_path: Path) -> List[Tuple[str, str]]:
"""Returns a list of `seq_region` names from the report file.
def get_report_regions_names(report_path: PathLike) -> List[Tuple[str, str]]:
"""Returns a list of GenBank-RefSeq `seq_region` names from the assembly report file.

Args:
report_path: Path to the seq_regions report from INSDC/RefSeq.
report_path: Path to the assembly report file from INSDC/RefSeq.
"""
# Get the report in a CSV format, easier to manipulate
report_csv, _ = _report_to_csv(report_path)

# Feed the csv string to the CSV reader
# Feed the CSV string to the CSV reader
reader = csv.DictReader(report_csv.splitlines(), delimiter="\t", quoting=csv.QUOTE_NONE)

# Create the seq_regions
seq_regions = []
for row in reader:
refseq_name = row["RefSeq-Accn"]
genbank_name = row["GenBank-Accn"]

if refseq_name == "na":
refseq_name = ""
if genbank_name == "na":
Expand All @@ -142,55 +124,51 @@ def get_report_regions_names(report_path: Path) -> List[Tuple[str, str]]:
return seq_regions


def amend_genomic_metadata(
def amend_genome_metadata(
genome_infile: PathLike,
genome_outfile: PathLike,
report_file: Optional[PathLike] = None,
genbank_infile: Optional[PathLike] = None,
genbank_file: Optional[PathLike] = None,
) -> None:
"""
Args:
genome_infile: Genome data following the src/python/ensembl/io/genomio/data/schemas/genome.json.
genome_outfile: Amended genome data file.
genome_infile: Genome metadata following the `src/python/ensembl/io/genomio/data/schemas/genome.json`.
genome_outfile: Amended genome metadata file.
report_file: INSDC/RefSeq sequences report file.
genbank_infile: INSDC/RefSeq GBFF file.
genbank_file: INSDC/RefSeq GBFF file.
"""
genome_metadata = get_json(genome_infile)

# Get additional sequences in the assembly but not in the data
if report_file:
gbff_path = Path(genbank_infile) if genbank_infile else None
additions = get_additions(Path(report_file), gbff_path)
genbank_path = Path(genbank_file) if genbank_file else None
additions = get_additions(report_file, genbank_path)
if additions:
genome_metadata["added_seq"] = {"region_name": additions}

# Print out the file
genome_outfile = Path(genome_outfile)
print_json(genome_outfile, genome_metadata)


def main() -> None:
"""Module's entry-point."""
parser = ArgumentParser(
description="Update genome metadata file to include additional sequence regions (e.g. MT chromosome)."
)
parser = ArgumentParser(description=__doc__)
parser.add_argument_src_path(
"--genome_infile",
required=True,
help="Input genome file (following the src/python/ensembl/io/genomio/data/schemas/genome.json)",
help="Input genome metadata file (following src/python/ensembl/io/genomio/data/schemas/genome.json)",
)
parser.add_argument_dst_path(
"--genome_outfile", required=True, help="Path to the new amended genome metadata file"
)
parser.add_argument_src_path("--report_file", help="INSDC/RefSeq sequences report file")
parser.add_argument_src_path("--genbank_infile", help="INSDC/RefSeq GBFF file")
parser.add_argument_src_path("--genbank_file", help="INSDC/RefSeq GBFF file")
parser.add_log_arguments()
args = parser.parse_args()
init_logging_with_args(args)

amend_genomic_metadata(
amend_genome_metadata(
genome_infile=args.genome_infile,
genome_outfile=args.genome_outfile,
report_file=args.report_file,
genbank_infile=args.genbank_infile,
genbank_file=args.genbank_file,
)
Loading