diff --git a/VERSION b/VERSION index ccbccc3d..c043eea7 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.2.0 +2.2.1 diff --git a/docs/user/RGP/rgpOutputs.md b/docs/user/RGP/rgpOutputs.md index 317230c1..e9a86b33 100644 --- a/docs/user/RGP/rgpOutputs.md +++ b/docs/user/RGP/rgpOutputs.md @@ -25,6 +25,7 @@ The file has the following format : | stop | The stop position of the RGP in the contig. | | length | The length of the RGP in nucleotide | | coordinates | The coordinates of the region. If the region overlap the contig edges will be right with join coordinates syntax (*i.e* 1523..1758,1..57) | +| score | Score of the RGP. | | contigBorder | This is a boolean column. If the RGP is on a contig border it will be True, otherwise, it will be False. This often can indicate that, if an RGP is on a contig border it is probably not complete. | | wholeContig | This is a boolean column. If the RGP is an entire contig, it will be True, and False otherwise. If a RGP is an entire contig it can possibly be a plasmid, a region flanked with repeat sequences or a contaminant. | diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py index e7d8918d..33373341 100644 --- a/ppanggolin/RGP/genomicIsland.py +++ b/ppanggolin/RGP/genomicIsland.py @@ -4,7 +4,7 @@ import logging import argparse from pathlib import Path -from typing import Set, Iterable, List, Tuple +from typing import Set, Iterable # installed libraries from tqdm import tqdm diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index f99c1f46..c4359e34 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -22,7 +22,7 @@ # local libraries from ppanggolin.annotate.synta import ( annotate_organism, - read_fasta, + get_contigs_from_fasta_file, get_dna_sequence, init_contig_counter, contig_counter, @@ -1034,17 +1034,22 @@ def check_chevrons_in_start_and_stop( dbxref_metadata ) - if fields_gff[gff_seqname] in circular_contigs or ( + if ( "IS_CIRCULAR" in attributes and attributes["IS_CIRCULAR"] == "true" ): - # WARNING: In case we have prodigal gff with is_circular attributes. - # This would fail as contig is not defined. However, is_circular should not be found in prodigal gff - logging.getLogger("PPanGGOLiN").debug( - f"Contig {contig.name} is circular." - ) - contig.is_circular = True - assert contig.name == fields_gff[gff_seqname] + contig_name = fields_gff[gff_seqname] + + if contig is not None: + logging.getLogger("PPanGGOLiN").debug( + f"Contig {contig.name} is circular." + ) + contig.is_circular = True + assert contig.name == contig_name + else: + # contig object has not been initialized yet. + # let's keep the circularity info in the circular_contigs list + circular_contigs.append(contig_name) elif fields_gff[gff_type] == "CDS" or "RNA" in fields_gff[gff_type]: @@ -1120,9 +1125,9 @@ def check_chevrons_in_start_and_stop( gene_frame = int(fields_gff[frame]) if ( - gene_id in id_attr_to_gene_id + id_attribute in id_attr_to_gene_id ): # the ID has already been seen at least once in this genome - existing_gene = id_attr_to_gene_id[gene_id] + existing_gene = id_attr_to_gene_id[id_attribute] new_gene_info = { "strand": fields_gff[gff_strand], "type": fields_gff[gff_type], @@ -1141,7 +1146,7 @@ def check_chevrons_in_start_and_stop( gene = Gene(org.name + "_CDS_" + str(gene_counter).zfill(4)) - id_attr_to_gene_id[gene_id] = gene + id_attr_to_gene_id[id_attribute] = gene # here contig is filled in order, so position is the number of genes already stored in the contig. gene.fill_annotations( @@ -1182,9 +1187,6 @@ def check_chevrons_in_start_and_stop( rna_counter += 1 contig.add_rna(rna) - # Correct coordinates of genes that overlap the edge of circulars contig - correct_putative_overlaps(org.contigs) - # Fix partial genes coordinates for contig in org.contigs: for gene in contig.genes: @@ -1201,14 +1203,11 @@ def check_chevrons_in_start_and_stop( has_fasta = False if has_fasta: - contig_sequences = read_fasta( - org, fasta_string.split("\n") - ) # _ is total contig length + contig_sequences = get_contigs_from_fasta_file(org, fasta_string.split("\n")) + + correct_putative_overlaps(org.contigs) + for contig in org.contigs: - if contig.length != len(contig_sequences[contig.name]): - raise ValueError( - "The contig length defined is different than the sequence length" - ) for gene in contig.genes: gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene)) @@ -1220,7 +1219,7 @@ def check_chevrons_in_start_and_stop( add_metadata_from_gff_file(contig_name_to_region_info, org, gff_file_path) if used_transl_table_arg: - logging.getLogger("PPanGGOLiN").info( + logging.getLogger("PPanGGOLiN").debug( f"transl_table tag was not found for {used_transl_table_arg} CDS " f"in {gff_file_path}. Provided translation_table argument value was used instead: {translation_table}." ) @@ -1616,7 +1615,15 @@ def get_gene_sequences_from_fastas( f"your fasta file are different." ) with read_compressed_or_not(Path(elements[1])) as currFastaFile: - fasta_dict[org] = read_fasta(org, currFastaFile) + fasta_dict[org] = get_contigs_from_fasta_file(org, currFastaFile) + + # When dealing with GFF files, some genes may have coordinates extending beyond the actual + # length of contigs, especially when they overlap the edges. This usually needs to be split + # into two parts to handle the circular genome wrapping. + # If the GFF file lacks associated FASTA sequences and it was not possible to determine the + # contig length from the GFF file, we must apply this correction while parsing the external FASTA file. + + correct_putative_overlaps(org.contigs) if set(pangenome.organisms) > set(fasta_dict.keys()): missing = pangenome.number_of_organisms - len( diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index 2b65ea40..f3c74a92 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -9,7 +9,7 @@ from subprocess import Popen, PIPE import ast from collections import defaultdict -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional, Union, Generator, Tuple from pathlib import Path # install libraries @@ -245,49 +245,103 @@ def launch_infernal( return gene_objs -def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str, str]: - """Reads a fna file (or stream, or string) and stores it in a dictionary with contigs as key and sequence as value. +def check_sequence_tuple(name: str, sequence: str): + """ + Checks and validates a sequence name and its corresponding sequence. + + :param name: The name (header) of the sequence, typically extracted from the FASTA file header. + :param sequence: The sequence string corresponding to the name, containing the nucleotide or protein sequence. - :param org: Organism corresponding to fasta file - :param fna_file: Input fasta file with sequences or list of each line as sequence + :return: A tuple containing the validated name and sequence. - :return: Dictionary with contig_name as keys and contig sequence in values + :raises ValueError: + - If the sequence is empty, a ValueError is raised with a message containing the header name. + - If the name is empty, a ValueError is raised with a message containing a preview of the sequence. """ - global contig_counter - try: - contigs = {} - contig_seq = "" - contig = None - for line in fna_file: - if line.startswith(">"): - if len(contig_seq) >= 1: # contig filter = 1 - contigs[contig.name] = contig_seq.upper() - contig.length = len(contig_seq) - contig_seq = "" - try: - contig = org.get(line.split()[0][1:]) - except KeyError: - with contig_counter.get_lock(): - contig = Contig(contig_counter.value, line.split()[0][1:]) - contig_counter.value += 1 - org.add(contig) - else: - contig_seq += line.strip() - if len(contig_seq) >= 1: # processing the last contig - contigs[contig.name] = contig_seq.upper() - contig.length = len(contig_seq) - - except AttributeError as e: - raise AttributeError( - f"{e}\nAn error was raised when reading file: '{fna_file.name}'. " - f"One possibility for this error is that the file did not start with a '>' " - f"as it would be expected from a fna file." - ) - except Exception as err: # To manage other exception which can occur - raise Exception( - f"{err}: Please check your input file and if everything looks fine, " - "please post an issue on our github" + if not sequence: + raise ValueError(f"Found an empty sequence with header '{name}'") + + if not name: + raise ValueError( + f"Found a sequence with empty name (sequence starts as '{sequence[:60]}')" ) + + return name, sequence + + +def parse_fasta( + fna_file: Union[TextIOWrapper, list] +) -> Generator[Tuple[str, str], None, None]: + """Yields each sequence name and sequence from a FASTA file or stream as a tuple. + + :param fna_file: Input FASTA file or list of lines as sequences. + :yield: Tuple with contig header (without '>') and sequence. + :raises ValueError: If the file does not contain valid FASTA format. + """ + name = None + sequence = "" + + for line in fna_file: + line = line.strip() + + if line.startswith(">"): # New header + if name: # Yield previous header and sequence if available + yield check_sequence_tuple(name, sequence) + + name = line[1:].split()[ + 0 + ] # Strip '>' and extract the first word as the name + sequence = "" + + elif line: # Only append non-empty lines + sequence += line + + else: + # You can skip or handle empty lines here if required + pass + + # Yield the final contig if exists + if name: + yield check_sequence_tuple(name, sequence) + + # Check if there was any valid data (at least one header and sequence) + if not name: + raise ValueError("The file does not contain any valid FASTA content.") + + +def get_contigs_from_fasta_file( + org: Organism, fna_file: Union[TextIOWrapper, list] +) -> Dict[str, str]: + """Processes contigs from a parsed FASTA generator and stores in a dictionary. + + :param org: Organism instance to update with contig info. + :param fna_file: Input FASTA file or list of lines as sequences. + :return: Dictionary with contig names as keys and sequences as values. + """ + + global contig_counter + contigs = {} + + for contig_name, sequence in parse_fasta(fna_file): + + # Retrieve or create the contig + try: + contig = org.get(contig_name) + except KeyError: + with contig_counter.get_lock(): + contig = Contig(contig_counter.value, contig_name) + contig_counter.value += 1 + org.add(contig) + + # Update contig information + if contig.length is not None and contig.length != len(sequence): + raise ValueError( + f"Length mismatch for contig {contig_name}: expected {contig.length}, found {len(sequence)} from the fasta sequence." + ) + + contig.length = len(sequence) + contigs[contig_name] = sequence.upper() + return contigs @@ -464,7 +518,7 @@ def annotate_organism( fasta_file = read_compressed_or_not(file_name) - contig_sequences = read_fasta(org, fasta_file) + contig_sequences = get_contigs_from_fasta_file(org, fasta_file) if is_compressed(file_name): # TODO simply copy file with shutil.copyfileobj fasta_file = write_tmp_fasta(contig_sequences, tmpdir) if procedure is None: # prodigal procedure is not force by user diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index bf25c51d..c2607aa1 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -971,7 +971,13 @@ def read_rgp(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): region = pangenome.get_region(row["RGP"].decode()) except KeyError: region = Region(row["RGP"].decode()) + + # starting from v2.2.1 score is part of RGP table in h5. + if "score" in row.dtype.names: + region.score = row["score"] + pangenome.add_region(region) + gene = pangenome.get_gene(row["gene"].decode()) region.add(gene) pangenome.status["predictedRGP"] = "Loaded" @@ -979,7 +985,7 @@ def read_rgp(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): def read_spots(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): """ - Read hotspot in pangenome hdf5 file to add in pangenome object + Read hotspots in the pangenome HDF5 file and add them to the pangenome object. :param pangenome: Pangenome object without spot :param h5f: Pangenome HDF5 file with spot computed @@ -987,20 +993,23 @@ def read_spots(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False """ table = h5f.root.spots spots = {} + curr_spot_id = None for row in tqdm( read_chunks(table, chunk=20000), total=table.nrows, unit="spot", disable=disable_bar, ): - curr_spot = spots.get(int(row["spot"])) - if curr_spot is None: - curr_spot = Spot(int(row["spot"])) - spots[row["spot"]] = curr_spot + if curr_spot_id != int(row["spot"]): + curr_spot_id = int(row["spot"]) + curr_spot = spots.get(curr_spot_id) + if curr_spot is None: + curr_spot = Spot(int(row["spot"])) + spots[row["spot"]] = curr_spot region = pangenome.get_region(row["RGP"].decode()) curr_spot.add(region) - curr_spot.spot_2_families() for spot in spots.values(): + spot.spot_2_families() pangenome.add_spot(spot) pangenome.status["spots"] = "Loaded" @@ -1548,7 +1557,6 @@ def read_pangenome( f"The pangenome in file '{filename}' does not have spots information, " f"or has been improperly filled" ) - if modules: if h5f.root.status._v_attrs.modules: logging.getLogger("PPanGGOLiN").info("Reading the modules...") diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index 17ba76ab..67dc0463 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -299,6 +299,7 @@ def rgp_desc(max_rgp_len, max_gene_len): return { "RGP": tables.StringCol(itemsize=max_rgp_len), "gene": tables.StringCol(itemsize=max_gene_len), + "score": tables.UInt32Col(), } @@ -355,6 +356,7 @@ def write_rgp( for gene in region.genes: rgp_row["RGP"] = region.name rgp_row["gene"] = gene.ID + rgp_row["score"] = region.score rgp_row.append() rgp_table.flush() diff --git a/ppanggolin/formats/writeFlatPangenome.py b/ppanggolin/formats/writeFlatPangenome.py index 5ef84f4b..5de33763 100644 --- a/ppanggolin/formats/writeFlatPangenome.py +++ b/ppanggolin/formats/writeFlatPangenome.py @@ -1096,6 +1096,7 @@ def write_rgp_table(regions: Set[Region], output: Path, compress: bool = False): "stop", "length", "coordinates", + "score", "contigBorder", "wholeContig", ] @@ -1117,6 +1118,7 @@ def write_rgp_table(regions: Set[Region], output: Path, compress: bool = False): "stop": region.stop, "length": region.length, "coordinates": region.string_coordinates(), + "score": region.score, "contigBorder": region.is_contig_border, "wholeContig": region.is_whole_contig, } diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 46cbdcdd..331db45c 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -553,8 +553,6 @@ def __setitem__(self, coordinate: Tuple[int, int, str], gene: Gene): @property def length(self) -> Union[int, None]: """Get the length of the contig""" - if self._length is None: - logging.getLogger("PPanGGOLiN").warning("Contig length is unknown") return self._length @length.setter diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index d577e840..b0a36c5f 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -20,7 +20,7 @@ import yaml # # local libraries -from ppanggolin.annotate.synta import read_fasta, get_dna_sequence +from ppanggolin.annotate.synta import get_contigs_from_fasta_file, get_dna_sequence from ppanggolin.annotate.annotate import ( init_contig_counter, read_anno_file, @@ -679,7 +679,7 @@ def get_gene_sequences_from_fasta_files(organisms, genome_name_to_annot_path): org_fasta_file = genome_name_to_annot_path[org.name]["path"] with read_compressed_or_not(org_fasta_file) as currFastaFile: - org_contig_to_seq = read_fasta(org, currFastaFile) + org_contig_to_seq = get_contigs_from_fasta_file(org, currFastaFile) for contig in org.contigs: try: @@ -997,7 +997,7 @@ def retrieve_gene_sequences_from_fasta_file(input_organism, fasta_file): """ with read_compressed_or_not(fasta_file) as currFastaFile: - contig_id2seq = read_fasta(input_organism, currFastaFile) + contig_id2seq = get_contigs_from_fasta_file(input_organism, currFastaFile) for contig in input_organism.contigs: try: diff --git a/ppanggolin/region.py b/ppanggolin/region.py index e2589e13..39f90326 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -51,7 +51,7 @@ def __init__(self, name: str): super().__init__() self._genes_getter = {} self.name = name - self.score = 0 + self.score = None self._starter = None self._stopper = None self._coordinates = None diff --git a/ppanggolin/workflow/all.py b/ppanggolin/workflow/all.py index 3ce60b7c..9898c127 100644 --- a/ppanggolin/workflow/all.py +++ b/ppanggolin/workflow/all.py @@ -602,6 +602,14 @@ def add_workflow_args(parser: argparse.ArgumentParser): "it will be assigned to its own unique cluster as a singleton.", ) + optional.add_argument( + "--use_pseudo", + required=False, + action="store_true", + help="In the context of provided annotation, use this option to read pseudogenes. " + "(Default behavior is to ignore them)", + ) + optional.add_argument( "-K", "--nb_of_partitions", diff --git a/testingDataset/expected_info_files/myannopang_info.yaml b/testingDataset/expected_info_files/myannopang_info.yaml index 3b7c1b5b..13a402d7 100644 --- a/testingDataset/expected_info_files/myannopang_info.yaml +++ b/testingDataset/expected_info_files/myannopang_info.yaml @@ -8,7 +8,7 @@ Status: RGP_Predicted: false Spots_Predicted: false Modules_Predicted: false - PPanGGOLiN_Version: 2.2.0 + PPanGGOLiN_Version: 2.2.1 Content: Genes: 47986 diff --git a/testingDataset/expected_info_files/mybasicpangenome_info.yaml b/testingDataset/expected_info_files/mybasicpangenome_info.yaml index b0bedf6f..9197a24a 100644 --- a/testingDataset/expected_info_files/mybasicpangenome_info.yaml +++ b/testingDataset/expected_info_files/mybasicpangenome_info.yaml @@ -8,7 +8,7 @@ Status: RGP_Predicted: true Spots_Predicted: true Modules_Predicted: true - PPanGGOLiN_Version: 2.2.0 + PPanGGOLiN_Version: 2.2.1 Content: Genes: 45429 diff --git a/testingDataset/expected_info_files/stepbystep_info.yaml b/testingDataset/expected_info_files/stepbystep_info.yaml index ef4525fc..203f80fa 100644 --- a/testingDataset/expected_info_files/stepbystep_info.yaml +++ b/testingDataset/expected_info_files/stepbystep_info.yaml @@ -8,7 +8,7 @@ Status: RGP_Predicted: true Spots_Predicted: true Modules_Predicted: true - PPanGGOLiN_Version: 2.2.0 + PPanGGOLiN_Version: 2.2.1 Content: Genes: 45429 diff --git a/tests/annotate/test_annotate.py b/tests/annotate/test_annotate.py index cd933eab..377dab8f 100644 --- a/tests/annotate/test_annotate.py +++ b/tests/annotate/test_annotate.py @@ -16,6 +16,8 @@ shift_end_coordinates, ) +from ppanggolin.annotate.synta import check_sequence_tuple, parse_fasta + @pytest.mark.parametrize( "input_string, expected_positions, expected_complement, expected_partialgene_start, expected_partialgene_end", @@ -531,3 +533,39 @@ def test_shift_start_coordinates(coordinates, shift, expected): def test_shift_end_coordinates(coordinates, shift, expected): result = shift_end_coordinates(coordinates, shift) assert result == expected + + +def test_check_sequence_tuple_valid(): + name, sequence = check_sequence_tuple("seq1", "ATGC") + assert name == "seq1" + assert sequence == "ATGC" + + +def test_check_sequence_tuple_empty_name(): + with pytest.raises(ValueError): + check_sequence_tuple("", "ATGC") + + +def test_check_sequence_tuple_empty_sequence(): + with pytest.raises(ValueError): + check_sequence_tuple("seq1", "") + + +def test_parse_fasta_valid(): + fasta_data = ">seq1\nATGC\n>seq2\nGCTA" + + result = list(parse_fasta(fasta_data.split("\n"))) + + assert result == [("seq1", "ATGC"), ("seq2", "GCTA")] + + +def test_parse_fasta_empty_sequence(): + fasta_data = ">seq1\n>seq2\nGCTA" + with pytest.raises(ValueError): + list(parse_fasta(fasta_data.split("\n"))) + + +def test_parse_fasta_no_header(): + fasta_data = "seq1\nATGC\nseq2\nGCTA".split("\n") + with pytest.raises(ValueError): + list(parse_fasta(fasta_data))