diff --git a/VERSION b/VERSION index 50ffc5aa..2165f8f9 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.0.3 +2.0.4 diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 88c65971..8182db8c 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -238,7 +238,7 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li line = lines.pop() if contig.length != len(sequence): - raise ValueError("The contig lenght defined is different than the sequence length") + raise ValueError("The contig length defined is different than the sequence length") # get each gene's sequence. for gene in contig.genes: gene.add_sequence(get_dna_sequence(sequence, gene)) @@ -253,7 +253,7 @@ def read_org_gff(organism: str, gff_file_path: Path, circular_contigs: List[str] :param organism: Organism name :param gff_file_path: Path corresponding to GFF file :param circular_contigs: List of circular contigs - :param pseudo: Allow to read pseudogène + :param pseudo: Allow to read pseudogene :return: Organism object and if there are sequences associated or not """ @@ -293,7 +293,7 @@ def get_id_attribute(attributes_dict: dict) -> str: element_id = attributes_dict.get("ID") if not element_id: raise Exception(f"Each CDS type of the gff files must own a unique ID attribute. " - f"Not the case for file: {gff_file_path}") + f"Not the case for file: {gff_file_path} with ID {element_id}") return element_id contig = None # initialize contig @@ -419,7 +419,7 @@ def read_anno_file(organism_name: str, filename: Path, circular_contigs: list, :param organism_name: Name of the organism :param filename: Path to the corresponding file :param circular_contigs: list of sequence in contig - :param pseudo: allow to read pseudogène + :param pseudo: allow to read pseudogene :return: Annotated organism for pangenome and true for sequence in file """ @@ -428,16 +428,22 @@ def read_anno_file(organism_name: str, filename: Path, circular_contigs: list, if filetype == "gff": try: return read_org_gff(organism_name, filename, circular_contigs, pseudo) - except Exception: - raise Exception(f"Reading the gff3 file '{filename}' raised an error.") + except Exception as err: + raise Exception(f"Reading the gff3 file '{filename}' raised an error. {err}") elif filetype == "gbff": try: return read_org_gbff(organism_name, filename, circular_contigs, pseudo) - except Exception: - raise Exception(f"Reading the gbff file '{filename}' raised an error.") - else: # Fasta type obligatory because unknown raise an error in detect_filetype function - raise Exception("Wrong file type provided. This looks like a fasta file. " - "You may be able to use --fasta instead.") + except Exception as err: + raise Exception(f"Reading the gbff file '{filename}' raised an error. {err}") + + elif filetype == "fasta": + raise ValueError(f"Invalid file type provided for parameter '--anno'. The file '{filename}' looks like a fasta file. " + "Please use a .gff or .gbff file. You may be able to use --fasta instead of --anno.") + + else: + raise ValueError(f"Invalid file type provided for parameter '--anno'. The file '{filename}' appears to be of type '{filetype}'. " + "Please use .gff or .gbff files.") + def chose_gene_identifiers(pangenome: Pangenome) -> bool: diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index 3be897fd..d1eaa428 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -169,7 +169,7 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str, :param org: Organism corresponding to fasta file :param fna_file: Input fasta file with sequences or list of each line as sequence - :return: Dictionnary with contig_name as keys and contig sequence in values + :return: Dictionary with contig_name as keys and contig sequence in values """ global contig_counter try: @@ -199,8 +199,8 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str, raise AttributeError(f"{e}\nAn error was raised when reading file: '{fna_file.name}'. " f"One possibility for this error is that the file did not start with a '>' " f"as it would be expected from a fna file.") - except Exception: # To manage other exception which can occur - raise Exception("Unexpected error. Please check your input file and if everything looks fine, " + except Exception as err: # To manage other exception which can occur + raise Exception(f"{err}: Please check your input file and if everything looks fine, " "please post an issue on our github") return contigs diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index a18b8f8c..593bdae4 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -526,6 +526,7 @@ def part_spec(part: str) -> list: mod_fam = [len(module) for module in pangenome.modules] + sum_mod_fam = sum(mod_fam) info_group._v_attrs.StatOfFamiliesInModules = {"min": getmin(mod_fam), "max": getmax(mod_fam), @@ -536,19 +537,19 @@ def part_spec(part: str) -> list: spec_shell = part_spec(part='shell') spec_cloud = part_spec(part='cloud') - info_group._v_attrs.PersistentSpecInModules = {"percent": round((sum(spec_pers) / sum(mod_fam)) * 100, 2), + info_group._v_attrs.PersistentSpecInModules = {"percent": round((sum(spec_pers) / sum_mod_fam) * 100, 2) if sum_mod_fam > 0 else 0, "min": getmin(spec_pers), "max": getmax(spec_pers), "sd": getstdev(spec_pers), "mean": getmean(spec_pers)} - info_group._v_attrs.ShellSpecInModules = {"percent": round((sum(spec_shell) / sum(mod_fam)) * 100, 2), + info_group._v_attrs.ShellSpecInModules = {"percent": round((sum(spec_shell) / sum_mod_fam) * 100, 2) if sum_mod_fam > 0 else 0, "min": getmin(spec_shell), "max": getmax(spec_shell), "sd": getstdev(spec_shell), "mean": getmean(spec_shell)} - info_group._v_attrs.CloudSpecInModules = {"percent": round((sum(spec_cloud) / sum(mod_fam)) * 100, 2), + info_group._v_attrs.CloudSpecInModules = {"percent": round((sum(spec_cloud) / sum_mod_fam) * 100, 2) if sum_mod_fam > 0 else 0, "min": getmin(spec_cloud), "max": getmax(spec_cloud), "sd": getstdev(spec_cloud), diff --git a/ppanggolin/meta/meta.py b/ppanggolin/meta/meta.py index 159f4996..880ab22a 100644 --- a/ppanggolin/meta/meta.py +++ b/ppanggolin/meta/meta.py @@ -76,8 +76,6 @@ def check_metadata_format(metadata: Path, metatype: str) -> pd.DataFrame: if not colname_check.match(column): raise ValueError(f"column name is not a valid identifier: {column}; " f"it does not match the pattern {colname_check.pattern}") - if column != metatype and metadata_df.dtypes[column] == object: - pd.to_numeric(metadata_df[column], downcast='integer', errors='ignore') return metadata_df diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index 59c16bed..908e5112 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -14,6 +14,7 @@ import tempfile import time from itertools import zip_longest +import re import networkx as nx from importlib.metadata import distribution @@ -307,7 +308,7 @@ def mk_file_name(basename: str, output: Path, force: bool = False) -> Path: def detect_filetype(filename: Path) -> str: """ - Detects whether the current file is gff3, gbk/gbff, fasta or unknown. + Detects whether the current file is gff3, gbk/gbff, fasta, tsv or unknown. If unknown, it will raise an error :param filename: path to file @@ -318,7 +319,7 @@ def detect_filetype(filename: Path) -> str: first_line = f.readline() if first_line.startswith("LOCUS "): # then this is probably a gbff/gbk file return "gbff" - elif first_line.startswith("##gff-version 3") or first_line.startswith("##gff-version 3"): # prodigal gff header has two spaces betwene gff-version and 3... + elif re.match(r"##gff-version\s{1,3}3", first_line): # prodigal gff header has two spaces between gff-version and 3... some gff user can have a tab return 'gff' elif first_line.startswith(">"): return 'fasta' @@ -326,8 +327,8 @@ def detect_filetype(filename: Path) -> str: return "tsv" else: raise Exception(f"Filetype {filename} was not gff3 (file starts with '##gff-version 3') " - "nor gbff/gbk (file starts with 'LOCUS '). " - "Only those two file formats are supported (for now).") + "nor gbff/gbk (file starts with 'LOCUS ') " + "nor fasta (file starts with '>') nor tsv (file has '\t' in the first line). ") def restricted_float(x: Union[int, float]) -> float: