diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index b6af8b4b..960fb317 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -93,12 +93,11 @@ jobs: shell: bash -l {0} run: | cd testingDataset - ppanggolin align --pangenome mybasicpangenome/pangenome.h5 --proteins some_chlam_proteins.fasta --output test_align --draw_related --getinfo - ppanggolin align --pangenome mybasicpangenome/pangenome.h5 --anno NC_010287.gbff --output test_align -f + ppanggolin align --pangenome mybasicpangenome/pangenome.h5 --sequences some_chlam_proteins.fasta --output test_align --draw_related --getinfo cd - - name: testing context command shell: bash -l {0} run: | cd testingDataset - ppanggolin context --pangenome myannopang/pangenome.h5 --proteins some_chlam_proteins.fasta --output test_context + ppanggolin context --pangenome myannopang/pangenome.h5 --sequences some_chlam_proteins.fasta --output test_context ppanggolin context --pangenome readclusterpang/pangenome.h5 --family some_chlam_families.txt --output test_context -f diff --git a/VERSION b/VERSION index 1f4a7ab2..f98da2cd 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.45 +1.2.46 diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index f7600ddd..24440caa 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -9,11 +9,9 @@ from collections import defaultdict # local libraries -from ppanggolin.formats import checkPangenomeInfo, writeGeneSequencesFromAnnotations +from ppanggolin.formats import checkPangenomeInfo from ppanggolin.utils import mkOutdir, read_compressed_or_not from ppanggolin.pangenome import Pangenome -from ppanggolin.annotate import detect_filetype, read_org_gff, read_org_gbff -from ppanggolin.RGP.genomicIsland import compute_org_rgp from ppanggolin.figures.draw_spot import drawSelectedSpots, subgraph @@ -30,13 +28,12 @@ def createdb(fileObj, tmpdir): :rtype: _io.TextIOWrapper """ seqdb = tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.name) - cmd = ["mmseqs", "createdb", fileObj.name, seqdb.name, '--dbtype','0'] + cmd = ["mmseqs", "createdb", fileObj.name, seqdb.name, '--dbtype', '0'] subprocess.run(cmd, stdout=subprocess.DEVNULL) return seqdb -def alignSeqToPang(pangFile, seqFile, output, tmpdir, cpu=1, no_defrag=False, identity=0.8, coverage=0.8, - code=11): +def alignSeqToPang(pangFile, seqFile, output, tmpdir, cpu=1, no_defrag=False, identity=0.8, coverage=0.8): pang_db = createdb(pangFile, tmpdir) seq_db = createdb(seqFile, tmpdir) cov_mode = "0" # coverage of query and target @@ -140,7 +137,7 @@ def draw_spot_gexf(spots, output, multigenics, fam2mod, set_size=3): subgraph(spot, fname, set_size=set_size, multigenics=multigenics, fam2mod=fam2mod) -def getSeqInfo(seq2pang, pangenome, output, cpu, draw_related, disable_bar=False): +def getSeqInfo(seq2pang, pangenome, output, draw_related, disable_bar=False): logging.getLogger().info("Writing RGP and spot information related to hits in the pangenome") multigenics = pangenome.get_multigenics(pangenome.parameters["RGP"]["dup_margin"]) @@ -191,7 +188,7 @@ def get_seq2pang(pangenome, sequenceFile, output, tmpdir, cpu=1, no_defrag=False :param output: Output directory :type output: str :param tmpdir: Temporary directory - :type tmpdir: str + :type tmpdir: tempfile.TemporaryDirectory :param cpu: number of CPU cores to use :type cpu: int :param no_defrag: do not use the defrag workflow if true @@ -220,7 +217,7 @@ def get_seq2pang(pangenome, sequenceFile, output, tmpdir, cpu=1, no_defrag=False def align(pangenome, sequenceFile, output, tmpdir, identity=0.8, coverage=0.8, no_defrag=False, cpu=1, getinfo=False, - draw_related=False, priority='name,ID', disable_bar=False): + draw_related=False, disable_bar=False): if pangenome.status["geneFamilySequences"] not in ["inFile", "Loaded", "Computed"]: raise Exception("Cannot use this function as your pangenome does not have gene families representatives " "associated to it. For now this works only if the clustering is realised by PPanGGOLiN.") @@ -240,10 +237,10 @@ def align(pangenome, sequenceFile, output, tmpdir, identity=0.8, coverage=0.8, n new_tmpdir = tempfile.TemporaryDirectory(dir=tmpdir) seqSet, alignFile, seq2pang = get_seq2pang(pangenome, sequenceFile, output, new_tmpdir, - cpu, no_defrag, identity, coverage) + cpu, no_defrag, identity, coverage) if getinfo or draw_related: - getSeqInfo(seq2pang, pangenome, output, cpu, draw_related, disable_bar=disable_bar) + getSeqInfo(seq2pang, pangenome, output, draw_related, disable_bar=disable_bar) else: partProj = projectPartition(seq2pang, seqSet, output) # write the partition assignation only logging.getLogger().info(f"sequences partition projection : '{partProj}'") @@ -264,14 +261,15 @@ def launch(args): if args.sequences is not None: align(pangenome=pangenome, sequenceFile=args.sequences, output=args.output, tmpdir=args.tmpdir, cpu=args.cpu, identity=args.identity, coverage=args.coverage, no_defrag=args.no_defrag, getinfo=args.getinfo, - draw_related=args.draw_related, priority=args.label_priority, disable_bar=args.disable_prog_bar) + draw_related=args.draw_related, disable_bar=args.disable_prog_bar) + def alignSubparser(subparser): parser = subparser.add_parser("align", formatter_class=argparse.ArgumentDefaultsHelpFormatter) required = parser.add_argument_group(title="Required arguments", description="All of the following arguments are required :") - required.add_argument('-S','--sequences', required=True, type=str, - help="sequences (nucleotides or amino acids) to align on the pangenome gene families") + required.add_argument('-S', '--sequences', required=True, type=str, + help="sequences (nucleotides or amino acids) to align on the pangenome gene families") required.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome .h5 file") required.add_argument('-o', '--output', required=True, type=str, diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 91b1f401..30e04075 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -34,6 +34,7 @@ class geneContext: Methods ------- """ + def __init__(self, ID, families=None): """ Initial methods @@ -61,8 +62,8 @@ def add_family(self, family): self.families.add(family) -def search_geneContext_in_pangenome(pangenome, output, tmpdir, sequences=None, families=None, transitive=4, identity=0.5, - coverage=0.8, jaccard=0.85, no_defrag=False, cpu=1, disable_bar=True): +def search_geneContext_in_pangenome(pangenome, output, tmpdir, sequences=None, families=None, transitive=4, + identity=0.5, coverage=0.8, jaccard=0.85, no_defrag=False, cpu=1, disable_bar=True): """ Main function to search common gene contexts between sequence set and pangenome families @@ -104,7 +105,7 @@ def search_geneContext_in_pangenome(pangenome, output, tmpdir, sequences=None, f # Alignment of sequences on pangenome families new_tmpdir = tempfile.TemporaryDirectory(dir=tmpdir) seq_set, _, seq2pan = get_seq2pang(pangenome, sequences, output, new_tmpdir, cpu, no_defrag, - identity, coverage) + identity, coverage) projectPartition(seq2pan, seq_set, output) new_tmpdir.cleanup() for k, v in seq2pan.items(): @@ -206,7 +207,7 @@ def extract_gene_context(gene, contig, families, t=4): :rtype: (int, Bool, int, Bool) """ pos_left, pos_right = (max(0, gene.position - t), - min(gene.position + t, len(contig)-1)) # Gene positions to compare family + min(gene.position + t, len(contig) - 1)) # Gene positions to compare family in_context_left, in_context_right = (False, False) while pos_left < gene.position and not in_context_left: if contig[pos_left].family in families.values(): @@ -247,7 +248,8 @@ def fam2seq(seq2pan): """ Create a dictionary with gene families as keys and list of sequences id as values - :param seq2pan: Dictionary storing the sequence ids as keys and the gene families to which they are assigned as values + :param seq2pan: Dictionary storing the sequence ids as keys and the gene families + to which they are assigned as values :param seq2pan: dict :return: Dictionary reversed @@ -262,24 +264,24 @@ def fam2seq(seq2pan): return fam_2_seq -def export_to_dataframe(families, geneContexts, fam_2_seq, output): +def export_to_dataframe(families, gene_contexts, fam_2_seq, output): """ Export the results into dataFrame :param families: Families related to the connected components :type families: set - :param geneContexts: connected components found in the pangenome - :type geneContexts: set + :param gene_contexts: connected components found in the pangenome + :type gene_contexts: set :param fam_2_seq: Dictionary with gene families as keys and list of sequence ids as values :type fam_2_seq: dict :param output: output path :type output: str """ - logging.getLogger().debug(f"There are {len(families)} families among {len(geneContexts)} gene contexts") + logging.getLogger().debug(f"There are {len(families)} families among {len(gene_contexts)} gene contexts") lines = [] - for geneContext in geneContexts: - for family in geneContext.families: - line = [geneContext.ID] + for gene_context in gene_contexts: + for family in gene_context.families: + line = [gene_context.ID] if fam_2_seq is None or fam_2_seq.get(family.ID) is None: line += [family.name, None, len(family.organisms)] else: @@ -298,10 +300,12 @@ def launch(args): mkOutdir(args.output, args.force) pangenome = Pangenome() pangenome.addFile(args.pangenome) - search_geneContext_in_pangenome(pangenome=pangenome, sequences=args.sequences, families=args.family, output=args.output, - identity=args.identity, coverage=args.coverage, jaccard=args.jaccard, - transitive=args.transitive, cpu=args.cpu, tmpdir=args.tmpdir, no_defrag=args.no_defrag, - disable_bar=args.disable_prog_bar) + search_geneContext_in_pangenome(pangenome=pangenome, sequences=args.sequences, families=args.family, + output=args.output, + identity=args.identity, coverage=args.coverage, jaccard=args.jaccard, + transitive=args.transitive, cpu=args.cpu, tmpdir=args.tmpdir, + no_defrag=args.no_defrag, + disable_bar=args.disable_prog_bar) def contextSubparser(sub_parser):