Update github workflow to test new version and little reformat of

script
labgem · Nov 29, 2021 · 414fd26 · 414fd26
1 parent 6f52583
commit 414fd26
Show file tree

Hide file tree

Showing 4 changed files with 35 additions and 34 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -93,12 +93,11 @@ jobs:
       shell: bash -l {0}
       run: |
         cd testingDataset
-        ppanggolin align --pangenome mybasicpangenome/pangenome.h5 --proteins some_chlam_proteins.fasta --output test_align --draw_related --getinfo
-        ppanggolin align --pangenome mybasicpangenome/pangenome.h5 --anno NC_010287.gbff --output test_align -f
+        ppanggolin align --pangenome mybasicpangenome/pangenome.h5 --sequences some_chlam_proteins.fasta --output test_align --draw_related --getinfo
         cd -
     - name: testing context command
       shell: bash -l {0}
       run: |
         cd testingDataset
-        ppanggolin context --pangenome myannopang/pangenome.h5 --proteins some_chlam_proteins.fasta --output test_context
+        ppanggolin context --pangenome myannopang/pangenome.h5 --sequences some_chlam_proteins.fasta --output test_context
         ppanggolin context --pangenome readclusterpang/pangenome.h5 --family some_chlam_families.txt --output test_context -f
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-1.2.45
+1.2.46
diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py
@@ -9,11 +9,9 @@
 from collections import defaultdict
 
 # local libraries
-from ppanggolin.formats import checkPangenomeInfo, writeGeneSequencesFromAnnotations
+from ppanggolin.formats import checkPangenomeInfo
 from ppanggolin.utils import mkOutdir, read_compressed_or_not
 from ppanggolin.pangenome import Pangenome
-from ppanggolin.annotate import detect_filetype, read_org_gff, read_org_gbff
-from ppanggolin.RGP.genomicIsland import compute_org_rgp
 from ppanggolin.figures.draw_spot import drawSelectedSpots, subgraph
 
 
@@ -30,13 +28,12 @@ def createdb(fileObj, tmpdir):
     :rtype: _io.TextIOWrapper
     """
     seqdb = tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.name)
-    cmd = ["mmseqs", "createdb", fileObj.name, seqdb.name, '--dbtype','0']
+    cmd = ["mmseqs", "createdb", fileObj.name, seqdb.name, '--dbtype', '0']
     subprocess.run(cmd, stdout=subprocess.DEVNULL)
     return seqdb
 
 
-def alignSeqToPang(pangFile, seqFile, output, tmpdir, cpu=1, no_defrag=False, identity=0.8, coverage=0.8,
-                   code=11):
+def alignSeqToPang(pangFile, seqFile, output, tmpdir, cpu=1, no_defrag=False, identity=0.8, coverage=0.8):
     pang_db = createdb(pangFile, tmpdir)
     seq_db = createdb(seqFile, tmpdir)
     cov_mode = "0"  # coverage of query and target
@@ -140,7 +137,7 @@ def draw_spot_gexf(spots, output, multigenics, fam2mod, set_size=3):
         subgraph(spot, fname, set_size=set_size, multigenics=multigenics, fam2mod=fam2mod)
 
 
-def getSeqInfo(seq2pang, pangenome, output, cpu, draw_related, disable_bar=False):
+def getSeqInfo(seq2pang, pangenome, output, draw_related, disable_bar=False):
     logging.getLogger().info("Writing RGP and spot information related to hits in the pangenome")
     multigenics = pangenome.get_multigenics(pangenome.parameters["RGP"]["dup_margin"])
 
@@ -191,7 +188,7 @@ def get_seq2pang(pangenome, sequenceFile, output, tmpdir, cpu=1, no_defrag=False
     :param output: Output directory
     :type output: str
     :param tmpdir: Temporary directory
-    :type tmpdir: str
+    :type tmpdir: tempfile.TemporaryDirectory
     :param cpu: number of CPU cores to use 
     :type cpu: int
     :param no_defrag: do not use the defrag workflow if true
@@ -220,7 +217,7 @@ def get_seq2pang(pangenome, sequenceFile, output, tmpdir, cpu=1, no_defrag=False
 
 
 def align(pangenome, sequenceFile, output, tmpdir, identity=0.8, coverage=0.8, no_defrag=False, cpu=1, getinfo=False,
-          draw_related=False, priority='name,ID', disable_bar=False):
+          draw_related=False, disable_bar=False):
     if pangenome.status["geneFamilySequences"] not in ["inFile", "Loaded", "Computed"]:
         raise Exception("Cannot use this function as your pangenome does not have gene families representatives "
                         "associated to it. For now this works only if the clustering is realised by PPanGGOLiN.")
@@ -240,10 +237,10 @@ def align(pangenome, sequenceFile, output, tmpdir, identity=0.8, coverage=0.8, n
     new_tmpdir = tempfile.TemporaryDirectory(dir=tmpdir)
 
     seqSet, alignFile, seq2pang = get_seq2pang(pangenome, sequenceFile, output, new_tmpdir,
-                                                  cpu, no_defrag, identity, coverage)
+                                               cpu, no_defrag, identity, coverage)
 
     if getinfo or draw_related:
-        getSeqInfo(seq2pang, pangenome, output, cpu, draw_related, disable_bar=disable_bar)
+        getSeqInfo(seq2pang, pangenome, output, draw_related, disable_bar=disable_bar)
     else:
         partProj = projectPartition(seq2pang, seqSet, output)  # write the partition assignation only
         logging.getLogger().info(f"sequences partition projection : '{partProj}'")
@@ -264,14 +261,15 @@ def launch(args):
     if args.sequences is not None:
         align(pangenome=pangenome, sequenceFile=args.sequences, output=args.output, tmpdir=args.tmpdir, cpu=args.cpu,
               identity=args.identity, coverage=args.coverage, no_defrag=args.no_defrag, getinfo=args.getinfo,
-              draw_related=args.draw_related, priority=args.label_priority, disable_bar=args.disable_prog_bar)
+              draw_related=args.draw_related, disable_bar=args.disable_prog_bar)
+
 
 def alignSubparser(subparser):
     parser = subparser.add_parser("align", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     required = parser.add_argument_group(title="Required arguments",
                                          description="All of the following arguments are required :")
-    required.add_argument('-S','--sequences', required=True, type=str,
-                        help="sequences (nucleotides or amino acids) to align on the pangenome gene families")
+    required.add_argument('-S', '--sequences', required=True, type=str,
+                          help="sequences (nucleotides or amino acids) to align on the pangenome gene families")
 
     required.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome .h5 file")
     required.add_argument('-o', '--output', required=True, type=str,

diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py
@@ -34,6 +34,7 @@ class geneContext:
         Methods
         -------
         """
+
     def __init__(self, ID, families=None):
         """ Initial methods
 
@@ -61,8 +62,8 @@ def add_family(self, family):
         self.families.add(family)
 
 
-def search_geneContext_in_pangenome(pangenome, output, tmpdir, sequences=None, families=None, transitive=4, identity=0.5,
-                           coverage=0.8, jaccard=0.85, no_defrag=False, cpu=1, disable_bar=True):
+def search_geneContext_in_pangenome(pangenome, output, tmpdir, sequences=None, families=None, transitive=4,
+                                    identity=0.5, coverage=0.8, jaccard=0.85, no_defrag=False, cpu=1, disable_bar=True):
     """
     Main function to search common gene contexts between sequence set and pangenome families
 
@@ -104,7 +105,7 @@ def search_geneContext_in_pangenome(pangenome, output, tmpdir, sequences=None, f
         # Alignment of sequences on pangenome families
         new_tmpdir = tempfile.TemporaryDirectory(dir=tmpdir)
         seq_set, _, seq2pan = get_seq2pang(pangenome, sequences, output, new_tmpdir, cpu, no_defrag,
-                                              identity, coverage)
+                                           identity, coverage)
         projectPartition(seq2pan, seq_set, output)
         new_tmpdir.cleanup()
         for k, v in seq2pan.items():
@@ -206,7 +207,7 @@ def extract_gene_context(gene, contig, families, t=4):
     :rtype: (int, Bool, int, Bool)
     """
     pos_left, pos_right = (max(0, gene.position - t),
-                           min(gene.position + t, len(contig)-1))  # Gene positions to compare family
+                           min(gene.position + t, len(contig) - 1))  # Gene positions to compare family
     in_context_left, in_context_right = (False, False)
     while pos_left < gene.position and not in_context_left:
         if contig[pos_left].family in families.values():
@@ -247,7 +248,8 @@ def fam2seq(seq2pan):
     """
     Create a dictionary with gene families as keys and list of sequences id as values
 
-    :param seq2pan: Dictionary storing the sequence ids as keys and the gene families to which they are assigned as values
+    :param seq2pan: Dictionary storing the sequence ids as keys and the gene families
+                    to which they are assigned as values
     :param seq2pan: dict
 
     :return: Dictionary reversed
@@ -262,24 +264,24 @@ def fam2seq(seq2pan):
     return fam_2_seq
 
 
-def export_to_dataframe(families, geneContexts, fam_2_seq, output):
+def export_to_dataframe(families, gene_contexts, fam_2_seq, output):
     """ Export the results into dataFrame
 
     :param families: Families related to the connected components
     :type families: set
-    :param geneContexts: connected components found in the pangenome
-    :type geneContexts: set
+    :param gene_contexts: connected components found in the pangenome
+    :type gene_contexts: set
     :param fam_2_seq: Dictionary with gene families as keys and list of sequence ids as values
     :type fam_2_seq: dict
     :param output: output path
     :type output: str
     """
-    logging.getLogger().debug(f"There are {len(families)} families among {len(geneContexts)} gene contexts")
+    logging.getLogger().debug(f"There are {len(families)} families among {len(gene_contexts)} gene contexts")
 
     lines = []
-    for geneContext in geneContexts:
-        for family in geneContext.families:
-            line = [geneContext.ID]
+    for gene_context in gene_contexts:
+        for family in gene_context.families:
+            line = [gene_context.ID]
             if fam_2_seq is None or fam_2_seq.get(family.ID) is None:
                 line += [family.name, None, len(family.organisms)]
             else:
@@ -298,10 +300,12 @@ def launch(args):
     mkOutdir(args.output, args.force)
     pangenome = Pangenome()
     pangenome.addFile(args.pangenome)
-    search_geneContext_in_pangenome(pangenome=pangenome, sequences=args.sequences, families=args.family, output=args.output,
-                           identity=args.identity, coverage=args.coverage, jaccard=args.jaccard,
-                           transitive=args.transitive, cpu=args.cpu, tmpdir=args.tmpdir, no_defrag=args.no_defrag,
-                           disable_bar=args.disable_prog_bar)
+    search_geneContext_in_pangenome(pangenome=pangenome, sequences=args.sequences, families=args.family,
+                                    output=args.output,
+                                    identity=args.identity, coverage=args.coverage, jaccard=args.jaccard,
+                                    transitive=args.transitive, cpu=args.cpu, tmpdir=args.tmpdir,
+                                    no_defrag=args.no_defrag,
+                                    disable_bar=args.disable_prog_bar)
 
 
 def contextSubparser(sub_parser):