temporary fix to remove dependency for mwTools

See #5
CRG-CNAG · Jul 30, 2019 · 731d022 · 731d022
1 parent 541bd66
commit 731d022
Show file tree

Hide file tree

Showing 2 changed files with 222 additions and 7 deletions.
diff --git a/src/index_genome_files_bowtie2.py b/src/index_genome_files_bowtie2.py
@@ -11,11 +11,204 @@
 import Bio
 from Bio import SeqIO
 
-from mwTools.general import glob_file_list
-from mwTools.bio import convert_genbank_to_annotation_df
-from mwTools.bio import convert_annotation_df_to_bed
-from mwTools.id import extract_fasta_id
-from mwTools.pandas import sort_df
+# from mwTools.general import glob_file_list
+# from mwTools.bio import convert_genbank_to_annotation_df
+# from mwTools.bio import convert_annotation_df_to_bed
+# from mwTools.id import extract_fasta_id
+# from mwTools.pandas import sort_df
+import random
+from glob import glob
+
+
+
+def glob_file_list(fileList):
+    "Apply glob on the list of file pattern, filter for files only, remove duplicated files."
+    if fileList is None or fileList == '':
+        fileList = None
+    else:
+        fileList = [fn for pattern in fileList for fn in glob(pattern) if Path(fn).is_file()]
+        # Remove duplicated files from list (can happen if multiple patterns match same files)
+        fileList = sorted(list(set(fileList)))
+    return fileList
+
+
+def convert_genbank_to_annotation_df(genomeBio, extractDNASeq=False, extractProteinSeq=False, verbose=1):
+
+    species_name = genomeBio.annotations.get('organism')
+    species_name = species_name if species_name is not None else ''
+
+    def get_proteinGI(feature):
+        proteinGI = feature.qualifiers.get('db_xref')
+        proteinGI = int(re.sub(r'GI:', '', proteinGI[0])) if proteinGI is not None else None
+        return proteinGI
+
+    def get_attribute(feature, attName):
+        if attName == 'proteinGI':
+            att = get_proteinGI(feature)
+        else:
+            att = feature.qualifiers.get(attName)
+            att = att[0] if att is not None else None
+        return att
+
+#     def get_attribute_dict(feature):
+#         # Extract the first value of each qualifier
+#         return {q:v[0] for q, v in feature.qualifiers.items()}
+
+    def get_attribute_dict(feature):
+        return {att:get_attribute(feature, att) for att in feature.qualifiers.keys()}
+
+    hasWellDefinedGenomeSeq = \
+        not (
+            (genomeBio.seq.count('N') / len(genomeBio.seq)) > 0.5 or
+            genomeBio.seq == ''
+        )
+
+    geneIdentifierPriorityList = ['locus_tag', 'gene', 'label', 'protein_id']
+
+    CDSList = []
+    if verbose >= 1: print("len(genomeBio.features):", len(genomeBio.features))
+    for feature in genomeBio.features:
+        attDict = get_attribute_dict(feature)
+        location = convert_location_bio_to_dict(feature)
+        locationBio = feature.location
+
+        # Use the first of the attributes of the priority list found in the feature as main identifier
+        featId = None
+        for identifier in geneIdentifierPriorityList:
+            if attDict.get(identifier) is not None:
+                featId = attDict[identifier]
+                break
+
+        CDSDict = {'chromosome':genomeBio.id,
+                   'id':featId,
+                   'feature':feature.type,
+                   'strand':location['strand'],
+                   'start':location['start'],
+                   'end':location['end'],
+                   'locus_tag':attDict.get('locus_tag'),
+                   'gene':attDict.get('gene'),
+                   'protein_id':attDict.get('protein_id')
+                   }
+        if extractDNASeq and hasWellDefinedGenomeSeq:
+            dnaSeqBio = SeqFeature(location=locationBio).extract(genomeBio)
+            dnaSeq = str(dnaSeqBio.seq)
+            CDSDict['DNA_seq'] = dnaSeq
+        else:
+            dnaSeqBio = None
+            dnaSeq = None
+
+        if extractProteinSeq and feature.type == 'CDS' and dnaSeqBio is not None:
+            codonTableId = attDict.get('transl_table')
+            codonStartPos = attDict.get('codon_start')    # in 1-based index
+            codonStartPos = int(codonStartPos) if codonStartPos is not None else 1
+            if codonTableId is not None:
+                try:
+                    proteinSeq = str(dnaSeqBio.seq[3*(codonStartPos - 1):].translate(table=codonTableId, cds=True))
+                except TranslationError:
+                    try:
+                        proteinSeq = str(dnaSeqBio.seq[3*(codonStartPos - 1):].translate(table=codonTableId, cds=False))
+                    except TranslationError:
+                        proteinSeq = None
+
+            CDSDict['protein_seq'] = proteinSeq
+
+        CDSList.append(CDSDict)
+        if verbose >= 2: print("\n\n")
+
+    CDSDf = pd.DataFrame(CDSList)
+    CDSDf = CDSDf.sort_values(by=['start', 'end', 'strand'])
+
+    return CDSDf
+
+
+def convert_annotation_df_to_bed(annotDf, chromosomeCol='chromosome', startCol='start', endCol='end',
+                                 strandCol='strand', idCol='id', featureCol='feature', combineFeatureAndId=False,
+                                 sort=True, sortBy=None, sortAscending=None
+                                 ):
+    """We assume that annotation features dataframe uses 0-based start-inclusive end-exclusive
+    counting with start <= end."""
+
+    df = annotDf
+    if type(df) is pd.Series:
+        df = df.to_frame().T
+    sortColList = [startCol, endCol, strandCol]
+    sortAscendingList = [True, True, True]
+    if sort:
+        if sortBy is not None:
+            if 'chromosome' in sortBy:
+                raise ValueError("chromosome is included by default and has to be sorted.")
+            sortColList = sortBy + sortColList
+            sortAscendingList = sortAscending + sortAscendingList
+        df = df.sort_values(by=sortColList, ascending=sortAscendingList)
+        df = sort_df(df, chromosomeCol, key=lambda x: (x.upper(), x[0].islower()), reverse=False)
+    print(df)
+    df.index.name = idCol
+
+    if idCol not in df.columns:
+        df = df.reset_index()
+
+    bed_string = ""
+    for index, annot in df.iterrows():
+        # BED format uses 0-based start-inclusive end-exclusive counting (as in Python)
+        # BED format start < end
+        chromosomeName = annot[chromosomeCol]
+        if combineFeatureAndId:
+            bed_id = "{};{}".format(annot[featureCol], annot[idCol])
+        else:
+            bed_id = annot[idCol]
+        bed_start = int(annot[startCol])
+        bed_end = int(annot[endCol])
+        bed_strand = annot[strandCol]
+        if bed_start > bed_end:
+            bed_start, bed_end = bed_end, bed_start
+        bed_string += "{}\t{}\t{}\t{}\t{:d}\t{}\n".format(chromosomeName, str(bed_start), str(bed_end),
+                                                          bed_id, 0, bed_strand)
+    return bed_string
+
+
+def extract_fasta_id(seqIdString):
+    """
+    Extract the id in the fasta description string, taking only the characters till the '|'.
+    """
+    seqId = None
+    regex1 = re.compile(r'^\s*?([^|]+?)(\s|\||$)')
+    seqIdSearch = re.search(regex1, seqIdString)
+    if seqIdSearch:
+        seqId = seqIdSearch.group(1)
+    return seqId
+
+
+def sort_df_by_value_list(df, col, valueList):
+    sorterIndex = dict(zip(valueList, range(len(valueList))))
+    ranInt = random.random_integers(1e6)
+    rank = df[col].map(sorterIndex)
+    rank.name = 'rank{:d}'.format(ranInt)
+    return df.join(rank).sort_values(rank.name).drop(rank.name, axis=1)
+
+
+def sort_df(df, col, key=None, reverse=False):
+    """
+    Take into account duplicated entries by using a rank column and applying the sort_df_by_value_list
+    method.
+
+    Example, sorting by alphabetical order grouping together capital and small case, with capital case first:
+    sort_df(df, 'col', key=lambda x: (x.upper(), x[0].islower()))
+    """
+
+    def sorter(key=None, reverse=False):
+        """
+        Using python built-in sorted function.
+        """
+        def sorter_(series):
+            series_list = list(series)
+            return [series_list.index(i)
+                    for i in sorted(series_list, key=key, reverse=reverse)]
+        return sorter_
+
+    df2 = df.reset_index(drop=True)
+    sortedIndex = sorter(key=key, reverse=reverse)(df2[col])
+    sortedValuesUnique = list(OrderedDict.fromkeys(np.array([df2[col].iloc[i] for i in sortedIndex])))
+    return sort_df_by_value_list(df2, col=col, valueList=sortedValuesUnique)
 
 
 

diff --git a/src/pipeline_roesti.py b/src/pipeline_roesti.py
@@ -25,8 +25,30 @@
 from send_socket_message import send_socket_message
 
 from index_genome_files_bowtie2 import index_genome_files_bowtie2
-from mwTools.general import glob_file_list
-from mwTools.general import open_by_suffix
+# from mwTools.general import glob_file_list
+# from mwTools.general import open_by_suffix
+
+
+def glob_file_list(fileList):
+    "Apply glob on the list of file pattern, filter for files only, remove duplicated files."
+    if fileList is None or fileList == '':
+        fileList = None
+    else:
+        fileList = [fn for pattern in fileList for fn in glob(pattern) if Path(fn).is_file()]
+        # Remove duplicated files from list (can happen if multiple patterns match same files)
+        fileList = sorted(list(set(fileList)))
+    return fileList
+
+
+def open_by_suffix(filename):
+    """Detect compressed file by filename suffix, and return gzip, bz2, or normal file handles."""
+    if Path(filename).suffix == '.gz':
+        return gzip.open(filename, 'rt')
+    elif Path(filename).suffix == '.bz2':
+        return bz2.BZ2file(filename, 'rt')
+    else:
+        return open(filename, 'r')
+
 
 # --------------------------------------------------