Skip to content

Commit

Permalink
temporary fix to remove dependency for mwTools
Browse files Browse the repository at this point in the history
See #5
  • Loading branch information
webermarcolivier committed Jul 30, 2019
1 parent 541bd66 commit 731d022
Show file tree
Hide file tree
Showing 2 changed files with 222 additions and 7 deletions.
203 changes: 198 additions & 5 deletions src/index_genome_files_bowtie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,204 @@
import Bio
from Bio import SeqIO

from mwTools.general import glob_file_list
from mwTools.bio import convert_genbank_to_annotation_df
from mwTools.bio import convert_annotation_df_to_bed
from mwTools.id import extract_fasta_id
from mwTools.pandas import sort_df
# from mwTools.general import glob_file_list
# from mwTools.bio import convert_genbank_to_annotation_df
# from mwTools.bio import convert_annotation_df_to_bed
# from mwTools.id import extract_fasta_id
# from mwTools.pandas import sort_df
import random
from glob import glob



def glob_file_list(fileList):
"Apply glob on the list of file pattern, filter for files only, remove duplicated files."
if fileList is None or fileList == '':
fileList = None
else:
fileList = [fn for pattern in fileList for fn in glob(pattern) if Path(fn).is_file()]
# Remove duplicated files from list (can happen if multiple patterns match same files)
fileList = sorted(list(set(fileList)))
return fileList


def convert_genbank_to_annotation_df(genomeBio, extractDNASeq=False, extractProteinSeq=False, verbose=1):

species_name = genomeBio.annotations.get('organism')
species_name = species_name if species_name is not None else ''

def get_proteinGI(feature):
proteinGI = feature.qualifiers.get('db_xref')
proteinGI = int(re.sub(r'GI:', '', proteinGI[0])) if proteinGI is not None else None
return proteinGI

def get_attribute(feature, attName):
if attName == 'proteinGI':
att = get_proteinGI(feature)
else:
att = feature.qualifiers.get(attName)
att = att[0] if att is not None else None
return att

# def get_attribute_dict(feature):
# # Extract the first value of each qualifier
# return {q:v[0] for q, v in feature.qualifiers.items()}

def get_attribute_dict(feature):
return {att:get_attribute(feature, att) for att in feature.qualifiers.keys()}

hasWellDefinedGenomeSeq = \
not (
(genomeBio.seq.count('N') / len(genomeBio.seq)) > 0.5 or
genomeBio.seq == ''
)

geneIdentifierPriorityList = ['locus_tag', 'gene', 'label', 'protein_id']

CDSList = []
if verbose >= 1: print("len(genomeBio.features):", len(genomeBio.features))
for feature in genomeBio.features:
attDict = get_attribute_dict(feature)
location = convert_location_bio_to_dict(feature)
locationBio = feature.location

# Use the first of the attributes of the priority list found in the feature as main identifier
featId = None
for identifier in geneIdentifierPriorityList:
if attDict.get(identifier) is not None:
featId = attDict[identifier]
break

CDSDict = {'chromosome':genomeBio.id,
'id':featId,
'feature':feature.type,
'strand':location['strand'],
'start':location['start'],
'end':location['end'],
'locus_tag':attDict.get('locus_tag'),
'gene':attDict.get('gene'),
'protein_id':attDict.get('protein_id')
}
if extractDNASeq and hasWellDefinedGenomeSeq:
dnaSeqBio = SeqFeature(location=locationBio).extract(genomeBio)
dnaSeq = str(dnaSeqBio.seq)
CDSDict['DNA_seq'] = dnaSeq
else:
dnaSeqBio = None
dnaSeq = None

if extractProteinSeq and feature.type == 'CDS' and dnaSeqBio is not None:
codonTableId = attDict.get('transl_table')
codonStartPos = attDict.get('codon_start') # in 1-based index
codonStartPos = int(codonStartPos) if codonStartPos is not None else 1
if codonTableId is not None:
try:
proteinSeq = str(dnaSeqBio.seq[3*(codonStartPos - 1):].translate(table=codonTableId, cds=True))
except TranslationError:
try:
proteinSeq = str(dnaSeqBio.seq[3*(codonStartPos - 1):].translate(table=codonTableId, cds=False))
except TranslationError:
proteinSeq = None

CDSDict['protein_seq'] = proteinSeq

CDSList.append(CDSDict)
if verbose >= 2: print("\n\n")

CDSDf = pd.DataFrame(CDSList)
CDSDf = CDSDf.sort_values(by=['start', 'end', 'strand'])

return CDSDf


def convert_annotation_df_to_bed(annotDf, chromosomeCol='chromosome', startCol='start', endCol='end',
strandCol='strand', idCol='id', featureCol='feature', combineFeatureAndId=False,
sort=True, sortBy=None, sortAscending=None
):
"""We assume that annotation features dataframe uses 0-based start-inclusive end-exclusive
counting with start <= end."""

df = annotDf
if type(df) is pd.Series:
df = df.to_frame().T
sortColList = [startCol, endCol, strandCol]
sortAscendingList = [True, True, True]
if sort:
if sortBy is not None:
if 'chromosome' in sortBy:
raise ValueError("chromosome is included by default and has to be sorted.")
sortColList = sortBy + sortColList
sortAscendingList = sortAscending + sortAscendingList
df = df.sort_values(by=sortColList, ascending=sortAscendingList)
df = sort_df(df, chromosomeCol, key=lambda x: (x.upper(), x[0].islower()), reverse=False)
print(df)
df.index.name = idCol

if idCol not in df.columns:
df = df.reset_index()

bed_string = ""
for index, annot in df.iterrows():
# BED format uses 0-based start-inclusive end-exclusive counting (as in Python)
# BED format start < end
chromosomeName = annot[chromosomeCol]
if combineFeatureAndId:
bed_id = "{};{}".format(annot[featureCol], annot[idCol])
else:
bed_id = annot[idCol]
bed_start = int(annot[startCol])
bed_end = int(annot[endCol])
bed_strand = annot[strandCol]
if bed_start > bed_end:
bed_start, bed_end = bed_end, bed_start
bed_string += "{}\t{}\t{}\t{}\t{:d}\t{}\n".format(chromosomeName, str(bed_start), str(bed_end),
bed_id, 0, bed_strand)
return bed_string


def extract_fasta_id(seqIdString):
"""
Extract the id in the fasta description string, taking only the characters till the '|'.
"""
seqId = None
regex1 = re.compile(r'^\s*?([^|]+?)(\s|\||$)')
seqIdSearch = re.search(regex1, seqIdString)
if seqIdSearch:
seqId = seqIdSearch.group(1)
return seqId


def sort_df_by_value_list(df, col, valueList):
sorterIndex = dict(zip(valueList, range(len(valueList))))
ranInt = random.random_integers(1e6)
rank = df[col].map(sorterIndex)
rank.name = 'rank{:d}'.format(ranInt)
return df.join(rank).sort_values(rank.name).drop(rank.name, axis=1)


def sort_df(df, col, key=None, reverse=False):
"""
Take into account duplicated entries by using a rank column and applying the sort_df_by_value_list
method.
Example, sorting by alphabetical order grouping together capital and small case, with capital case first:
sort_df(df, 'col', key=lambda x: (x.upper(), x[0].islower()))
"""

def sorter(key=None, reverse=False):
"""
Using python built-in sorted function.
"""
def sorter_(series):
series_list = list(series)
return [series_list.index(i)
for i in sorted(series_list, key=key, reverse=reverse)]
return sorter_

df2 = df.reset_index(drop=True)
sortedIndex = sorter(key=key, reverse=reverse)(df2[col])
sortedValuesUnique = list(OrderedDict.fromkeys(np.array([df2[col].iloc[i] for i in sortedIndex])))
return sort_df_by_value_list(df2, col=col, valueList=sortedValuesUnique)



Expand Down
26 changes: 24 additions & 2 deletions src/pipeline_roesti.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,30 @@
from send_socket_message import send_socket_message

from index_genome_files_bowtie2 import index_genome_files_bowtie2
from mwTools.general import glob_file_list
from mwTools.general import open_by_suffix
# from mwTools.general import glob_file_list
# from mwTools.general import open_by_suffix


def glob_file_list(fileList):
"Apply glob on the list of file pattern, filter for files only, remove duplicated files."
if fileList is None or fileList == '':
fileList = None
else:
fileList = [fn for pattern in fileList for fn in glob(pattern) if Path(fn).is_file()]
# Remove duplicated files from list (can happen if multiple patterns match same files)
fileList = sorted(list(set(fileList)))
return fileList


def open_by_suffix(filename):
"""Detect compressed file by filename suffix, and return gzip, bz2, or normal file handles."""
if Path(filename).suffix == '.gz':
return gzip.open(filename, 'rt')
elif Path(filename).suffix == '.bz2':
return bz2.BZ2file(filename, 'rt')
else:
return open(filename, 'r')


# --------------------------------------------------

Expand Down

0 comments on commit 731d022

Please sign in to comment.