From 731d022f93e1c07ae0ac7fe44749a9fb222ac676 Mon Sep 17 00:00:00 2001 From: Marc Weber Date: Tue, 30 Jul 2019 12:05:23 +0200 Subject: [PATCH] temporary fix to remove dependency for mwTools See #5 --- src/index_genome_files_bowtie2.py | 203 +++++++++++++++++++++++++++++- src/pipeline_roesti.py | 26 +++- 2 files changed, 222 insertions(+), 7 deletions(-) diff --git a/src/index_genome_files_bowtie2.py b/src/index_genome_files_bowtie2.py index cc6f80d..0706f6f 100755 --- a/src/index_genome_files_bowtie2.py +++ b/src/index_genome_files_bowtie2.py @@ -11,11 +11,204 @@ import Bio from Bio import SeqIO -from mwTools.general import glob_file_list -from mwTools.bio import convert_genbank_to_annotation_df -from mwTools.bio import convert_annotation_df_to_bed -from mwTools.id import extract_fasta_id -from mwTools.pandas import sort_df +# from mwTools.general import glob_file_list +# from mwTools.bio import convert_genbank_to_annotation_df +# from mwTools.bio import convert_annotation_df_to_bed +# from mwTools.id import extract_fasta_id +# from mwTools.pandas import sort_df +import random +from glob import glob + + + +def glob_file_list(fileList): + "Apply glob on the list of file pattern, filter for files only, remove duplicated files." + if fileList is None or fileList == '': + fileList = None + else: + fileList = [fn for pattern in fileList for fn in glob(pattern) if Path(fn).is_file()] + # Remove duplicated files from list (can happen if multiple patterns match same files) + fileList = sorted(list(set(fileList))) + return fileList + + +def convert_genbank_to_annotation_df(genomeBio, extractDNASeq=False, extractProteinSeq=False, verbose=1): + + species_name = genomeBio.annotations.get('organism') + species_name = species_name if species_name is not None else '' + + def get_proteinGI(feature): + proteinGI = feature.qualifiers.get('db_xref') + proteinGI = int(re.sub(r'GI:', '', proteinGI[0])) if proteinGI is not None else None + return proteinGI + + def get_attribute(feature, attName): + if attName == 'proteinGI': + att = get_proteinGI(feature) + else: + att = feature.qualifiers.get(attName) + att = att[0] if att is not None else None + return att + +# def get_attribute_dict(feature): +# # Extract the first value of each qualifier +# return {q:v[0] for q, v in feature.qualifiers.items()} + + def get_attribute_dict(feature): + return {att:get_attribute(feature, att) for att in feature.qualifiers.keys()} + + hasWellDefinedGenomeSeq = \ + not ( + (genomeBio.seq.count('N') / len(genomeBio.seq)) > 0.5 or + genomeBio.seq == '' + ) + + geneIdentifierPriorityList = ['locus_tag', 'gene', 'label', 'protein_id'] + + CDSList = [] + if verbose >= 1: print("len(genomeBio.features):", len(genomeBio.features)) + for feature in genomeBio.features: + attDict = get_attribute_dict(feature) + location = convert_location_bio_to_dict(feature) + locationBio = feature.location + + # Use the first of the attributes of the priority list found in the feature as main identifier + featId = None + for identifier in geneIdentifierPriorityList: + if attDict.get(identifier) is not None: + featId = attDict[identifier] + break + + CDSDict = {'chromosome':genomeBio.id, + 'id':featId, + 'feature':feature.type, + 'strand':location['strand'], + 'start':location['start'], + 'end':location['end'], + 'locus_tag':attDict.get('locus_tag'), + 'gene':attDict.get('gene'), + 'protein_id':attDict.get('protein_id') + } + if extractDNASeq and hasWellDefinedGenomeSeq: + dnaSeqBio = SeqFeature(location=locationBio).extract(genomeBio) + dnaSeq = str(dnaSeqBio.seq) + CDSDict['DNA_seq'] = dnaSeq + else: + dnaSeqBio = None + dnaSeq = None + + if extractProteinSeq and feature.type == 'CDS' and dnaSeqBio is not None: + codonTableId = attDict.get('transl_table') + codonStartPos = attDict.get('codon_start') # in 1-based index + codonStartPos = int(codonStartPos) if codonStartPos is not None else 1 + if codonTableId is not None: + try: + proteinSeq = str(dnaSeqBio.seq[3*(codonStartPos - 1):].translate(table=codonTableId, cds=True)) + except TranslationError: + try: + proteinSeq = str(dnaSeqBio.seq[3*(codonStartPos - 1):].translate(table=codonTableId, cds=False)) + except TranslationError: + proteinSeq = None + + CDSDict['protein_seq'] = proteinSeq + + CDSList.append(CDSDict) + if verbose >= 2: print("\n\n") + + CDSDf = pd.DataFrame(CDSList) + CDSDf = CDSDf.sort_values(by=['start', 'end', 'strand']) + + return CDSDf + + +def convert_annotation_df_to_bed(annotDf, chromosomeCol='chromosome', startCol='start', endCol='end', + strandCol='strand', idCol='id', featureCol='feature', combineFeatureAndId=False, + sort=True, sortBy=None, sortAscending=None + ): + """We assume that annotation features dataframe uses 0-based start-inclusive end-exclusive + counting with start <= end.""" + + df = annotDf + if type(df) is pd.Series: + df = df.to_frame().T + sortColList = [startCol, endCol, strandCol] + sortAscendingList = [True, True, True] + if sort: + if sortBy is not None: + if 'chromosome' in sortBy: + raise ValueError("chromosome is included by default and has to be sorted.") + sortColList = sortBy + sortColList + sortAscendingList = sortAscending + sortAscendingList + df = df.sort_values(by=sortColList, ascending=sortAscendingList) + df = sort_df(df, chromosomeCol, key=lambda x: (x.upper(), x[0].islower()), reverse=False) + print(df) + df.index.name = idCol + + if idCol not in df.columns: + df = df.reset_index() + + bed_string = "" + for index, annot in df.iterrows(): + # BED format uses 0-based start-inclusive end-exclusive counting (as in Python) + # BED format start < end + chromosomeName = annot[chromosomeCol] + if combineFeatureAndId: + bed_id = "{};{}".format(annot[featureCol], annot[idCol]) + else: + bed_id = annot[idCol] + bed_start = int(annot[startCol]) + bed_end = int(annot[endCol]) + bed_strand = annot[strandCol] + if bed_start > bed_end: + bed_start, bed_end = bed_end, bed_start + bed_string += "{}\t{}\t{}\t{}\t{:d}\t{}\n".format(chromosomeName, str(bed_start), str(bed_end), + bed_id, 0, bed_strand) + return bed_string + + +def extract_fasta_id(seqIdString): + """ + Extract the id in the fasta description string, taking only the characters till the '|'. + """ + seqId = None + regex1 = re.compile(r'^\s*?([^|]+?)(\s|\||$)') + seqIdSearch = re.search(regex1, seqIdString) + if seqIdSearch: + seqId = seqIdSearch.group(1) + return seqId + + +def sort_df_by_value_list(df, col, valueList): + sorterIndex = dict(zip(valueList, range(len(valueList)))) + ranInt = random.random_integers(1e6) + rank = df[col].map(sorterIndex) + rank.name = 'rank{:d}'.format(ranInt) + return df.join(rank).sort_values(rank.name).drop(rank.name, axis=1) + + +def sort_df(df, col, key=None, reverse=False): + """ + Take into account duplicated entries by using a rank column and applying the sort_df_by_value_list + method. + + Example, sorting by alphabetical order grouping together capital and small case, with capital case first: + sort_df(df, 'col', key=lambda x: (x.upper(), x[0].islower())) + """ + + def sorter(key=None, reverse=False): + """ + Using python built-in sorted function. + """ + def sorter_(series): + series_list = list(series) + return [series_list.index(i) + for i in sorted(series_list, key=key, reverse=reverse)] + return sorter_ + + df2 = df.reset_index(drop=True) + sortedIndex = sorter(key=key, reverse=reverse)(df2[col]) + sortedValuesUnique = list(OrderedDict.fromkeys(np.array([df2[col].iloc[i] for i in sortedIndex]))) + return sort_df_by_value_list(df2, col=col, valueList=sortedValuesUnique) diff --git a/src/pipeline_roesti.py b/src/pipeline_roesti.py index 4aa0cf7..72abe6c 100755 --- a/src/pipeline_roesti.py +++ b/src/pipeline_roesti.py @@ -25,8 +25,30 @@ from send_socket_message import send_socket_message from index_genome_files_bowtie2 import index_genome_files_bowtie2 -from mwTools.general import glob_file_list -from mwTools.general import open_by_suffix +# from mwTools.general import glob_file_list +# from mwTools.general import open_by_suffix + + +def glob_file_list(fileList): + "Apply glob on the list of file pattern, filter for files only, remove duplicated files." + if fileList is None or fileList == '': + fileList = None + else: + fileList = [fn for pattern in fileList for fn in glob(pattern) if Path(fn).is_file()] + # Remove duplicated files from list (can happen if multiple patterns match same files) + fileList = sorted(list(set(fileList))) + return fileList + + +def open_by_suffix(filename): + """Detect compressed file by filename suffix, and return gzip, bz2, or normal file handles.""" + if Path(filename).suffix == '.gz': + return gzip.open(filename, 'rt') + elif Path(filename).suffix == '.bz2': + return bz2.BZ2file(filename, 'rt') + else: + return open(filename, 'r') + # --------------------------------------------------