diff --git a/blast/blasttab_dice_filter.py b/blast/blasttab_dice_filter.py index 58debfe..86bf215 100755 --- a/blast/blasttab_dice_filter.py +++ b/blast/blasttab_dice_filter.py @@ -42,7 +42,7 @@ def blasttsv2gff3(blasttsv, min_dice=50): line = line.strip("\n") data = line.split("\t") dice = 2 * float(data[14]) / (float(data[22]) + float(data[23])) - + if dice >= min_dice: yield line diff --git a/blast/blasttab_to_gapped_gff3.py b/blast/blasttab_to_gapped_gff3.py index 8328f48..9bc3afe 100755 --- a/blast/blasttab_to_gapped_gff3.py +++ b/blast/blasttab_to_gapped_gff3.py @@ -61,13 +61,13 @@ def blasttsv2gff3( continue dc = {k: v for (k, v) in zip(columns, (x.strip() for x in record.split("\t")))} - #print(columns) - #print(dc) - #exit() + # print(columns) + # print(dc) + # exit() rec = SeqRecord(Seq("ACTG"), id=dc["qseqid"]) - #print(record_idx) - #print(type(record_idx)) + # print(record_idx) + # print(type(record_idx)) feature_id = "blast.%s.%s.%s" % (record_idx, dc["qseqid"], dc["sseqid"]) feature_id = re.sub("\|", "_", feature_id) # Replace any \ or | with _ feature_id = re.sub( diff --git a/blast/relatedness.py b/blast/relatedness.py index 282b55c..e3f7ce4 100755 --- a/blast/relatedness.py +++ b/blast/relatedness.py @@ -274,7 +274,7 @@ def scoreMap(blast): # data = expand_taxIDs(data) # data = deform_scores(data) if not args.noFilter: - data = filter_phage(data, phageTaxLookup) + data = filter_phage(data, phageTaxLookup) # data = expand_titles(data) if args.protein or args.canonical: diff --git a/comparative/mist3.py b/comparative/mist3.py index 5e4a09a..58368a4 100755 --- a/comparative/mist3.py +++ b/comparative/mist3.py @@ -41,7 +41,9 @@ class FancyRecord(object): def __init__(self, record, tmpdir): - self.temp = tempfile.NamedTemporaryFile(mode='w', dir=tmpdir, delete=False, suffix=".fa") + self.temp = tempfile.NamedTemporaryFile( + mode="w", dir=tmpdir, delete=False, suffix=".fa" + ) self.temp_path = self.temp.name self.id = self.temp_path.rsplit("/")[-1] self.record = record @@ -110,8 +112,7 @@ def __repr__(self): return "Subplot [%s]" % self.get_description() def run_gepard(self, matrix, window, global_rescale="35%"): - """Run gepard on two sequences, with a specified output file - """ + """Run gepard on two sequences, with a specified output file""" log.info("Running Gepard on %s", self.get_description()) destination_fn = ( @@ -138,8 +139,8 @@ def run_gepard(self, matrix, window, global_rescale="35%"): "--silent", ] log.debug(subprocess.list2cmdline(cmd)) - #log.info(subprocess.check_output("convert -list type")) - #exit(2) + # log.info(subprocess.check_output("convert -list type")) + # exit(2) failure_count = 0 while True: try: @@ -209,8 +210,7 @@ def char_height(font_size): return int(float(font_size) * 30 / 40) def est_pixels(string, font_size): - """guess pixel width of a string at a given font size - """ + """guess pixel width of a string at a given font size""" return char_width(font_size) * len(string) j_ticks = int(Misty.BestTick(self.j.length, 5)) @@ -246,7 +246,10 @@ def est_pixels(string, font_size): primary_header = self.i.header secondary_head = self.i.description cmd += ( - ["-rotate", "-90",] + [ + "-rotate", + "-90", + ] + FONT_30pt + [ # Side label (i/row) @@ -266,7 +269,10 @@ def est_pixels(string, font_size): ] ) - if est_pixels(self.i.description, 10) < original_dims[1] and secondary_head != "": + if ( + est_pixels(self.i.description, 10) < original_dims[1] + and secondary_head != "" + ): cmd += FONT_10pt + [ # Side label (i/row) "-annotate", @@ -340,7 +346,10 @@ def est_pixels(string, font_size): ] ) - if est_pixels(self.j.description, 10) < original_dims[0] and secondary_head != "": + if ( + est_pixels(self.j.description, 10) < original_dims[0] + and secondary_head != "" + ): cmd += FONT_10pt + [ "-annotate", "+%s+%s" @@ -379,25 +388,23 @@ def est_pixels(string, font_size): cmd += ["-annotate", "+%s+%s" % (x + 5, y), self.label_formatter(z)] cmd.append(outfile) - #tmpFile = open(outfile, "w") - #tmpFile.close() - log.info(subprocess.check_output( ["cp", infile, outfile] )) + # tmpFile = open(outfile, "w") + # tmpFile.close() + log.info(subprocess.check_output(["cp", infile, outfile])) log.info(subprocess.list2cmdline(cmd)) - log.info(subprocess.check_output( "ls" )) + log.info(subprocess.check_output("ls")) log.info(self.tmpdir) - log.info(subprocess.check_output( ["ls", self.tmpdir])) + log.info(subprocess.check_output(["ls", self.tmpdir])) log.info(outfile[2:]) log.info("Above was ls\n") try: - subprocess.check_output(cmd)# + [" 2>&1"]) + subprocess.check_output(cmd) # + [" 2>&1"]) except: log.info("Excepted") - class Misty(object): - """MIST Class for building MIST Plots - """ + """MIST Class for building MIST Plots""" def __init__(self, window=10, zoom=50, matrix="edna", files_path="mist_images"): self.tmpdir = tempfile.mkdtemp(prefix="cpt.mist3.", dir=".") @@ -563,7 +570,7 @@ def _generate_montage(self): MONTAGE_BORDER_COORD = "%sx%s" % (MONTAGE_BORDER, MONTAGE_BORDER) m0 = os.path.join(self.tmpdir, "m0.png") -# log.info(subprocess.check_output( ["cp", image_list[0], m0] )) + # log.info(subprocess.check_output( ["cp", image_list[0], m0] )) cmd = ["montage"] + image_list cmd += [ "-tile", @@ -581,9 +588,9 @@ def _generate_montage(self): log.debug(" ".join(cmd)) try: - subprocess.check_call(cmd) + subprocess.check_call(cmd) except: - log.debug("Excepted, 2") + log.debug("Excepted, 2") # Add grey borders montage_path = os.path.join(self.tmpdir, "montage.png") cmd = [ @@ -602,9 +609,9 @@ def _generate_montage(self): log.debug(" ".join(cmd)) try: - subprocess.check_call(cmd) + subprocess.check_call(cmd) except: - log.debug("Excepted, 2") + log.debug("Excepted, 2") os.unlink(m0) return montage_path @@ -629,10 +636,7 @@ def _annotate_montage(self, base_path): current_sum_width = MONTAGE_BORDER current_sum_height = MONTAGE_BORDER - convert_arguments_top+= [ - "-rotate", - "-90" - ] + convert_arguments_top += ["-rotate", "-90"] # Top side for j in range(len(self.matrix_data[0])): subplot = self.matrix_data[0][j]["subplot"] @@ -640,27 +644,39 @@ def _annotate_montage(self, base_path): "-fill", LABEL_COLOUR, "-annotate", - "-%s+%s" % (0, str(cumulative_width - current_sum_width -(subplot.get_thumb_dims()[0]/2) + (2 * MONTAGE_BORDER) + IMAGE_BORDER)), + "-%s+%s" + % ( + 0, + str( + cumulative_width + - current_sum_width + - (subplot.get_thumb_dims()[0] / 2) + + (2 * MONTAGE_BORDER) + + IMAGE_BORDER + ), + ), subplot.j.header, ] current_sum_width += subplot.get_thumb_dims()[0] + (2 * IMAGE_BORDER) log.debug("CSW %s", current_sum_width) - convert_arguments_top+= [ - "-rotate", - "90" - ] + convert_arguments_top += ["-rotate", "90"] # Left side - #convert_arguments_left += [ + # convert_arguments_left += [ # "-rotate", # "90" - #] + # ] for i in range(len(self.matrix_data)): subplot = self.matrix_data[i][0]["subplot"] convert_arguments_left += [ "-fill", LABEL_COLOUR, "-annotate", - "+2+%s" % str(current_sum_height + (subplot.get_thumb_dims()[1]/2.0) + IMAGE_BORDER), + "+2+%s" + % str( + current_sum_height + + (subplot.get_thumb_dims()[1] / 2.0) + + IMAGE_BORDER + ), "\n" + subplot.i.header, ] current_sum_height += subplot.get_thumb_dims()[1] + (2 * IMAGE_BORDER) @@ -669,15 +685,15 @@ def _annotate_montage(self, base_path): cmd = [ "convert", base_path, - # "-rotate", - # "-90", + # "-rotate", + # "-90", "-pointsize", "20", "-font", TYPEFONT, ] cmd += convert_arguments_left - # cmd += ["-rotate", "90"] + # cmd += ["-rotate", "90"] cmd += convert_arguments_top output_path = os.path.join(self.tmpdir, "large.png") @@ -694,7 +710,7 @@ def _annotate_montage(self, base_path): subprocess.check_call(cmd) except: log.debug("Excepted, 3") - #subprocess.check_output(cmd) + # subprocess.check_output(cmd) return output_path def run(self): diff --git a/comparative/xmfa.py b/comparative/xmfa.py index 0760809..f8cc9f5 100755 --- a/comparative/xmfa.py +++ b/comparative/xmfa.py @@ -30,8 +30,7 @@ def parse_xmfa(xmfa): - """Simple XMFA parser until https://github.com/biopython/biopython/pull/544 - """ + """Simple XMFA parser until https://github.com/biopython/biopython/pull/544""" current_lcb = [] current_seq = {} for line in xmfa.readlines(): @@ -103,8 +102,7 @@ def to_xmfa(lcbs, handle=sys.stdout): def percent_identity(a, b): - """Calculate % identity, ignoring gaps in the host sequence - """ + """Calculate % identity, ignoring gaps in the host sequence""" match = 0 mismatch = 0 for char_a, char_b in zip(list(a), list(b)): @@ -121,8 +119,7 @@ def percent_identity(a, b): def id_tn_dict(sequences, tmpfile=False): - """Figure out sequence IDs - """ + """Figure out sequence IDs""" label_convert = {} correct_chrom = None if not isinstance(sequences, list): diff --git a/comparative/xmfa2tbl.py b/comparative/xmfa2tbl.py index c579b0b..3f77b10 100755 --- a/comparative/xmfa2tbl.py +++ b/comparative/xmfa2tbl.py @@ -12,8 +12,7 @@ def _id_tn_dict(sequences): - """Figure out sequence IDs AND sequence lengths from fasta file - """ + """Figure out sequence IDs AND sequence lengths from fasta file""" label_convert = {} if sequences is not None: if len(sequences) == 1: diff --git a/cpt_annotation_table/gff3.py b/cpt_annotation_table/gff3.py index d4795d4..48496c3 100755 --- a/cpt_annotation_table/gff3.py +++ b/cpt_annotation_table/gff3.py @@ -97,10 +97,10 @@ def feature_test_type(feature, **kwargs): if "type" in kwargs: return str(feature.type).upper() == str(kwargs["type"]).upper() elif "types" in kwargs: - for x in kwargs["types"]: - if str(feature.type).upper() == str(x).upper(): - return True - return False + for x in kwargs["types"]: + if str(feature.type).upper() == str(x).upper(): + return True + return False raise Exception("Incorrect feature_test_type call, need type or types") diff --git a/cpt_annotation_table/phage_annotation_table.py b/cpt_annotation_table/phage_annotation_table.py index e60623c..ef3c29b 100755 --- a/cpt_annotation_table/phage_annotation_table.py +++ b/cpt_annotation_table/phage_annotation_table.py @@ -2,7 +2,14 @@ # vim: set fileencoding=utf-8 import os import argparse -from gff3 import genes, get_gff3_id, get_rbs_from, feature_test_true, feature_lambda, feature_test_type +from gff3 import ( + genes, + get_gff3_id, + get_rbs_from, + feature_test_true, + feature_lambda, + feature_test_type, +) from CPT_GFFParser import gffParse, gffWrite from Bio import SeqIO from jinja2 import Environment, FileSystemLoader @@ -16,6 +23,7 @@ SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__)) # Path to the HTML template for the report + def genes_all(feature_list, feature_type=["gene"], sort=False): """ Simple filter to extract gene features from the feature set. @@ -32,27 +40,29 @@ def genes_all(feature_list, feature_type=["gene"], sort=False): for x in data: yield x + def checkSubs(feature, qualName): subFeats = [] res = "" subFeats = feature.sub_features - while (len(subFeats) > 0): - for feat in subFeats: - for i in feat.qualifiers.keys(): - for j in qualName: - if i == j: - if res == "": - res = feat.qualifiers[i][0] - else: - res += "; " + feat.qualifiers[i][0] - if res != "": - return res - tempFeats = [] - for feat in subFeats: # Should be breadth-first results - for x in feat.sub_features: - tempFeats.append(x) - subFeats = tempFeats - return res + while len(subFeats) > 0: + for feat in subFeats: + for i in feat.qualifiers.keys(): + for j in qualName: + if i == j: + if res == "": + res = feat.qualifiers[i][0] + else: + res += "; " + feat.qualifiers[i][0] + if res != "": + return res + tempFeats = [] + for feat in subFeats: # Should be breadth-first results + for x in feat.sub_features: + tempFeats.append(x) + subFeats = tempFeats + return res + def annotation_table_report(record, types, wanted_cols, gaf_data, searchSubs): getTypes = [] @@ -65,73 +75,66 @@ def annotation_table_report(record, types, wanted_cols, gaf_data, searchSubs): useSubs = searchSubs def rid(record, feature): - """Organism ID - """ + """Organism ID""" return record.id def id(record, feature): - """ID - """ + """ID""" return feature.id def featureType(record, feature): - """Type - """ + """Type""" return feature.type def name(record, feature): - """Name - """ + """Name""" for x in ["Name", "name"]: - for y in feature.qualifiers.keys(): - if x == y: - return feature.qualifiers[x][0] + for y in feature.qualifiers.keys(): + if x == y: + return feature.qualifiers[x][0] if useSubs: - res = checkSubs(feature, ["Name", "name"]) - if res != "": - return res + res = checkSubs(feature, ["Name", "name"]) + if res != "": + return res return "None" + def start(record, feature): - """Boundary - """ + """Boundary""" return str(feature.location.start + 1) def end(record, feature): - """Boundary - """ + """Boundary""" return str(feature.location.end) def location(record, feature): - """Location - """ + """Location""" return str(feature.location.start + 1) + "..{0.end}".format(feature.location) def length(record, feature): - """CDS Length (AA) - """ - + """CDS Length (AA)""" + if feature.type == "CDS": - cdss = [feature] + cdss = [feature] else: - cdss = list(genes(feature.sub_features, feature_type="CDS", sort=True)) - + cdss = list(genes(feature.sub_features, feature_type="CDS", sort=True)) + if cdss == []: - return "None" + return "None" res = (sum([len(cds) for cds in cdss]) / 3) - 1 if floor(res) == res: - res = int(res) + res = int(res) return str(res) def notes(record, feature): """User entered Notes""" for x in ["Note", "note", "Notes", "notes"]: - for y in feature.qualifiers.keys(): - if x == y: - return feature.qualifiers[x][0] + for y in feature.qualifiers.keys(): + if x == y: + return feature.qualifiers[x][0] if useSubs: - res = checkSubs(feature, ["Note", "note", "Notes", "notes"]) - if res != "": - return res + res = checkSubs(feature, ["Note", "note", "Notes", "notes"]) + if res != "": + return res return "None" def date_created(record, feature): @@ -142,22 +145,22 @@ def date_last_modified(record, feature): """Last Modified""" res = feature.qualifiers.get("date_last_modified", ["None"])[0] if res != "None": - return res - if useSubs: - res = checkSubs(feature, ["date_last_modified"]) - if res != "": return res + if useSubs: + res = checkSubs(feature, ["date_last_modified"]) + if res != "": + return res return "None" def description(record, feature): """Description""" res = feature.qualifiers.get("description", ["None"])[0] if res != "None": - return res - if useSubs: - res = checkSubs(feature, ["description"]) - if res != "": return res + if useSubs: + res = checkSubs(feature, ["description"]) + if res != "": + return res return "None" def owner(record, feature): @@ -166,13 +169,13 @@ def owner(record, feature): User who created the feature. In a 464 scenario this may be one of the TAs.""" for x in ["Owner", "owner"]: - for y in feature.qualifiers.keys(): - if x == y: - return feature.qualifiers[x][0] + for y in feature.qualifiers.keys(): + if x == y: + return feature.qualifiers[x][0] if useSubs: - res = checkSubs(feature, ["Owner", "owner"]) - if res != "": - return res + res = checkSubs(feature, ["Owner", "owner"]) + if res != "": + return res return "None" def product(record, feature): @@ -182,13 +185,13 @@ def product(record, feature): entries)""" """User entered Notes""" for x in ["product", "Product"]: - for y in feature.qualifiers.keys(): - if x == y: - return feature.qualifiers[x][0] + for y in feature.qualifiers.keys(): + if x == y: + return feature.qualifiers[x][0] if useSubs: - res = checkSubs(feature, ["product", "Product"]) - if res != "": - return res + res = checkSubs(feature, ["product", "Product"]) + if res != "": + return res return "None" def note(record, feature): @@ -198,13 +201,11 @@ def note(record, feature): return feature.qualifiers.get("Note", []) def strand(record, feature): - """Strand - """ + """Strand""" return "+" if feature.location.strand > 0 else "-" def sd_spacing(record, feature): - """Shine-Dalgarno spacing - """ + """Shine-Dalgarno spacing""" rbss = get_rbs_from(gene) if len(rbss) == 0: return "None" @@ -213,7 +214,7 @@ def sd_spacing(record, feature): for rbs in rbss: cdss = list(genes(feature.sub_features, feature_type="CDS", sort=True)) if len(cdss) == 0: - return "No CDS" + return "No CDS" if rbs.location.strand > 0: distance = min( cdss, key=lambda x: x.location.start - rbs.location.end @@ -232,8 +233,7 @@ def sd_spacing(record, feature): return resp def sd_seq(record, feature): - """Shine-Dalgarno sequence - """ + """Shine-Dalgarno sequence""" rbss = get_rbs_from(gene) if len(rbss) == 0: return "None" @@ -247,13 +247,12 @@ def sd_seq(record, feature): return resp def start_codon(record, feature): - """Start Codon - """ + """Start Codon""" if feature.type == "CDS": - cdss = [feature] + cdss = [feature] else: - cdss = list(genes(feature.sub_features, feature_type="CDS", sort=True)) - + cdss = list(genes(feature.sub_features, feature_type="CDS", sort=True)) + data = [x for x in cdss] if len(data) == 1: return str(data[0].extract(record).seq[0:3]) @@ -266,58 +265,68 @@ def start_codon(record, feature): ] def stop_codon(record, feature): - """Stop Codon - """ + """Stop Codon""" return str(feature.extract(record).seq[-3:]) def dbxrefs(record, feature): - """DBxrefs - """ + """DBxrefs""" """User entered Notes""" for x in ["Dbxref", "db_xref", "DB_xref", "DBxref", "DB_Xref", "DBXref"]: - for y in feature.qualifiers.keys(): - if x == y: - return feature.qualifiers[x][0] + for y in feature.qualifiers.keys(): + if x == y: + return feature.qualifiers[x][0] return "None" def upstream_feature(record, feature): """Next gene upstream""" if feature.strand > 0: upstream_features = [ - x for x in sorted_features if (x.location.start < feature.location.start and x.type == "gene" and x.strand == feature.strand) + x + for x in sorted_features + if ( + x.location.start < feature.location.start + and x.type == "gene" + and x.strand == feature.strand + ) ] if len(upstream_features) > 0: foundSelf = False featCheck = upstream_features[-1].sub_features for x in featCheck: - if x == feature: - foundSelf = True - break - featCheck = featCheck + x.sub_features + if x == feature: + foundSelf = True + break + featCheck = featCheck + x.sub_features if foundSelf: - if len(upstream_features) > 1: - return upstream_features[-2] - return None + if len(upstream_features) > 1: + return upstream_features[-2] + return None return upstream_features[-1] else: return None else: upstream_features = [ - x for x in sorted_features if (x.location.end > feature.location.end and x.type == "gene" and x.strand == feature.strand) + x + for x in sorted_features + if ( + x.location.end > feature.location.end + and x.type == "gene" + and x.strand == feature.strand + ) ] if len(upstream_features) > 0: foundSelf = False featCheck = upstream_features[0].sub_features for x in featCheck: - if x == feature: - foundSelf = True - break - featCheck = featCheck + x.sub_features + if x == feature: + foundSelf = True + break + featCheck = featCheck + x.sub_features if foundSelf: - if len(upstream_features) > 1: - return upstream_features[1] - return None + if len(upstream_features) > 1: + return upstream_features[1] + return None return upstream_features[0] else: return None @@ -369,58 +378,47 @@ def gaf_aspect(record, feature, gaf_data): return _main_gaf_func(record, feature, gaf_data, "aspect") def gaf_assigned_by(record, feature, gaf_data): - """GAF Creating Organisation - """ + """GAF Creating Organisation""" return _main_gaf_func(record, feature, gaf_data, "assigned_by") def gaf_date(record, feature, gaf_data): - """GAF Creation Date - """ + """GAF Creation Date""" return _main_gaf_func(record, feature, gaf_data, "date") def gaf_db(record, feature, gaf_data): - """GAF DB - """ + """GAF DB""" return _main_gaf_func(record, feature, gaf_data, "db") def gaf_db_reference(record, feature, gaf_data): - """GAF DB Reference - """ + """GAF DB Reference""" return _main_gaf_func(record, feature, gaf_data, "db_reference") def gaf_evidence_code(record, feature, gaf_data): - """GAF Evidence Code - """ + """GAF Evidence Code""" return _main_gaf_func(record, feature, gaf_data, "evidence_code") def gaf_go_id(record, feature, gaf_data): - """GAF GO ID - """ + """GAF GO ID""" return _main_gaf_func(record, feature, gaf_data, "go_id") def gaf_go_term(record, feature, gaf_data): - """GAF GO Term - """ + """GAF GO Term""" return _main_gaf_func(record, feature, gaf_data, "go_term") def gaf_id(record, feature, gaf_data): - """GAF ID - """ + """GAF ID""" return _main_gaf_func(record, feature, gaf_data, "id") def gaf_notes(record, feature, gaf_data): - """GAF Notes - """ + """GAF Notes""" return _main_gaf_func(record, feature, gaf_data, "notes") def gaf_owner(record, feature, gaf_data): - """GAF Creator - """ + """GAF Creator""" return _main_gaf_func(record, feature, gaf_data, "owner") def gaf_with_or_from(record, feature, gaf_data): - """GAF With/From - """ + """GAF With/From""" return _main_gaf_func(record, feature, gaf_data, "with_or_from") cols = [] @@ -431,7 +429,7 @@ def gaf_with_or_from(record, feature, gaf_data): if not x: continue if x == "type": - x = "featureType" + x = "featureType" if x in lcl: funcs.append(lcl[x]) # Keep track of docs @@ -473,9 +471,9 @@ def gaf_with_or_from(record, feature, gaf_data): if isinstance(value, list): collapsed_value = ", ".join(value) - value = [str(collapsed_value)]#.encode("unicode_escape")] + value = [str(collapsed_value)] # .encode("unicode_escape")] else: - value = str(value)#.encode("unicode_escape") + value = str(value) # .encode("unicode_escape") row.append(value) # print row @@ -512,7 +510,7 @@ def parseGafData(file): line = row.strip().split("\t") tmp = dict(zip(cols, line)) if "gene" not in tmp.keys(): - continue + continue if tmp["gene"] not in data: data[tmp["gene"]] = [] @@ -527,7 +525,7 @@ def evaluate_and_report( reportTemplateName="phage_annotation_validator.html", annotationTableCols="", gafData=None, - searchSubs = False, + searchSubs=False, ): """ Generate our HTML evaluation of the genome @@ -600,7 +598,9 @@ def join(listy): "--gafData", help="CPT GAF-like table", type=argparse.FileType("r") ) parser.add_argument( - "--searchSubs", help="Attempt to populate fields from sub-features if qualifier is empty", action="store_true" + "--searchSubs", + help="Attempt to populate fields from sub-features if qualifier is empty", + action="store_true", ) args = parser.parse_args() diff --git a/cpt_blast_to_xmfa/gff3.py b/cpt_blast_to_xmfa/gff3.py index d4795d4..48496c3 100755 --- a/cpt_blast_to_xmfa/gff3.py +++ b/cpt_blast_to_xmfa/gff3.py @@ -97,10 +97,10 @@ def feature_test_type(feature, **kwargs): if "type" in kwargs: return str(feature.type).upper() == str(kwargs["type"]).upper() elif "types" in kwargs: - for x in kwargs["types"]: - if str(feature.type).upper() == str(x).upper(): - return True - return False + for x in kwargs["types"]: + if str(feature.type).upper() == str(x).upper(): + return True + return False raise Exception("Incorrect feature_test_type call, need type or types") diff --git a/cpt_blastn_to_gff/blast_to_gff3.py b/cpt_blastn_to_gff/blast_to_gff3.py index e537fcf..b96f32f 100755 --- a/cpt_blastn_to_gff/blast_to_gff3.py +++ b/cpt_blastn_to_gff/blast_to_gff3.py @@ -166,7 +166,11 @@ def blastxml2gff3(blastxml, include_seq=False): ) # Build the top level seq feature for the hit - hit_qualifiers["description"] = "Residue %s..%s hit to %s" % (parent_match_start, parent_match_end, desc,) + hit_qualifiers["description"] = "Residue %s..%s hit to %s" % ( + parent_match_start, + parent_match_end, + desc, + ) top_feature = gffSeqFeature( FeatureLocation(parent_match_start - 1, parent_match_end), type=match_type, @@ -217,7 +221,10 @@ def combine_records(records): cleaned_records[combo_id].features[0].subfeatures = copy.deepcopy( sub_features ) - cleaned_records[combo_id].features[0].qualifiers["score"] = min(cleaned_records[combo_id].features[0].qualifiers["score"], rec.features[0].qualifiers["score"]) + cleaned_records[combo_id].features[0].qualifiers["score"] = min( + cleaned_records[combo_id].features[0].qualifiers["score"], + rec.features[0].qualifiers["score"], + ) # now we need to update the IDs for the features when combined # sort them into the proper order, then apply new ids # and also ensure the parent record boundaries fit the whole span of subfeatures @@ -234,8 +241,11 @@ def combine_records(records): new_parent_end, feat.location.start + 1, feat.location.end, - ) - cleaned_records[combo_id].features[0].qualifiers["score"] = min(cleaned_records[combo_id].features[0].qualifiers["score"], feat.qualifiers["blast_score"]) + ) + cleaned_records[combo_id].features[0].qualifiers["score"] = min( + cleaned_records[combo_id].features[0].qualifiers["score"], + feat.qualifiers["blast_score"], + ) # if feat.location.start < new_parent_start: # new_parent_start = feat.location.start - 1 # if feat.location.end > new_parent_end: @@ -245,11 +255,11 @@ def combine_records(records): ) cleaned_records[combo_id].features[0].qualifiers[ "description" - ] = "Residue %s..%s hit to %s" % ( - new_parent_start, - new_parent_end, - cleaned_records[combo_id].features[0].qualifiers["Name"], - ) + ] = "Residue %s..%s hit to %s" % ( + new_parent_start, + new_parent_end, + cleaned_records[combo_id].features[0].qualifiers["Name"], + ) # save the renamed and ordered feature list to record cleaned_records[combo_id].features[0].sub_features = copy.deepcopy( sub_features @@ -309,7 +319,7 @@ def blasttsv2gff3(blasttsv, include_seq=False): "ID": feature_id, "Name": (dc["salltitles"].split("<>")[0]), "description": "Residue {sstart}..{send} hit to {x}".format( - x=dc["salltitles"].split("<>")[0], **dc + x=dc["salltitles"].split("<>")[0], **dc ), "source": "blast", "score": dc["evalue"], @@ -321,7 +331,14 @@ def blasttsv2gff3(blasttsv, include_seq=False): hsp_qualifiers = {"source": "blast"} for key in dc.keys(): # Add the remaining BLAST info to the GFF qualifiers - if key in ("salltitles", "sallseqid", "sseqid", "qseqid", "qseq", "sseq",): + if key in ( + "salltitles", + "sallseqid", + "sseqid", + "qseqid", + "qseq", + "sseq", + ): continue hsp_qualifiers["blast_%s" % key] = clean_string(dc[key]) diff --git a/cpt_blastp_to_gff/blast_to_gff3.py b/cpt_blastp_to_gff/blast_to_gff3.py index e537fcf..b96f32f 100755 --- a/cpt_blastp_to_gff/blast_to_gff3.py +++ b/cpt_blastp_to_gff/blast_to_gff3.py @@ -166,7 +166,11 @@ def blastxml2gff3(blastxml, include_seq=False): ) # Build the top level seq feature for the hit - hit_qualifiers["description"] = "Residue %s..%s hit to %s" % (parent_match_start, parent_match_end, desc,) + hit_qualifiers["description"] = "Residue %s..%s hit to %s" % ( + parent_match_start, + parent_match_end, + desc, + ) top_feature = gffSeqFeature( FeatureLocation(parent_match_start - 1, parent_match_end), type=match_type, @@ -217,7 +221,10 @@ def combine_records(records): cleaned_records[combo_id].features[0].subfeatures = copy.deepcopy( sub_features ) - cleaned_records[combo_id].features[0].qualifiers["score"] = min(cleaned_records[combo_id].features[0].qualifiers["score"], rec.features[0].qualifiers["score"]) + cleaned_records[combo_id].features[0].qualifiers["score"] = min( + cleaned_records[combo_id].features[0].qualifiers["score"], + rec.features[0].qualifiers["score"], + ) # now we need to update the IDs for the features when combined # sort them into the proper order, then apply new ids # and also ensure the parent record boundaries fit the whole span of subfeatures @@ -234,8 +241,11 @@ def combine_records(records): new_parent_end, feat.location.start + 1, feat.location.end, - ) - cleaned_records[combo_id].features[0].qualifiers["score"] = min(cleaned_records[combo_id].features[0].qualifiers["score"], feat.qualifiers["blast_score"]) + ) + cleaned_records[combo_id].features[0].qualifiers["score"] = min( + cleaned_records[combo_id].features[0].qualifiers["score"], + feat.qualifiers["blast_score"], + ) # if feat.location.start < new_parent_start: # new_parent_start = feat.location.start - 1 # if feat.location.end > new_parent_end: @@ -245,11 +255,11 @@ def combine_records(records): ) cleaned_records[combo_id].features[0].qualifiers[ "description" - ] = "Residue %s..%s hit to %s" % ( - new_parent_start, - new_parent_end, - cleaned_records[combo_id].features[0].qualifiers["Name"], - ) + ] = "Residue %s..%s hit to %s" % ( + new_parent_start, + new_parent_end, + cleaned_records[combo_id].features[0].qualifiers["Name"], + ) # save the renamed and ordered feature list to record cleaned_records[combo_id].features[0].sub_features = copy.deepcopy( sub_features @@ -309,7 +319,7 @@ def blasttsv2gff3(blasttsv, include_seq=False): "ID": feature_id, "Name": (dc["salltitles"].split("<>")[0]), "description": "Residue {sstart}..{send} hit to {x}".format( - x=dc["salltitles"].split("<>")[0], **dc + x=dc["salltitles"].split("<>")[0], **dc ), "source": "blast", "score": dc["evalue"], @@ -321,7 +331,14 @@ def blasttsv2gff3(blasttsv, include_seq=False): hsp_qualifiers = {"source": "blast"} for key in dc.keys(): # Add the remaining BLAST info to the GFF qualifiers - if key in ("salltitles", "sallseqid", "sseqid", "qseqid", "qseq", "sseq",): + if key in ( + "salltitles", + "sallseqid", + "sseqid", + "qseqid", + "qseq", + "sseq", + ): continue hsp_qualifiers["blast_%s" % key] = clean_string(dc[key]) diff --git a/cpt_blasttab_dice_filter/blasttab_dice_filter.py b/cpt_blasttab_dice_filter/blasttab_dice_filter.py index 58debfe..86bf215 100755 --- a/cpt_blasttab_dice_filter/blasttab_dice_filter.py +++ b/cpt_blasttab_dice_filter/blasttab_dice_filter.py @@ -42,7 +42,7 @@ def blasttsv2gff3(blasttsv, min_dice=50): line = line.strip("\n") data = line.split("\t") dice = 2 * float(data[14]) / (float(data[22]) + float(data[23])) - + if dice >= min_dice: yield line diff --git a/cpt_bprom_converter/bprom_gff3_converter.py b/cpt_bprom_converter/bprom_gff3_converter.py index 988eef7..37ac3fc 100644 --- a/cpt_bprom_converter/bprom_gff3_converter.py +++ b/cpt_bprom_converter/bprom_gff3_converter.py @@ -4,9 +4,11 @@ import re from typing import List, Match, Dict, TextIO, Union from datetime import date + # In this file, a "feature" refers to the collection of data between the > keys of the bprom output. # That collection of data refers to one section of the DNA upstream of a gene + def read_bprom_file(bprom_file) -> List[str]: """Reads in file, creating a list of strings with each list element containing a line from the file""" contents = [] @@ -20,22 +22,22 @@ def read_bprom_file(bprom_file) -> List[str]: def concatenate_then_split(contents) -> List[str]: """Concatenates the file into one large string. - Then splits it on '>' so that each feature's data is together in one element""" + Then splits it on '>' so that each feature's data is together in one element""" # Concatenates the entire file into one large string - concat_contents = ''.join(contents) + concat_contents = "".join(contents) # Removing the empty string '' at element 0 used to make the join concat_contents = concat_contents[1:] # Splits the file into a list of strings on ">" - features = concat_contents.split('>') + features = concat_contents.split(">") return features def remove_promoterless_features(features) -> List[str]: """For each concatenated feature string passed, removes the element - if the # of predicted promoters is 0.""" + if the # of predicted promoters is 0.""" cleaned_features = features indices_to_delete = [] for i, feature in enumerate(cleaned_features): @@ -51,8 +53,8 @@ def remove_promoterless_features(features) -> List[str]: def extract_accession(feature) -> str: """Extract accession""" - accession = re.search('[\w](.*)(?=_)', feature) - accession = accession.group().replace('_', '').strip() + accession = re.search("[\w](.*)(?=_)", feature) + accession = accession.group().replace("_", "").strip() return accession @@ -61,8 +63,8 @@ def extract_test_seq_position(feature) -> List[str]: """Extract position in genome. Gets any number of values '(.*)' between the brackets using 'lookbehind/lookright' (?<=PATTERN) and 'lookahead/lookleft' regex assertions to extract (?<=Location=\\[)(.*)(?=]\\()""" - location = re.search('(?<=Location=\\[)(.*)(?=]\\()', feature) - location = location.group().split(':') + location = re.search("(?<=Location=\\[)(.*)(?=]\\()", feature) + location = location.group().split(":") return location @@ -70,7 +72,7 @@ def extract_test_seq_position(feature) -> List[str]: def extract_strand_direction(feature) -> str: """Extract strand direction for a feature, - or +""" # Matches for '(.)' - direction = re.search('(?<=\\().(?=\\))', feature) + direction = re.search("(?<=\\().(?=\\))", feature) direction = direction.group() return direction @@ -81,54 +83,56 @@ def extract_promoter_data(feature) -> Dict[str, str]: Use for one element in the output of concatenate_then_split()""" # Extract promoter -10 and -35 sequences and scores # Gets everything between "-xx box at pos." and " Score" - minus10 = re.search('(?<=-10 box at pos.)(.*)(?= Score)(.*)', feature) - minus35 = re.search('(?<=-35 box at pos.)(.*)(?= Score)(.*)', feature) + minus10 = re.search("(?<=-10 box at pos.)(.*)(?= Score)(.*)", feature) + minus35 = re.search("(?<=-35 box at pos.)(.*)(?= Score)(.*)", feature) # Extracts the match and removes leading and trailing whitespace (which can be variable) # (the bprom output does not maintain the same # of whitespace characters # if there are less digits, at least for the scoring) - minus10 = minus10.group().lstrip().split(' ') + minus10 = minus10.group().lstrip().split(" ") minus10_pos = int(minus10[0]) minus10_seq = minus10[1] minus10_score = minus10[-1] - minus35 = minus35.group().lstrip().split(' ') + minus35 = minus35.group().lstrip().split(" ") minus35_pos = int(minus35[0]) minus35_seq = minus35[1] minus35_score = minus35[-1] # Can change these keys to change the column 9 promoter_data = { - 'minus10_pos': minus10_pos, - 'minus10_seq': minus10_seq, - 'minus10_score': minus10_score, - 'minus35_pos': minus35_pos, - 'minus35_seq': minus35_seq, - 'minus35_score': minus35_score + "minus10_pos": minus10_pos, + "minus10_seq": minus10_seq, + "minus10_score": minus10_score, + "minus35_pos": minus35_pos, + "minus35_seq": minus35_seq, + "minus35_score": minus35_score, } return promoter_data def convert_extracted_promoter_data_to_ID_column_format( - promoter_data, - calculated_promoter_positions) -> str: + promoter_data, calculated_promoter_positions +) -> str: """Converts input data to the GFF3 ID column (column 9) format, a semicolon separated - list of values providing additional information about each feature""" + list of values providing additional information about each feature""" # Replaces the BPROM output positions with the calculated ones minus_10_calculated = calculated_promoter_positions[2] minus_35_calculated = calculated_promoter_positions[3] - promoter_data['minus10_pos'] = minus_10_calculated - promoter_data['minus35_pos'] = minus_35_calculated + promoter_data["minus10_pos"] = minus_10_calculated + promoter_data["minus35_pos"] = minus_35_calculated # Creates the column 9 string (attributes) - promoter_data = ['{}={}'.format(key, value) for key, value in promoter_data.items()] - promoter_data = 'Description=Predicted promoter data;' + 'Note=' + ','.join(promoter_data) + ';' + promoter_data = ["{}={}".format(key, value) for key, value in promoter_data.items()] + promoter_data = ( + "Description=Predicted promoter data;" + "Note=" + ",".join(promoter_data) + ";" + ) return promoter_data def extract_LDF_score(feature) -> str: """Extract LDF score""" - LDF = re.search('(?<=LDF-)(.*)', feature) + LDF = re.search("(?<=LDF-)(.*)", feature) LDF = LDF.group().strip() return LDF @@ -137,7 +141,7 @@ def extract_LDF_score(feature) -> str: def calculate_promoter_position(feature): """Calculate promoter positions (in the context of the genome) based on BPROM predictions.""" # Get 'Promoter Pos: X' data. This refers to the predicted transcriptional start site! - promoter_pos = re.search('(?<=Promoter Pos:)(.*)(?=LDF)', feature) + promoter_pos = re.search("(?<=Promoter Pos:)(.*)(?=LDF)", feature) promoter_pos = int(promoter_pos.group().strip()) # Get start and end positions from 'Location=[XXX:YYYY]' @@ -147,38 +151,42 @@ def calculate_promoter_position(feature): promoter_data = extract_promoter_data(feature) - ''' IMPORTANT!! Whether or not you add or subtract to calculate the promoter start + """ IMPORTANT!! Whether or not you add or subtract to calculate the promoter start # position depends on whether we're on the + or - strand! # The workflow Jolene uses is smart enough to correctly pull upstream # for both + and - strands (i.e., pulls left for +, pulls right for -) # THEREFORE, for a gene with a start at 930 on the + strand, it pulls 830:930 - # And for a gene with a start at 930 on the - strand, it pulls 930:1030 ''' + # And for a gene with a start at 930 on the - strand, it pulls 930:1030 """ direction = extract_strand_direction(feature) - if direction == '+': + if direction == "+": # BPROM starts counting from the LEFT boundary for + strand test sequences (as expected) # Get -10 promoter position - minus10_pos = promoter_data['minus10_pos'] + minus10_pos = promoter_data["minus10_pos"] minus10_pos_in_context_of_genome = test_cds_location_start_pos + minus10_pos # Get -35 promoter position - minus35_pos = promoter_data['minus35_pos'] + minus35_pos = promoter_data["minus35_pos"] minus35_pos_in_context_of_genome = test_cds_location_start_pos + minus35_pos start = test_cds_location_start_pos + minus35_pos end = test_cds_location_start_pos + promoter_pos calculated_promoter_positions = [ - start, end, minus10_pos_in_context_of_genome, minus35_pos_in_context_of_genome] + start, + end, + minus10_pos_in_context_of_genome, + minus35_pos_in_context_of_genome, + ] return calculated_promoter_positions - elif direction == '-': + elif direction == "-": # BPROM starts counting from the RIGHT boundary for - strand test sequences # Get -10 promoter position - minus10_pos = promoter_data['minus10_pos'] + minus10_pos = promoter_data["minus10_pos"] minus10_pos_in_context_of_genome = test_cds_location_end_pos - minus10_pos # Get -35 promoter position - minus35_pos = promoter_data['minus35_pos'] + minus35_pos = promoter_data["minus35_pos"] minus35_pos_in_context_of_genome = test_cds_location_end_pos - minus35_pos # The start and end are reversed @@ -186,11 +194,15 @@ def calculate_promoter_position(feature): start = test_cds_location_end_pos - promoter_pos calculated_promoter_positions = [ - start, end, minus10_pos_in_context_of_genome, minus35_pos_in_context_of_genome] + start, + end, + minus10_pos_in_context_of_genome, + minus35_pos_in_context_of_genome, + ] return calculated_promoter_positions else: - assert "Error: Strand data neither \'+\' nor \'-\'" + assert "Error: Strand data neither '+' nor '-'" def extract_tf_binding_elements(): @@ -200,7 +212,7 @@ def extract_tf_binding_elements(): def extract_data_for_all_features(features) -> List[List[Union[str, int]]]: """Loops through cleaned bprom output extracting all data of interest and builds the - structure for loading into a dataframe""" + structure for loading into a dataframe""" extracted_data = [] for feature in features: # loop through features, a List[str] containing each feature [str] in the @@ -208,19 +220,22 @@ def extract_data_for_all_features(features) -> List[List[Union[str, int]]]: calculated_promoter_positions = calculate_promoter_position(feature) promoter_data = extract_promoter_data(feature) promoter_data_converted = convert_extracted_promoter_data_to_ID_column_format( - promoter_data, calculated_promoter_positions) + promoter_data, calculated_promoter_positions + ) extracted_data.append( - [extract_accession(feature), # Seqid, col 1 - 'bprom', # Source, col 2 - 'promoter', # Type, col 3 - calculated_promoter_positions[0], # Start, col 4 - calculated_promoter_positions[1], # End, col 5 - extract_LDF_score(feature), # Score, col 6 - extract_strand_direction(feature), # Strand direction, col 7 - '.', # Phase, col 8 - promoter_data_converted, # Attributes, col 9 - ]) + [ + extract_accession(feature), # Seqid, col 1 + "bprom", # Source, col 2 + "promoter", # Type, col 3 + calculated_promoter_positions[0], # Start, col 4 + calculated_promoter_positions[1], # End, col 5 + extract_LDF_score(feature), # Score, col 6 + extract_strand_direction(feature), # Strand direction, col 7 + ".", # Phase, col 8 + promoter_data_converted, # Attributes, col 9 + ] + ) return extracted_data @@ -228,30 +243,41 @@ def extract_data_for_all_features(features) -> List[List[Union[str, int]]]: def convert_to_dataframe(extracted_data) -> pd.DataFrame: """Convert extracted and processed data to Pandas dataframe with gff3 column names""" - df = pd.DataFrame(extracted_data, - columns=['seqid', 'source', 'type', 'start', 'end', - 'score', 'strand', 'phase', 'attributes'] - ) + df = pd.DataFrame( + extracted_data, + columns=[ + "seqid", + "source", + "type", + "start", + "end", + "score", + "strand", + "phase", + "attributes", + ], + ) return df def write_to_gff3(dataframe) -> None: """Create a gff3 text file from the DataFrame by converting to a tab separated values (tsv) file""" - tsv = dataframe.to_csv(sep='\t', index=False, header=None) + tsv = dataframe.to_csv(sep="\t", index=False, header=None) # Gets the first element of the first column to use for accession = dataframe.iloc[0][0] year, month, day = date.today().year, date.today().month, date.today().day - #with open(f'{year}_{month}_{day}_bprom_as_gff3_{accession}.txt', 'w') as wf: - # Header so Galaxy can recognize as GFF3 - print('##gff-version 3\n') - #for line in tsv: + # with open(f'{year}_{month}_{day}_bprom_as_gff3_{accession}.txt', 'w') as wf: + # Header so Galaxy can recognize as GFF3 + print("##gff-version 3\n") + # for line in tsv: print(tsv) return + def convert_bprom_output_to_gff3(bprom_file) -> None: """Master function. Given a BPROM .txt file as output, extracts data and writes as a GFF3 file""" bprom_file = read_bprom_file(bprom_file) @@ -265,7 +291,7 @@ def convert_bprom_output_to_gff3(bprom_file) -> None: return -if __name__ == '__main__': +if __name__ == "__main__": ## Shows the DataFrame output in the terminal for testing/debugging # bprom_file = read_bprom_file('BPROM_output.txt') # concatenated_bprom_file: List[str] = concatenate_then_split(bprom_file) @@ -273,9 +299,10 @@ def convert_bprom_output_to_gff3(bprom_file) -> None: # print(convert_to_dataframe(extract_data_for_all_features(working_file)).to_string()) parser = argparse.ArgumentParser( - description='converts BPROM output to the gff3 file format') + description="converts BPROM output to the gff3 file format" + ) - parser.add_argument('-f', help='bprom file as .txt') + parser.add_argument("-f", help="bprom file as .txt") args = parser.parse_args() # Actual function for converting the BPROM output to gff3 convert_bprom_output_to_gff3(args.f) diff --git a/cpt_cluster_lcbs/cluster_lcbs.py b/cpt_cluster_lcbs/cluster_lcbs.py index d766210..88cb492 100644 --- a/cpt_cluster_lcbs/cluster_lcbs.py +++ b/cpt_cluster_lcbs/cluster_lcbs.py @@ -6,8 +6,7 @@ def parse_xmfa(xmfa): - """Simple XMFA parser until https://github.com/biopython/biopython/pull/544 - """ + """Simple XMFA parser until https://github.com/biopython/biopython/pull/544""" current_lcb = [] current_seq = {} for line in xmfa.readlines(): @@ -79,8 +78,7 @@ def to_xmfa(lcbs, handle=sys.stdout): def percent_identity(a, b): - """Calculate % identity, ignoring gaps in the host sequence - """ + """Calculate % identity, ignoring gaps in the host sequence""" match = 0 mismatch = 0 for char_a, char_b in zip(list(a), list(b)): @@ -97,8 +95,7 @@ def percent_identity(a, b): def id_tn_dict(sequences, tmpfile=False): - """Figure out sequence IDs - """ + """Figure out sequence IDs""" label_convert = {} correct_chrom = None if not isinstance(sequences, list): @@ -121,7 +118,7 @@ def id_tn_dict(sequences, tmpfile=False): def filter_lcbs_for_seq(xmfa): - """ clusters lcbs based on which sequences they involve """ + """clusters lcbs based on which sequences they involve""" strand_info = {"1": "+", "-1": "-"} clusters = {} @@ -181,7 +178,7 @@ def new(clusters, lcb): def cluster_lcbs(lcbs, threshold): - """ clusters lcbs based on how far apart they are""" + """clusters lcbs based on how far apart they are""" clusters = [] for o, i in enumerate(lcbs): diff --git a/cpt_convert_glimmer/cpt_convert_glimmer_to_gff3.py b/cpt_convert_glimmer/cpt_convert_glimmer_to_gff3.py index 6cd0e72..025d2fb 100755 --- a/cpt_convert_glimmer/cpt_convert_glimmer_to_gff3.py +++ b/cpt_convert_glimmer/cpt_convert_glimmer_to_gff3.py @@ -45,13 +45,13 @@ def glimmer3_to_gff3(glimmer, genome): start -= 1 if start > end: - #gene found on boundary (ex [4000, 200]) from glimmer assuming circular genome - #-------------start<=======|sequence end|========>end------ + # gene found on boundary (ex [4000, 200]) from glimmer assuming circular genome + # -------------start<=======|sequence end|========>end------ if strand > 0: end = len(current_record) else: start = 0 - gene_id+="_truncated" + gene_id += "_truncated" cds_feat = gffSeqFeature( FeatureLocation(start, end), @@ -62,7 +62,7 @@ def glimmer3_to_gff3(glimmer, genome): "source": "Glimmer3", "ID": "%s.cds_%s" % (current_record.id, gene_id), }, - source="Glimmer3" + source="Glimmer3", ) gene = gffSeqFeature( @@ -74,7 +74,7 @@ def glimmer3_to_gff3(glimmer, genome): "source": "Glimmer3", "ID": "%s.%s" % (current_record.id, gene_id), }, - source="Glimmer3" + source="Glimmer3", ) gene.sub_features = [cds_feat] current_record.features.append(gene) diff --git a/cpt_convert_mga/cpt_convert_mga_to_gff3.py b/cpt_convert_mga/cpt_convert_mga_to_gff3.py index e2d51d6..fddd73e 100755 --- a/cpt_convert_mga/cpt_convert_mga_to_gff3.py +++ b/cpt_convert_mga/cpt_convert_mga_to_gff3.py @@ -67,7 +67,7 @@ def mga_to_gff3(mga_output, genome): "Source": "MGA", }, phase=phase, - source="MGA" + source="MGA", ) cds_feat = gffSeqFeature( @@ -77,9 +77,9 @@ def mga_to_gff3(mga_output, genome): qualifiers={ "Source": "MGA", "ID": "%s.cds_%s" % (current_record.id, gene_id), - }, + }, phase=phase, - source="MGA" + source="MGA", ) if rbs_feat is not None: @@ -103,7 +103,7 @@ def mga_to_gff3(mga_output, genome): "ID": "%s.%s" % (current_record.id, gene_id), }, phase=phase, - source="MGA" + source="MGA", ) gene.sub_features = [cds_feat] diff --git a/cpt_convert_xmfa/xmfa.py b/cpt_convert_xmfa/xmfa.py index 0760809..f8cc9f5 100755 --- a/cpt_convert_xmfa/xmfa.py +++ b/cpt_convert_xmfa/xmfa.py @@ -30,8 +30,7 @@ def parse_xmfa(xmfa): - """Simple XMFA parser until https://github.com/biopython/biopython/pull/544 - """ + """Simple XMFA parser until https://github.com/biopython/biopython/pull/544""" current_lcb = [] current_seq = {} for line in xmfa.readlines(): @@ -103,8 +102,7 @@ def to_xmfa(lcbs, handle=sys.stdout): def percent_identity(a, b): - """Calculate % identity, ignoring gaps in the host sequence - """ + """Calculate % identity, ignoring gaps in the host sequence""" match = 0 mismatch = 0 for char_a, char_b in zip(list(a), list(b)): @@ -121,8 +119,7 @@ def percent_identity(a, b): def id_tn_dict(sequences, tmpfile=False): - """Figure out sequence IDs - """ + """Figure out sequence IDs""" label_convert = {} correct_chrom = None if not isinstance(sequences, list): diff --git a/cpt_convert_xmfa/xmfa2tbl.py b/cpt_convert_xmfa/xmfa2tbl.py index c579b0b..3f77b10 100755 --- a/cpt_convert_xmfa/xmfa2tbl.py +++ b/cpt_convert_xmfa/xmfa2tbl.py @@ -12,8 +12,7 @@ def _id_tn_dict(sequences): - """Figure out sequence IDs AND sequence lengths from fasta file - """ + """Figure out sequence IDs AND sequence lengths from fasta file""" label_convert = {} if sequences is not None: if len(sequences) == 1: diff --git a/cpt_disruptin_proximity/disruptin_proximity_2_lysis_genes.py b/cpt_disruptin_proximity/disruptin_proximity_2_lysis_genes.py index 9c92ce0..c181f9a 100755 --- a/cpt_disruptin_proximity/disruptin_proximity_2_lysis_genes.py +++ b/cpt_disruptin_proximity/disruptin_proximity_2_lysis_genes.py @@ -46,16 +46,16 @@ def treeFeatures(features): def read_enzyme_list(enzyme_file=None): enzyme_file.seek(0) domains = [] - #domain_names = [] + # domain_names = [] for line in enzyme_file: if not line.startswith("*"): words = line.split("\t") if len(words) > 3: domains += [words[2]] - #domain_names += [words[0]] + # domain_names += [words[0]] - return (domains[1:]) + return domains[1:] # adapted from intersect_and_adjacent.py @@ -132,9 +132,11 @@ def find_endolysins(rec_ipro, enzyme_domain_ids): # Ignores feature with unwanted key words in the feature name if all(x not in f.qualifiers["Name"][0] for x in unwanted): # If feature is included in the given enzyme domain list, the protein name, domain id, and domain name are stored - domain_description = [str(f.qualifiers['Name'][0])] - if 'signature_desc' in f.qualifiers: - domain_description += [str(f.qualifiers['signature_desc'][0])] + domain_description = [str(f.qualifiers["Name"][0])] + if "signature_desc" in f.qualifiers: + domain_description += [ + str(f.qualifiers["signature_desc"][0]) + ] for i in enzyme_domain_ids: for y in domain_description: @@ -146,7 +148,7 @@ def find_endolysins(rec_ipro, enzyme_domain_ids): target = f.qualifiers["Target"][0] target = target.split(" ") - protein_name = str(target[0]) + '**' + protein_name = str(target[0]) + "**" endo_rec_names += [protein_name] return endo_rec_names, endo_rec_domain_ids @@ -158,11 +160,11 @@ def adjacent_lgc(lgc, tmhmm, ipro, genome, enzyme, window): rec_ipro = gffParse(ipro) recTemp = gffParse(genome)[0] tempFeats = feature_lambda( - recTemp.features, - feature_test_type, - {"types": ["CDS"]}, - subfeatures=True, - ) + recTemp.features, + feature_test_type, + {"types": ["CDS"]}, + subfeatures=True, + ) recTemp.features = tempFeats rec_genome_ini = [recTemp] @@ -175,18 +177,16 @@ def adjacent_lgc(lgc, tmhmm, ipro, genome, enzyme, window): if len(rec_lgc) > 0 and len(rec_tmhmm) > 0 and len(rec_genome_ini) > 0: # find names of the proteins containing endolysin associated domains - endo_names, endo_domain_ids = find_endolysins( - rec_ipro, list(enzyme_domain_ids) - ) + endo_names, endo_domain_ids = find_endolysins(rec_ipro, list(enzyme_domain_ids)) # find names of proteins containing transmembrane domains tmhmm_protein_names = [] for seq in rec_tmhmm: - tmhmm_protein_names += [str(seq.id) + '**'] + tmhmm_protein_names += [str(seq.id) + "**"] lgc_names = [] for seq in rec_lgc: - lgc_names += [str(seq.id) + '**'] + lgc_names += [str(seq.id) + "**"] adjacent_endo = {} adjacent_lgc_to_endo = {} @@ -215,14 +215,14 @@ def adjacent_lgc(lgc, tmhmm, ipro, genome, enzyme, window): # searches for synonyms and if feat.type == "CDS": feat_names = [] - feat_names.append(str(feat.id) + '**') + feat_names.append(str(feat.id) + "**") if "locus_tag" in feat.qualifiers: - feat_names.append(str(feat.qualifiers["locus_tag"][0]) + '**') + feat_names.append(str(feat.qualifiers["locus_tag"][0]) + "**") if "protein_id" in feat.qualifiers: - feat_names.append(str(feat.qualifiers["protein_id"][0]) + '**') + feat_names.append(str(feat.qualifiers["protein_id"][0]) + "**") if "Name" in feat.qualifiers: if len(str(feat.qualifiers["Name"][0])) > 5: - feat_names.append(str(feat.qualifiers["Name"][0]) + '**') + feat_names.append(str(feat.qualifiers["Name"][0]) + "**") # print(str(feat_names)) # print(str(feat.qualifiers)) @@ -233,9 +233,7 @@ def adjacent_lgc(lgc, tmhmm, ipro, genome, enzyme, window): holin_annotations = ["holin"] if "product" in feat.qualifiers: if any( - x - for x in holin_annotations - if (x in str(feat.qualifiers)) + x for x in holin_annotations if (x in str(feat.qualifiers)) ): tm_seqrec += [feat] # check if protein contains a TMD @@ -277,7 +275,7 @@ def adjacent_lgc(lgc, tmhmm, ipro, genome, enzyme, window): adjacent_lgc_to_tm[rec_genome.id] = adjacent_lgc_to_tm_i # print(rec_genome.id) else: - return 0, 0, 0, 0 + return 0, 0, 0, 0 # print(adjacent_endo) return adjacent_endo, adjacent_lgc_to_endo, adjacent_tm, adjacent_lgc_to_tm @@ -333,19 +331,19 @@ def adjacent_lgc(lgc, tmhmm, ipro, genome, enzyme, window): ) if endo == 0: - with open(args.oa, "w") as handle: - handle.write("##gff-version 3") - with open(args.ob, "w") as handle: - handle.write("##gff-version 3") - with open(args.oc, "w") as handle: - handle.write("##gff-version 3") - with open(args.od, "w") as handle: - handle.write("##gff-version 3") - return + with open(args.oa, "w") as handle: + handle.write("##gff-version 3") + with open(args.ob, "w") as handle: + handle.write("##gff-version 3") + with open(args.oc, "w") as handle: + handle.write("##gff-version 3") + with open(args.od, "w") as handle: + handle.write("##gff-version 3") + return args.genome.seek(0) rec = list(gffParse(args.genome)) - + with open(args.oa, "w") as handle: for i in range(len(rec)): rec_i = rec[i] diff --git a/cpt_disruptin_proximity/gff3.py b/cpt_disruptin_proximity/gff3.py index d4795d4..48496c3 100755 --- a/cpt_disruptin_proximity/gff3.py +++ b/cpt_disruptin_proximity/gff3.py @@ -97,10 +97,10 @@ def feature_test_type(feature, **kwargs): if "type" in kwargs: return str(feature.type).upper() == str(kwargs["type"]).upper() elif "types" in kwargs: - for x in kwargs["types"]: - if str(feature.type).upper() == str(x).upper(): - return True - return False + for x in kwargs["types"]: + if str(feature.type).upper() == str(x).upper(): + return True + return False raise Exception("Incorrect feature_test_type call, need type or types") diff --git a/cpt_disruptin_table/Disruptin_hydrophobicity_helicity_table_package.py b/cpt_disruptin_table/Disruptin_hydrophobicity_helicity_table_package.py index b054d0d..9793670 100755 --- a/cpt_disruptin_table/Disruptin_hydrophobicity_helicity_table_package.py +++ b/cpt_disruptin_table/Disruptin_hydrophobicity_helicity_table_package.py @@ -21,15 +21,15 @@ def disruptin_table(garnier_file, fasta_file): sec_struct = [] # reading the lines from the garnier csv file -# with open(garnier_file,'r') as csvfile: -# garnierreader = csv.reader(csvfile) + # with open(garnier_file,'r') as csvfile: + # garnierreader = csv.reader(csvfile) for row in garnier_file: - if row[0] == 'Sequence: ': + if row[0] == "Sequence: ": names += [row[1]] - elif row[0] in 'HETC': - row = row.split('\t') - sec_struct += [''.join(row)] - + elif row[0] in "HETC": + row = row.split("\t") + sec_struct += ["".join(row)] + record = [] p = [] r = [] @@ -100,19 +100,19 @@ def disruptin_table(garnier_file, fasta_file): args = parser.parse_args() # Set up output location -# f = open(sys.stdout, 'w', newline='') -# writer1 = csv.writer(f) + # f = open(sys.stdout, 'w', newline='') + # writer1 = csv.writer(f) iden, position, residue, charge, hydro, struct = disruptin_table(**vars(args)) for i in range(len(iden)): -# writer1.writerow(['Protein ID']+[iden[i]]) -# writer1.writerow(['Position'] + [format(x, 's') for x in position[i]]) -# writer1.writerow(['Residue'] + [format(x, 's') for x in residue[i]]) -# writer1.writerow(['Charge'] + [format(x, 's') for x in charge[i]]) -# writer1.writerow(['Hydrophobicity'] + [format(x, '.3f') for x in hydro[i]]) -# writer1.writerow(['Secondary Structure'] + [format(x, 's') for x in struct[i]]) -# writer1.writerow(['']) + # writer1.writerow(['Protein ID']+[iden[i]]) + # writer1.writerow(['Position'] + [format(x, 's') for x in position[i]]) + # writer1.writerow(['Residue'] + [format(x, 's') for x in residue[i]]) + # writer1.writerow(['Charge'] + [format(x, 's') for x in charge[i]]) + # writer1.writerow(['Hydrophobicity'] + [format(x, '.3f') for x in hydro[i]]) + # writer1.writerow(['Secondary Structure'] + [format(x, 's') for x in struct[i]]) + # writer1.writerow(['']) print(str(iden[i])) print("Position \t " + "\t".join(position[i])) diff --git a/cpt_easyfig/Easyfig.py b/cpt_easyfig/Easyfig.py index 17f017a..f1c07a0 100755 --- a/cpt_easyfig/Easyfig.py +++ b/cpt_easyfig/Easyfig.py @@ -12389,8 +12389,7 @@ def _saveBitMapNoCompression(self, filename): f.close() def _saveBitMapWithCompression(self, filename): - """ - """ + """ """ # open file f = file(filename, "wb") @@ -12602,7 +12601,7 @@ def getArrows(filename, legname): temp[1].append(int(stop)) except: if gotit: - print ("feature could not be processed:\n" + line) + print("feature could not be processed:\n" + line) gotit = False if gotit: aninstance = feature(temp[0], temp[1], feat, strand, None, None) @@ -12650,7 +12649,7 @@ def getArrows(filename, legname): ) outlist.append(aninstance) except: - print ("feature could not be processed:\n" + line) + print("feature could not be processed:\n" + line) if feat == "source": try: lengtht = max([int(start), int(stop)]) @@ -12674,7 +12673,7 @@ def getArrows(filename, legname): ) outlist[-1].colour = artColour except: - print ("Colour could not be processed:\n" + line) + print("Colour could not be processed:\n" + line) elif line[2:].startswith(" /color=") and getFeats: temp = line[26:-1] temp = temp.replace('"', "") @@ -12691,7 +12690,7 @@ def getArrows(filename, legname): ) outlist[-1].colour = artColour except: - print ("Colour could not be processed:\n" + line) + print("Colour could not be processed:\n" + line) elif line[2:].startswith(" /colour=") and getFeats: temp = line[29:-1] temp = temp.replace('"', "") @@ -12708,7 +12707,7 @@ def getArrows(filename, legname): ) outlist[-1].colour = artColour except: - print ("Colour could not be processed:\n" + line) + print("Colour could not be processed:\n" + line) elif line[2:].startswith(" /color=") and getFeats: temp = line[28:-1] temp = temp.replace('"', "") @@ -12724,7 +12723,7 @@ def getArrows(filename, legname): int(artColourF[2]), ) except: - print ("Colour could not be processed:\n" + line) + print("Colour could not be processed:\n" + line) outlist[-1].colour = artColour elif ( line[2:].startswith(" /gene=") @@ -13470,7 +13469,7 @@ def draw( bmp.setPenColor(Color.BLACK) bmp.writeString(legendArrows[index][0], 106, i, 64) else: - print ("wang") + print("wang") index += 1 elif legend == "Two columns": index = 0 @@ -19285,7 +19284,7 @@ def gbk2fasta(genbank, out, mincut, maxcut): if maxcut < 1: maxcut = 1 except: - print ("Annotation slice values not valid.") + print("Annotation slice values not valid.") try: gen = open(genbank) outfile = open(out, "w") @@ -19316,7 +19315,9 @@ def gbk2fasta(genbank, out, mincut, maxcut): if not i in rightchars: isitgood = False if not isitgood: - print ("Annotation file contains invalid characters. Check genbank/EMBL contains no lines starting with > or that fasta file contains only valid nucleotides") + print( + "Annotation file contains invalid characters. Check genbank/EMBL contains no lines starting with > or that fasta file contains only valid nucleotides" + ) return 0 if "/" in out: outfile.write(">" + out.split("/")[1] + "\n") @@ -19339,12 +19340,12 @@ def gbk2fasta(genbank, out, mincut, maxcut): seq = seq.replace("qqq", "n" * int(len(seq) / 500)) outfile.write(seq) if len(seq) == 0: - print ("There is no sequence in " + genbank + ".") + print("There is no sequence in " + genbank + ".") return 0 else: return 1 except: - print (genbank + " does not exist.") + print(genbank + " does not exist.") return 0 @@ -19369,10 +19370,10 @@ def getGCcontent(filename, windsize, step, mincut, maxcut): gen.close() seq = seq.upper() except: - print ("Annotation file " + filename + " not valid.") + print("Annotation file " + filename + " not valid.") return None if len(seq) == 0: - print ("Annotation file " + filename + " not valid.") + print("Annotation file " + filename + " not valid.") return None if maxcut == "Max": seq = seq[int(mincut) - 1 :] @@ -19412,10 +19413,10 @@ def getGCskew(filename, windsize, step, mincut, maxcut): gen.close() seq = seq.upper() except: - print ("Annotation file " + filename + " not valid.") + print("Annotation file " + filename + " not valid.") return None if len(seq) == 0: - print ("Annotation file " + filename + " not valid.") + print("Annotation file " + filename + " not valid.") return None if maxcut == "Max": seq = seq[int(mincut) - 1 :] @@ -19460,10 +19461,10 @@ def getCoverage(filename, filename2, mincut, maxcut): seq += "".join(line.split()[:-1]) gen.close() except: - print ("Annotation file " + filename + " not valid.") + print("Annotation file " + filename + " not valid.") return None if len(seq) == 0: - print ("Annotation file " + filename + " not valid.") + print("Annotation file " + filename + " not valid.") return None seq = seq.lower() if maxcut == "Max": @@ -19589,7 +19590,7 @@ def getCustom(filename): thearray[i].append(float(templine[i])) return thearray except: - print (filename + " not valid graph file.") + print(filename + " not valid graph file.") return None @@ -19616,9 +19617,11 @@ def genBlast(inlist, cutlist): + ".easyfig.fa", shell=True, ).wait() - print ("makeblastdb -dbtype nucl -out temp_easyfig/tempdb -in temp_easyfig/" + str( - i + 2 - ) + ".easyfig.fa") + print( + "makeblastdb -dbtype nucl -out temp_easyfig/tempdb -in temp_easyfig/" + + str(i + 2) + + ".easyfig.fa" + ) elif isLegBlastDB(): subprocess.Popen( "formatdb -p F -t tempdb -n temp_easyfig/tempdb -i temp_easyfig/" @@ -19627,7 +19630,7 @@ def genBlast(inlist, cutlist): shell=True, ).wait() else: - print ("Could not find BLAST.") + print("Could not find BLAST.") sys.exit() if isNewBlastn(): subprocess.Popen( @@ -19650,7 +19653,7 @@ def genBlast(inlist, cutlist): shell=True, ).wait() else: - print ("Could not find BLAST.") + print("Could not find BLAST.") sys.exit() outlist.append(inlist[i]) outlist.append("temp_easyfig/" + str(i + 1) + str(i + 2) + ".easyfig.out") @@ -19661,7 +19664,7 @@ def genBlast(inlist, cutlist): def genTBlastX(inlist, cutlist): pwd = os.getcwd() if os.path.exists("temp_easyfig"): - print ("please run from a directory without the folder temp_easyfig") + print("please run from a directory without the folder temp_easyfig") sys.exit() os.mkdir("temp_easyfig") os.chdir("temp_easyfig") @@ -19690,7 +19693,7 @@ def genTBlastX(inlist, cutlist): shell=True, ).wait() else: - print ("Could not find BLAST.") + print("Could not find BLAST.") sys.exit() if isNewTblastx(): subprocess.Popen( @@ -19713,7 +19716,7 @@ def genTBlastX(inlist, cutlist): shell=True, ).wait() else: - print ("Could not find BLAST.") + print("Could not find BLAST.") sys.exit() outlist.append(inlist[i]) outlist.append(os.getcwd() + "/" + str(i + 1) + str(i + 2) + ".easyfig.out") @@ -19936,7 +19939,7 @@ def genTBlastX(inlist, cutlist): gfilename = sys.argv[i + 2] lastflag += 1 else: - print (sys.argv[i + 1] + " not a valid graph type") + print(sys.argv[i + 1] + " not a valid graph type") elif sys.argv[i] == "-wind_size": windsize = int(sys.argv[i + 1]) elif sys.argv[i] == "-step": @@ -20000,7 +20003,9 @@ def genTBlastX(inlist, cutlist): elif sys.argv[i + 1] == "both": legend = "Top & Bottom" else: - print ("Legend options are (case sensitive), using None.") + print( + "Legend options are (case sensitive), using None." + ) elif sys.argv[i] == "-leg_name": legname = sys.argv[i + 1] inlist = sys.argv[lastflag + 1 :] @@ -20092,7 +20097,7 @@ def genTBlastX(inlist, cutlist): else: "Please choolse -blastn or -tblastx flags to generate blast files, or use -blast_files to use previously generated files." if filename == None: - print ("Please choose a file to write to (-o tag) and try agian.") + print("Please choose a file to write to (-o tag) and try agian.") sys.exit() if featDict == {} and not nofeat: featDict = {"CDS": ("arrow", (64, 224, 208))} @@ -20162,7 +20167,7 @@ def genTBlastX(inlist, cutlist): ) if (blastit or tblastit) and not keep_blast: shutil.rmtree("temp_easyfig") - print ("Minimum blast hit reported: " + str(x) + "%") + print("Minimum blast hit reported: " + str(x) + "%") elif len(sys.argv) == 1: from Tkinter import * @@ -20202,7 +20207,7 @@ def shiftSelection(self, event): app = App(root) root.mainloop() else: - print ( + print( """ Easyfig.py Written by: Mitchell Sullivan mjsull@gmail.com Supervisor: Dr. Scott Beatson University of Queensland 03.12.2010 diff --git a/cpt_export_seq_unique/gff3.py b/cpt_export_seq_unique/gff3.py index d4795d4..48496c3 100755 --- a/cpt_export_seq_unique/gff3.py +++ b/cpt_export_seq_unique/gff3.py @@ -97,10 +97,10 @@ def feature_test_type(feature, **kwargs): if "type" in kwargs: return str(feature.type).upper() == str(kwargs["type"]).upper() elif "types" in kwargs: - for x in kwargs["types"]: - if str(feature.type).upper() == str(x).upper(): - return True - return False + for x in kwargs["types"]: + if str(feature.type).upper() == str(x).upper(): + return True + return False raise Exception("Incorrect feature_test_type call, need type or types") diff --git a/cpt_export_seq_unique/gff3_extract_sequence.py b/cpt_export_seq_unique/gff3_extract_sequence.py index 22e0ca4..8e66746 100755 --- a/cpt_export_seq_unique/gff3_extract_sequence.py +++ b/cpt_export_seq_unique/gff3_extract_sequence.py @@ -17,7 +17,6 @@ def main(fasta, gff3, feature_filter=None, nodesc=False): if feature_filter == "nice_cds": from gff2gb import gff3_to_genbank as cpt_Gff2Gbk - for rec in cpt_Gff2Gbk(gff3, fasta, 11): seenList = {} @@ -66,8 +65,10 @@ def main(fasta, gff3, feature_filter=None, nodesc=False): else: feat.qualifiers["ID"] = [feat._ID] product = feat.qualifiers.get("product", "") - description = "{1} [Location={0.location};ID={0.qualifiers[ID][0]}]".format( - feat, product + description = ( + "{1} [Location={0.location};ID={0.qualifiers[ID][0]}]".format( + feat, product + ) ) yield [ SeqRecord( @@ -116,9 +117,21 @@ def main(fasta, gff3, feature_filter=None, nodesc=False): description = "" else: if feat.strand == -1: - important_data = {"Location": FeatureLocation(feat.location.start + 1, feat.location.end - feat.phase, feat.strand)} + important_data = { + "Location": FeatureLocation( + feat.location.start + 1, + feat.location.end - feat.phase, + feat.strand, + ) + } else: - important_data = {"Location": FeatureLocation(feat.location.start + 1 + feat.phase, feat.location.end, feat.strand)} + important_data = { + "Location": FeatureLocation( + feat.location.start + 1 + feat.phase, + feat.location.end, + feat.strand, + ) + } if "Name" in feat.qualifiers: important_data["Name"] = feat.qualifiers.get("Name", [""])[0] @@ -130,48 +143,65 @@ def main(fasta, gff3, feature_filter=None, nodesc=False): ] ) ) - #if feat.id == "CPT_Privateer_006.p01": - #print(feat) - #exit() - + # if feat.id == "CPT_Privateer_006.p01": + # print(feat) + # exit() + if isinstance(feat.location, CompoundLocation): - finSeq = "" - if feat.strand == -1: - for x in feat.location.parts: - finSeq += str((rec.seq[feat.location.start: feat.location.end - feat.phase]).reverse_complement()) - else: - for x in feat.location.parts: - finSeq += str(rec.seq[feat.location.start + feat.phase: feat.location.end]) - yield [ - SeqRecord( - finSeq, - id=nid.replace(" ", "-"), - description=description, - ) - ] + finSeq = "" + if feat.strand == -1: + for x in feat.location.parts: + finSeq += str( + ( + rec.seq[ + feat.location.start : feat.location.end + - feat.phase + ] + ).reverse_complement() + ) + else: + for x in feat.location.parts: + finSeq += str( + rec.seq[ + feat.location.start + feat.phase : feat.location.end + ] + ) + yield [ + SeqRecord( + finSeq, + id=nid.replace(" ", "-"), + description=description, + ) + ] elif feat.strand == -1: - yield [ - SeqRecord( - (rec.seq[feat.location.start: feat.location.end - feat.phase]).reverse_complement(), - id=nid.replace(" ", "-"), - description=description, - ) - ] + yield [ + SeqRecord( + ( + rec.seq[ + feat.location.start : feat.location.end - feat.phase + ] + ).reverse_complement(), + id=nid.replace(" ", "-"), + description=description, + ) + ] else: - yield [ - SeqRecord( - #feat.extract(rec).seq, - rec.seq[feat.location.start + feat.phase: feat.location.end], - id=nid.replace(" ", "-"), - description=description, - ) - ] + yield [ + SeqRecord( + # feat.extract(rec).seq, + rec.seq[ + feat.location.start + feat.phase : feat.location.end + ], + id=nid.replace(" ", "-"), + description=description, + ) + ] rec.features = newfeats rec.annotations = {} - #gffWrite([rec], sys.stdout) + # gffWrite([rec], sys.stdout) else: seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta")) - + for rec in gffParse(gff3, base_dict=seq_dict): noMatch = True if "Alias" in rec.features[0].qualifiers.keys(): @@ -201,9 +231,21 @@ def main(fasta, gff3, feature_filter=None, nodesc=False): description = "" else: if feat.strand == -1: - important_data = {"Location": FeatureLocation(feat.location.start + 1, feat.location.end - feat.phase, feat.strand)} + important_data = { + "Location": FeatureLocation( + feat.location.start + 1, + feat.location.end - feat.phase, + feat.strand, + ) + } else: - important_data = {"Location": FeatureLocation(feat.location.start + 1 + feat.phase, feat.location.end, feat.strand)} + important_data = { + "Location": FeatureLocation( + feat.location.start + 1 + feat.phase, + feat.location.end, + feat.strand, + ) + } if "Name" in feat.qualifiers: important_data["Name"] = feat.qualifiers.get("Name", [""])[0] @@ -217,40 +259,58 @@ def main(fasta, gff3, feature_filter=None, nodesc=False): ) if isinstance(feat.location, CompoundLocation): - finSeq = "" - if feat.strand == -1: - for x in feat.location.parts: - finSeq += str((rec.seq[x.start: x.end - feat.phase]).reverse_complement()) - else: - for x in feat.location.parts: - finSeq += str(rec.seq[x.start + feat.phase: x.end]) - yield [ - SeqRecord( - Seq(finSeq), - id=id.replace(" ", "-"), - description=description, - ) - ] + finSeq = "" + if feat.strand == -1: + for x in feat.location.parts: + finSeq += str( + ( + rec.seq[x.start : x.end - feat.phase] + ).reverse_complement() + ) + else: + for x in feat.location.parts: + finSeq += str(rec.seq[x.start + feat.phase : x.end]) + yield [ + SeqRecord( + Seq(finSeq), + id=id.replace(" ", "-"), + description=description, + ) + ] else: - if feat.strand == -1: - yield [ - SeqRecord( - seq=Seq(str(rec.seq[feat.location.start: feat.location.end - feat.phase])).reverse_complement(), - id=id.replace(" ", "-"), - description=description, - ) - ] - else: - yield [ - SeqRecord( - #feat.extract(rec).seq, - seq=Seq(str(rec.seq[feat.location.start + feat.phase: feat.location.end])), - id=id.replace(" ", "-"), - description=description, - ) - ] + if feat.strand == -1: + yield [ + SeqRecord( + seq=Seq( + str( + rec.seq[ + feat.location.start : feat.location.end + - feat.phase + ] + ) + ).reverse_complement(), + id=id.replace(" ", "-"), + description=description, + ) + ] + else: + yield [ + SeqRecord( + # feat.extract(rec).seq, + seq=Seq( + str( + rec.seq[ + feat.location.start + + feat.phase : feat.location.end + ] + ) + ), + id=id.replace(" ", "-"), + description=description, + ) + ] if __name__ == "__main__": @@ -267,9 +327,9 @@ def main(fasta, gff3, feature_filter=None, nodesc=False): ) args = parser.parse_args() for seq in main(**vars(args)): - #if isinstance(seq, list): + # if isinstance(seq, list): # for x in seq: # print(type(x.seq)) # SeqIO.write(x, sys.stdout, "fasta") - #else: - SeqIO.write(seq, sys.stdout, "fasta") + # else: + SeqIO.write(seq, sys.stdout, "fasta") diff --git a/cpt_find_spanins/findSpanin.py b/cpt_find_spanins/findSpanin.py index 6fd428a..71af657 100755 --- a/cpt_find_spanins/findSpanin.py +++ b/cpt_find_spanins/findSpanin.py @@ -3,10 +3,17 @@ import argparse import os -import re # new -import itertools # new +import re # new +import itertools # new from collections import Counter, OrderedDict -from spaninFuncs import getDescriptions, grabLocs, spaninProximity, splitStrands, tuple_fasta, lineWrapper +from spaninFuncs import ( + getDescriptions, + grabLocs, + spaninProximity, + splitStrands, + tuple_fasta, + lineWrapper, +) ### Requirement Inputs #### INPUT : putative_isp.fa & putative_osp.fa (in that order) @@ -14,9 +21,10 @@ ############################################################################### def write_output(candidates): - """ output file function...maybe not needed """ + """output file function...maybe not needed""" pass + def reconfigure_dict(spanins): """ re organizes dictionary to be more friendly for checks @@ -25,40 +33,43 @@ def reconfigure_dict(spanins): new_spanin_dict = {} for each_spanin_type, data_dict in spanins.items(): - #print(f"{each_spanin_type} == {data_dict}") + # print(f"{each_spanin_type} == {data_dict}") new_spanin_dict[each_spanin_type] = {} - new_spanin_dict[each_spanin_type]['positive'] = {} - new_spanin_dict[each_spanin_type]['negative'] = {} - new_spanin_dict[each_spanin_type]['positive']['coords'] = [] - new_spanin_dict[each_spanin_type]['negative']['coords'] = [] + new_spanin_dict[each_spanin_type]["positive"] = {} + new_spanin_dict[each_spanin_type]["negative"] = {} + new_spanin_dict[each_spanin_type]["positive"]["coords"] = [] + new_spanin_dict[each_spanin_type]["negative"]["coords"] = [] for outter_orf, inner_data in data_dict.items(): list_of_hits = [] for data_content in inner_data: - #print(data_content) + # print(data_content) data_content.insert(0, outter_orf) - #print(f"new data_content -> {data_content}") - #print(data_content) - #list_of_hits += [data_content] - #new_spanin_dict[each_spanin_type] += [data_content] + # print(f"new data_content -> {data_content}") + # print(data_content) + # list_of_hits += [data_content] + # new_spanin_dict[each_spanin_type] += [data_content] if data_content[6] == "+": - #print(f"{each_spanin_type} @ POSITIVE") - new_spanin_dict[each_spanin_type]['positive']['coords'] += [data_content] + # print(f"{each_spanin_type} @ POSITIVE") + new_spanin_dict[each_spanin_type]["positive"]["coords"] += [ + data_content + ] elif data_content[6] == "-": - #print(f"{each_spanin_type} @ NEGATIVE") - new_spanin_dict[each_spanin_type]['negative']['coords'] += [data_content] - #print(new_spanin_dict[each_spanin_type]) - #print(reorganized) - #print(f"{outter_orf} => {inner_data}") - #print(new_spanin_dict) - - #print('\n') - #for k, v in new_spanin_dict.items(): - #print(k) - #print(v) + # print(f"{each_spanin_type} @ NEGATIVE") + new_spanin_dict[each_spanin_type]["negative"]["coords"] += [ + data_content + ] + # print(new_spanin_dict[each_spanin_type]) + # print(reorganized) + # print(f"{outter_orf} => {inner_data}") + # print(new_spanin_dict) + + # print('\n') + # for k, v in new_spanin_dict.items(): + # print(k) + # print(v) return new_spanin_dict - def check_for_uniques(spanins): """ Checks for unique spanins based on spanin_type. @@ -67,90 +78,123 @@ def check_for_uniques(spanins): """ pair_dict = {} pair_dict = { - 'pairs' : { - 'location_amount' : [], - 'pair_number' : {}, + "pairs": { + "location_amount": [], + "pair_number": {}, } } for each_spanin_type, spanin_data in spanins.items(): - #print(f"{each_spanin_type} ===> {spanin_data}") + # print(f"{each_spanin_type} ===> {spanin_data}") # early declarations for cases of no results - pos_check = [] # end checks + pos_check = [] # end checks pos_uniques = [] - neg_check = [] # start checks + neg_check = [] # start checks neg_uniques = [] unique_ends = [] pos_amt_unique = 0 neg_amt_unique = 0 amt_positive = 0 amt_negative = 0 - spanin_data['uniques'] = 0 - spanin_data['amount'] = 0 - #spanin_data['positive']['amt_positive'] = 0 - #spanin_data['positive']['pos_amt_unique'] = 0 - #spanin_data['positive']['isp_match'] = [] - #spanin_data['negative']['amt_negative'] = 0 - #spanin_data['negative']['neg_amt_unique'] = 0 - #spanin_data['negative']['isp_match'] = [] - #print(spanin_data) - if spanin_data['positive']['coords']: + spanin_data["uniques"] = 0 + spanin_data["amount"] = 0 + # spanin_data['positive']['amt_positive'] = 0 + # spanin_data['positive']['pos_amt_unique'] = 0 + # spanin_data['positive']['isp_match'] = [] + # spanin_data['negative']['amt_negative'] = 0 + # spanin_data['negative']['neg_amt_unique'] = 0 + # spanin_data['negative']['isp_match'] = [] + # print(spanin_data) + if spanin_data["positive"]["coords"]: # do something... - #print('in other function') - #print(spanin_data['positive']['coords']) - for each_hit in spanin_data['positive']['coords']: + # print('in other function') + # print(spanin_data['positive']['coords']) + for each_hit in spanin_data["positive"]["coords"]: pos_check.append(each_hit[2]) - pair_dict['pairs']['location_amount'].append(each_hit[2]) - pos_uniques = list(set([end_site for end_site in pos_check if pos_check.count(end_site) >= 1])) - #print(pos_check) - #print(pos_uniques) - amt_positive = len(spanin_data['positive']['coords']) + pair_dict["pairs"]["location_amount"].append(each_hit[2]) + pos_uniques = list( + set( + [ + end_site + for end_site in pos_check + if pos_check.count(end_site) >= 1 + ] + ) + ) + # print(pos_check) + # print(pos_uniques) + amt_positive = len(spanin_data["positive"]["coords"]) pos_amt_unique = len(pos_uniques) if amt_positive: - spanin_data['positive']['amt_positive'] = amt_positive - spanin_data['positive']['pos_amt_unique'] = pos_amt_unique - #pair_dict['pairs']['locations'].extend(pos_uniques) + spanin_data["positive"]["amt_positive"] = amt_positive + spanin_data["positive"]["pos_amt_unique"] = pos_amt_unique + # pair_dict['pairs']['locations'].extend(pos_uniques) else: - spanin_data['positive']['amt_positive'] = 0 - spanin_data['positive']['pos_amt_unique'] = 0 - if spanin_data['negative']['coords']: + spanin_data["positive"]["amt_positive"] = 0 + spanin_data["positive"]["pos_amt_unique"] = 0 + if spanin_data["negative"]["coords"]: # do something else... - #print('in other function') - #print(spanin_data['negative']['coords']) - for each_hit in spanin_data['negative']['coords']: + # print('in other function') + # print(spanin_data['negative']['coords']) + for each_hit in spanin_data["negative"]["coords"]: neg_check.append(each_hit[1]) - pair_dict['pairs']['location_amount'].append(each_hit[1]) - neg_uniques = list(set([start_site for start_site in neg_check if neg_check.count(start_site) >= 1])) - #print(neg_uniques) - amt_negative = len(spanin_data['negative']['coords']) + pair_dict["pairs"]["location_amount"].append(each_hit[1]) + neg_uniques = list( + set( + [ + start_site + for start_site in neg_check + if neg_check.count(start_site) >= 1 + ] + ) + ) + # print(neg_uniques) + amt_negative = len(spanin_data["negative"]["coords"]) neg_amt_unique = len(neg_uniques) if amt_negative: - spanin_data['negative']['amt_negative'] = amt_negative - spanin_data['negative']['neg_amt_unique'] = neg_amt_unique - #pair_dict['pairs']['locations'].extend(neg_uniques) + spanin_data["negative"]["amt_negative"] = amt_negative + spanin_data["negative"]["neg_amt_unique"] = neg_amt_unique + # pair_dict['pairs']['locations'].extend(neg_uniques) else: - spanin_data['negative']['amt_negative'] = 0 - spanin_data['negative']['neg_amt_unique'] = 0 - spanin_data['uniques'] += (spanin_data['positive']['pos_amt_unique'] + spanin_data['negative']['neg_amt_unique']) - spanin_data['amount'] += (spanin_data['positive']['amt_positive'] + spanin_data['negative']['amt_negative']) - #print(spanin_data['uniques']) - list(set(pair_dict['pairs']['location_amount'])) - pair_dict['pairs']['location_amount'] = dict(Counter(pair_dict['pairs']['location_amount'])) + spanin_data["negative"]["amt_negative"] = 0 + spanin_data["negative"]["neg_amt_unique"] = 0 + spanin_data["uniques"] += ( + spanin_data["positive"]["pos_amt_unique"] + + spanin_data["negative"]["neg_amt_unique"] + ) + spanin_data["amount"] += ( + spanin_data["positive"]["amt_positive"] + + spanin_data["negative"]["amt_negative"] + ) + # print(spanin_data['uniques']) + list(set(pair_dict["pairs"]["location_amount"])) + pair_dict["pairs"]["location_amount"] = dict( + Counter(pair_dict["pairs"]["location_amount"]) + ) for data in pair_dict.values(): - #print(data['locations']) - #print(type(data['locations'])) + # print(data['locations']) + # print(type(data['locations'])) v = 0 - for loc, count in data['location_amount'].items(): - #data['pair_number'] = {loc + for loc, count in data["location_amount"].items(): + # data['pair_number'] = {loc v += 1 - data['pair_number'][loc] = v - #print(dict(Counter(pair_dict['pairs']['locations']))) - #print(pair_dict) - spanins['total_amount'] = spanins['EMBEDDED']['amount'] + spanins['SEPARATED']['amount'] + spanins['OVERLAPPED']['amount'] - spanins['total_unique'] = spanins['EMBEDDED']['uniques'] + spanins['SEPARATED']['uniques'] + spanins['OVERLAPPED']['uniques'] - #spanins['total_unique'] = len(pair_dict['pairs']['pair_number']) + data["pair_number"][loc] = v + # print(dict(Counter(pair_dict['pairs']['locations']))) + # print(pair_dict) + spanins["total_amount"] = ( + spanins["EMBEDDED"]["amount"] + + spanins["SEPARATED"]["amount"] + + spanins["OVERLAPPED"]["amount"] + ) + spanins["total_unique"] = ( + spanins["EMBEDDED"]["uniques"] + + spanins["SEPARATED"]["uniques"] + + spanins["OVERLAPPED"]["uniques"] + ) + # spanins['total_unique'] = len(pair_dict['pairs']['pair_number']) return spanins, pair_dict + if __name__ == "__main__": # Common parameters for both ISP / OSP portion of script @@ -213,17 +257,16 @@ def check_for_uniques(spanins): ) # Is this manually updated? args = parser.parse_args() - #### RE-WRITE SPANIN_TYPES = {} - SPANIN_TYPES['EMBEDDED'] = {} - SPANIN_TYPES['OVERLAPPED'] = {} - SPANIN_TYPES['SEPARATED'] = {} - #SPANIN_TYPES = { + SPANIN_TYPES["EMBEDDED"] = {} + SPANIN_TYPES["OVERLAPPED"] = {} + SPANIN_TYPES["SEPARATED"] = {} + # SPANIN_TYPES = { # 'EMBEDDED' : {}, # 'OVERLAPPED' : {}, # 'SEPARATED' : {}, - #} + # } isp = getDescriptions(args.putative_isp_fasta_file) args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r") @@ -234,65 +277,90 @@ def check_for_uniques(spanins): osp_full = tuple_fasta(args.putative_osp_fasta_file) #### location data - location_data = { - 'isp' : [], - 'osp' : [] - } + location_data = {"isp": [], "osp": []} spanins = [isp, osp] for idx, each_spanin_type in enumerate(spanins): for description in each_spanin_type: locations = grabLocs(description) - if idx == 0: # i-spanin - location_data['isp'].append(locations) - elif idx == 1: # o-spanin - location_data['osp'].append(locations) + if idx == 0: # i-spanin + location_data["isp"].append(locations) + elif idx == 1: # o-spanin + location_data["osp"].append(locations) #### Check for types of spanins embedded, overlap, separate = spaninProximity( - isp=location_data['isp'], - osp=location_data['osp'], - max_dist=args.max_isp_osp_distance * 3 - ) - - SPANIN_TYPES['EMBEDDED'] = embedded - SPANIN_TYPES['OVERLAPPED'] = overlap - SPANIN_TYPES['SEPARATED'] = separate - - #for spanin_type, spanin in SPANIN_TYPES.items(): + isp=location_data["isp"], + osp=location_data["osp"], + max_dist=args.max_isp_osp_distance * 3, + ) + + SPANIN_TYPES["EMBEDDED"] = embedded + SPANIN_TYPES["OVERLAPPED"] = overlap + SPANIN_TYPES["SEPARATED"] = separate + + # for spanin_type, spanin in SPANIN_TYPES.items(): # s = 0 # for sequence in spanin.values(): # s += len(sequence) # SPANIN_TYPES[spanin_type]['amount'] = s # SPANIN_TYPES[spanin_type]['unique'] = len(spanin.keys()) - - #check_for_unique_spanins(SPANIN_TYPES) + + # check_for_unique_spanins(SPANIN_TYPES) spanins = reconfigure_dict(SPANIN_TYPES) spanins, pair_dict = check_for_uniques(spanins) - #print(pair_dict) + # print(pair_dict) with args.summary_txt as f: for each_spanin_type, spanin_data in spanins.items(): try: - if each_spanin_type not in ["total_amount","total_unique"]: - #print(each_spanin_type) - #print(each_spanin_type) - f.write("=~~~~~= "+str(each_spanin_type) +" Spanin Candidate Statistics =~~~~~=\n") - f.writelines("Total Candidate Pairs = "+str(spanin_data['amount'])+"\n") - f.writelines("Total Unique Pairs = "+str(spanin_data['uniques'])+"\n") + if each_spanin_type not in ["total_amount", "total_unique"]: + # print(each_spanin_type) + # print(each_spanin_type) + f.write( + "=~~~~~= " + + str(each_spanin_type) + + " Spanin Candidate Statistics =~~~~~=\n" + ) + f.writelines( + "Total Candidate Pairs = " + str(spanin_data["amount"]) + "\n" + ) + f.writelines( + "Total Unique Pairs = " + str(spanin_data["uniques"]) + "\n" + ) if each_spanin_type == "EMBEDDED": - for k, v in SPANIN_TYPES['EMBEDDED'].items(): - #print(k) - f.writelines(""+str(k)+" ==> Amount of corresponding candidate o-spanins(s): "+str(len(v))+"\n") + for k, v in SPANIN_TYPES["EMBEDDED"].items(): + # print(k) + f.writelines( + "" + + str(k) + + " ==> Amount of corresponding candidate o-spanins(s): " + + str(len(v)) + + "\n" + ) if each_spanin_type == "SEPARATED": - for k, v in SPANIN_TYPES['SEPARATED'].items(): - f.writelines(""+str(k)+ " ==> Amount of corresponding candidate o-spanins(s): "+str(len(v))+"\n") + for k, v in SPANIN_TYPES["SEPARATED"].items(): + f.writelines( + "" + + str(k) + + " ==> Amount of corresponding candidate o-spanins(s): " + + str(len(v)) + + "\n" + ) if each_spanin_type == "OVERLAPPED": - for k, v in SPANIN_TYPES['OVERLAPPED'].items(): - f.writelines(""+str(k)+" ==> Amount of corresponding candidate o-spanins(s): "+str(len(v))+"\n") + for k, v in SPANIN_TYPES["OVERLAPPED"].items(): + f.writelines( + "" + + str(k) + + " ==> Amount of corresponding candidate o-spanins(s): " + + str(len(v)) + + "\n" + ) except TypeError: continue f.write("\n=~~~~~= Tally from ALL spanin types =~~~~~=\n") - f.writelines("Total Candidates = "+str(spanins['total_amount'])+"\n") - f.writelines("Total Unique Candidate Pairs = "+str(spanins['total_unique'])+"\n") + f.writelines("Total Candidates = " + str(spanins["total_amount"]) + "\n") + f.writelines( + "Total Unique Candidate Pairs = " + str(spanins["total_unique"]) + "\n" + ) args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r") isp_full = tuple_fasta(args.putative_isp_fasta_file) @@ -300,37 +368,39 @@ def check_for_uniques(spanins): args.putative_osp_fasta_file = open(args.putative_osp_fasta_file.name, "r") osp_full = tuple_fasta(args.putative_osp_fasta_file) - #print(isp_full) + # print(isp_full) isp_seqs = [] osp_seqs = [] for isp_tupe in isp_full: - #print(isp_tupe) + # print(isp_tupe) for pisp, posp in embedded.items(): - #print(f"ISP = searching for {pisp} in {isp_tupe[0]}") - if re.search(("("+str(pisp)+")\D"), isp_tupe[0]): - #print(isp_tupe[0]) - #print(peri_count) - peri_count = str.split(isp_tupe[0],"~=")[1] - isp_seqs.append((pisp,isp_tupe[1],peri_count)) - #print(isp_seqs) + # print(f"ISP = searching for {pisp} in {isp_tupe[0]}") + if re.search(("(" + str(pisp) + ")\D"), isp_tupe[0]): + # print(isp_tupe[0]) + # print(peri_count) + peri_count = str.split(isp_tupe[0], "~=")[1] + isp_seqs.append((pisp, isp_tupe[1], peri_count)) + # print(isp_seqs) for osp_tupe in osp_full: for pisp, posp in embedded.items(): for data in posp: - #print(f"OSP = searching for {data[3]} in {osp_tupe[0]}, coming from this object: {data}") - if re.search(("("+str(data[3])+")\D"), osp_tupe[0]): - peri_count = str.split(osp_tupe[0],"~=")[1] - osp_seqs.append((data[3],osp_tupe[1],peri_count)) + # print(f"OSP = searching for {data[3]} in {osp_tupe[0]}, coming from this object: {data}") + if re.search(("(" + str(data[3]) + ")\D"), osp_tupe[0]): + peri_count = str.split(osp_tupe[0], "~=")[1] + osp_seqs.append((data[3], osp_tupe[1], peri_count)) with args.embedded_txt as f: f.write("================ embedded spanin candidates =================\n") - f.write("isp\tisp_start\tisp_end\tosp\tosp_start\tosp_end\tstrand\tpair_number\n") + f.write( + "isp\tisp_start\tisp_end\tosp\tosp_start\tosp_end\tstrand\tpair_number\n" + ) if embedded != {}: - #print(embedded) + # print(embedded) for pisp, posp in embedded.items(): - #print(f"{pisp} - {posp}") + # print(f"{pisp} - {posp}") f.write(pisp + "\n") for each_posp in posp: - #print(posp) + # print(posp) f.write( "\t{}\t{}\t{}\t{}\t{}\t{}\t".format( each_posp[1], @@ -342,11 +412,19 @@ def check_for_uniques(spanins): ) ) if each_posp[6] == "+": - if each_posp[2] in pair_dict['pairs']['pair_number'].keys(): - f.write(""+str(pair_dict['pairs']['pair_number'][each_posp[2]])+"\n") + if each_posp[2] in pair_dict["pairs"]["pair_number"].keys(): + f.write( + "" + + str(pair_dict["pairs"]["pair_number"][each_posp[2]]) + + "\n" + ) elif each_posp[6] == "-": - if each_posp[1] in pair_dict['pairs']['pair_number'].keys(): - f.write(""+str(pair_dict['pairs']['pair_number'][each_posp[1]])+"\n") + if each_posp[1] in pair_dict["pairs"]["pair_number"].keys(): + f.write( + "" + + str(pair_dict["pairs"]["pair_number"][each_posp[1]]) + + "\n" + ) else: f.write("nothing found") @@ -354,11 +432,19 @@ def check_for_uniques(spanins): f.write("\n================= embedded candidate sequences ================\n") f.write("======================= isp ==========================\n\n") for isp_data in isp_seqs: - #print(isp_data) - f.write(">isp_orf::{}-peri_count~={}\n{}\n".format(isp_data[0],isp_data[2],lineWrapper(isp_data[1]))) + # print(isp_data) + f.write( + ">isp_orf::{}-peri_count~={}\n{}\n".format( + isp_data[0], isp_data[2], lineWrapper(isp_data[1]) + ) + ) f.write("\n======================= osp ========================\n\n") for osp_data in osp_seqs: - f.write(">osp_orf::{}-peri_count~={}\n{}\n".format(osp_data[0],osp_data[2],lineWrapper(osp_data[1]))) + f.write( + ">osp_orf::{}-peri_count~={}\n{}\n".format( + osp_data[0], osp_data[2], lineWrapper(osp_data[1]) + ) + ) args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r") isp_full = tuple_fasta(args.putative_isp_fasta_file) @@ -369,24 +455,24 @@ def check_for_uniques(spanins): isp_seqs = [] osp_seqs = [] for isp_tupe in isp_full: - peri_count = str.split(isp_tupe[0],"~=")[1] + peri_count = str.split(isp_tupe[0], "~=")[1] for pisp, posp in overlap.items(): - if re.search(("("+str(pisp)+")\D"), isp_tupe[0]): - peri_count = str.split(isp_tupe[0],"~=")[1] - isp_seqs.append((pisp,isp_tupe[1],peri_count)) + if re.search(("(" + str(pisp) + ")\D"), isp_tupe[0]): + peri_count = str.split(isp_tupe[0], "~=")[1] + isp_seqs.append((pisp, isp_tupe[1], peri_count)) for osp_tupe in osp_full: for pisp, posp in overlap.items(): for data in posp: - if re.search(("("+str(data[3])+")\D"), osp_tupe[0]): - peri_count = str.split(osp_tupe[0],"~=")[1] - osp_seqs.append((data[3],osp_tupe[1],peri_count)) - + if re.search(("(" + str(data[3]) + ")\D"), osp_tupe[0]): + peri_count = str.split(osp_tupe[0], "~=")[1] + osp_seqs.append((data[3], osp_tupe[1], peri_count)) - with args.overlap_txt as f: f.write("================ overlap spanin candidates =================\n") - f.write("isp\tisp_start\tisp_end\tosp\tosp_start\tosp_end\tstrand\tpair_number\n") + f.write( + "isp\tisp_start\tisp_end\tosp\tosp_start\tosp_end\tstrand\tpair_number\n" + ) if overlap != {}: for pisp, posp in overlap.items(): f.write(pisp + "\n") @@ -402,24 +488,40 @@ def check_for_uniques(spanins): ) ) if each_posp[6] == "+": - if each_posp[2] in pair_dict['pairs']['pair_number'].keys(): - #print('ovl ; +') - f.write(""+str(pair_dict['pairs']['pair_number'][each_posp[2]])+"\n") + if each_posp[2] in pair_dict["pairs"]["pair_number"].keys(): + # print('ovl ; +') + f.write( + "" + + str(pair_dict["pairs"]["pair_number"][each_posp[2]]) + + "\n" + ) elif each_posp[6] == "-": - if each_posp[1] in pair_dict['pairs']['pair_number'].keys(): - f.write(""+str(pair_dict['pairs']['pair_number'][each_posp[1]])+"\n") + if each_posp[1] in pair_dict["pairs"]["pair_number"].keys(): + f.write( + "" + + str(pair_dict["pairs"]["pair_number"][each_posp[1]]) + + "\n" + ) else: f.write("nothing found") with open(args.overlap_txt.name, "a") as f: - #print(isp_seqs) + # print(isp_seqs) f.write("\n================= overlap candidate sequences ================\n") f.write("======================= isp ==========================\n\n") for isp_data in isp_seqs: - f.write(">isp_orf::{}-pericount~={}\n{}\n".format(isp_data[0],isp_data[2],lineWrapper(isp_data[1]))) + f.write( + ">isp_orf::{}-pericount~={}\n{}\n".format( + isp_data[0], isp_data[2], lineWrapper(isp_data[1]) + ) + ) f.write("\n======================= osp ========================\n\n") for osp_data in osp_seqs: - f.write(">osp_orf::{}-pericount~={}\n{}\n".format(osp_data[0],osp_data[2],lineWrapper(osp_data[1]))) + f.write( + ">osp_orf::{}-pericount~={}\n{}\n".format( + osp_data[0], osp_data[2], lineWrapper(osp_data[1]) + ) + ) args.putative_isp_fasta_file = open(args.putative_isp_fasta_file.name, "r") isp_full = tuple_fasta(args.putative_isp_fasta_file) @@ -430,20 +532,22 @@ def check_for_uniques(spanins): osp_seqs = [] for isp_tupe in isp_full: for pisp, posp in separate.items(): - if re.search(("("+str(pisp)+")\D"), isp_tupe[0]): - peri_count = str.split(isp_tupe[0],"~=")[1] - isp_seqs.append((pisp,isp_tupe[1],peri_count)) - #print(isp_seqs) + if re.search(("(" + str(pisp) + ")\D"), isp_tupe[0]): + peri_count = str.split(isp_tupe[0], "~=")[1] + isp_seqs.append((pisp, isp_tupe[1], peri_count)) + # print(isp_seqs) for osp_tupe in osp_full: for pisp, posp in separate.items(): for data in posp: - if re.search(("("+str(data[3])+")\D"), osp_tupe[0]): - peri_count = str.split(osp_tupe[0],"~=")[1] - osp_seqs.append((data[3],osp_tupe[1],peri_count)) + if re.search(("(" + str(data[3]) + ")\D"), osp_tupe[0]): + peri_count = str.split(osp_tupe[0], "~=")[1] + osp_seqs.append((data[3], osp_tupe[1], peri_count)) with args.separate_txt as f: f.write("================ separated spanin candidates =================\n") - f.write("isp\tisp_start\tisp_end\tosp\tosp_start\tosp_end\tstrand\tpair_number\n") + f.write( + "isp\tisp_start\tisp_end\tosp\tosp_start\tosp_end\tstrand\tpair_number\n" + ) if separate != {}: for pisp, posp in separate.items(): f.write(pisp + "\n") @@ -459,11 +563,19 @@ def check_for_uniques(spanins): ) ) if each_posp[6] == "+": - if each_posp[2] in pair_dict['pairs']['pair_number'].keys(): - f.write(""+str(pair_dict['pairs']['pair_number'][each_posp[2]])+"\n") + if each_posp[2] in pair_dict["pairs"]["pair_number"].keys(): + f.write( + "" + + str(pair_dict["pairs"]["pair_number"][each_posp[2]]) + + "\n" + ) elif each_posp[6] == "-": - if each_posp[1] in pair_dict['pairs']['pair_number'].keys(): - f.write(""+str(pair_dict['pairs']['pair_number'][each_posp[1]])+"\n") + if each_posp[1] in pair_dict["pairs"]["pair_number"].keys(): + f.write( + "" + + str(pair_dict["pairs"]["pair_number"][each_posp[1]]) + + "\n" + ) else: f.write("nothing found") @@ -471,7 +583,15 @@ def check_for_uniques(spanins): f.write("\n================= separated candidate sequences ================\n") f.write("======================= isp ==========================\n\n") for isp_data in isp_seqs: - f.write(">isp_orf::{}-pericount~={}\n{}\n".format(isp_data[0],isp_data[2],lineWrapper(isp_data[1]))) + f.write( + ">isp_orf::{}-pericount~={}\n{}\n".format( + isp_data[0], isp_data[2], lineWrapper(isp_data[1]) + ) + ) f.write("\n======================= osp ========================\n\n") for osp_data in osp_seqs: - f.write(">osp_orf::{}-pericount~={}\n{}\n".format(osp_data[0],osp_data[2],lineWrapper(osp_data[1]))) + f.write( + ">osp_orf::{}-pericount~={}\n{}\n".format( + osp_data[0], osp_data[2], lineWrapper(osp_data[1]) + ) + ) diff --git a/cpt_find_spanins/spaninFuncs.py b/cpt_find_spanins/spaninFuncs.py index 35d627a..bbf5870 100755 --- a/cpt_find_spanins/spaninFuncs.py +++ b/cpt_find_spanins/spaninFuncs.py @@ -18,9 +18,9 @@ def check_back_end_snorkels(seq, tmsize): """ - Searches through the backend of a potential TMD snorkel. This is the 2nd part of a TMD snorkel lysine match. - --> seq : should be the sequence fed from the "search_region" portion of the sequence - --> tmsize : size of the potential TMD being investigated + Searches through the backend of a potential TMD snorkel. This is the 2nd part of a TMD snorkel lysine match. + --> seq : should be the sequence fed from the "search_region" portion of the sequence + --> tmsize : size of the potential TMD being investigated """ found = [] if seq[tmsize - 4] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 5]): @@ -42,10 +42,10 @@ def check_back_end_snorkels(seq, tmsize): def prep_a_gff3(fa, spanin_type, org): """ - Function parses an input detailed 'fa' file and outputs a 'gff3' file - ---> fa = input .fa file - ---> output = output a returned list of data, easily portable to a gff3 next - ---> spanin_type = 'isp' or 'osp' + Function parses an input detailed 'fa' file and outputs a 'gff3' file + ---> fa = input .fa file + ---> output = output a returned list of data, easily portable to a gff3 next + ---> spanin_type = 'isp' or 'osp' """ with org as f: header = f.readline() @@ -76,17 +76,21 @@ def prep_a_gff3(fa, spanin_type, org): source = "cpt.py|putative-*.py" # column 2 score = "." # column 6 phase = "." # column 8 - attributes = "ID=" +orgacc+ "|"+ orfid + ";ALIAS=" + spanin + ";SEQ="+a_pair[1] # column 9 - sequence = [[orgacc, source, methodtype, start, end, score, strand, phase, attributes]] + attributes = ( + "ID=" + orgacc + "|" + orfid + ";ALIAS=" + spanin + ";SEQ=" + a_pair[1] + ) # column 9 + sequence = [ + [orgacc, source, methodtype, start, end, score, strand, phase, attributes] + ] data += sequence return data def write_gff3(data, output="results.gff3"): """ - Parses results from prep_a_gff3 into a gff3 file - ---> input : list from prep_a_gff3 - ---> output : gff3 file + Parses results from prep_a_gff3 into a gff3 file + ---> input : list from prep_a_gff3 + ---> output : gff3 file """ data = data filename = output @@ -109,14 +113,23 @@ def write_gff3(data, output="results.gff3"): f.close() -def find_tmd(pair, minimum=10, maximum=30, TMDmin=10, TMDmax=20, isp_mode=False, peri_min=18, peri_max=206): - """ - Function that searches for lysine snorkels and then for a spanning hydrophobic region that indicates a potential TMD - ---> pair : Input of tuple with description and AA sequence (str) - ---> minimum : How close from the initial start codon a TMD can be within - ---> maximum : How far from the initial start codon a TMD can be within - ---> TMDmin : The minimum size that a transmembrane can be (default = 10) - ---> TMDmax : The maximum size tha ta transmembrane can be (default = 20) +def find_tmd( + pair, + minimum=10, + maximum=30, + TMDmin=10, + TMDmax=20, + isp_mode=False, + peri_min=18, + peri_max=206, +): + """ + Function that searches for lysine snorkels and then for a spanning hydrophobic region that indicates a potential TMD + ---> pair : Input of tuple with description and AA sequence (str) + ---> minimum : How close from the initial start codon a TMD can be within + ---> maximum : How far from the initial start codon a TMD can be within + ---> TMDmin : The minimum size that a transmembrane can be (default = 10) + ---> TMDmax : The maximum size tha ta transmembrane can be (default = 20) """ # hydrophobicAAs = ['P', 'F', 'I', 'W', 'L', 'V', 'M', 'Y', 'C', 'A', 'T', 'G', 'S'] tmd = [] @@ -125,55 +138,62 @@ def find_tmd(pair, minimum=10, maximum=30, TMDmin=10, TMDmax=20, isp_mode=False, if maximum > len(s): maximum = len(s) search_region = s[minimum - 1 : maximum + 1] - #print(f"this is the search region: {search_region}") + # print(f"this is the search region: {search_region}") # print(search_region) # for trouble shooting - for tmsize in range(TMDmin, TMDmax+1, 1): - #print(f"this is the current tmsize we're trying: {tmsize}") + for tmsize in range(TMDmin, TMDmax + 1, 1): + # print(f"this is the current tmsize we're trying: {tmsize}") # print('==============='+str(tmsize)+'================') # print for troubleshooting - pattern = "[PFIWLVMYCATGS]{"+str(tmsize)+"}" # searches for these hydrophobic residues tmsize total times - #print(pattern) - #print(f"sending to regex: {search_region}") + pattern = ( + "[PFIWLVMYCATGS]{" + str(tmsize) + "}" + ) # searches for these hydrophobic residues tmsize total times + # print(pattern) + # print(f"sending to regex: {search_region}") if re.search( - ("[K]"), search_region[1:8]): # grabbing one below with search region, so I want to grab one ahead here when I query. - store_search = re.search(("[K]"), search_region[1:8]) # storing regex object + ("[K]"), search_region[1:8] + ): # grabbing one below with search region, so I want to grab one ahead here when I query. + store_search = re.search( + ("[K]"), search_region[1:8] + ) # storing regex object where_we_are = store_search.start() # finding where we got the hit if re.search( ("[PFIWLVMYCATGS]"), search_region[where_we_are + 1] ) and re.search( ("[PFIWLVMYCATGS]"), search_region[where_we_are - 1] ): # hydrophobic neighbor - #try: - g = re.search(("[PFIWLVMYCATGS]"), search_region[where_we_are + 1]).group() + # try: + g = re.search( + ("[PFIWLVMYCATGS]"), search_region[where_we_are + 1] + ).group() backend = check_back_end_snorkels(search_region, tmsize) if backend == "match": if isp_mode: g = re.search((pattern), search_region).group() - end_of_tmd = re.search((g), s).end()+1 + end_of_tmd = re.search((g), s).end() + 1 amt_peri = len(s) - end_of_tmd if peri_min <= amt_peri <= peri_max: - pair_desc = pair[0] + ", peri_count~="+str(amt_peri) - new_pair = (pair_desc,pair[1]) + pair_desc = pair[0] + ", peri_count~=" + str(amt_peri) + new_pair = (pair_desc, pair[1]) tmd.append(new_pair) else: tmd.append(pair) else: continue - #else: - #print("I'm continuing out of snorkel loop") - #print(f"{search_region}") - #continue + # else: + # print("I'm continuing out of snorkel loop") + # print(f"{search_region}") + # continue if re.search((pattern), search_region): - #print(f"found match: {}") - #print("I AM HEREEEEEEEEEEEEEEEEEEEEEEE") - #try: + # print(f"found match: {}") + # print("I AM HEREEEEEEEEEEEEEEEEEEEEEEE") + # try: if isp_mode: g = re.search((pattern), search_region).group() - end_of_tmd = re.search((g), s).end()+1 + end_of_tmd = re.search((g), s).end() + 1 amt_peri = len(s) - end_of_tmd if peri_min <= amt_peri <= peri_max: - pair_desc = pair[0] + ", peri_count~="+str(amt_peri) - new_pair = (pair_desc,pair[1]) + pair_desc = pair[0] + ", peri_count~=" + str(amt_peri) + new_pair = (pair_desc, pair[1]) tmd.append(new_pair) else: tmd.append(pair) @@ -183,13 +203,15 @@ def find_tmd(pair, minimum=10, maximum=30, TMDmin=10, TMDmax=20, isp_mode=False, return tmd -def find_lipobox(pair, minimum=10, maximum=50, min_after=30, max_after=185, regex=1, osp_mode=False): +def find_lipobox( + pair, minimum=10, maximum=50, min_after=30, max_after=185, regex=1, osp_mode=False +): """ - Function that takes an input tuple, and will return pairs of sequences to their description that have a lipoobox - ---> minimum - min distance from start codon to first AA of lipobox - ---> maximum - max distance from start codon to first AA of lipobox - ---> regex - option 1 (default) => more strict regular expression ; option 2 => looser selection, imported from LipoRy - + Function that takes an input tuple, and will return pairs of sequences to their description that have a lipoobox + ---> minimum - min distance from start codon to first AA of lipobox + ---> maximum - max distance from start codon to first AA of lipobox + ---> regex - option 1 (default) => more strict regular expression ; option 2 => looser selection, imported from LipoRy + """ if regex == 1: pattern = "[ILMFTV][^REKD][GAS]C" # regex for Lipobox from findSpanin.pl @@ -199,19 +221,23 @@ def find_lipobox(pair, minimum=10, maximum=50, min_after=30, max_after=185, rege candidates = [] s = str(pair[1]) # print(s) # trouble shooting - search_region = s[minimum-1 : maximum + 5] # properly slice the input... add 4 to catch if it hangs off at max input + search_region = s[ + minimum - 1 : maximum + 5 + ] # properly slice the input... add 4 to catch if it hangs off at max input # print(search_region) # trouble shooting - patterns = ["[ILMFTV][^REKD][GAS]C","AW[AGS]C"] + patterns = ["[ILMFTV][^REKD][GAS]C", "AW[AGS]C"] for pattern in patterns: - #print(pattern) # trouble shooting + # print(pattern) # trouble shooting if re.search((pattern), search_region): # lipobox must be WITHIN the range... # searches the sequence with the input RegEx AND omits if - g = re.search((pattern), search_region).group() # find the exact group match + g = re.search( + (pattern), search_region + ).group() # find the exact group match amt_peri = len(s) - re.search((g), s).end() + 1 - if min_after <= amt_peri <= max_after: # find the lipobox end region + if min_after <= amt_peri <= max_after: # find the lipobox end region if osp_mode: - pair_desc = pair[0] + ", peri_count~="+str(amt_peri) - new_pair = (pair_desc,pair[1]) + pair_desc = pair[0] + ", peri_count~=" + str(amt_peri) + new_pair = (pair_desc, pair[1]) candidates.append(new_pair) else: candidates.append(pair) @@ -221,9 +247,9 @@ def find_lipobox(pair, minimum=10, maximum=50, min_after=30, max_after=185, rege def tuple_fasta(fasta_file): """ - #### INPUT: Fasta File - #### OUTPUT: zipped (zip) : pairwise relationship of description to sequence - #### + #### INPUT: Fasta File + #### OUTPUT: zipped (zip) : pairwise relationship of description to sequence + #### """ fasta = SeqIO.parse(fasta_file, "fasta") descriptions = [] @@ -281,10 +307,10 @@ def splitStrands(text, strand="+"): def parse_a_range(pair, start, end): """ - Takes an input data tuple from a fasta tuple pair and keeps only those within the input sequence range - ---> data : fasta tuple data - ---> start : start range to keep - ---> end : end range to keep (will need to + 1) + Takes an input data tuple from a fasta tuple pair and keeps only those within the input sequence range + ---> data : fasta tuple data + ---> start : start range to keep + ---> end : end range to keep (will need to + 1) """ matches = [] for each_pair in pair: @@ -310,12 +336,18 @@ def grabLocs(text): Grabs the locations of the spanin based on NT location (seen from ORF). Grabs the ORF name, as per named from the ORF class/module from cpt.py """ - start = re.search(("[\d]+\.\."), text).group(0) # Start of the sequence ; looks for [numbers].. - end = re.search(("\.\.[\d]+"), text).group(0) # End of the sequence ; Looks for ..[numbers] - orf = re.search(("(ORF)[\d]+"), text).group(0) # Looks for ORF and the numbers that are after it - if re.search(("(\[1\])"), text): # stores strand + start = re.search(("[\d]+\.\."), text).group( + 0 + ) # Start of the sequence ; looks for [numbers].. + end = re.search(("\.\.[\d]+"), text).group( + 0 + ) # End of the sequence ; Looks for ..[numbers] + orf = re.search(("(ORF)[\d]+"), text).group( + 0 + ) # Looks for ORF and the numbers that are after it + if re.search(("(\[1\])"), text): # stores strand strand = "+" - elif re.search(("(\[-1\])"), text): # stores strand + elif re.search(("(\[-1\])"), text): # stores strand strand = "-" start = int(start.split("..")[0]) end = int(end.split("..")[1]) @@ -329,7 +361,7 @@ def spaninProximity(isp, osp, max_dist=30): _NOTE THIS FUNCTION COULD BE MODIFIED TO RETURN SEQUENCES_ Compares the locations of i-spanins and o-spanins. max_dist is the distance in NT measurement from i-spanin END site to o-spanin START. The user will be inputting AA distance, so a conversion will be necessary ( * 3) - I modified this on 07.30.2020 to bypass the pick + or - strand. To + I modified this on 07.30.2020 to bypass the pick + or - strand. To INPUT: list of OSP and ISP candidates OUTPUT: Return (improved) candidates for overlapping, embedded, and separate list """ @@ -358,13 +390,27 @@ def spaninProximity(isp, osp, max_dist=30): elif iseq[0] < oseq[0] <= iseq[1] and oseq[1] > iseq[1]: ### OVERLAP / SEPARATE ### if (iseq[1] - oseq[0]) < 6: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + combo = [ + iseq[0], + iseq[1], + oseq[2], + oseq[0], + oseq[1], + iseq[3], + ] separate[iseq[2]] += [combo] else: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + combo = [ + iseq[0], + iseq[1], + oseq[2], + oseq[0], + oseq[1], + iseq[3], + ] overlap[iseq[2]] += [combo] elif iseq[1] <= oseq[0] <= iseq[1] + max_dist: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1], iseq[3]] separate[iseq[2]] += [combo] else: continue @@ -383,13 +429,27 @@ def spaninProximity(isp, osp, max_dist=30): embedded[iseq[2]] += [combo] elif iseq[0] <= oseq[1] <= iseq[1] and oseq[0] < iseq[0]: if (oseq[1] - iseq[0]) < 6: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + combo = [ + iseq[0], + iseq[1], + oseq[2], + oseq[0], + oseq[1], + iseq[3], + ] separate[iseq[2]] += [combo] else: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + combo = [ + iseq[0], + iseq[1], + oseq[2], + oseq[0], + oseq[1], + iseq[3], + ] overlap[iseq[2]] += [combo] elif iseq[0] - 10 < oseq[1] < iseq[0]: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1], iseq[3]] separate[iseq[2]] += [combo] else: continue @@ -402,7 +462,8 @@ def spaninProximity(isp, osp, max_dist=30): def check_for_usp(): - " pass " + "pass" + ############################################### TEST RANGE ######################################################################### #################################################################################################################################### @@ -454,7 +515,7 @@ def check_for_usp(): pairs = zip(test_desc, test_seq) lipo = [] for each_pair in pairs: - #print(each_pair) + # print(each_pair) # try: try: lipo += find_lipobox(pair=each_pair, regex=2) # , minimum=8) diff --git a/cpt_fix_aragorn/fix-aragorn-gff3.py b/cpt_fix_aragorn/fix-aragorn-gff3.py index 10194bd..febce7f 100755 --- a/cpt_fix_aragorn/fix-aragorn-gff3.py +++ b/cpt_fix_aragorn/fix-aragorn-gff3.py @@ -13,19 +13,22 @@ def fixed_feature(rec): for idx, feature in enumerate( feature_lambda( - rec.features, feature_test_type, {"types": ["tRNA", "tmRNA"]}, subfeatures=True + rec.features, + feature_test_type, + {"types": ["tRNA", "tmRNA"]}, + subfeatures=True, ) ): - + fid = "%s-%03d" % (feature.type, 1 + idx) try: name = [feature.type + "-" + feature.qualifiers["Codon"][0]] except KeyError: - name = [feature.qualifiers['product'][0]] + name = [feature.qualifiers["product"][0]] try: - origSource = feature.qualifiers["source"][0] + origSource = feature.qualifiers["source"][0] except: - origSource = "." + origSource = "." gene = gffSeqFeature( location=feature.location, type="gene", diff --git a/cpt_fix_aragorn/gff3.py b/cpt_fix_aragorn/gff3.py index d4795d4..48496c3 100755 --- a/cpt_fix_aragorn/gff3.py +++ b/cpt_fix_aragorn/gff3.py @@ -97,10 +97,10 @@ def feature_test_type(feature, **kwargs): if "type" in kwargs: return str(feature.type).upper() == str(kwargs["type"]).upper() elif "types" in kwargs: - for x in kwargs["types"]: - if str(feature.type).upper() == str(x).upper(): - return True - return False + for x in kwargs["types"]: + if str(feature.type).upper() == str(x).upper(): + return True + return False raise Exception("Incorrect feature_test_type call, need type or types") diff --git a/cpt_fix_sixpack/gff3.py b/cpt_fix_sixpack/gff3.py index d4795d4..48496c3 100755 --- a/cpt_fix_sixpack/gff3.py +++ b/cpt_fix_sixpack/gff3.py @@ -97,10 +97,10 @@ def feature_test_type(feature, **kwargs): if "type" in kwargs: return str(feature.type).upper() == str(kwargs["type"]).upper() elif "types" in kwargs: - for x in kwargs["types"]: - if str(feature.type).upper() == str(x).upper(): - return True - return False + for x in kwargs["types"]: + if str(feature.type).upper() == str(x).upper(): + return True + return False raise Exception("Incorrect feature_test_type call, need type or types") diff --git a/cpt_gbk_adjacent/adjacent_features.py b/cpt_gbk_adjacent/adjacent_features.py index ae8d2c6..5aaf672 100755 --- a/cpt_gbk_adjacent/adjacent_features.py +++ b/cpt_gbk_adjacent/adjacent_features.py @@ -64,10 +64,9 @@ def extract_features( continue if "codon_start" in feat.qualifiers: - offset = 1 - int(feat.qualifiers["codon_start"][0]) + offset = 1 - int(feat.qualifiers["codon_start"][0]) else: - offset = 0 - + offset = 0 temp = gbk.seq[feat.location.start : feat.location.end] if feat.location.strand == -1: @@ -90,7 +89,9 @@ def extract_features( "++++++++" ] # Junk value for genesOnly flag - if (gSeq == fSeq) and (protID == feat.qualifiers["protein_id"][0] or forceSeqID == False): + if (gSeq == fSeq) and ( + protID == feat.qualifiers["protein_id"][0] or forceSeqID == False + ): goBack = num - 1 goAhead = num + 1 numBack = behind @@ -120,10 +121,9 @@ def extract_features( backList.reverse() if feat.location.strand == -1: - tmpList = aheadList - aheadList = backList - backList = tmpList - + tmpList = aheadList + aheadList = backList + backList = tmpList for item in backList: addition = "" diff --git a/cpt_gbk_compare/gbk_compare.py b/cpt_gbk_compare/gbk_compare.py index 1c61555..96d492e 100644 --- a/cpt_gbk_compare/gbk_compare.py +++ b/cpt_gbk_compare/gbk_compare.py @@ -23,28 +23,38 @@ def addArr(arrA, arrB): res = [] for x in range(0, min(len(arrA), len(arrB))): - res.append(arrA[x] + arrB[x]) + res.append(arrA[x] + arrB[x]) return res -def get_arguments(): - parser = argparse.ArgumentParser(description='Compare GenBank annotations', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - - parser.add_argument('annotation_1', type=str, - help='First annotated genome in Genbank format') - parser.add_argument('annotation_2', type=str, - help='Second annotated genome in Genbank format') +def get_arguments(): + parser = argparse.ArgumentParser( + description="Compare GenBank annotations", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) - parser.add_argument('--match_identity_threshold', type=float, default=0.7, - help='Two genes must have at least this identity to be considerd the same (0.0 to 1.0)') - parser.add_argument('--allowed_skipped_genes', type=int, default=10, - help='This many missing genes are allowed when aligning the annotations') - parser.add_argument("--addNotes", action="store_true", help="Add Note fields") + parser.add_argument( + "annotation_1", type=str, help="First annotated genome in Genbank format" + ) + parser.add_argument( + "annotation_2", type=str, help="Second annotated genome in Genbank format" + ) parser.add_argument( - "-sumOut", type=argparse.FileType("w"), help="Summary out file" + "--match_identity_threshold", + type=float, + default=0.7, + help="Two genes must have at least this identity to be considerd the same (0.0 to 1.0)", + ) + parser.add_argument( + "--allowed_skipped_genes", + type=int, + default=10, + help="This many missing genes are allowed when aligning the annotations", ) + parser.add_argument("--addNotes", action="store_true", help="Add Note fields") + + parser.add_argument("-sumOut", type=argparse.FileType("w"), help="Summary out file") args = parser.parse_args() return args @@ -53,35 +63,49 @@ def main(): args = get_arguments() # Load in the CDS features from the two assemblies. - old = SeqIO.parse(args.annotation_1, 'genbank') - new = SeqIO.parse(args.annotation_2, 'genbank') + old = SeqIO.parse(args.annotation_1, "genbank") + new = SeqIO.parse(args.annotation_2, "genbank") old_record = next(old) new_record = next(new) old_features, new_features = [], [] for f in old_record.features: - if f.type == 'CDS': + if f.type == "CDS": old_features.append(f) for f in new_record.features: - if f.type == 'CDS': + if f.type == "CDS": new_features.append(f) - args.sumOut.write('Features in First Genbank\'s assembly:\t' + str(len(old_features)) + "\n") - args.sumOut.write('Features in Second Genbank\'s assembly:\t' + str(len(new_features)) + "\n\n") + args.sumOut.write( + "Features in First Genbank's assembly:\t" + str(len(old_features)) + "\n" + ) + args.sumOut.write( + "Features in Second Genbank's assembly:\t" + str(len(new_features)) + "\n\n" + ) # Align the features to each other. - offsets = sorted(list(itertools.product(range(args.allowed_skipped_genes + 1), - range(args.allowed_skipped_genes + 1))), - key=lambda x: x[0]+x[1]) + offsets = sorted( + list( + itertools.product( + range(args.allowed_skipped_genes + 1), + range(args.allowed_skipped_genes + 1), + ) + ), + key=lambda x: x[0] + x[1], + ) old_i, new_i = 0, 0 exactRec = 0 - inexactRec = [0, 0 ,0] + inexactRec = [0, 0, 0] hypoRec = [0, 0, 0] newCount = 0 oldCount = 0 if args.addNotes: - print("First Record CDS Product\tSimilarity\tSecond Record CDS Product\tPercent Identity\tLength Difference\tFirst Gbk's CDS Location\tSecond Gbk's CDS Location\tHypothetical Status\tFirst Record's Notes\tSecond Record's Notes\n") + print( + "First Record CDS Product\tSimilarity\tSecond Record CDS Product\tPercent Identity\tLength Difference\tFirst Gbk's CDS Location\tSecond Gbk's CDS Location\tHypothetical Status\tFirst Record's Notes\tSecond Record's Notes\n" + ) else: - print("First Record CDS Product\tSimilarity\tSecond Record CDS Product\tPercent Identity\tLength Difference\tFirst Gbk's CDS Location\tSecond Gbk's CDS Location\tHypothetical Status\n") + print( + "First Record CDS Product\tSimilarity\tSecond Record CDS Product\tPercent Identity\tLength Difference\tFirst Gbk's CDS Location\tSecond Gbk's CDS Location\tHypothetical Status\n" + ) while True: if old_i >= len(old_features) and new_i >= len(new_features): break @@ -96,7 +120,13 @@ def main(): except IndexError: new_feature = None try: - match, identity, length_diff = compare_features(old_feature, new_feature, old_record, new_record, args.match_identity_threshold) + match, identity, length_diff = compare_features( + old_feature, + new_feature, + old_record, + new_record, + args.match_identity_threshold, + ) except TypeError: break if match: @@ -107,15 +137,25 @@ def main(): print_in_new_not_old(new_features[new_i + j]) newCount += 1 if identity == 1.0: - exactRec += 1 - res1, res2 = print_match(old_features[old_i + old_offset], new_features[new_i + new_offset], identity, length_diff, args.addNotes) + exactRec += 1 + res1, res2 = print_match( + old_features[old_i + old_offset], + new_features[new_i + new_offset], + identity, + length_diff, + args.addNotes, + ) inexactRec = addArr(inexactRec, res1) hypoRec = addArr(hypoRec, res2) old_i += old_offset new_i += new_offset break else: - sys.stderr.write("Exceeded allowed number of skipped genes (" + str(args.allowed_skipped_genes) + "), unable to maintain alignment and continue comparison.\n") + sys.stderr.write( + "Exceeded allowed number of skipped genes (" + + str(args.allowed_skipped_genes) + + "), unable to maintain alignment and continue comparison.\n" + ) exit(2) if old_feature is None and new_feature is None: @@ -124,108 +164,134 @@ def main(): old_i += 1 new_i += 1 - args.sumOut.write('Exact Match:\t' + str(exactRec) + "\n\n") + args.sumOut.write("Exact Match:\t" + str(exactRec) + "\n\n") - args.sumOut.write('Inexact Match:\t' + str(inexactRec[0] + inexactRec[1] + inexactRec[2]) + "\n") - args.sumOut.write(' Same length:\t' + str(inexactRec[0]) + "\n") - args.sumOut.write(' Second Gbk Seq longer:\t' + str(inexactRec[2]) + "\n") - args.sumOut.write(' First Gbk Seq longer:\t' + str(inexactRec[1]) + "\n\n") - - args.sumOut.write('In Second Gbk but not in first:\t' + str(newCount) + "\n") - args.sumOut.write('In First Gbk but not in second:\t' + str(oldCount) + "\n\n") + args.sumOut.write( + "Inexact Match:\t" + str(inexactRec[0] + inexactRec[1] + inexactRec[2]) + "\n" + ) + args.sumOut.write(" Same length:\t" + str(inexactRec[0]) + "\n") + args.sumOut.write(" Second Gbk Seq longer:\t" + str(inexactRec[2]) + "\n") + args.sumOut.write(" First Gbk Seq longer:\t" + str(inexactRec[1]) + "\n\n") + + args.sumOut.write("In Second Gbk but not in first:\t" + str(newCount) + "\n") + args.sumOut.write("In First Gbk but not in second:\t" + str(oldCount) + "\n\n") + + args.sumOut.write( + "Hypothetical Annotation Change:\t" + str(hypoRec[1] + hypoRec[2]) + "\n" + ) + args.sumOut.write("Hypothetical:\t" + str(hypoRec[0] + hypoRec[2]) + "\n") - args.sumOut.write('Hypothetical Annotation Change:\t' + str(hypoRec[1] + hypoRec[2]) + "\n") - args.sumOut.write('Hypothetical:\t' + str(hypoRec[0] + hypoRec[2]) + "\n") - def print_match(f1, f2, identity, length_diff, outNotes): - #print('', flush=True) - line = f1.qualifiers['product'][0] + "\t" + # print('', flush=True) + line = f1.qualifiers["product"][0] + "\t" matchArr = [0, 0, 0] hypoArr = [0, 0, 0] if identity == 1.0: -# print('Exact match') - line += 'Exact match\t' + f2.qualifiers['product'][0] + "\t100.0\tSame Length\t" + # print('Exact match') + line += "Exact match\t" + f2.qualifiers["product"][0] + "\t100.0\tSame Length\t" else: -# print('Inexact match (' + '%.2f' % (identity * 100.0) + '% ID, ', end='') - line += 'Inexact match\t' + f2.qualifiers['product'][0] + "\t%.2f\t" % (identity * 100.0) + # print('Inexact match (' + '%.2f' % (identity * 100.0) + '% ID, ', end='') + line += ( + "Inexact match\t" + + f2.qualifiers["product"][0] + + "\t%.2f\t" % (identity * 100.0) + ) if length_diff == 0: -# print('same length)') - line +="Same Length\t" + # print('same length)') + line += "Same Length\t" matchArr[0] += 1 elif length_diff > 0: -# print('old seq longer)') - line +="First Gbk Seq Longer\t" + # print('old seq longer)') + line += "First Gbk Seq Longer\t" matchArr[1] += 1 elif length_diff < 0: -# print('new seq longer)') - line +="Second Gbk Seq Longer\t" + # print('new seq longer)') + line += "Second Gbk Seq Longer\t" matchArr[2] += 1 -# print(' old: ', end='') -# print_feature_one_line(f1) + # print(' old: ', end='') + # print_feature_one_line(f1) line += print_feature_one_line(f1) + "\t" -# print(' new: ', end='') -# print_feature_one_line(f2) + # print(' new: ', end='') + # print_feature_one_line(f2) line += print_feature_one_line(f2) + "\t" - p1 = f1.qualifiers['product'][0].lower() - p2 = f2.qualifiers['product'][0].lower() - if 'hypothetical' in p1 and 'hypothetical' in p2: -# print(' still hypothetical') + p1 = f1.qualifiers["product"][0].lower() + p2 = f2.qualifiers["product"][0].lower() + if "hypothetical" in p1 and "hypothetical" in p2: + # print(' still hypothetical') line += "Hypothetical\t" hypoArr[0] += 1 - elif 'hypothetical' in p1 and 'hypothetical' not in p2: -# print(' no longer hypothetical') + elif "hypothetical" in p1 and "hypothetical" not in p2: + # print(' no longer hypothetical') line += "No Longer Hypothetical\t" hypoArr[1] += 1 - elif 'hypothetical' not in p1 and 'hypothetical' in p2: -# print(' became hypothetical') + elif "hypothetical" not in p1 and "hypothetical" in p2: + # print(' became hypothetical') line += "Became Hypothetical\t" hypoArr[2] += 1 else: line += "'Hypothetical' not in second nor first Gbk's product tag" - + if outNotes: - line += "\t" - if "note" in f1.qualifiers.keys(): - for x in f1.qualifiers["note"]: - line += x line += "\t" - else: - line += "N/A\t" - if "note" in f2.qualifiers.keys(): - for x in f2.qualifiers["note"]: - line += x - else: - line += "N/A" + if "note" in f1.qualifiers.keys(): + for x in f1.qualifiers["note"]: + line += x + line += "\t" + else: + line += "N/A\t" + if "note" in f2.qualifiers.keys(): + for x in f2.qualifiers["note"]: + line += x + else: + line += "N/A" print(line) return matchArr, hypoArr -def print_in_old_not_new(f): # rename file outputs - line = f.qualifiers['product'][0] + "\tIn First Gbk but not Second\tN/A\t0.00\t" + str(f.location.end - f.location.start) + "\t" + print_feature_one_line(f) + "\tN/A\tN/A" -# print('') -# print('In old but not in new:') -# print(' ', end='') -# print_feature_one_line(f) + +def print_in_old_not_new(f): # rename file outputs + line = ( + f.qualifiers["product"][0] + + "\tIn First Gbk but not Second\tN/A\t0.00\t" + + str(f.location.end - f.location.start) + + "\t" + + print_feature_one_line(f) + + "\tN/A\tN/A" + ) + # print('') + # print('In old but not in new:') + # print(' ', end='') + # print_feature_one_line(f) print(line) -def print_in_new_not_old(f): # rename file outputs - line = "N/A\tIn Second Gbk but not First\t" + f.qualifiers['product'][0] + "\t0.00\t" + str(f.location.end - f.location.start) + "\tN/A\t" + print_feature_one_line(f) + "\tN/A" - #print('') - #print('In new but not in old:') - #print(' ', end='') - #print_feature_one_line(f) +def print_in_new_not_old(f): # rename file outputs + line = ( + "N/A\tIn Second Gbk but not First\t" + + f.qualifiers["product"][0] + + "\t0.00\t" + + str(f.location.end - f.location.start) + + "\tN/A\t" + + print_feature_one_line(f) + + "\tN/A" + ) + # print('') + # print('In new but not in old:') + # print(' ', end='') + # print_feature_one_line(f) print(line) def print_feature_one_line(f): - #f_str = f.qualifiers['product'][0] + # f_str = f.qualifiers['product'][0] f_str = "" - strand = '+' if f.location.strand == 1 else '-' - f_str += '(' + str(f.location.start) + '-' + str(f.location.end) + ' ' + strand + ', ' - f_str += str(f.location.end - f.location.start) + ' bp)' - return(f_str) + strand = "+" if f.location.strand == 1 else "-" + f_str += ( + "(" + str(f.location.start) + "-" + str(f.location.end) + " " + strand + ", " + ) + f_str += str(f.location.end - f.location.start) + " bp)" + return f_str def compare_features(f1, f2, r1, r2, match_identity_threshold): @@ -241,5 +307,5 @@ def compare_features(f1, f2, r1, r2, match_identity_threshold): return match, identity, length_diff -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/cpt_gbk_to_5col/BIO_FIX_TOPO.py b/cpt_gbk_to_5col/BIO_FIX_TOPO.py index 846f9cc..4111d5a 100755 --- a/cpt_gbk_to_5col/BIO_FIX_TOPO.py +++ b/cpt_gbk_to_5col/BIO_FIX_TOPO.py @@ -2,10 +2,9 @@ def record_end(self, content): - """Clean up when we've finished the record. - """ - #from Bio import Alphabet - #from Bio.Alphabet import IUPAC + """Clean up when we've finished the record.""" + # from Bio import Alphabet + # from Bio.Alphabet import IUPAC from Bio.Seq import Seq, UnknownSeq # Try and append the version number to the accession for the full id @@ -24,8 +23,8 @@ def record_end(self, content): # first, determine the alphabet # we default to an generic alphabet if we don't have a # seq type or have strange sequence information. - - #seq_alphabet = Alphabet.generic_alphabet + + # seq_alphabet = Alphabet.generic_alphabet # now set the sequence sequence = "".join(self._seq_data) @@ -77,9 +76,9 @@ def record_end(self, content): self.data.annotations["topology"] = "linear" """ if not sequence and self.__expected_size: - self.data.seq = UnknownSeq(self._expected_size)#, seq_alphabet) + self.data.seq = UnknownSeq(self._expected_size) # , seq_alphabet) else: - self.data.seq = Seq(sequence)#, seq_alphabet) + self.data.seq = Seq(sequence) # , seq_alphabet) Bio.GenBank._FeatureConsumer.record_end = record_end diff --git a/cpt_gbk_to_gff/gbk_to_gff3.py b/cpt_gbk_to_gff/gbk_to_gff3.py index 5b71eed..e8c0183 100755 --- a/cpt_gbk_to_gff/gbk_to_gff3.py +++ b/cpt_gbk_to_gff/gbk_to_gff3.py @@ -10,24 +10,43 @@ bottomFeatTypes = ["exon", "RBS", "CDS"] + def makeGffFeat(inFeat, num, recName, identifier): - if inFeat.type == "RBS" or (inFeat.type == "regulatory" and "regulatory_class" in inFeat.qualifiers.keys() and inFeat.qualifiers["regulatory_class"][0] == "ribosome_binding_site"): - inFeat.type = "Shine_Dalgarno_sequence" + if inFeat.type == "RBS" or ( + inFeat.type == "regulatory" + and "regulatory_class" in inFeat.qualifiers.keys() + and inFeat.qualifiers["regulatory_class"][0] == "ribosome_binding_site" + ): + inFeat.type = "Shine_Dalgarno_sequence" if "codon_start" in inFeat.qualifiers.keys(): - shift = int(inFeat.qualifiers["codon_start"][0]) - 1 + shift = int(inFeat.qualifiers["codon_start"][0]) - 1 else: - shift = "." + shift = "." if identifier in inFeat.qualifiers.keys(): - name = inFeat.qualifiers[identifier][0] + "." + inFeat.type - if num > 0: - name += "." + str(num) + name = inFeat.qualifiers[identifier][0] + "." + inFeat.type + if num > 0: + name += "." + str(num) else: - name = recName + "." + inFeat.type + "." + str(num) - - outFeat = gffSeqFeature(inFeat.location, inFeat.type, '', inFeat.strand, name, inFeat.qualifiers, None, None, None, shift, 0, "GbkToGff") - outFeat.qualifiers["ID"] = [name] + name = recName + "." + inFeat.type + "." + str(num) + + outFeat = gffSeqFeature( + inFeat.location, + inFeat.type, + "", + inFeat.strand, + name, + inFeat.qualifiers, + None, + None, + None, + shift, + 0, + "GbkToGff", + ) + outFeat.qualifiers["ID"] = [name] return outFeat + def main(inFile, makeMRNA, makeGene, identifier, fastaFile, outFile): ofh = sys.stdout @@ -51,224 +70,408 @@ def main(inFile, makeMRNA, makeGene, identifier, fastaFile, outFile): geneNum = 0 autoGeneNum = 0 for feat in rec.features: - if identifier not in feat.qualifiers.keys(): #Allow metadata features and other features with no ID (Output warning?) - AJC - if feat.type in bottomFeatTypes: - seekingParent.append([feat, [], []]) # [Feature, all parent candidates, strongest parent candidates] + if ( + identifier not in feat.qualifiers.keys() + ): # Allow metadata features and other features with no ID (Output warning?) - AJC + if feat.type in bottomFeatTypes: + seekingParent.append( + [feat, [], []] + ) # [Feature, all parent candidates, strongest parent candidates] + continue + elif feat.type not in topTypeDict.keys(): + topTypeDict[feat.type] = 1 + else: + topTypeDict[feat.type] += 1 + outFeats.append( + makeGffFeat(feat, topTypeDict[feat.type], recID, identifier) + ) continue - elif feat.type not in topTypeDict.keys(): - topTypeDict[feat.type] = 1 - else: - topTypeDict[feat.type] += 1 - outFeats.append(makeGffFeat(feat, topTypeDict[feat.type], recID, identifier)) - continue elif feat.qualifiers[identifier][0] not in locBucket.keys(): - locBucket[feat.qualifiers[identifier][0]] = [] + locBucket[feat.qualifiers[identifier][0]] = [] locBucket[feat.qualifiers[identifier][0]].append(feat) for locus in locBucket.keys(): - minLoc = locBucket[locus][0].location.start - maxLoc = locBucket[locus][0].location.end - for feat in locBucket[locus]: - minLoc = min(minLoc, feat.location.start) - maxLoc = max(maxLoc, feat.location.end) - for x in seekingParent: - if x[0].location.start >= minLoc and x[0].location.end <= maxLoc: - x[1].append(locus) - if x[0].location.start == minLoc or x[0].location.end == maxLoc: - x[2].append(locus) + minLoc = locBucket[locus][0].location.start + maxLoc = locBucket[locus][0].location.end + for feat in locBucket[locus]: + minLoc = min(minLoc, feat.location.start) + maxLoc = max(maxLoc, feat.location.end) + for x in seekingParent: + if x[0].location.start >= minLoc and x[0].location.end <= maxLoc: + x[1].append(locus) + if x[0].location.start == minLoc or x[0].location.end == maxLoc: + x[2].append(locus) - for x in seekingParent: #Reformat to [Feature, Locus, Unused/Free] - if len(x[2]) == 1: - finList = "" - if len(x[1]) > 1: - for loc in x[1]: - if loc != x[2][0]: - finList += loc + ", " - finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived. Other, weaker candidate(s) were " + finList[0:-2] + "." - else: - finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived." - if "Notes" not in x[0].qualifiers.keys(): - x[0].qualifiers["Notes"] = [] - x[0].qualifiers["Notes"].append(finList) - x[1] = x[2][0] - elif len(x[2]) > 1: - candidate = x[2][0] #Arbitrarily choose first one - finList = "" - strongList = "" - for loc in x[2]: - if loc != candidate: - finList += loc + ", " - strongList += loc + ", " - for loc in x[1]: - if loc not in x[2]: - finList += loc + ", " - finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived. Other candidate(s) were " + finList[0:-2] + " (Equally strong candidate(s): " + strongList[0:-2] + ")." - if "Notes" not in x[0].qualifiers.keys(): - x[0].qualifiers["Notes"] = [] - x[0].qualifiers["Notes"].append(finList) - x[1] = candidate - elif len(x[1]) == 1: - x[1] = x[1][0] - if "Notes" not in x[0].qualifiers.keys(): - x[0].qualifiers["Notes"] = [] - finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived." - x[0].qualifiers["Notes"].append(finList) - elif len(x[1]) > 1: - candidate = x[1][0] #Arbitrarily choose first one - finList = "" - for loc in x[1]: - if loc != candidate: - finList += loc + ", " - finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived. Other candidates were " + finList[0:-2] + "." - if "Notes" not in x[0].qualifiers.keys(): - x[0].qualifiers["Notes"] = [] - x[0].qualifiers["Notes"].append(finList) - x[1] = candidate - else: - if makeGene: - sys.stderr.write("Warning: Unable to find potential parent for feature with no " + identifier + " of type " + str(x[0].type) + " at location [" + str(x[0].location.start + 1) + ", " + str(x[0].location.end) + "], creating standalone gene.\n") - autoGeneNum += 1 - x[0].source = "GbkToGff" - x[0].score = 0 - x[0].shift = 0 - if "ID" not in x[0].qualifiers.keys(): - x[0].qualifiers["ID"] = [recID + ".standalone_" + x[0].type + "." + str(autoGeneNum)] - tempName = recID + ".derived_Gene." + str(autoGeneNum) - tempQuals = {"ID" : [tempName], "Notes" : ["Gene feature automatically generated by Gbk to GFF conversion"]} - tempGene = gffSeqFeature(FeatureLocation(x[0].location.start, x[0].location.end, x[0].location.strand), 'gene', '', x[0].strand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff") - if makeMRNA: - tempName = recID + ".derived_mRNA." + str(autoGeneNum) - tempQuals = {"ID" : [tempName], "Notes" : ["mRNA feature automatically generated by Gbk to GFF conversion"]} - tempGene.sub_features.append(gffSeqFeature(FeatureLocation(x[0].location.start, x[0].location.end, x[0].location.strand), 'mRNA', '', x[0].strand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff")) - tempGene.sub_features[-1].sub_features.append(x[0]) - else: - tempGene.sub_features.append(x[0]) - - - outFeats.append(tempGene) - else: - sys.stderr.write("Warning: Unable to find potential parent for feature with no " + identifier + " of type " + str(x[0].type) + " at location [" + str(x[0].location.start + 1) + ", " + str(x[0].location.end) + "].\n") - if x[0].type not in topTypeDict.keys(): - topTypeDict[x[0].type] = 1 - else: - topTypeDict[x[0].type] += 1 - outFeats.append(makeGffFeat(x[0], topTypeDict[x[0].type], recID, identifier)) - - for locus in locBucket.keys(): - if len(locBucket[locus]) == 1: # No heirarchy to be made - outFeats.append(makeGffFeat(locBucket[locus][0], 0, recID, identifier)) - continue - topFeat = None - midFeat = None - bottomFeats = [] - typeDict = {} - minLoc = locBucket[locus][0].location.start - maxLoc = locBucket[locus][0].location.end - geneNum += 1 - for feat in locBucket[locus]: - # If we want to make our own top-level feat? - minLoc = min(minLoc, feat.location.start) - maxLoc = max(maxLoc, feat.location.end) - - # Gene->mRNA->CDS included as example, to add other feature-heirarchys in the appropriate slot - if feat.type in ['gene']: - if not topFeat: - topFeat = feat - # Else handle multiple top features - elif feat.type in ['mRNA', 'tRNA', 'rRNA']: - if not midFeat: - midFeat = feat - # Else handle multiple mid feats (May need another elif type-in-list statement if we actually expect a list of mid feats) - else: - if feat.type not in typeDict.keys(): - typeDict[feat.type] = 1 - else: - typeDict[feat.type] += 1 - bottomFeats.append(feat) - - for x in seekingParent: - if type(x[1]) != "list" and locus == x[1]: - x[0].qualifiers[identifier] = [locus] - bottomFeats.append(x[0]) - if x[0].type not in typeDict.keys(): - typeDict[x[0].type] = 1 - else: - typeDict[x[0].type] += 1 - - - - - - #if not topFeat: # Make our own top-level feature based off minLoc, maxLoc bounds - - for x in typeDict.keys(): # If only 1, set it to 0 so we don't append a number to the name - if typeDict[x] == 1: # Else, set to 1 so that we count up as we encounter the features - typeDict[x] = 0 - else: - typeDict[x] = 1 - - if not topFeat: - if makeGene: - if midFeat: - possibleStrand = midFeat.strand - else: - possibleStrand = bottomFeats[0].strand - tempName = recID + ".gene." + str(geneNum) - tempQuals = {identifier : [locus], "ID" : [tempName], "Notes" : ["Gene feature automatically generated by Gbk to GFF conversion"]} - topFeat = gffSeqFeature(FeatureLocation(minLoc, maxLoc, possibleStrand), 'gene', '', possibleStrand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff") + for x in seekingParent: # Reformat to [Feature, Locus, Unused/Free] + if len(x[2]) == 1: + finList = "" + if len(x[1]) > 1: + for loc in x[1]: + if loc != x[2][0]: + finList += loc + ", " + finList = ( + str(x[0].type) + + " had no locus tag set in .gbk file, automatically derived. Other, weaker candidate(s) were " + + finList[0:-2] + + "." + ) + else: + finList = ( + str(x[0].type) + + " had no locus tag set in .gbk file, automatically derived." + ) + if "Notes" not in x[0].qualifiers.keys(): + x[0].qualifiers["Notes"] = [] + x[0].qualifiers["Notes"].append(finList) + x[1] = x[2][0] + elif len(x[2]) > 1: + candidate = x[2][0] # Arbitrarily choose first one + finList = "" + strongList = "" + for loc in x[2]: + if loc != candidate: + finList += loc + ", " + strongList += loc + ", " + for loc in x[1]: + if loc not in x[2]: + finList += loc + ", " + finList = ( + str(x[0].type) + + " had no locus tag set in .gbk file, automatically derived. Other candidate(s) were " + + finList[0:-2] + + " (Equally strong candidate(s): " + + strongList[0:-2] + + ")." + ) + if "Notes" not in x[0].qualifiers.keys(): + x[0].qualifiers["Notes"] = [] + x[0].qualifiers["Notes"].append(finList) + x[1] = candidate + elif len(x[1]) == 1: + x[1] = x[1][0] + if "Notes" not in x[0].qualifiers.keys(): + x[0].qualifiers["Notes"] = [] + finList = ( + str(x[0].type) + + " had no locus tag set in .gbk file, automatically derived." + ) + x[0].qualifiers["Notes"].append(finList) + elif len(x[1]) > 1: + candidate = x[1][0] # Arbitrarily choose first one + finList = "" + for loc in x[1]: + if loc != candidate: + finList += loc + ", " + finList = ( + str(x[0].type) + + " had no locus tag set in .gbk file, automatically derived. Other candidates were " + + finList[0:-2] + + "." + ) + if "Notes" not in x[0].qualifiers.keys(): + x[0].qualifiers["Notes"] = [] + x[0].qualifiers["Notes"].append(finList) + x[1] = candidate else: - sys.stderr.write("Unable to create a feature heirarchy at location [%d, %d] with features: \n" % (minLoc, maxLoc)) - for x in locBucket[locus]: - sys.stderr.write(str(x)) - sys.stderr.write('\n') - failed = 1 - continue + if makeGene: + sys.stderr.write( + "Warning: Unable to find potential parent for feature with no " + + identifier + + " of type " + + str(x[0].type) + + " at location [" + + str(x[0].location.start + 1) + + ", " + + str(x[0].location.end) + + "], creating standalone gene.\n" + ) + autoGeneNum += 1 + x[0].source = "GbkToGff" + x[0].score = 0 + x[0].shift = 0 + if "ID" not in x[0].qualifiers.keys(): + x[0].qualifiers["ID"] = [ + recID + ".standalone_" + x[0].type + "." + str(autoGeneNum) + ] + tempName = recID + ".derived_Gene." + str(autoGeneNum) + tempQuals = { + "ID": [tempName], + "Notes": [ + "Gene feature automatically generated by Gbk to GFF conversion" + ], + } + tempGene = gffSeqFeature( + FeatureLocation( + x[0].location.start, x[0].location.end, x[0].location.strand + ), + "gene", + "", + x[0].strand, + tempName, + tempQuals, + None, + None, + None, + ".", + 0, + "GbkToGff", + ) + if makeMRNA: + tempName = recID + ".derived_mRNA." + str(autoGeneNum) + tempQuals = { + "ID": [tempName], + "Notes": [ + "mRNA feature automatically generated by Gbk to GFF conversion" + ], + } + tempGene.sub_features.append( + gffSeqFeature( + FeatureLocation( + x[0].location.start, + x[0].location.end, + x[0].location.strand, + ), + "mRNA", + "", + x[0].strand, + tempName, + tempQuals, + None, + None, + None, + ".", + 0, + "GbkToGff", + ) + ) + tempGene.sub_features[-1].sub_features.append(x[0]) + else: + tempGene.sub_features.append(x[0]) - outFeats.append(makeGffFeat(topFeat, 0, recID, identifier)) - if not midFeat and topFeat.type == "gene" and makeMRNA: - if identifier in topFeat.qualifiers.keys(): - tempName = topFeat.qualifiers[identifier][0] + ".mRNA" - tempQuals = {identifier : topFeat.qualifiers[identifier], "ID" : [tempName], "Notes" : ["mRNA feature automatically generated by Gbk to GFF conversion"]} - else: - tempName = outFeats[-1].ID + ".mRNA" - tempQuals = {identifier : topFeat.qualifiers[identifier], "ID" : [tempName], "Notes" : ["mRNA feature automatically generated by Gbk to GFF conversion"]} - midFeat = gffSeqFeature(FeatureLocation(minLoc, maxLoc, topFeat.strand), 'mRNA', '', topFeat.strand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff") - - if midFeat: # Again, need a new if statement if we want to handle multiple mid-tier features - outFeats[-1].sub_features.append(makeGffFeat(midFeat, 0, recID, identifier)) - outFeats[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].id] - for x in bottomFeats: - typeDict[x.type] += 1 - outFeats[-1].sub_features[-1].sub_features.append(makeGffFeat(x, typeDict[x.type], recID, identifier)) - outFeats[-1].sub_features[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].sub_features[-1].id] - else: # No midFeat, append bottom feats directly to top feats - for x in bottomFeats: - typeDict[x.type] += 1 - outFeats[-1].sub_features.append(makeGffFeat(x, typeDict[x.type], recID, identifier)) - outFeats[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].id] - - outRec.append(SeqRecord(rec.seq, recID, rec.name, rec.description, rec.dbxrefs, sorted(outFeats, key=lambda x: x.location.start), rec.annotations, rec.letter_annotations)) - SeqIO.write([outRec[-1]], fastaFile, "fasta") - gffWrite(outRec, ofh) - exit(failed) # 0 if all features handled, 1 if unable to handle some + outFeats.append(tempGene) + else: + sys.stderr.write( + "Warning: Unable to find potential parent for feature with no " + + identifier + + " of type " + + str(x[0].type) + + " at location [" + + str(x[0].location.start + 1) + + ", " + + str(x[0].location.end) + + "].\n" + ) + if x[0].type not in topTypeDict.keys(): + topTypeDict[x[0].type] = 1 + else: + topTypeDict[x[0].type] += 1 + outFeats.append( + makeGffFeat(x[0], topTypeDict[x[0].type], recID, identifier) + ) + for locus in locBucket.keys(): + if len(locBucket[locus]) == 1: # No heirarchy to be made + outFeats.append(makeGffFeat(locBucket[locus][0], 0, recID, identifier)) + continue + topFeat = None + midFeat = None + bottomFeats = [] + typeDict = {} + minLoc = locBucket[locus][0].location.start + maxLoc = locBucket[locus][0].location.end + geneNum += 1 + for feat in locBucket[locus]: + # If we want to make our own top-level feat? + minLoc = min(minLoc, feat.location.start) + maxLoc = max(maxLoc, feat.location.end) -if __name__ == '__main__': - parser = argparse.ArgumentParser( description='Biopython solution to Gbk to GFF conversion') + # Gene->mRNA->CDS included as example, to add other feature-heirarchys in the appropriate slot + if feat.type in ["gene"]: + if not topFeat: + topFeat = feat + # Else handle multiple top features + elif feat.type in ["mRNA", "tRNA", "rRNA"]: + if not midFeat: + midFeat = feat + # Else handle multiple mid feats (May need another elif type-in-list statement if we actually expect a list of mid feats) + else: + if feat.type not in typeDict.keys(): + typeDict[feat.type] = 1 + else: + typeDict[feat.type] += 1 + bottomFeats.append(feat) - parser.add_argument('inFile', type=argparse.FileType("r"), help='Path to an input GBK file' ) - parser.add_argument('--makeMRNA', action="store_true", required=False, help="Automatically create mRNA features") - parser.add_argument('--makeGene', action="store_true", required=False, help="Automatically create missing Gene features") - parser.add_argument('--identifier', type=str, default="locus_tag", required=False, help="Qualifier to derive ID property from") - parser.add_argument('--fastaFile', type=argparse.FileType("w"), help='Fasta output for sequences' ) - parser.add_argument('--outFile', type=argparse.FileType("w"), help='GFF feature output' ) - args = parser.parse_args() - main(**vars(args)) + for x in seekingParent: + if type(x[1]) != "list" and locus == x[1]: + x[0].qualifiers[identifier] = [locus] + bottomFeats.append(x[0]) + if x[0].type not in typeDict.keys(): + typeDict[x[0].type] = 1 + else: + typeDict[x[0].type] += 1 + # if not topFeat: # Make our own top-level feature based off minLoc, maxLoc bounds + for ( + x + ) in ( + typeDict.keys() + ): # If only 1, set it to 0 so we don't append a number to the name + if ( + typeDict[x] == 1 + ): # Else, set to 1 so that we count up as we encounter the features + typeDict[x] = 0 + else: + typeDict[x] = 1 + if not topFeat: + if makeGene: + if midFeat: + possibleStrand = midFeat.strand + else: + possibleStrand = bottomFeats[0].strand + tempName = recID + ".gene." + str(geneNum) + tempQuals = { + identifier: [locus], + "ID": [tempName], + "Notes": [ + "Gene feature automatically generated by Gbk to GFF conversion" + ], + } + topFeat = gffSeqFeature( + FeatureLocation(minLoc, maxLoc, possibleStrand), + "gene", + "", + possibleStrand, + tempName, + tempQuals, + None, + None, + None, + ".", + 0, + "GbkToGff", + ) + else: + sys.stderr.write( + "Unable to create a feature heirarchy at location [%d, %d] with features: \n" + % (minLoc, maxLoc) + ) + for x in locBucket[locus]: + sys.stderr.write(str(x)) + sys.stderr.write("\n") + failed = 1 + continue + outFeats.append(makeGffFeat(topFeat, 0, recID, identifier)) + if not midFeat and topFeat.type == "gene" and makeMRNA: + if identifier in topFeat.qualifiers.keys(): + tempName = topFeat.qualifiers[identifier][0] + ".mRNA" + tempQuals = { + identifier: topFeat.qualifiers[identifier], + "ID": [tempName], + "Notes": [ + "mRNA feature automatically generated by Gbk to GFF conversion" + ], + } + else: + tempName = outFeats[-1].ID + ".mRNA" + tempQuals = { + identifier: topFeat.qualifiers[identifier], + "ID": [tempName], + "Notes": [ + "mRNA feature automatically generated by Gbk to GFF conversion" + ], + } + midFeat = gffSeqFeature( + FeatureLocation(minLoc, maxLoc, topFeat.strand), + "mRNA", + "", + topFeat.strand, + tempName, + tempQuals, + None, + None, + None, + ".", + 0, + "GbkToGff", + ) + if ( + midFeat + ): # Again, need a new if statement if we want to handle multiple mid-tier features + outFeats[-1].sub_features.append( + makeGffFeat(midFeat, 0, recID, identifier) + ) + outFeats[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].id] + for x in bottomFeats: + typeDict[x.type] += 1 + outFeats[-1].sub_features[-1].sub_features.append( + makeGffFeat(x, typeDict[x.type], recID, identifier) + ) + outFeats[-1].sub_features[-1].sub_features[-1].qualifiers[ + "Parent" + ] = [outFeats[-1].sub_features[-1].id] + else: # No midFeat, append bottom feats directly to top feats + for x in bottomFeats: + typeDict[x.type] += 1 + outFeats[-1].sub_features.append( + makeGffFeat(x, typeDict[x.type], recID, identifier) + ) + outFeats[-1].sub_features[-1].qualifiers["Parent"] = [ + outFeats[-1].id + ] + + outRec.append( + SeqRecord( + rec.seq, + recID, + rec.name, + rec.description, + rec.dbxrefs, + sorted(outFeats, key=lambda x: x.location.start), + rec.annotations, + rec.letter_annotations, + ) + ) + SeqIO.write([outRec[-1]], fastaFile, "fasta") + gffWrite(outRec, ofh) + exit(failed) # 0 if all features handled, 1 if unable to handle some +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Biopython solution to Gbk to GFF conversion" + ) + parser.add_argument( + "inFile", type=argparse.FileType("r"), help="Path to an input GBK file" + ) + parser.add_argument( + "--makeMRNA", + action="store_true", + required=False, + help="Automatically create mRNA features", + ) + parser.add_argument( + "--makeGene", + action="store_true", + required=False, + help="Automatically create missing Gene features", + ) + parser.add_argument( + "--identifier", + type=str, + default="locus_tag", + required=False, + help="Qualifier to derive ID property from", + ) + parser.add_argument( + "--fastaFile", type=argparse.FileType("w"), help="Fasta output for sequences" + ) + parser.add_argument( + "--outFile", type=argparse.FileType("w"), help="GFF feature output" + ) + args = parser.parse_args() + main(**vars(args)) diff --git a/cpt_genome_editor/genome_editor.py b/cpt_genome_editor/genome_editor.py index 0f2599b..c110a15 100755 --- a/cpt_genome_editor/genome_editor.py +++ b/cpt_genome_editor/genome_editor.py @@ -38,7 +38,7 @@ def mutate(gff3, fasta, changes, customSeqs, new_id): covered = 0 for feat in rec.features: if "ID" in feat.qualifiers.keys(): - topFeats[feat.qualifiers["ID"][0]] = feat.location.start + topFeats[feat.qualifiers["ID"][0]] = feat.location.start for change in changes: if "," in change: (start, end, strand) = change.split(",") @@ -83,18 +83,18 @@ def mutate(gff3, fasta, changes, customSeqs, new_id): dbxrefs=True, ) tmp_req = convertSeqRec(tmp_req)[0] + def update_location(feature, shiftS): - feature.location = FeatureLocation(feature.location.start + shiftS, feature.location.end + shiftS, feature.strand) + feature.location = FeatureLocation( + feature.location.start + shiftS, + feature.location.end + shiftS, + feature.strand, + ) for i in feature.sub_features: - i = update_location(i, shiftS) + i = update_location(i, shiftS) return feature - - - #for feature in tmp_req.features: - - - + # for feature in tmp_req.features: chain.append( [ @@ -116,15 +116,19 @@ def update_location(feature, shiftS): # subfeatures, which means you will only get top-level features. startInd = len(new_record.features) new_record.features += tmp_req.features - + for i in new_record.features[startInd:]: - i.location = FeatureLocation(i.location.start + covered, i.location.end + covered, i.location.strand) + i.location = FeatureLocation( + i.location.start + covered, + i.location.end + covered, + i.location.strand, + ) if "ID" not in i.qualifiers.keys(): - continue + continue diffS = i.location.start - topFeats[i.qualifiers["ID"][0]] subFeats = i.sub_features for j in subFeats: - j = update_location(j, diffS) + j = update_location(j, diffS) else: new_record.seq += custom_seqs[change].seq yield new_record, chain diff --git a/cpt_genome_editor/gff3.py b/cpt_genome_editor/gff3.py index d4795d4..48496c3 100755 --- a/cpt_genome_editor/gff3.py +++ b/cpt_genome_editor/gff3.py @@ -97,10 +97,10 @@ def feature_test_type(feature, **kwargs): if "type" in kwargs: return str(feature.type).upper() == str(kwargs["type"]).upper() elif "types" in kwargs: - for x in kwargs["types"]: - if str(feature.type).upper() == str(x).upper(): - return True - return False + for x in kwargs["types"]: + if str(feature.type).upper() == str(x).upper(): + return True + return False raise Exception("Incorrect feature_test_type call, need type or types") diff --git a/cpt_gff_add_parent/gff3.py b/cpt_gff_add_parent/gff3.py index d4795d4..48496c3 100755 --- a/cpt_gff_add_parent/gff3.py +++ b/cpt_gff_add_parent/gff3.py @@ -97,10 +97,10 @@ def feature_test_type(feature, **kwargs): if "type" in kwargs: return str(feature.type).upper() == str(kwargs["type"]).upper() elif "types" in kwargs: - for x in kwargs["types"]: - if str(feature.type).upper() == str(x).upper(): - return True - return False + for x in kwargs["types"]: + if str(feature.type).upper() == str(x).upper(): + return True + return False raise Exception("Incorrect feature_test_type call, need type or types") diff --git a/cpt_gff_add_parent/gff3_add_parents_to_cds.py b/cpt_gff_add_parent/gff3_add_parents_to_cds.py index c15a254..33388f1 100755 --- a/cpt_gff_add_parent/gff3_add_parents_to_cds.py +++ b/cpt_gff_add_parent/gff3_add_parents_to_cds.py @@ -3,7 +3,8 @@ import logging import argparse from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature -#from Bio.SeqFeature import SeqFeature + +# from Bio.SeqFeature import SeqFeature from gff3 import feature_lambda, feature_test_type logging.basicConfig(level=logging.INFO) @@ -26,7 +27,11 @@ def fixed_feature(rec): mRNA = gffSeqFeature( location=feature.location, type="mRNA", - qualifiers={"source": ["cpt.fixModel"], "ID": ["%s.mRNA" % fid], "Parent": gene.qualifiers["ID"]}, + qualifiers={ + "source": ["cpt.fixModel"], + "ID": ["%s.mRNA" % fid], + "Parent": gene.qualifiers["ID"], + }, ) feature.qualifiers["ID"] = [fid + ".CDS"] feature.qualifiers["Parent"] = mRNA.qualifiers["ID"] diff --git a/cpt_gff_apollo_prep/gff3.py b/cpt_gff_apollo_prep/gff3.py index d4795d4..48496c3 100755 --- a/cpt_gff_apollo_prep/gff3.py +++ b/cpt_gff_apollo_prep/gff3.py @@ -97,10 +97,10 @@ def feature_test_type(feature, **kwargs): if "type" in kwargs: return str(feature.type).upper() == str(kwargs["type"]).upper() elif "types" in kwargs: - for x in kwargs["types"]: - if str(feature.type).upper() == str(x).upper(): - return True - return False + for x in kwargs["types"]: + if str(feature.type).upper() == str(x).upper(): + return True + return False raise Exception("Incorrect feature_test_type call, need type or types") diff --git a/cpt_gff_apollo_prep/gff3_prep_for_apollo.py b/cpt_gff_apollo_prep/gff3_prep_for_apollo.py index 7fe01b6..6a6bad1 100755 --- a/cpt_gff_apollo_prep/gff3_prep_for_apollo.py +++ b/cpt_gff_apollo_prep/gff3_prep_for_apollo.py @@ -11,29 +11,28 @@ log = logging.getLogger(__name__) ALLOWED_FEATURES = [ - "mRNA", - "exon", - "transposable_element", - "tRNA", - "transcript", - "terminator", - "Shine_Dalgarno_Sequence", - "pseudogene", - "stop_codon_read_through", - "repeat_region", - "CDS", - "gene", - "rRNA", - "ncRNA", - "snRNA", - "snoRNA", - "miRNA", - ] + "mRNA", + "exon", + "transposable_element", + "tRNA", + "transcript", + "terminator", + "Shine_Dalgarno_Sequence", + "pseudogene", + "stop_codon_read_through", + "repeat_region", + "CDS", + "gene", + "rRNA", + "ncRNA", + "snRNA", + "snoRNA", + "miRNA", +] SPECIAL_REMOVED_FEATURES = ["gene_component_region", "sequence_difference"] - def add_exons(features): for gene in feature_lambda( features, feature_test_type, {"type": "gene"}, subfeatures=True @@ -44,19 +43,25 @@ def add_exons(features): exon_strand = None cds_list = [] - #for mRNA in gene.sub_features: + # for mRNA in gene.sub_features: # for x in mRNA.sub_features: # x.qualifiers["Parent"] = [gene.id] # gene.sub_features.append(x) - - for exon in feature_lambda(gene.sub_features, feature_test_type, {"type": "exon"}, subfeatures=False,recurse=False): - #if the gene contains an exon, skip. + + for exon in feature_lambda( + gene.sub_features, + feature_test_type, + {"type": "exon"}, + subfeatures=False, + recurse=False, + ): + # if the gene contains an exon, skip. continue hasMRNA = False for x in gene.sub_features: - if x.type == "mRNA": - hasMRNA = True - mRNA = x + if x.type == "mRNA": + hasMRNA = True + mRNA = x """ if not hasMRNA: mRNA = gffSeqFeature( @@ -77,7 +82,7 @@ def add_exons(features): for x in clean_gene.sub_features: if x.type != "mRNA": x.qualifiers["Parent"] = [mRNA.id] """ - + # check for CDS child features of the gene, do not go a further step (this should skip any CDS children of exon child features) for cds in feature_lambda( gene.sub_features, @@ -85,7 +90,7 @@ def add_exons(features): {"type": "CDS"}, subfeatures=False, recurse=False, - ): + ): # check all CDS features for min/max boundaries if exon_start is None: exon_start = cds.location.start @@ -100,22 +105,22 @@ def add_exons(features): new_exon = gffSeqFeature( location=FeatureLocation(exon_start, exon_end), type="exon", - source = "cpt.prepApollo", + source="cpt.prepApollo", qualifiers={ "ID": ["%s.exon" % clean_gene.qualifiers["ID"][0]], "Parent": [clean_gene.id], "ApolloExon": ["True"], }, sub_features=[], - strand=exon_strand + strand=exon_strand, ) for cds in cds_list: cds.qualifiers["Parent"] = new_exon.qualifiers["ID"] new_exon.sub_features.append(cds) - #gene.sub_features.append(new_exon) + # gene.sub_features.append(new_exon) # get all the other children of gene that AREN'T a CDS including the new exon clean_gene.sub_features.append(copy.deepcopy(new_exon)) - #clean_gene.sub_features.append(gffSeqFeature(location=FeatureLocation(exon_start, exon_end, exon_strand), type="exon", source = "cpt.prepApollo", qualifiers={"ID": ["%s.exon" % clean_gene.qualifiers["ID"][0]], "Parent": clean_gene.qualifiers["ID"]}, sub_features=[], strand=exon_strand)) + # clean_gene.sub_features.append(gffSeqFeature(location=FeatureLocation(exon_start, exon_end, exon_strand), type="exon", source = "cpt.prepApollo", qualifiers={"ID": ["%s.exon" % clean_gene.qualifiers["ID"][0]], "Parent": clean_gene.qualifiers["ID"]}, sub_features=[], strand=exon_strand)) """ for sf in feature_lambda( gene.sub_features, @@ -133,13 +138,16 @@ def add_exons(features): # return the cleaned gene with new exon yield clean_gene + def process_features(features): # change RBS to 'Shine_Dalgarno_sequence' - for rbs in feature_lambda(features, feature_test_type, {'type': "RBS"}): + for rbs in feature_lambda(features, feature_test_type, {"type": "RBS"}): rbs.type = "Shine_Dalgarno_sequence" # Filter top level features - for feature in feature_lambda(features, feature_test_type, {"types": ALLOWED_FEATURES}, subfeatures=True): + for feature in feature_lambda( + features, feature_test_type, {"types": ALLOWED_FEATURES}, subfeatures=True + ): cleaned_subfeatures = [] for sf in feature.sub_features: if sf.type in SPECIAL_REMOVED_FEATURES: @@ -147,13 +155,18 @@ def process_features(features): continue else: cleaned_subfeatures.append(sf) - feature.sub_features = copy.deepcopy(cleaned_subfeatures) + feature.sub_features = copy.deepcopy(cleaned_subfeatures) yield feature + def gff_filter(gff3): for rec in gffParse(gff3): - cleaned_features = sorted(list(process_features(rec.features)), key=lambda x: x.location.start) - rec.features = sorted(list(add_exons(cleaned_features)), key=lambda x: x.location.start) + cleaned_features = sorted( + list(process_features(rec.features)), key=lambda x: x.location.start + ) + rec.features = sorted( + list(add_exons(cleaned_features)), key=lambda x: x.location.start + ) rec.annotations = {} gffWrite([rec], sys.stdout) diff --git a/cpt_gff_extract_seq/gff3.py b/cpt_gff_extract_seq/gff3.py index d4795d4..48496c3 100755 --- a/cpt_gff_extract_seq/gff3.py +++ b/cpt_gff_extract_seq/gff3.py @@ -97,10 +97,10 @@ def feature_test_type(feature, **kwargs): if "type" in kwargs: return str(feature.type).upper() == str(kwargs["type"]).upper() elif "types" in kwargs: - for x in kwargs["types"]: - if str(feature.type).upper() == str(x).upper(): - return True - return False + for x in kwargs["types"]: + if str(feature.type).upper() == str(x).upper(): + return True + return False raise Exception("Incorrect feature_test_type call, need type or types") diff --git a/cpt_gff_extract_seq/gff3_extract_sequence.py b/cpt_gff_extract_seq/gff3_extract_sequence.py index 22e0ca4..8e66746 100755 --- a/cpt_gff_extract_seq/gff3_extract_sequence.py +++ b/cpt_gff_extract_seq/gff3_extract_sequence.py @@ -17,7 +17,6 @@ def main(fasta, gff3, feature_filter=None, nodesc=False): if feature_filter == "nice_cds": from gff2gb import gff3_to_genbank as cpt_Gff2Gbk - for rec in cpt_Gff2Gbk(gff3, fasta, 11): seenList = {} @@ -66,8 +65,10 @@ def main(fasta, gff3, feature_filter=None, nodesc=False): else: feat.qualifiers["ID"] = [feat._ID] product = feat.qualifiers.get("product", "") - description = "{1} [Location={0.location};ID={0.qualifiers[ID][0]}]".format( - feat, product + description = ( + "{1} [Location={0.location};ID={0.qualifiers[ID][0]}]".format( + feat, product + ) ) yield [ SeqRecord( @@ -116,9 +117,21 @@ def main(fasta, gff3, feature_filter=None, nodesc=False): description = "" else: if feat.strand == -1: - important_data = {"Location": FeatureLocation(feat.location.start + 1, feat.location.end - feat.phase, feat.strand)} + important_data = { + "Location": FeatureLocation( + feat.location.start + 1, + feat.location.end - feat.phase, + feat.strand, + ) + } else: - important_data = {"Location": FeatureLocation(feat.location.start + 1 + feat.phase, feat.location.end, feat.strand)} + important_data = { + "Location": FeatureLocation( + feat.location.start + 1 + feat.phase, + feat.location.end, + feat.strand, + ) + } if "Name" in feat.qualifiers: important_data["Name"] = feat.qualifiers.get("Name", [""])[0] @@ -130,48 +143,65 @@ def main(fasta, gff3, feature_filter=None, nodesc=False): ] ) ) - #if feat.id == "CPT_Privateer_006.p01": - #print(feat) - #exit() - + # if feat.id == "CPT_Privateer_006.p01": + # print(feat) + # exit() + if isinstance(feat.location, CompoundLocation): - finSeq = "" - if feat.strand == -1: - for x in feat.location.parts: - finSeq += str((rec.seq[feat.location.start: feat.location.end - feat.phase]).reverse_complement()) - else: - for x in feat.location.parts: - finSeq += str(rec.seq[feat.location.start + feat.phase: feat.location.end]) - yield [ - SeqRecord( - finSeq, - id=nid.replace(" ", "-"), - description=description, - ) - ] + finSeq = "" + if feat.strand == -1: + for x in feat.location.parts: + finSeq += str( + ( + rec.seq[ + feat.location.start : feat.location.end + - feat.phase + ] + ).reverse_complement() + ) + else: + for x in feat.location.parts: + finSeq += str( + rec.seq[ + feat.location.start + feat.phase : feat.location.end + ] + ) + yield [ + SeqRecord( + finSeq, + id=nid.replace(" ", "-"), + description=description, + ) + ] elif feat.strand == -1: - yield [ - SeqRecord( - (rec.seq[feat.location.start: feat.location.end - feat.phase]).reverse_complement(), - id=nid.replace(" ", "-"), - description=description, - ) - ] + yield [ + SeqRecord( + ( + rec.seq[ + feat.location.start : feat.location.end - feat.phase + ] + ).reverse_complement(), + id=nid.replace(" ", "-"), + description=description, + ) + ] else: - yield [ - SeqRecord( - #feat.extract(rec).seq, - rec.seq[feat.location.start + feat.phase: feat.location.end], - id=nid.replace(" ", "-"), - description=description, - ) - ] + yield [ + SeqRecord( + # feat.extract(rec).seq, + rec.seq[ + feat.location.start + feat.phase : feat.location.end + ], + id=nid.replace(" ", "-"), + description=description, + ) + ] rec.features = newfeats rec.annotations = {} - #gffWrite([rec], sys.stdout) + # gffWrite([rec], sys.stdout) else: seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta")) - + for rec in gffParse(gff3, base_dict=seq_dict): noMatch = True if "Alias" in rec.features[0].qualifiers.keys(): @@ -201,9 +231,21 @@ def main(fasta, gff3, feature_filter=None, nodesc=False): description = "" else: if feat.strand == -1: - important_data = {"Location": FeatureLocation(feat.location.start + 1, feat.location.end - feat.phase, feat.strand)} + important_data = { + "Location": FeatureLocation( + feat.location.start + 1, + feat.location.end - feat.phase, + feat.strand, + ) + } else: - important_data = {"Location": FeatureLocation(feat.location.start + 1 + feat.phase, feat.location.end, feat.strand)} + important_data = { + "Location": FeatureLocation( + feat.location.start + 1 + feat.phase, + feat.location.end, + feat.strand, + ) + } if "Name" in feat.qualifiers: important_data["Name"] = feat.qualifiers.get("Name", [""])[0] @@ -217,40 +259,58 @@ def main(fasta, gff3, feature_filter=None, nodesc=False): ) if isinstance(feat.location, CompoundLocation): - finSeq = "" - if feat.strand == -1: - for x in feat.location.parts: - finSeq += str((rec.seq[x.start: x.end - feat.phase]).reverse_complement()) - else: - for x in feat.location.parts: - finSeq += str(rec.seq[x.start + feat.phase: x.end]) - yield [ - SeqRecord( - Seq(finSeq), - id=id.replace(" ", "-"), - description=description, - ) - ] + finSeq = "" + if feat.strand == -1: + for x in feat.location.parts: + finSeq += str( + ( + rec.seq[x.start : x.end - feat.phase] + ).reverse_complement() + ) + else: + for x in feat.location.parts: + finSeq += str(rec.seq[x.start + feat.phase : x.end]) + yield [ + SeqRecord( + Seq(finSeq), + id=id.replace(" ", "-"), + description=description, + ) + ] else: - if feat.strand == -1: - yield [ - SeqRecord( - seq=Seq(str(rec.seq[feat.location.start: feat.location.end - feat.phase])).reverse_complement(), - id=id.replace(" ", "-"), - description=description, - ) - ] - else: - yield [ - SeqRecord( - #feat.extract(rec).seq, - seq=Seq(str(rec.seq[feat.location.start + feat.phase: feat.location.end])), - id=id.replace(" ", "-"), - description=description, - ) - ] + if feat.strand == -1: + yield [ + SeqRecord( + seq=Seq( + str( + rec.seq[ + feat.location.start : feat.location.end + - feat.phase + ] + ) + ).reverse_complement(), + id=id.replace(" ", "-"), + description=description, + ) + ] + else: + yield [ + SeqRecord( + # feat.extract(rec).seq, + seq=Seq( + str( + rec.seq[ + feat.location.start + + feat.phase : feat.location.end + ] + ) + ), + id=id.replace(" ", "-"), + description=description, + ) + ] if __name__ == "__main__": @@ -267,9 +327,9 @@ def main(fasta, gff3, feature_filter=None, nodesc=False): ) args = parser.parse_args() for seq in main(**vars(args)): - #if isinstance(seq, list): + # if isinstance(seq, list): # for x in seq: # print(type(x.seq)) # SeqIO.write(x, sys.stdout, "fasta") - #else: - SeqIO.write(seq, sys.stdout, "fasta") + # else: + SeqIO.write(seq, sys.stdout, "fasta") diff --git a/cpt_gff_rebase/gff3.py b/cpt_gff_rebase/gff3.py index d4795d4..48496c3 100755 --- a/cpt_gff_rebase/gff3.py +++ b/cpt_gff_rebase/gff3.py @@ -97,10 +97,10 @@ def feature_test_type(feature, **kwargs): if "type" in kwargs: return str(feature.type).upper() == str(kwargs["type"]).upper() elif "types" in kwargs: - for x in kwargs["types"]: - if str(feature.type).upper() == str(x).upper(): - return True - return False + for x in kwargs["types"]: + if str(feature.type).upper() == str(x).upper(): + return True + return False raise Exception("Incorrect feature_test_type call, need type or types") diff --git a/cpt_gff_split/gff3_splitgff.py b/cpt_gff_split/gff3_splitgff.py index 87768d2..22da152 100755 --- a/cpt_gff_split/gff3_splitgff.py +++ b/cpt_gff_split/gff3_splitgff.py @@ -27,9 +27,9 @@ for record in gffParse(args.data): gffWrite([record], args.gff) record.description = "" - + if isinstance(record.seq, str): - record.seq = Seq(record.seq) - + record.seq = Seq(record.seq) + SeqIO.write([record], args.fasta, "fasta") sys.exit() diff --git a/cpt_gff_to_gbk/gff2gb.py b/cpt_gff_to_gbk/gff2gb.py index 5e7675f..9d5528b 100755 --- a/cpt_gff_to_gbk/gff2gb.py +++ b/cpt_gff_to_gbk/gff2gb.py @@ -11,7 +11,8 @@ import itertools import logging from Bio import SeqIO -#from Bio.Alphabet import generic_dna + +# from Bio.Alphabet import generic_dna from Bio.SeqFeature import CompoundLocation, FeatureLocation from CPT_GFFParser import gffParse, gffWrite from gff3 import ( @@ -48,7 +49,7 @@ def rename_key(ds, k_f, k_t): def gff3_to_genbank(gff_file, fasta_file, transltbl): - fasta_input = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta"))#, generic_dna)) + fasta_input = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta")) # , generic_dna)) gff_iter = gffParse(gff_file, fasta_input) for record in gff_iter: @@ -63,11 +64,11 @@ def handle_non_gene_features(features): {"type": "gene"}, subfeatures=False, invert=True, - recurse=True, # used to catch RBS from new apollo runs (used to be False) + recurse=True, # used to catch RBS from new apollo runs (used to be False) ): if feature.type in ( - "terminator", - "tRNA", + "terminator", + "tRNA", "Shine_Dalgarno_sequence", "sequence_feature", "recombination_feature", @@ -75,15 +76,12 @@ def handle_non_gene_features(features): "binding_site", ): yield feature - elif feature.type in ( - "CDS", - ): + elif feature.type in ("CDS",): pass else: yield feature - def fminmax(feature): fmin = None fmax = None @@ -127,7 +125,7 @@ def fix_gene_qualifiers(name, feature, fid): if is_uuid(sf.qualifiers["Name"][0]): del sf.qualifiers["Name"] except KeyError: - continue # might should go back to pass, I have not put thought into this still + continue # might should go back to pass, I have not put thought into this still # If it is the RBS exon (mis-labelled by apollo as 'exon') if sf.type == "exon" and len(sf) < 10: @@ -213,11 +211,11 @@ def fix_frameshifted(features): mRNA_1 = fixed_mrnas[1] if not noRBS: - mRNA_0.sub_features = [rbss[0], merge_a] - mRNA_1.sub_features = other + [rbss[1]] + mRNA_0.sub_features = [rbss[0], merge_a] + mRNA_1.sub_features = other + [rbss[1]] else: - mRNA_0.sub_features = [merge_a] - mRNA_1.sub_features = other + mRNA_0.sub_features = [merge_a] + mRNA_1.sub_features = other mRNA_0 = fix_gene_boundaries(mRNA_0) mRNA_1 = fix_gene_boundaries(mRNA_1) @@ -293,17 +291,16 @@ def remove_useless_features(features): # We use the full GO term, but it should be less than that. if f.type == "Shine_Dalgarno_sequence": f.type = "RBS" - - + if f.type == "sequence_feature": f.type = "misc_feature" - + if f.type == "recombination_feature": f.type = "misc_recomb" - + if f.type == "sequence_alteration": f.type = "variation" - + if f.type == "binding_site": f.type = "misc_binding" @@ -368,18 +365,18 @@ def handle_record(record, transltbl): # Wipe out the parent gene's data, leaving only a locus_tag feature.qualifiers = {"locus_tag": "CPT_%s_%03d" % (record.id, fid)} - + # Patch our features back in (even if they're non-gene features) replacement_feats.append(feature) - + replacement_feats = fix_frameshifts(replacement_feats) - #exit(0) + # exit(0) flat_features = feature_lambda( replacement_feats, lambda x: True, {}, subfeatures=True ) - + flat_features = remove_useless_features(flat_features) - + # Meat of our modifications for flat_feat in flat_features: # Try and figure out a name. We gave conflicting instructions, so @@ -409,7 +406,7 @@ def handle_record(record, transltbl): del flat_feat.qualifiers["Product"] elif flat_feat.type == "RBS": if "locus_tag" not in flat_feat.qualifiers.keys(): - continue + continue elif flat_feat.type == "terminator": flat_feat.type = "regulatory" @@ -452,5 +449,5 @@ def handle_record(record, transltbl): for record in gff3_to_genbank(**vars(args)): record.annotations["molecule_type"] = "DNA" - #record.seq.alphabet = generic_dna + # record.seq.alphabet = generic_dna SeqIO.write([record], sys.stdout, "genbank") diff --git a/cpt_gff_to_gbk/gff3.py b/cpt_gff_to_gbk/gff3.py index d4795d4..48496c3 100755 --- a/cpt_gff_to_gbk/gff3.py +++ b/cpt_gff_to_gbk/gff3.py @@ -97,10 +97,10 @@ def feature_test_type(feature, **kwargs): if "type" in kwargs: return str(feature.type).upper() == str(kwargs["type"]).upper() elif "types" in kwargs: - for x in kwargs["types"]: - if str(feature.type).upper() == str(x).upper(): - return True - return False + for x in kwargs["types"]: + if str(feature.type).upper() == str(x).upper(): + return True + return False raise Exception("Incorrect feature_test_type call, need type or types") diff --git a/cpt_helical_wheel/generateHelicalWheel.py b/cpt_helical_wheel/generateHelicalWheel.py index bfa83be..fff4029 100644 --- a/cpt_helical_wheel/generateHelicalWheel.py +++ b/cpt_helical_wheel/generateHelicalWheel.py @@ -1,86 +1,131 @@ -## +## import argparse from plotWheels.helical_wheel import helical_wheel if __name__ == "__main__": parser = argparse.ArgumentParser(description="Generate Helical Wheel") - parser.add_argument("--sequence",dest="sequence",type=str) - parser.add_argument("--seqRange",dest="seqRange",type=int,default=1) - parser.add_argument("--t_size",dest="t_size",type=int,default=32) - parser.add_argument("--rotation",dest="rotation",type=int,default=90) - parser.add_argument("--numbering",action="store_true",help="numbering for helical wheel") - parser.add_argument("--output",dest="output",type=argparse.FileType("wb"), default="_helicalwheel.png")#dest="output",default="_helicalwheel.png") + parser.add_argument("--sequence", dest="sequence", type=str) + parser.add_argument("--seqRange", dest="seqRange", type=int, default=1) + parser.add_argument("--t_size", dest="t_size", type=int, default=32) + parser.add_argument("--rotation", dest="rotation", type=int, default=90) + parser.add_argument( + "--numbering", action="store_true", help="numbering for helical wheel" + ) + parser.add_argument( + "--output", + dest="output", + type=argparse.FileType("wb"), + default="_helicalwheel.png", + ) # dest="output",default="_helicalwheel.png") #### circle colors - parser.add_argument("--f_A",dest="f_A", default="#ffcc33") - parser.add_argument("--f_C",dest="f_C",default="#b5b5b5") - parser.add_argument("--f_D",dest="f_D",default="#db270f") - parser.add_argument("--f_E",dest="f_E",default="#db270f") - parser.add_argument("--f_F",dest="f_F",default="#ffcc33") - parser.add_argument("--f_G",dest="f_G",default="#b5b5b5") - parser.add_argument("--f_H",dest="f_H",default="#12d5fc") - parser.add_argument("--f_I",dest="f_I",default="#ffcc33") - parser.add_argument("--f_K",dest="f_K",default="#12d5fc") - parser.add_argument("--f_L",dest="f_L",default="#ffcc33") - parser.add_argument("--f_M",dest="f_M",default="#ffcc33") - parser.add_argument("--f_N",dest="f_N",default="#b5b5b5") - parser.add_argument("--f_P",dest="f_P",default="#ffcc33") - parser.add_argument("--f_Q",dest="f_Q",default="#b5b5b5") - parser.add_argument("--f_R",dest="f_R",default="#12d5fc") - parser.add_argument("--f_S",dest="f_S",default="#b5b5b5") - parser.add_argument("--f_T",dest="f_T",default="#b5b5b5") - parser.add_argument("--f_V",dest="f_V",default="#ffcc33") - parser.add_argument("--f_W",dest="f_W",default="#ffcc33") - parser.add_argument("--f_Y",dest="f_Y",default="#b5b5b5") + parser.add_argument("--f_A", dest="f_A", default="#ffcc33") + parser.add_argument("--f_C", dest="f_C", default="#b5b5b5") + parser.add_argument("--f_D", dest="f_D", default="#db270f") + parser.add_argument("--f_E", dest="f_E", default="#db270f") + parser.add_argument("--f_F", dest="f_F", default="#ffcc33") + parser.add_argument("--f_G", dest="f_G", default="#b5b5b5") + parser.add_argument("--f_H", dest="f_H", default="#12d5fc") + parser.add_argument("--f_I", dest="f_I", default="#ffcc33") + parser.add_argument("--f_K", dest="f_K", default="#12d5fc") + parser.add_argument("--f_L", dest="f_L", default="#ffcc33") + parser.add_argument("--f_M", dest="f_M", default="#ffcc33") + parser.add_argument("--f_N", dest="f_N", default="#b5b5b5") + parser.add_argument("--f_P", dest="f_P", default="#ffcc33") + parser.add_argument("--f_Q", dest="f_Q", default="#b5b5b5") + parser.add_argument("--f_R", dest="f_R", default="#12d5fc") + parser.add_argument("--f_S", dest="f_S", default="#b5b5b5") + parser.add_argument("--f_T", dest="f_T", default="#b5b5b5") + parser.add_argument("--f_V", dest="f_V", default="#ffcc33") + parser.add_argument("--f_W", dest="f_W", default="#ffcc33") + parser.add_argument("--f_Y", dest="f_Y", default="#b5b5b5") ### text colors - parser.add_argument("--t_A",dest="t_A",default="k") - parser.add_argument("--t_C",dest="t_C",default="k") - parser.add_argument("--t_D",dest="t_D",default="w") - parser.add_argument("--t_E",dest="t_E",default="w") - parser.add_argument("--t_F",dest="t_F",default="k") - parser.add_argument("--t_G",dest="t_G",default="k") - parser.add_argument("--t_H",dest="t_H",default="k") - parser.add_argument("--t_I",dest="t_I",default="k") - parser.add_argument("--t_K",dest="t_K",default="k") - parser.add_argument("--t_L",dest="t_L",default="k") - parser.add_argument("--t_M",dest="t_M",default="k") - parser.add_argument("--t_N",dest="t_N",default="k") - parser.add_argument("--t_P",dest="t_P",default="k") - parser.add_argument("--t_Q",dest="t_Q",default="k") - parser.add_argument("--t_R",dest="t_R",default="k") - parser.add_argument("--t_S",dest="t_S",default="k") - parser.add_argument("--t_T",dest="t_T",default="k") - parser.add_argument("--t_V",dest="t_V",default="k") - parser.add_argument("--t_W",dest="t_W",default="k") - parser.add_argument("--t_Y",dest="t_Y",default="k") + parser.add_argument("--t_A", dest="t_A", default="k") + parser.add_argument("--t_C", dest="t_C", default="k") + parser.add_argument("--t_D", dest="t_D", default="w") + parser.add_argument("--t_E", dest="t_E", default="w") + parser.add_argument("--t_F", dest="t_F", default="k") + parser.add_argument("--t_G", dest="t_G", default="k") + parser.add_argument("--t_H", dest="t_H", default="k") + parser.add_argument("--t_I", dest="t_I", default="k") + parser.add_argument("--t_K", dest="t_K", default="k") + parser.add_argument("--t_L", dest="t_L", default="k") + parser.add_argument("--t_M", dest="t_M", default="k") + parser.add_argument("--t_N", dest="t_N", default="k") + parser.add_argument("--t_P", dest="t_P", default="k") + parser.add_argument("--t_Q", dest="t_Q", default="k") + parser.add_argument("--t_R", dest="t_R", default="k") + parser.add_argument("--t_S", dest="t_S", default="k") + parser.add_argument("--t_T", dest="t_T", default="k") + parser.add_argument("--t_V", dest="t_V", default="k") + parser.add_argument("--t_W", dest="t_W", default="k") + parser.add_argument("--t_Y", dest="t_Y", default="k") args = parser.parse_args() - - #print(type(args.output)) + # print(type(args.output)) - f_colors = [args.f_A,args.f_C,args.f_D,args.f_E,args.f_F,args.f_G,args.f_H,args.f_I,args.f_K, - args.f_L,args.f_M,args.f_N,args.f_P,args.f_Q,args.f_R,args.f_S,args.f_T,args.f_V, - args.f_W,args.f_Y] + f_colors = [ + args.f_A, + args.f_C, + args.f_D, + args.f_E, + args.f_F, + args.f_G, + args.f_H, + args.f_I, + args.f_K, + args.f_L, + args.f_M, + args.f_N, + args.f_P, + args.f_Q, + args.f_R, + args.f_S, + args.f_T, + args.f_V, + args.f_W, + args.f_Y, + ] + + t_colors = [ + args.t_A, + args.t_C, + args.t_D, + args.t_E, + args.t_F, + args.t_G, + args.t_H, + args.t_I, + args.t_K, + args.t_L, + args.t_M, + args.t_N, + args.t_P, + args.t_Q, + args.t_R, + args.t_S, + args.t_T, + args.t_V, + args.t_W, + args.t_Y, + ] - t_colors = [args.t_A,args.t_C,args.t_D,args.t_E,args.t_F,args.t_G,args.t_H,args.t_I,args.t_K, - args.t_L,args.t_M,args.t_N,args.t_P,args.t_Q,args.t_R,args.t_S,args.t_T,args.t_V, - args.t_W,args.t_Y] - colors = [f_colors, t_colors] tmp_file = "./tmp.png" - helical_wheel(sequence=args.sequence, - colorcoding=colors[0], - text_color=colors[1], - seqRange=args.seqRange, - t_size=args.t_size, - rot=args.rotation, - numbering=args.numbering, - filename=tmp_file - ) - + helical_wheel( + sequence=args.sequence, + colorcoding=colors[0], + text_color=colors[1], + seqRange=args.seqRange, + t_size=args.t_size, + rot=args.rotation, + numbering=args.numbering, + filename=tmp_file, + ) + with open("tmp.png", "rb") as f: for line in f: args.output.write(line) diff --git a/cpt_helical_wheel/plotWheels/core.py b/cpt_helical_wheel/plotWheels/core.py index 2e68701..56e9051 100644 --- a/cpt_helical_wheel/plotWheels/core.py +++ b/cpt_helical_wheel/plotWheels/core.py @@ -33,7 +33,7 @@ class BaseSequence(object): """Base class for sequence classes in the module :mod:`modlamp.sequences`. It contains amino acid probabilities for different sequence generation classes. - + The following amino acid probabilities are used: (extracted from the `APD3 `_, March 17, 2016) @@ -61,7 +61,7 @@ class BaseSequence(object): W 0.05 0.0155 0.0201275 0.05555555 Y 0.05 0.0244 0.0290275 0.05555555 === ==== ====== ========= ========== - + """ def __init__(self, seqnum, lenmin=7, lenmax=28): @@ -71,7 +71,7 @@ def __init__(self, seqnum, lenmin=7, lenmax=28): :param lenmax: maximal length of the generated sequences :return: attributes :py:attr:`seqnum`, :py:attr:`lenmin` and :py:attr:`lenmax`. :Example: - + >>> b = BaseSequence(10, 7, 28) >>> b.seqnum 10 @@ -87,96 +87,604 @@ def __init__(self, seqnum, lenmin=7, lenmax=28): self.seqnum = int(seqnum) # AA classes: - self.AA_hyd = ['G', 'A', 'L', 'I', 'V'] - self.AA_basic = ['K', 'R'] - self.AA_acidic = ['D', 'E'] - self.AA_aroma = ['W', 'Y', 'F'] - self.AA_polar = ['S', 'T', 'Q', 'N'] + self.AA_hyd = ["G", "A", "L", "I", "V"] + self.AA_basic = ["K", "R"] + self.AA_acidic = ["D", "E"] + self.AA_aroma = ["W", "Y", "F"] + self.AA_polar = ["S", "T", "Q", "N"] # AA labels: - self.AAs = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] + self.AAs = [ + "A", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "K", + "L", + "M", + "N", + "P", + "Q", + "R", + "S", + "T", + "V", + "W", + "Y", + ] # AA probability from the APD3 database: - self.prob_AMP = [0.0766, 0.071, 0.026, 0.0264, 0.0405, 0.1172, 0.021, 0.061, 0.0958, 0.0838, 0.0123, 0.0386, - 0.0463, 0.0251, 0.0545, 0.0613, 0.0455, 0.0572, 0.0155, 0.0244] + self.prob_AMP = [ + 0.0766, + 0.071, + 0.026, + 0.0264, + 0.0405, + 0.1172, + 0.021, + 0.061, + 0.0958, + 0.0838, + 0.0123, + 0.0386, + 0.0463, + 0.0251, + 0.0545, + 0.0613, + 0.0455, + 0.0572, + 0.0155, + 0.0244, + ] # AA probability from the APD2 database without Cys and Met (synthesis reasons) - self.prob_AMPnoCM = [0.081228, 0., 0.030627, 0.031027, 0.045128, 0.121828, 0.025627, 0.065628, 0.100428, - 0.088428, 0., 0.043228, 0.050928, 0.029728, 0.059128, 0.065927, 0.050128, 0.061828, - 0.020128, 0.029028] + self.prob_AMPnoCM = [ + 0.081228, + 0.0, + 0.030627, + 0.031027, + 0.045128, + 0.121828, + 0.025627, + 0.065628, + 0.100428, + 0.088428, + 0.0, + 0.043228, + 0.050928, + 0.029728, + 0.059128, + 0.065927, + 0.050128, + 0.061828, + 0.020128, + 0.029028, + ] # equal AA probabilities: - self.prob = [0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, - 0.05, 0.05, 0.05, 0.05] + self.prob = [ + 0.05, + 0.05, + 0.05, + 0.05, + 0.05, + 0.05, + 0.05, + 0.05, + 0.05, + 0.05, + 0.05, + 0.05, + 0.05, + 0.05, + 0.05, + 0.05, + 0.05, + 0.05, + 0.05, + 0.05, + ] # equal AA probabilities but 0 for Cys and Met: - self.prob_randnoCM = [0.05555555555, 0.0, 0.05555555555, 0.05555555555, 0.05555555555, 0.05555555555, - 0.05555555555, 0.05555555555, 0.05555555555, 0.05555555555, 0.0, 0.05555555555, - 0.05555555555, 0.05555555555, 0.05555555555, 0.05555555555, 0.05555555555, 0.05555555555, - 0.05555555555, 0.05555555555] + self.prob_randnoCM = [ + 0.05555555555, + 0.0, + 0.05555555555, + 0.05555555555, + 0.05555555555, + 0.05555555555, + 0.05555555555, + 0.05555555555, + 0.05555555555, + 0.05555555555, + 0.0, + 0.05555555555, + 0.05555555555, + 0.05555555555, + 0.05555555555, + 0.05555555555, + 0.05555555555, + 0.05555555555, + 0.05555555555, + 0.05555555555, + ] # AA probability from the linear CancerPPD peptides: - self.prob_ACP = [0.14526966, 0., 0.00690031, 0.00780824, 0.06991102, 0.04957327, 0.01725077, 0.05647358, - 0.27637552, 0.17759216, 0.00998729, 0.00798983, 0.01307427, 0.00381333, 0.02941711, - 0.02651171, 0.0154349, 0.04013074, 0.0406755, 0.00581079] + self.prob_ACP = [ + 0.14526966, + 0.0, + 0.00690031, + 0.00780824, + 0.06991102, + 0.04957327, + 0.01725077, + 0.05647358, + 0.27637552, + 0.17759216, + 0.00998729, + 0.00798983, + 0.01307427, + 0.00381333, + 0.02941711, + 0.02651171, + 0.0154349, + 0.04013074, + 0.0406755, + 0.00581079, + ] # AA probabilities for perfect amphipathic helix of different arc sizes - self.prob_amphihel = [[0.04545455, 0., 0.04545454, 0.04545455, 0., 0.04545455, 0.04545455, 0., 0.25, 0., 0., - 0.04545454, 0.04545455, 0.04545454, 0.25, 0.04545454, 0.04545454, 0., 0., 0.04545454], - [0., 0., 0., 0., 0.16666667, 0., 0., 0.16666667, 0., 0.16666667, 0., 0., 0., 0., 0., 0., - 0., 0.16666667, 0.16666667, (1. - 0.16666667 * 5)]] + self.prob_amphihel = [ + [ + 0.04545455, + 0.0, + 0.04545454, + 0.04545455, + 0.0, + 0.04545455, + 0.04545455, + 0.0, + 0.25, + 0.0, + 0.0, + 0.04545454, + 0.04545455, + 0.04545454, + 0.25, + 0.04545454, + 0.04545454, + 0.0, + 0.0, + 0.04545454, + ], + [ + 0.0, + 0.0, + 0.0, + 0.0, + 0.16666667, + 0.0, + 0.0, + 0.16666667, + 0.0, + 0.16666667, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.16666667, + 0.16666667, + (1.0 - 0.16666667 * 5), + ], + ] # helical ACP AA probabilities, depending on the position of the AA in the helix. - self.prob_ACPhel = np.array([[0.0483871, 0., 0., 0.0483871, 0.01612903, 0.12903226, 0.03225807, 0.09677419, - 0.19354839, 0.5, 0.0483871, 0.11290323, 0.1, 0.18518519, 0.07843137, 0.12, - 0.17073172, 0.16666667], - [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.01612903, 0., 0., 0., 0., 0., - 0.02439024, - 0.19444444], - [0., 0.01612903, 0., 0.27419355, 0.01612903, 0., 0., 0.01612903, 0., 0., 0., 0., - 0., - 0., 0., 0., 0., 0.], - [0., 0., 0., 0., 0., 0., 0., 0.06451613, 0., 0.01612903, 0.0483871, 0.01612903, 0., - 0.01851852, 0., 0., 0., 0.], - [0.16129032, 0.0483871, 0.30645161, 0., 0.0483871, 0., 0., 0.01612903, 0., - 0.01612903, - 0., 0.09677419, 0.06666667, 0.01851852, 0., 0.02, 0.14634146, 0.], - [0.64516129, 0., 0.17741936, 0.14516129, 0., 0.01612903, 0.25806452, 0.11290323, - 0.06451613, 0.08064516, 0.22580645, 0.03225807, 0.06666667, 0.2037037, 0.1372549, - 0.1, 0., 0.05555556], - [0., 0., 0., 0.01612903, 0., 0., 0.01612903, 0., 0.03225807, 0., 0., 0.20967742, - 0., - 0., 0., 0.16, 0., 0.], - [0.0483871, 0.11290323, 0.01612903, 0.08064516, 0.33870968, 0.27419355, 0., - 0.0483871, 0.14516129, 0.06451613, 0.03225807, 0.06451613, 0.18333333, 0., 0., - 0.1, 0.26829268, 0.], - [0., 0.03225807, 0.01612903, 0.12903226, 0.12903226, 0., 0.38709677, 0.33870968, - 0.0483871, 0.03225807, 0.41935484, 0.08064516, 0., 0.03703704, 0.29411765, - 0.04, 0.02439024, 0.02777778], - [0.0483871, 0.70967742, 0.12903226, 0.0483871, 0.09677419, 0.32258064, 0.20967742, - 0.06451613, 0.11290323, 0.06451613, 0.03225807, 0.03225807, 0.28333333, - 0.24074074, - 0.03921569, 0.28, 0.07317073, 0.22222222], - [0., 0.01612903, 0.01612903, 0.0483871, 0.01612903, 0.03225807, 0., 0., 0., 0., - 0., 0., 0.03333333, 0., 0.01960784, 0.02, 0., 0.], - [0., 0.01612903, 0., 0., 0., 0., 0., 0., 0.01612903, 0., 0.03225807, 0., 0., 0., - 0.01960784, 0.02, 0., 0.], - [0., 0., 0.14516129, 0.01612903, 0.03225807, 0.01612903, 0., 0., 0., 0., - 0.01612903, 0., 0., 0.12962963, 0.17647059, 0., 0., 0.], - [0., 0., 0.01612903, 0.01612903, 0., 0., 0.01612903, 0., 0.01612903, 0., 0., - 0.01612903, 0., 0.01851852, 0., 0., 0., 0.], - [0., 0.01612903, 0.01612903, 0., 0.01612903, 0., 0.01612903, 0., 0.01612903, - 0.01612903, 0.01612903, 0.01612903, 0., 0.01851852, 0.01960784, 0., 0.04878049, - 0.], - [0.01612903, 0., 0.01612903, 0.12903226, 0.03225807, 0.03225807, 0.0483871, - 0.17741936, 0., 0.03225807, 0.09677419, 0.0483871, 0.01666667, 0., 0.15686274, - 0.1, 0., 0.05555556], - [0.01612903, 0.01612903, 0., 0.01612903, 0.0483871, 0.01612903, 0., 0.01612903, 0., - 0.01612903, 0.01612903, 0.11290323, 0., 0.01851852, 0.03921569, 0.02, 0., - 0.05555556], - [0.01612903, 0.01612903, 0.01612903, 0.01612903, 0.20967742, 0.16129032, - 0.01612903, - 0.0483871, 0.33870968, 0.16129032, 0., 0.14516129, 0.25, 0.11111111, 0.01960784, - 0.02, 0.21951219, 0.22222222], - [0., 0., 0.12903226, 0.01612903, 0., 0., 0., 0., 0.01612903, 0., 0., 0., 0., 0., - 0., - 0., 0.02439024, 0.], - [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.01612903, 0., 0., 0., 0., 0., 0.]]) + self.prob_ACPhel = np.array( + [ + [ + 0.0483871, + 0.0, + 0.0, + 0.0483871, + 0.01612903, + 0.12903226, + 0.03225807, + 0.09677419, + 0.19354839, + 0.5, + 0.0483871, + 0.11290323, + 0.1, + 0.18518519, + 0.07843137, + 0.12, + 0.17073172, + 0.16666667, + ], + [ + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.01612903, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.02439024, + 0.19444444, + ], + [ + 0.0, + 0.01612903, + 0.0, + 0.27419355, + 0.01612903, + 0.0, + 0.0, + 0.01612903, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + ], + [ + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.06451613, + 0.0, + 0.01612903, + 0.0483871, + 0.01612903, + 0.0, + 0.01851852, + 0.0, + 0.0, + 0.0, + 0.0, + ], + [ + 0.16129032, + 0.0483871, + 0.30645161, + 0.0, + 0.0483871, + 0.0, + 0.0, + 0.01612903, + 0.0, + 0.01612903, + 0.0, + 0.09677419, + 0.06666667, + 0.01851852, + 0.0, + 0.02, + 0.14634146, + 0.0, + ], + [ + 0.64516129, + 0.0, + 0.17741936, + 0.14516129, + 0.0, + 0.01612903, + 0.25806452, + 0.11290323, + 0.06451613, + 0.08064516, + 0.22580645, + 0.03225807, + 0.06666667, + 0.2037037, + 0.1372549, + 0.1, + 0.0, + 0.05555556, + ], + [ + 0.0, + 0.0, + 0.0, + 0.01612903, + 0.0, + 0.0, + 0.01612903, + 0.0, + 0.03225807, + 0.0, + 0.0, + 0.20967742, + 0.0, + 0.0, + 0.0, + 0.16, + 0.0, + 0.0, + ], + [ + 0.0483871, + 0.11290323, + 0.01612903, + 0.08064516, + 0.33870968, + 0.27419355, + 0.0, + 0.0483871, + 0.14516129, + 0.06451613, + 0.03225807, + 0.06451613, + 0.18333333, + 0.0, + 0.0, + 0.1, + 0.26829268, + 0.0, + ], + [ + 0.0, + 0.03225807, + 0.01612903, + 0.12903226, + 0.12903226, + 0.0, + 0.38709677, + 0.33870968, + 0.0483871, + 0.03225807, + 0.41935484, + 0.08064516, + 0.0, + 0.03703704, + 0.29411765, + 0.04, + 0.02439024, + 0.02777778, + ], + [ + 0.0483871, + 0.70967742, + 0.12903226, + 0.0483871, + 0.09677419, + 0.32258064, + 0.20967742, + 0.06451613, + 0.11290323, + 0.06451613, + 0.03225807, + 0.03225807, + 0.28333333, + 0.24074074, + 0.03921569, + 0.28, + 0.07317073, + 0.22222222, + ], + [ + 0.0, + 0.01612903, + 0.01612903, + 0.0483871, + 0.01612903, + 0.03225807, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.03333333, + 0.0, + 0.01960784, + 0.02, + 0.0, + 0.0, + ], + [ + 0.0, + 0.01612903, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.01612903, + 0.0, + 0.03225807, + 0.0, + 0.0, + 0.0, + 0.01960784, + 0.02, + 0.0, + 0.0, + ], + [ + 0.0, + 0.0, + 0.14516129, + 0.01612903, + 0.03225807, + 0.01612903, + 0.0, + 0.0, + 0.0, + 0.0, + 0.01612903, + 0.0, + 0.0, + 0.12962963, + 0.17647059, + 0.0, + 0.0, + 0.0, + ], + [ + 0.0, + 0.0, + 0.01612903, + 0.01612903, + 0.0, + 0.0, + 0.01612903, + 0.0, + 0.01612903, + 0.0, + 0.0, + 0.01612903, + 0.0, + 0.01851852, + 0.0, + 0.0, + 0.0, + 0.0, + ], + [ + 0.0, + 0.01612903, + 0.01612903, + 0.0, + 0.01612903, + 0.0, + 0.01612903, + 0.0, + 0.01612903, + 0.01612903, + 0.01612903, + 0.01612903, + 0.0, + 0.01851852, + 0.01960784, + 0.0, + 0.04878049, + 0.0, + ], + [ + 0.01612903, + 0.0, + 0.01612903, + 0.12903226, + 0.03225807, + 0.03225807, + 0.0483871, + 0.17741936, + 0.0, + 0.03225807, + 0.09677419, + 0.0483871, + 0.01666667, + 0.0, + 0.15686274, + 0.1, + 0.0, + 0.05555556, + ], + [ + 0.01612903, + 0.01612903, + 0.0, + 0.01612903, + 0.0483871, + 0.01612903, + 0.0, + 0.01612903, + 0.0, + 0.01612903, + 0.01612903, + 0.11290323, + 0.0, + 0.01851852, + 0.03921569, + 0.02, + 0.0, + 0.05555556, + ], + [ + 0.01612903, + 0.01612903, + 0.01612903, + 0.01612903, + 0.20967742, + 0.16129032, + 0.01612903, + 0.0483871, + 0.33870968, + 0.16129032, + 0.0, + 0.14516129, + 0.25, + 0.11111111, + 0.01960784, + 0.02, + 0.21951219, + 0.22222222, + ], + [ + 0.0, + 0.0, + 0.12903226, + 0.01612903, + 0.0, + 0.0, + 0.0, + 0.0, + 0.01612903, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.02439024, + 0.0, + ], + [ + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.01612903, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + ], + ] + ) def save_fasta(self, filename, names=False): """Method to save generated sequences in a ``.FASTA`` formatted file. @@ -185,7 +693,7 @@ def save_fasta(self, filename, names=False): :param names: {bool} whether sequence names from :py:attr:`names` should be saved as sequence identifiers :return: a FASTA formatted file containing the generated sequences :Example: - + >>> b = BaseSequence(2) >>> b.sequences = ['KLLSLSLALDLLS', 'KLPERTVVNSSDF'] >>> b.names = ['Sequence1', 'Sequence2'] @@ -219,14 +727,14 @@ def mutate_AA(self, nr, prob): while cnt < nr: # mutate "nr" AA seq[random.choice(range(len(seq)))] = random.choice(self.AAs) cnt += 1 - self.sequences[s] = ''.join(seq) + self.sequences[s] = "".join(seq) def filter_duplicates(self): """Method to filter duplicates in the sequences from the class attribute :py:attr:`sequences` :return: filtered sequences list in the attribute :py:attr:`sequences` and corresponding names. :Example: - + >>> b = BaseSequence(4) >>> b.sequences = ['KLLKLLKKLLKLLK', 'KLLKLLKKLLKLLK', 'KLAKLAKKLAKLAK', 'KLAKLAKKLAKLAK'] >>> b.filter_duplicates() @@ -236,11 +744,15 @@ def filter_duplicates(self): .. versionadded:: v2.2.5 """ if not self.names: - self.names = ['Seq_' + str(i) for i in range(len(self.sequences))] - df = pd.DataFrame(list(zip(self.sequences, self.names)), columns=['Sequences', 'Names']) - df = df.drop_duplicates('Sequences', 'first') # keep first occurrence of duplicate - self.sequences = df['Sequences'].get_values().tolist() - self.names = df['Names'].get_values().tolist() + self.names = ["Seq_" + str(i) for i in range(len(self.sequences))] + df = pd.DataFrame( + list(zip(self.sequences, self.names)), columns=["Sequences", "Names"] + ) + df = df.drop_duplicates( + "Sequences", "first" + ) # keep first occurrence of duplicate + self.sequences = df["Sequences"].get_values().tolist() + self.names = df["Names"].get_values().tolist() def keep_natural_aa(self): """Method to filter out sequences that do not contain natural amino acids. If the sequence contains a character @@ -249,15 +761,35 @@ def keep_natural_aa(self): :return: filtered sequence list in the attribute :py:attr:`sequences`. The other attributes are also filtered accordingly (if present). :Example: - + >>> b = BaseSequence(2) >>> b.sequences = ['BBBsdflUasUJfBJ', 'GLFDIVKKVVGALGSL'] >>> b.keep_natural_aa() >>> b.sequences ['GLFDIVKKVVGALGSL'] """ - natural_aa = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', - 'Y'] + natural_aa = [ + "A", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "K", + "L", + "M", + "N", + "P", + "Q", + "R", + "S", + "T", + "V", + "W", + "Y", + ] seqs = [] names = [] @@ -266,7 +798,7 @@ def keep_natural_aa(self): seq = list(s.upper()) if all(c in natural_aa for c in seq): seqs.append(s.upper()) - if hasattr(self, 'names') and self.names: + if hasattr(self, "names") and self.names: names.append(self.names[i]) self.sequences = seqs @@ -279,7 +811,7 @@ def filter_aa(self, amino_acids): :param amino_acids: {list} amino acids to be filtered :return: filtered list of sequences names in the corresponding attributes. :Example: - + >>> b = BaseSequence(3) >>> b.sequences = ['AAALLLIIIKKK', 'CCEERRT', 'LLVVIIFFFQQ'] >>> b.filter_aa(['C']) @@ -287,14 +819,14 @@ def filter_aa(self, amino_acids): ['AAALLLIIIKKK', 'LLVVIIFFFQQ'] """ - pattern = re.compile('|'.join(amino_acids)) + pattern = re.compile("|".join(amino_acids)) seqs = [] names = [] for i, s in enumerate(self.sequences): if not pattern.search(s): seqs.append(s) - if hasattr(self, 'names') and self.names: + if hasattr(self, "names") and self.names: names.append(self.names[i]) self.sequences = seqs @@ -338,9 +870,9 @@ def __init__(self, seqs): self.sequences = [seqs.strip()] self.names = [] elif os.path.isfile(seqs): - if seqs.endswith('.fasta'): # read .fasta file + if seqs.endswith(".fasta"): # read .fasta file self.sequences, self.names = read_fasta(seqs) - elif seqs.endswith('.csv'): # read .csv file with sequences every line + elif seqs.endswith(".csv"): # read .csv file with sequences every line with open(seqs) as f: self.sequences = list() cntr = 0 @@ -348,15 +880,18 @@ def __init__(self, seqs): for line in f: if line.isupper(): self.sequences.append(line.strip()) - self.names.append('seq_' + str(cntr)) + self.names.append("seq_" + str(cntr)) cntr += 1 else: print("Sorry, currently only .fasta or .csv files can be read!") else: - print("%s does not exist, is not a valid list of AA sequences or is not a valid sequence string" % seqs) + print( + "%s does not exist, is not a valid list of AA sequences or is not a valid sequence string" + % seqs + ) self.descriptor = np.array([[]]) - self.target = np.array([], dtype='int') + self.target = np.array([], dtype="int") self.scaler = None self.featurenames = [] @@ -382,7 +917,7 @@ def save_fasta(self, filename, names=False): else: save_fasta(filename, self.sequences) - def count_aa(self, scale='relative', average=False, append=False): + def count_aa(self, scale="relative", average=False, append=False): """Method for producing the amino acid distribution for the given sequences as a descriptor :param scale: {'absolute' or 'relative'} defines whether counts or frequencies are given for each AA @@ -440,7 +975,7 @@ def count_ngrams(self, n): ngrams[k] = v self.descriptor = ngrams - def feature_scaling(self, stype='standard', fit=True): + def feature_scaling(self, stype="standard", fit=True): """Method for feature scaling of the calculated descriptor matrix. :param stype: {'standard' or 'minmax'} type of scaling to be used @@ -454,10 +989,10 @@ def feature_scaling(self, stype='standard', fit=True): >>> D.feature_scaling(type='minmax',fit=True) array([[0.56818182],[1.],[0.5853447],[0.],[0.47714988]]) """ - if stype in ['standard', 'minmax']: - if stype == 'standard': + if stype in ["standard", "minmax"]: + if stype == "standard": self.scaler = StandardScaler() - elif stype == 'minmax': + elif stype == "minmax": self.scaler = MinMaxScaler() if fit: @@ -521,14 +1056,14 @@ def random_selection(self, num): sel = np.random.choice(len(self.sequences), size=num, replace=False) self.sequences = np.array(self.sequences)[sel].tolist() - if hasattr(self, 'descriptor') and self.descriptor.size: + if hasattr(self, "descriptor") and self.descriptor.size: self.descriptor = self.descriptor[sel] - if hasattr(self, 'names') and self.names: + if hasattr(self, "names") and self.names: self.names = np.array(self.names)[sel].tolist() - if hasattr(self, 'target') and self.target.size: + if hasattr(self, "target") and self.target.size: self.target = self.target[sel] - def minmax_selection(self, iterations, distmetric='euclidean', seed=0): + def minmax_selection(self, iterations, distmetric="euclidean", seed=0): """Method to select a specified number of sequences according to the minmax algorithm. :param iterations: {int} Number of sequences to retrieve. @@ -547,8 +1082,10 @@ def minmax_selection(self, iterations, distmetric='euclidean', seed=0): # Randomly selecting first peptide into the sele np.random.seed(seed) idx = int(np.random.random_integers(0, len(pool), 1)) - sele = pool[idx:idx + 1, :] - minmaxidx.append(int(*np.where(np.all(self.descriptor == pool[idx:idx + 1, :], axis=1)))) + sele = pool[idx : idx + 1, :] + minmaxidx.append( + int(*np.where(np.all(self.descriptor == pool[idx : idx + 1, :], axis=1))) + ) # Deleting peptide in selection from pool pool = np.delete(pool, idx, axis=0) @@ -566,16 +1103,22 @@ def minmax_selection(self, iterations, distmetric='euclidean', seed=0): maxidx = int(maxidx[minmax]) # Adding it to selection and removing from pool - sele = np.append(sele, pool[maxidx:maxidx + 1, :], axis=0) + sele = np.append(sele, pool[maxidx : maxidx + 1, :], axis=0) pool = np.delete(pool, maxidx, axis=0) - minmaxidx.append(int(*np.where(np.all(self.descriptor == pool[maxidx:maxidx + 1, :], axis=1)))) + minmaxidx.append( + int( + *np.where( + np.all(self.descriptor == pool[maxidx : maxidx + 1, :], axis=1) + ) + ) + ) self.sequences = np.array(self.sequences)[minmaxidx].tolist() - if hasattr(self, 'descriptor') and self.descriptor.size: + if hasattr(self, "descriptor") and self.descriptor.size: self.descriptor = self.descriptor[minmaxidx] - if hasattr(self, 'names') and self.names: + if hasattr(self, "names") and self.names: self.names = np.array(self.names)[minmaxidx].tolist() - if hasattr(self, 'target') and self.target.size: + if hasattr(self, "target") and self.target.size: self.target = self.descriptor[minmaxidx] def filter_sequences(self, sequences): @@ -599,20 +1142,22 @@ def filter_sequences(self, sequences): ['ACDEFGHIK', 'GLFDIVKKVV', 'GLFDIVKKVVGALG', 'GLFDIVKKVVGALGSL'] """ indices = list() - if isinstance(sequences, str): # check if sequences is only one sequence string and convert it to a list + if isinstance( + sequences, str + ): # check if sequences is only one sequence string and convert it to a list sequences = [sequences] for s in sequences: # get indices of queried sequences indices.append(self.sequences.index(s)) self.sequences = np.delete(np.array(self.sequences), indices, 0).tolist() - if hasattr(self, 'descriptor') and self.descriptor.size: + if hasattr(self, "descriptor") and self.descriptor.size: self.descriptor = np.delete(self.descriptor, indices, 0) - if hasattr(self, 'names') and self.names: + if hasattr(self, "names") and self.names: self.names = np.delete(np.array(self.names), indices, 0).tolist() - if hasattr(self, 'target') and self.target.size: + if hasattr(self, "target") and self.target.size: self.target = np.delete(self.target, indices, 0) - def filter_values(self, values, operator='=='): + def filter_values(self, values, operator="=="): """Method to filter the descriptor matrix in the attribute :py:attr:`descriptor` for a given list of values (same size as the number of features in the descriptor matrix!) The operator option tells the method whether to filter for values equal, lower, higher ect. to the given values in the *values* array. @@ -632,26 +1177,28 @@ def filter_values(self, values, operator='=='): """ dim = self.descriptor.shape[1] for d in range(dim): # for all the features in self.descriptor - if operator == '==': + if operator == "==": indices = np.where(self.descriptor[:, d] == values[d])[0] - elif operator == '<': + elif operator == "<": indices = np.where(self.descriptor[:, d] < values[d])[0] - elif operator == '>': + elif operator == ">": indices = np.where(self.descriptor[:, d] > values[d])[0] - elif operator == '<=': + elif operator == "<=": indices = np.where(self.descriptor[:, d] <= values[d])[0] - elif operator == '>=': + elif operator == ">=": indices = np.where(self.descriptor[:, d] >= values[d])[0] else: - raise KeyError('available operators: ``==``, ``<``, ``>``, ``<=``and ``>=``') + raise KeyError( + "available operators: ``==``, ``<``, ``>``, ``<=``and ``>=``" + ) # filter descriptor matrix, sequence list and names list according to obtained indices self.sequences = np.array(self.sequences)[indices].tolist() - if hasattr(self, 'descriptor') and self.descriptor.size: + if hasattr(self, "descriptor") and self.descriptor.size: self.descriptor = self.descriptor[indices] - if hasattr(self, 'names') and self.names: + if hasattr(self, "names") and self.names: self.names = np.array(self.names)[indices].tolist() - if hasattr(self, 'target') and self.target.size: + if hasattr(self, "target") and self.target.size: self.target = self.target[indices] def filter_aa(self, amino_acids): @@ -669,7 +1216,7 @@ def filter_aa(self, amino_acids): ['AAALLLIIIKKK', 'LLVVIIFFFQQ'] """ - pattern = re.compile('|'.join(amino_acids)) + pattern = re.compile("|".join(amino_acids)) seqs = [] desc = [] names = [] @@ -678,17 +1225,17 @@ def filter_aa(self, amino_acids): for i, s in enumerate(self.sequences): if not pattern.search(s): seqs.append(s) - if hasattr(self, 'descriptor') and self.descriptor.size: + if hasattr(self, "descriptor") and self.descriptor.size: desc.append(self.descriptor[i]) - if hasattr(self, 'names') and self.names: + if hasattr(self, "names") and self.names: names.append(self.names[i]) - if hasattr(self, 'target') and self.target.size: + if hasattr(self, "target") and self.target.size: target.append(self.target[i]) self.sequences = seqs self.names = names self.descriptor = np.array(desc) - self.target = np.array(target, dtype='int') + self.target = np.array(target, dtype="int") def filter_duplicates(self): """Method to filter duplicates in the sequences from the class attribute :py:attr:`sequences` @@ -704,18 +1251,22 @@ def filter_duplicates(self): .. versionadded:: v2.2.5 """ if not self.names: - self.names = ['Seq_' + str(i) for i in range(len(self.sequences))] + self.names = ["Seq_" + str(i) for i in range(len(self.sequences))] if not self.target: self.target = [0] * len(self.sequences) if not self.descriptor: self.descriptor = np.zeros(len(self.sequences)) - df = pd.DataFrame(np.array([self.sequences, self.names, self.descriptor, self.target]).T, - columns=['Sequences', 'Names', 'Descriptor', 'Target']) - df = df.drop_duplicates('Sequences', 'first') # keep first occurrence of duplicate - self.sequences = df['Sequences'].get_values().tolist() - self.names = df['Names'].get_values().tolist() - self.descriptor = df['Descriptor'].get_values() - self.target = df['Target'].get_values() + df = pd.DataFrame( + np.array([self.sequences, self.names, self.descriptor, self.target]).T, + columns=["Sequences", "Names", "Descriptor", "Target"], + ) + df = df.drop_duplicates( + "Sequences", "first" + ) # keep first occurrence of duplicate + self.sequences = df["Sequences"].get_values().tolist() + self.names = df["Names"].get_values().tolist() + self.descriptor = df["Descriptor"].get_values() + self.target = df["Target"].get_values() def keep_natural_aa(self): """Method to filter out sequences that do not contain natural amino acids. If the sequence contains a character @@ -732,8 +1283,28 @@ def keep_natural_aa(self): ['GLFDIVKKVVGALGSL'] """ - natural_aa = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', - 'Y'] + natural_aa = [ + "A", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "K", + "L", + "M", + "N", + "P", + "Q", + "R", + "S", + "T", + "V", + "W", + "Y", + ] seqs = [] desc = [] @@ -744,19 +1315,21 @@ def keep_natural_aa(self): seq = list(s.upper()) if all(c in natural_aa for c in seq): seqs.append(s.upper()) - if hasattr(self, 'descriptor') and self.descriptor.size: + if hasattr(self, "descriptor") and self.descriptor.size: desc.append(self.descriptor[i]) - if hasattr(self, 'names') and self.names: + if hasattr(self, "names") and self.names: names.append(self.names[i]) - if hasattr(self, 'target') and self.target.size: + if hasattr(self, "target") and self.target.size: target.append(self.target[i]) self.sequences = seqs self.names = names self.descriptor = np.array(desc) - self.target = np.array(target, dtype='int') + self.target = np.array(target, dtype="int") - def load_descriptordata(self, filename, delimiter=",", targets=False, skip_header=0): + def load_descriptordata( + self, filename, delimiter=",", targets=False, skip_header=0 + ): """Method to load any data file with sequences and descriptor values and save it to a new insatnce of the class :class:`modlamp.descriptors.PeptideDescriptor`. @@ -773,11 +1346,11 @@ class :class:`modlamp.descriptors.PeptideDescriptor`. seqs = np.genfromtxt(filename, delimiter=delimiter, dtype="str") seqs = seqs[:, 0] if targets: - self.target = np.array(data[:, -1], dtype='int') + self.target = np.array(data[:, -1], dtype="int") self.sequences = seqs self.descriptor = data - def save_descriptor(self, filename, delimiter=',', targets=None, header=None): + def save_descriptor(self, filename, delimiter=",", targets=None, header=None): """Method to save the descriptor values to a .csv/.txt file :param filename: filename of the output file @@ -786,8 +1359,8 @@ def save_descriptor(self, filename, delimiter=',', targets=None, header=None): :param header: {str} header to be written at the beginning of the file (if ``None``: feature names are taken) :return: output file with peptide names and descriptor values """ - seqs = np.array(self.sequences, dtype='|S80')[:, np.newaxis] - ids = np.array(self.names, dtype='|S80')[:, np.newaxis] + seqs = np.array(self.sequences, dtype="|S80")[:, np.newaxis] + ids = np.array(self.names, dtype="|S80")[:, np.newaxis] if ids.shape == seqs.shape: names = np.hstack((ids, seqs)) else: @@ -798,9 +1371,9 @@ def save_descriptor(self, filename, delimiter=',', targets=None, header=None): else: data = np.hstack((names, self.descriptor)) if not header: - featurenames = [['Sequence']] + self.featurenames - header = ', '.join([f[0] for f in featurenames]) - np.savetxt(filename, data, delimiter=delimiter, fmt='%s', header=header) + featurenames = [["Sequence"]] + self.featurenames + header = ", ".join([f[0] for f in featurenames]) + np.savetxt(filename, data, delimiter=delimiter, fmt="%s", header=header) def load_scale(scalename): @@ -812,248 +1385,1495 @@ def load_scale(scalename): """ # predefined amino acid scales dictionary scales = { - 'aasi': {'A': [1.89], 'C': [1.73], 'D': [3.13], 'E': [3.14], 'F': [1.53], 'G': [2.67], 'H': [3], 'I': [1.97], - 'K': [2.28], 'L': [1.74], 'M': [2.5], 'N': [2.33], 'P': [0.22], 'Q': [3.05], 'R': [1.91], 'S': [2.14], - 'T': [2.18], 'V': [2.37], 'W': [2], 'Y': [2.01]}, - 'abhprk': {'A': [0, 0, 0, 0, 0, 0], 'C': [0, 0, 0, 0, 0, 0], 'D': [1, 0, 0, 1, 0, 0], 'E': [1, 0, 0, 1, 0, 0], - 'F': [0, 0, 1, 0, 1, 0], 'G': [0, 0, 0, 0, 0, 0], 'H': [0, 0, 0, 1, 1, 0], 'I': [0, 0, 1, 0, 0, 0], - 'K': [0, 1, 0, 1, 0, 0], 'L': [0, 0, 1, 0, 0, 0], 'M': [0, 0, 1, 0, 0, 0], 'N': [0, 0, 0, 1, 0, 0], - 'P': [0, 0, 0, 0, 0, 1], 'Q': [0, 0, 0, 1, 0, 0], 'R': [0, 1, 0, 1, 0, 0], 'S': [0, 0, 0, 1, 0, 0], - 'T': [0, 0, 0, 1, 0, 0], 'V': [0, 0, 1, 0, 0, 0], 'W': [0, 0, 1, 0, 1, 0], 'Y': [0, 0, 0, 1, 1, 0]}, - 'argos': {'I': [0.77], 'F': [1.2], 'V': [0.14], 'L': [2.3], 'W': [0.07], 'M': [2.3], 'A': [0.64], 'G': [-0.48], - 'C': [0.25], 'Y': [-0.41], 'P': [-0.31], 'T': [-0.13], 'S': [-0.25], 'H': [-0.87], 'E': [-0.94], - 'N': [-0.89], 'Q': [-0.61], 'D': [-1], 'K': [-1], 'R': [-0.68]}, - 'bulkiness': {'A': [0.443], 'C': [0.551], 'D': [0.453], 'E': [0.557], 'F': [0.898], 'G': [0], 'H': [0.563], - 'I': [0.985], 'K': [0.674], 'L': [0.985], 'M': [0.703], 'N': [0.516], 'P': [0.768], 'Q': [0.605], - 'R': [0.596], 'S': [0.332], 'T': [0.677], 'V': [0.995], 'W': [1], 'Y': [0.801]}, - 'charge_phys': {'A': [0.], 'C': [-.1], 'D': [-1.], 'E': [-1.], 'F': [0.], 'G': [0.], 'H': [0.1], - 'I': [0.], 'K': [1.], 'L': [0.], 'M': [0.], 'N': [0.], 'P': [0.], 'Q': [0.], - 'R': [1.], 'S': [0.], 'T': [0.], 'V': [0.], 'W': [0.], 'Y': [0.]}, - 'charge_acid': {'A': [0.], 'C': [-.1], 'D': [-1.], 'E': [-1.], 'F': [0.], 'G': [0.], 'H': [1.], - 'I': [0.], 'K': [1.], 'L': [0.], 'M': [0.], 'N': [0.], 'P': [0.], 'Q': [0.], - 'R': [1.], 'S': [0.], 'T': [0.], 'V': [0.], 'W': [0.], 'Y': [0.]}, - 'cougar': {'A': [0.25, 0.62, 1.89], 'C': [0.208, 0.29, 1.73], 'D': [0.875, -0.9, 3.13], - 'E': [0.833, -0.74, 3.14], 'F': [0.042, 1.2, 1.53], 'G': [1, 0.48, 2.67], 'H': [0.083, -0.4, 3], - 'I': [0.667, 1.4, 1.97], 'K': [0.708, -1.5, 2.28], 'L': [0.292, 1.1, 1.74], 'M': [0, 0.64, 2.5], - 'N': [0.667, -0.78, 2.33], 'P': [0.875, 0.12, 0.22], 'Q': [0.792, -0.85, 3.05], - 'R': [0.958, -2.5, 1.91], 'S': [0.875, -0.18, 2.14], 'T': [0.583, -0.05, 2.18], - 'V': [0.375, 1.1, 2.37], 'W': [0.042, 0.81, 2], 'Y': [0.5, 0.26, 2.01]}, - 'eisenberg': {'I': [1.4], 'F': [1.2], 'V': [1.1], 'L': [1.1], 'W': [0.81], 'M': [0.64], 'A': [0.62], - 'G': [0.48], 'C': [0.29], 'Y': [0.26], 'P': [0.12], 'T': [-0.05], 'S': [-0.18], 'H': [-0.4], - 'E': [-0.74], 'N': [-0.78], 'Q': [-0.85], 'D': [-0.9], 'K': [-1.5], 'R': [-2.5]}, - 'ez': {'A': [-0.29, 10.22, 4.67], 'C': [0.95, 13.69, 5.77], 'D': [1.19, 14.25, 8.98], 'E': [1.3, 14.66, 4.16], - 'F': [-0.8, 19.67, 7.12], 'G': [-0.01, 13.86, 6], 'H': [0.75, 12.26, 2.77], 'I': [-0.56, 14.34, 10.69], - 'K': [1.66, 11.11, 2.09], 'L': [-0.64, 17.34, 8.61], 'M': [-0.28, 18.04, 7.13], 'N': [0.89, 12.78, 6.28], - 'P': [0.83, 18.09, 3.53], 'Q': [1.21, 10.46, 2.59], 'R': [1.55, 9.34, 4.68], 'S': [0.1, 13.86, 6], - 'T': [0.01, 13.86, 6], 'V': [-0.47, 11.35, 4.97], 'W': [-0.85, 11.65, 7.2], 'Y': [-0.42, 13.04, 6.2]}, - 'flexibility': {'A': [0.25], 'C': [0.208], 'D': [0.875], 'E': [0.833], 'F': [0.042], 'G': [1], 'H': [0.083], - 'I': [0.667], 'K': [0.708], 'L': [0.292], 'M': [0.], 'N': [0.667], 'P': [0.875], 'Q': [0.792], - 'R': [0.958], 'S': [0.875], 'T': [0.583], 'V': [0.375], 'W': [0.042], 'Y': [0.5]}, - 'grantham': {'A': [0, 8.1, 31], 'C': [2.75, 5.5, 55], 'D': [1.38, 13.0, 54], 'E': [0.92, 12.3, 83], - 'F': [0, 5.2, 132], 'G': [0.74, 9.0, 3], 'H': [0.58, 10.4, 96], 'I': [0, 5.2, 111], - 'K': [0.33, 11.3, 119], 'L': [0, 4.9, 111], 'M': [0, 5.7, 105], 'N': [1.33, 11.6, 56], - 'P': [0.39, 8.0, 32.5], 'Q': [0.89, 10.5, 85], 'R': [0.65, 10.5, 124], 'S': [1.42, 9.2, 32], - 'T': [0.71, 8.6, 61], 'V': [0, 5.9, 84], 'W': [0.13, 5.4, 170], 'Y': [0.20, 6.2, 136]}, - 'gravy': {'I': [4.5], 'V': [4.2], 'L': [3.8], 'F': [2.8], 'C': [2.5], 'M': [1.9], 'A': [1.8], 'G': [-0.4], - 'T': [-0.7], 'W': [-0.9], 'S': [-0.8], 'Y': [-1.3], 'P': [-1.6], 'H': [-3.2], 'E': [-3.5], - 'Q': [-3.5], 'D': [-3.5], 'N': [-3.5], 'K': [-3.9], 'R': [-4.5]}, - 'hopp-woods': {'A': [-0.5], 'C': [-1], 'D': [3], 'E': [3], 'F': [-2.5], 'G': [0], 'H': [-0.5], 'I': [-1.8], - 'K': [3], 'L': [-1.8], 'M': [-1.3], 'N': [0.2], 'P': [0], 'Q': [0.2], 'R': [3], 'S': [0.3], - 'T': [-0.4], 'V': [-1.5], 'W': [-3.4], 'Y': [-2.3]}, - 'isaeci': {'A': [62.9, 0.05], 'C': [78.51, 0.15], 'D': [18.46, 1.25], 'E': [30.19, 1.31], 'F': [189.42, 0.14], - 'G': [19.93, 0.02], 'H': [87.38, 0.56], 'I': [149.77, 0.09], 'K': [102.78, 0.53], 'L': [154.35, 0.1], - 'M': [132.22, 0.34], 'N': [19.53, 1.36], 'P': [122.35, 0.16], 'Q': [17.87, 1.31], 'R': [52.98, 1.69], - 'S': [19.75, 0.56], 'T': [59.44, 0.65], 'V': [120.91, 0.07], 'W': [179.16, 1.08], - 'Y': [132.16, 0.72]}, - 'janin': {'I': [1.2], 'F': [0.87], 'V': [1], 'L': [0.87], 'W': [0.59], 'M': [0.73], 'A': [0.59], 'G': [0.59], - 'C': [1.4], 'Y': [-0.4], 'P': [-0.26], 'T': [-0.12], 'S': [0.02], 'H': [0.02], 'E': [-0.83], - 'N': [-0.55], 'Q': [-0.83], 'D': [-0.69], 'K': [-2.4], 'R': [-1.8]}, - 'kytedoolittle': {'I': [1.7], 'F': [1.1], 'V': [1.6], 'L': [1.4], 'W': [-0.14], 'M': [0.8], 'A': [0.77], - 'G': [0.03], 'C': [1], 'Y': [-0.27], 'P': [-0.37], 'T': [-0.07], 'S': [-0.1], 'H': [-0.91], - 'E': [-1], 'N': [-1], 'Q': [-1], 'D': [-1], 'K': [-1.1], 'R': [-1.3]}, - 'levitt_alpha': {'A': [1.29], 'C': [1.11], 'D': [1.04], 'E': [1.44], 'F': [1.07], 'G': [0.56], 'H': [1.22], - 'I': [0.97], 'K': [1.23], 'L': [1.3], 'M': [1.47], 'N': [0.9], 'P': [0.52], 'Q': [1.27], - 'R': [0.96], 'S': [0.82], 'T': [0.82], 'V': [0.91], 'W': [0.99], 'Y': [0.72]}, - 'mss': {'A': [13.02], 'C': [23.7067], 'D': [22.02], 'E': [20.0233], 'F': [23.5288], 'G': [1.01], 'H': [23.5283], - 'I': [22.3611], 'K': [18.9756], 'L': [19.6944], 'M': [21.92], 'N': [21.8567], 'P': [19.0242], - 'Q': [19.9689], 'R': [19.0434], 'S': [18.3533], 'T': [22.3567], 'V': [21.0267], 'W': [26.1975], - 'Y': [24.1954]}, - 'msw': {'A': [-0.73, 0.2, -0.62], 'C': [-0.66, 0.26, -0.27], 'D': [0.11, -1, -0.96], 'E': [0.24, -0.39, -0.04], - 'F': [0.76, 0.85, -0.34], 'G': [-0.31, -0.28, -0.75], 'H': [0.84, 0.67, -0.78], - 'I': [-0.91, 0.83, -0.25], 'K': [-0.51, 0.08, 0.6], 'L': [-0.74, 0.72, -0.16], 'M': [-0.7, 1, -0.32], - 'N': [0.14, 0.2, -0.66], 'P': [-0.43, 0.73, -0.6], 'Q': [0.3, 1, -0.3], 'R': [-0.22, 0.27, 1], - 'S': [-0.8, 0.61, -1], 'T': [-0.58, 0.85, -0.89], 'V': [-1, 0.79, -0.58], 'W': [1, 0.98, -0.47], - 'Y': [0.97, 0.66, -0.16]}, - 'pepcats': {'A': [1, 0, 0, 0, 0, 0], 'C': [1, 0, 1, 1, 0, 0], 'D': [0, 0, 1, 0, 0, 1], 'E': [0, 0, 1, 0, 0, 1], - 'F': [1, 1, 0, 0, 0, 0], 'G': [0, 0, 0, 0, 0, 0], 'H': [1, 1, 0, 1, 1, 0], 'I': [1, 0, 0, 0, 0, 0], - 'K': [1, 0, 0, 1, 1, 0], 'L': [1, 0, 0, 0, 0, 0], 'M': [1, 0, 1, 0, 0, 0], 'N': [0, 0, 1, 1, 0, 0], - 'P': [1, 0, 0, 0, 0, 0], 'Q': [0, 0, 1, 1, 0, 0], 'R': [1, 0, 0, 1, 1, 0], 'S': [0, 0, 1, 1, 0, 0], - 'T': [0, 0, 1, 1, 0, 0], 'V': [1, 0, 0, 0, 0, 0], 'W': [1, 1, 0, 1, 0, 0], 'Y': [1, 1, 1, 1, 0, 0]}, - 'peparc': {'A': [1, 0, 0, 0, 0], 'C': [0, 1, 0, 0, 0], 'D': [0, 1, 0, 1, 0], 'E': [0, 1, 0, 1, 0], - 'F': [1, 0, 0, 0, 0], 'G': [0, 0, 0, 0, 0], 'H': [0, 1, 1, 0, 0], 'I': [1, 0, 0, 0, 0], - 'K': [0, 1, 1, 0, 0], 'L': [1, 0, 0, 0, 0], 'M': [1, 0, 0, 0, 0], 'N': [0, 1, 0, 0, 0], - 'P': [0, 0, 0, 0, 1], 'Q': [0, 1, 0, 0, 0], 'R': [0, 1, 1, 0, 0], 'S': [0, 1, 0, 0, 0], - 'T': [0, 1, 0, 0, 0], 'V': [1, 0, 0, 0, 0], 'W': [1, 0, 0, 0, 0], 'Y': [1, 0, 0, 0, 0]}, - 'polarity': {'A': [0.395], 'C': [0.074], 'D': [1.], 'E': [0.914], 'F': [0.037], 'G': [0.506], 'H': [0.679], - 'I': [0.037], 'K': [0.79], 'L': [0.], 'M': [0.099], 'N': [0.827], 'P': [0.383], 'Q': [0.691], - 'R': [0.691], 'S': [0.531], 'T': [0.457], 'V': [0.123], 'W': [0.062], 'Y': [0.16]}, - 'ppcali': { - 'A': [0.070781, 0.036271, 2.042, 0.083272, 0.69089, 0.15948, -0.80893, 0.24698, 0.86525, 0.68563, -0.24665, - 0.61314, -0.53343, -0.50878, -1.3646, 2.2679, -1.5644, -0.75043, -0.65875], - 'C': [0.61013, -0.93043, -0.85983, -2.2704, 1.5877, -2.0066, -0.30314, 1.2544, -0.2832, -1.2844, -0.73449, - -0.11235, -0.41152, -0.0050164, 0.28307, 0.20522, -0.021084, -0.15627, -0.32689], - 'D': [-1.3215, 0.24063, -0.032754, -0.37863, 1.2051, 1.0001, 2.1827, 0.19212, -0.60529, 0.37639, -0.46451, - -0.46788, 1.4077, -2.1661, 0.72604, -0.12332, -0.8243, -0.082989, 0.053476], - 'E': [-0.87713, 1.4905, 1.0755, 0.35944, 1.567, 0.41365, 1.0944, 0.72634, -0.74957, 0.038939, 0.075057, - 0.78637, -1.4543, 1.6667, -0.097439, -0.24293, 1.7687, 0.36174, -0.11585], - 'F': [1.3557, -0.10336, -0.4309, 0.41269, -0.083356, 0.83783, 0.095381, -0.65222, -0.3119, 0.43293, -1.0011, - -0.66855, -0.10242, 1.2066, 2.6234, 1.9981, -0.25016, 0.71979, 0.21569], - 'G': [-1.0818, -2.1561, 0.77082, -0.92747, -1.0748, 1.7997, -1.3708, 1.279, -1.2098, 0.46065, 0.43076, - 0.20037, -0.2302, 0.2646, 0.57149, -0.68432, 0.19341, -0.061606, -0.08071], - 'H': [-0.050161, 0.69246, -0.88397, -0.64601, 0.24622, 0.10487, -1.1317, -2.3661, -0.89918, 0.46391, - -0.62359, 2.5478, -0.34737, -0.52062, 0.17522, -0.88648, -0.4755, 0.023187, -0.28261], - 'I': [1.4829, -0.46435, 0.50189, 0.55724, -0.51535, -0.29914, 0.97236, -0.15793, -0.98246, -0.54347, - 0.97806, 0.37577, 1.618, 0.62323, -0.59359, -0.35483, -0.085017, 0.55825, -2.7542], - 'K': [-0.85344, 1.529, 0.27747, 0.32993, -1.1786, -0.16633, -1.0459, 0.44621, 0.41027, -2.5318, 0.91329, - 0.53385, 0.61417, -1.111, 1.1323, 0.95105, 0.76769, -0.016115, 0.054995], - 'L': [1.2857, 0.039488, 1.5378, 0.87969, -0.21419, 0.40389, -0.20426, -0.14351, 0.61024, -1.1927, -2.2149, - -0.84248, -0.5061, -0.48548, 0.10791, -2.1503, -0.12006, -0.60222, 0.26546], - 'M': [1.137, 0.64388, 0.13724, -0.2988, 1.2288, 0.24981, -1.6427, -0.75868, -0.54902, 1.0571, 1.272, - -1.9104, 0.70919, -0.93575, -0.6314, -0.079654, 1.634, -0.0021923, 0.49825], - 'N': [-1.084, -0.176, -0.47062, -0.92245, -0.32953, 0.74278, 0.34551, -1.4605, 0.25219, -1.2107, -0.59978, - -0.79183, 1.3268, 1.9839, -1.6137, 0.5333, 0.033889, -1.0331, 0.83019], - 'P': [-1.1823, -1.6911, -1.1331, 3.073, 1.1942, -0.93426, -0.72985, -0.042441, -0.19264, -0.21603, -0.1239, - 0.054016, 0.15241, -0.019691, -0.20543, 0.10206, 0.07671, -0.081968, 0.20348], - 'Q': [-0.57747, 0.97452, -0.077547, -0.0033488, 0.17184, -0.52537, -0.27362, -0.1366, 0.2057, -0.013066, - 1.8834, -1.2736, -0.84991, 1.0445, 0.69027, -1.2866, -2.6776, 0.1683, 0.086105], - 'R': [-0.62245, 1.545, -0.61966, 0.19057, -1.7485, -1.3909, -0.47526, 1.3938, -0.84556, 1.7344, -1.6516, - -0.52678, 0.6791, 0.24374, -0.62551, -0.0028271, -0.053884, 0.14926, -0.17232], - 'S': [-0.86409, -0.77147, 0.38542, -0.59389, -0.53313, -0.47585, 0.31966, -0.89716, 1.8029, 0.26431, - -0.23173, -0.37626, -0.47349, -0.42878, -0.47297, -0.079826, 0.57043, 3.2057, -0.18413], - 'T': [-0.33027, -0.57447, 0.18653, -0.28941, -0.62681, -1.0737, 0.80363, -0.59525, 1.8786, 1.3971, 0.63929, - 0.21281, -0.067048, 0.096271, 1.323, -0.36173, 1.2261, -2.2771, -0.65412], - 'V': [1.1675, -0.61554, 0.95405, 0.11662, -0.74473, -1.1482, 1.1309, 0.12079, -0.77171, 0.18597, 0.93442, - 1.201, 0.3826, -0.091573, -0.31269, 0.074367, -0.22946, 0.24322, 2.9836], - 'W': [1.1881, 0.43789, -1.7915, 0.138, 0.43088, 1.6467, -0.11987, 1.7369, 2.0818, 0.33122, 0.31829, 1.1586, - 0.67649, 0.30819, -0.55772, -0.54491, -0.17969, 0.24477, 0.38674], - 'Y': [0.54671, -0.1468, -1.5688, 0.19001, -1.2736, 0.66162, 1.1614, -0.18614, -0.70654, -0.43634, 0.44775, - -0.71366, -2.5907, -1.1649, -1.1576, 0.66572, 0.21019, -0.61016, -0.34844]}, - 'refractivity': {'A': [0.102045615], 'C': [0.841053374], 'D': [0.282153774], 'E': [0.405831178], - 'F': [0.691276746], 'G': [0], 'H': [0.512814484], 'I': [0.448154244], 'K': [0.50058782], - 'L': [0.441570656], 'M': [0.508817305], 'N': [0.282153774], 'P': [0.256995062], - 'Q': [0.405831178], 'R': [0.626851634], 'S': [0.149306372], 'T': [0.258876087], - 'V': [0.327298378], 'W': [1], 'Y': [0.741359041]}, - 't_scale': {'A': [-8.4, -8.01, -3.73, -3.65, -6.12, -1.59, 1.56], - 'C': [-2.44, -1.96, 0.93, -2.35, 1.31, 2.29, -1.52], - 'D': [-6.84, -0.94, 17.68, -0.03, 3.44, 9.07, 4.32], - 'E': [-6.5, 16.2, 17.28, 3.11, -4.75, -2.54, 4.72], - 'F': [21.59, -5.73, 1.03, -3.3, 2.64, -5.02, 1.7], - 'G': [-8.48, -10.37, -5.14, -6.51, -11.84, -3.6, 2.01], - 'H': [15.28, -3.67, 6.72, -6.38, 4.12, -1.55, -2.85], - 'I': [-2.97, 4.64, -0.77, 11, 3.26, -4.36, -7.88], - 'K': [2.7, 13.46, -14.03, -2.55, 2.77, 0.15, 3.19], - 'L': [2.61, 5.96, 1.97, 2.59, -4.77, -4.84, -5.44], - 'M': [3.38, 12.43, -4.77, 0.45, -1.55, -0.6, 3.26], - 'N': [-3.11, -1.22, 6.26, -9.38, 9.94, 7.66, -4.81], - 'P': [-5.35, -9.07, -1.52, -8.79, -8.73, 4.29, -9.91], - 'Q': [-5.31, 15.64, 8.44, 1.03, -4.32, -4.4, -0.52], - 'R': [-2.27, 18.9, -18.24, -3.47, 3.03, 6.64, 0.45], - 'S': [-15.88, -11.21, -2.44, -3.61, 3.46, -0.37, 8.98], - 'T': [-17.81, -13.64, -5.19, 10.57, 6.91, -4.43, 3.49], - 'V': [-5.8, -6.15, -2.26, 9.87, 5.28, -1.49, -7.54], - 'W': [21.68, -8.78, -2.53, 15.53, -8.15, 11.98, 3.23], - 'Y': [23.9, -6.47, 0.31, -4.14, 4.08, -7.28, 3.59]}, - 'tm_tend': {'A': [0.38], 'C': [-0.3], 'D': [-3.27], 'E': [-2.9], 'F': [1.98], 'G': [-0.19], 'H': [-1.44], - 'I': [1.97], 'K': [-3.46], 'L': [1.82], 'M': [1.4], 'N': [-1.62], 'P': [-1.44], 'Q': [-1.84], - 'R': [-2.57], 'S': [-0.53], 'T': [-0.32], 'V': [1.46], 'W': [1.53], 'Y': [0.49]}, - 'z3': {'A': [0.07, -1.73, 0.09], 'C': [0.71, -0.97, 4.13], 'D': [3.64, 1.13, 2.36], 'E': [3.08, 0.39, -0.07], - 'F': [-4.92, 1.3, 0.45], 'G': [2.23, -5.36, 0.3], 'H': [2.41, 1.74, 1.11], 'I': [-4.44, -1.68, -1.03], - 'K': [2.84, 1.41, -3.14], 'L': [-4.19, -1.03, -0.98], 'M': [-2.49, -0.27, -0.41], - 'N': [3.22, 1.45, 0.84], 'P': [-1.22, 0.88, 2.23], 'Q': [2.18, 0.53, -1.14], 'R': [2.88, 2.52, -3.44], - 'S': [1.96, -1.63, 0.57], 'T': [0.92, -2.09, -1.4], 'V': [-2.69, -2.53, -1.29], 'W': [-4.75, 3.65, 0.85], - 'Y': [-1.39, 2.32, 0.01]}, - 'z5': {'A': [0.24, -2.32, 0.6, -0.14, 1.3], 'C': [0.84, -1.67, 3.71, 0.18, -2.65], - 'D': [3.98, 0.93, 1.93, -2.46, 0.75], 'E': [3.11, 0.26, -0.11, -3.04, -0.25], - 'F': [-4.22, 1.94, 1.06, 0.54, -0.62], 'G': [2.05, -4.06, 0.36, -0.82, -0.38], - 'H': [2.47, 1.95, 0.26, 3.9, 0.09], 'I': [-3.89, -1.73, -1.71, -0.84, 0.26], - 'K': [2.29, 0.89, -2.49, 1.49, 0.31], 'L': [-4.28, -1.3, -1.49, -0.72, 0.84], - 'M': [-2.85, -0.22, 0.47, 1.94, -0.98], 'N': [3.05, 1.62, 1.04, -1.15, 1.61], - 'P': [-1.66, 0.27, 1.84, 0.7, 2], 'Q': [1.75, 0.5, -1.44, -1.34, 0.66], - 'R': [3.52, 2.5, -3.5, 1.99, -0.17], 'S': [2.39, -1.07, 1.15, -1.39, 0.67], - 'T': [0.75, -2.18, -1.12, -1.46, -0.4], 'V': [-2.59, -2.64, -1.54, -0.85, -0.02], - 'W': [-4.36, 3.94, 0.59, 3.44, -1.59], 'Y': [-2.54, 2.44, 0.43, 0.04, -1.47]} + "aasi": { + "A": [1.89], + "C": [1.73], + "D": [3.13], + "E": [3.14], + "F": [1.53], + "G": [2.67], + "H": [3], + "I": [1.97], + "K": [2.28], + "L": [1.74], + "M": [2.5], + "N": [2.33], + "P": [0.22], + "Q": [3.05], + "R": [1.91], + "S": [2.14], + "T": [2.18], + "V": [2.37], + "W": [2], + "Y": [2.01], + }, + "abhprk": { + "A": [0, 0, 0, 0, 0, 0], + "C": [0, 0, 0, 0, 0, 0], + "D": [1, 0, 0, 1, 0, 0], + "E": [1, 0, 0, 1, 0, 0], + "F": [0, 0, 1, 0, 1, 0], + "G": [0, 0, 0, 0, 0, 0], + "H": [0, 0, 0, 1, 1, 0], + "I": [0, 0, 1, 0, 0, 0], + "K": [0, 1, 0, 1, 0, 0], + "L": [0, 0, 1, 0, 0, 0], + "M": [0, 0, 1, 0, 0, 0], + "N": [0, 0, 0, 1, 0, 0], + "P": [0, 0, 0, 0, 0, 1], + "Q": [0, 0, 0, 1, 0, 0], + "R": [0, 1, 0, 1, 0, 0], + "S": [0, 0, 0, 1, 0, 0], + "T": [0, 0, 0, 1, 0, 0], + "V": [0, 0, 1, 0, 0, 0], + "W": [0, 0, 1, 0, 1, 0], + "Y": [0, 0, 0, 1, 1, 0], + }, + "argos": { + "I": [0.77], + "F": [1.2], + "V": [0.14], + "L": [2.3], + "W": [0.07], + "M": [2.3], + "A": [0.64], + "G": [-0.48], + "C": [0.25], + "Y": [-0.41], + "P": [-0.31], + "T": [-0.13], + "S": [-0.25], + "H": [-0.87], + "E": [-0.94], + "N": [-0.89], + "Q": [-0.61], + "D": [-1], + "K": [-1], + "R": [-0.68], + }, + "bulkiness": { + "A": [0.443], + "C": [0.551], + "D": [0.453], + "E": [0.557], + "F": [0.898], + "G": [0], + "H": [0.563], + "I": [0.985], + "K": [0.674], + "L": [0.985], + "M": [0.703], + "N": [0.516], + "P": [0.768], + "Q": [0.605], + "R": [0.596], + "S": [0.332], + "T": [0.677], + "V": [0.995], + "W": [1], + "Y": [0.801], + }, + "charge_phys": { + "A": [0.0], + "C": [-0.1], + "D": [-1.0], + "E": [-1.0], + "F": [0.0], + "G": [0.0], + "H": [0.1], + "I": [0.0], + "K": [1.0], + "L": [0.0], + "M": [0.0], + "N": [0.0], + "P": [0.0], + "Q": [0.0], + "R": [1.0], + "S": [0.0], + "T": [0.0], + "V": [0.0], + "W": [0.0], + "Y": [0.0], + }, + "charge_acid": { + "A": [0.0], + "C": [-0.1], + "D": [-1.0], + "E": [-1.0], + "F": [0.0], + "G": [0.0], + "H": [1.0], + "I": [0.0], + "K": [1.0], + "L": [0.0], + "M": [0.0], + "N": [0.0], + "P": [0.0], + "Q": [0.0], + "R": [1.0], + "S": [0.0], + "T": [0.0], + "V": [0.0], + "W": [0.0], + "Y": [0.0], + }, + "cougar": { + "A": [0.25, 0.62, 1.89], + "C": [0.208, 0.29, 1.73], + "D": [0.875, -0.9, 3.13], + "E": [0.833, -0.74, 3.14], + "F": [0.042, 1.2, 1.53], + "G": [1, 0.48, 2.67], + "H": [0.083, -0.4, 3], + "I": [0.667, 1.4, 1.97], + "K": [0.708, -1.5, 2.28], + "L": [0.292, 1.1, 1.74], + "M": [0, 0.64, 2.5], + "N": [0.667, -0.78, 2.33], + "P": [0.875, 0.12, 0.22], + "Q": [0.792, -0.85, 3.05], + "R": [0.958, -2.5, 1.91], + "S": [0.875, -0.18, 2.14], + "T": [0.583, -0.05, 2.18], + "V": [0.375, 1.1, 2.37], + "W": [0.042, 0.81, 2], + "Y": [0.5, 0.26, 2.01], + }, + "eisenberg": { + "I": [1.4], + "F": [1.2], + "V": [1.1], + "L": [1.1], + "W": [0.81], + "M": [0.64], + "A": [0.62], + "G": [0.48], + "C": [0.29], + "Y": [0.26], + "P": [0.12], + "T": [-0.05], + "S": [-0.18], + "H": [-0.4], + "E": [-0.74], + "N": [-0.78], + "Q": [-0.85], + "D": [-0.9], + "K": [-1.5], + "R": [-2.5], + }, + "ez": { + "A": [-0.29, 10.22, 4.67], + "C": [0.95, 13.69, 5.77], + "D": [1.19, 14.25, 8.98], + "E": [1.3, 14.66, 4.16], + "F": [-0.8, 19.67, 7.12], + "G": [-0.01, 13.86, 6], + "H": [0.75, 12.26, 2.77], + "I": [-0.56, 14.34, 10.69], + "K": [1.66, 11.11, 2.09], + "L": [-0.64, 17.34, 8.61], + "M": [-0.28, 18.04, 7.13], + "N": [0.89, 12.78, 6.28], + "P": [0.83, 18.09, 3.53], + "Q": [1.21, 10.46, 2.59], + "R": [1.55, 9.34, 4.68], + "S": [0.1, 13.86, 6], + "T": [0.01, 13.86, 6], + "V": [-0.47, 11.35, 4.97], + "W": [-0.85, 11.65, 7.2], + "Y": [-0.42, 13.04, 6.2], + }, + "flexibility": { + "A": [0.25], + "C": [0.208], + "D": [0.875], + "E": [0.833], + "F": [0.042], + "G": [1], + "H": [0.083], + "I": [0.667], + "K": [0.708], + "L": [0.292], + "M": [0.0], + "N": [0.667], + "P": [0.875], + "Q": [0.792], + "R": [0.958], + "S": [0.875], + "T": [0.583], + "V": [0.375], + "W": [0.042], + "Y": [0.5], + }, + "grantham": { + "A": [0, 8.1, 31], + "C": [2.75, 5.5, 55], + "D": [1.38, 13.0, 54], + "E": [0.92, 12.3, 83], + "F": [0, 5.2, 132], + "G": [0.74, 9.0, 3], + "H": [0.58, 10.4, 96], + "I": [0, 5.2, 111], + "K": [0.33, 11.3, 119], + "L": [0, 4.9, 111], + "M": [0, 5.7, 105], + "N": [1.33, 11.6, 56], + "P": [0.39, 8.0, 32.5], + "Q": [0.89, 10.5, 85], + "R": [0.65, 10.5, 124], + "S": [1.42, 9.2, 32], + "T": [0.71, 8.6, 61], + "V": [0, 5.9, 84], + "W": [0.13, 5.4, 170], + "Y": [0.20, 6.2, 136], + }, + "gravy": { + "I": [4.5], + "V": [4.2], + "L": [3.8], + "F": [2.8], + "C": [2.5], + "M": [1.9], + "A": [1.8], + "G": [-0.4], + "T": [-0.7], + "W": [-0.9], + "S": [-0.8], + "Y": [-1.3], + "P": [-1.6], + "H": [-3.2], + "E": [-3.5], + "Q": [-3.5], + "D": [-3.5], + "N": [-3.5], + "K": [-3.9], + "R": [-4.5], + }, + "hopp-woods": { + "A": [-0.5], + "C": [-1], + "D": [3], + "E": [3], + "F": [-2.5], + "G": [0], + "H": [-0.5], + "I": [-1.8], + "K": [3], + "L": [-1.8], + "M": [-1.3], + "N": [0.2], + "P": [0], + "Q": [0.2], + "R": [3], + "S": [0.3], + "T": [-0.4], + "V": [-1.5], + "W": [-3.4], + "Y": [-2.3], + }, + "isaeci": { + "A": [62.9, 0.05], + "C": [78.51, 0.15], + "D": [18.46, 1.25], + "E": [30.19, 1.31], + "F": [189.42, 0.14], + "G": [19.93, 0.02], + "H": [87.38, 0.56], + "I": [149.77, 0.09], + "K": [102.78, 0.53], + "L": [154.35, 0.1], + "M": [132.22, 0.34], + "N": [19.53, 1.36], + "P": [122.35, 0.16], + "Q": [17.87, 1.31], + "R": [52.98, 1.69], + "S": [19.75, 0.56], + "T": [59.44, 0.65], + "V": [120.91, 0.07], + "W": [179.16, 1.08], + "Y": [132.16, 0.72], + }, + "janin": { + "I": [1.2], + "F": [0.87], + "V": [1], + "L": [0.87], + "W": [0.59], + "M": [0.73], + "A": [0.59], + "G": [0.59], + "C": [1.4], + "Y": [-0.4], + "P": [-0.26], + "T": [-0.12], + "S": [0.02], + "H": [0.02], + "E": [-0.83], + "N": [-0.55], + "Q": [-0.83], + "D": [-0.69], + "K": [-2.4], + "R": [-1.8], + }, + "kytedoolittle": { + "I": [1.7], + "F": [1.1], + "V": [1.6], + "L": [1.4], + "W": [-0.14], + "M": [0.8], + "A": [0.77], + "G": [0.03], + "C": [1], + "Y": [-0.27], + "P": [-0.37], + "T": [-0.07], + "S": [-0.1], + "H": [-0.91], + "E": [-1], + "N": [-1], + "Q": [-1], + "D": [-1], + "K": [-1.1], + "R": [-1.3], + }, + "levitt_alpha": { + "A": [1.29], + "C": [1.11], + "D": [1.04], + "E": [1.44], + "F": [1.07], + "G": [0.56], + "H": [1.22], + "I": [0.97], + "K": [1.23], + "L": [1.3], + "M": [1.47], + "N": [0.9], + "P": [0.52], + "Q": [1.27], + "R": [0.96], + "S": [0.82], + "T": [0.82], + "V": [0.91], + "W": [0.99], + "Y": [0.72], + }, + "mss": { + "A": [13.02], + "C": [23.7067], + "D": [22.02], + "E": [20.0233], + "F": [23.5288], + "G": [1.01], + "H": [23.5283], + "I": [22.3611], + "K": [18.9756], + "L": [19.6944], + "M": [21.92], + "N": [21.8567], + "P": [19.0242], + "Q": [19.9689], + "R": [19.0434], + "S": [18.3533], + "T": [22.3567], + "V": [21.0267], + "W": [26.1975], + "Y": [24.1954], + }, + "msw": { + "A": [-0.73, 0.2, -0.62], + "C": [-0.66, 0.26, -0.27], + "D": [0.11, -1, -0.96], + "E": [0.24, -0.39, -0.04], + "F": [0.76, 0.85, -0.34], + "G": [-0.31, -0.28, -0.75], + "H": [0.84, 0.67, -0.78], + "I": [-0.91, 0.83, -0.25], + "K": [-0.51, 0.08, 0.6], + "L": [-0.74, 0.72, -0.16], + "M": [-0.7, 1, -0.32], + "N": [0.14, 0.2, -0.66], + "P": [-0.43, 0.73, -0.6], + "Q": [0.3, 1, -0.3], + "R": [-0.22, 0.27, 1], + "S": [-0.8, 0.61, -1], + "T": [-0.58, 0.85, -0.89], + "V": [-1, 0.79, -0.58], + "W": [1, 0.98, -0.47], + "Y": [0.97, 0.66, -0.16], + }, + "pepcats": { + "A": [1, 0, 0, 0, 0, 0], + "C": [1, 0, 1, 1, 0, 0], + "D": [0, 0, 1, 0, 0, 1], + "E": [0, 0, 1, 0, 0, 1], + "F": [1, 1, 0, 0, 0, 0], + "G": [0, 0, 0, 0, 0, 0], + "H": [1, 1, 0, 1, 1, 0], + "I": [1, 0, 0, 0, 0, 0], + "K": [1, 0, 0, 1, 1, 0], + "L": [1, 0, 0, 0, 0, 0], + "M": [1, 0, 1, 0, 0, 0], + "N": [0, 0, 1, 1, 0, 0], + "P": [1, 0, 0, 0, 0, 0], + "Q": [0, 0, 1, 1, 0, 0], + "R": [1, 0, 0, 1, 1, 0], + "S": [0, 0, 1, 1, 0, 0], + "T": [0, 0, 1, 1, 0, 0], + "V": [1, 0, 0, 0, 0, 0], + "W": [1, 1, 0, 1, 0, 0], + "Y": [1, 1, 1, 1, 0, 0], + }, + "peparc": { + "A": [1, 0, 0, 0, 0], + "C": [0, 1, 0, 0, 0], + "D": [0, 1, 0, 1, 0], + "E": [0, 1, 0, 1, 0], + "F": [1, 0, 0, 0, 0], + "G": [0, 0, 0, 0, 0], + "H": [0, 1, 1, 0, 0], + "I": [1, 0, 0, 0, 0], + "K": [0, 1, 1, 0, 0], + "L": [1, 0, 0, 0, 0], + "M": [1, 0, 0, 0, 0], + "N": [0, 1, 0, 0, 0], + "P": [0, 0, 0, 0, 1], + "Q": [0, 1, 0, 0, 0], + "R": [0, 1, 1, 0, 0], + "S": [0, 1, 0, 0, 0], + "T": [0, 1, 0, 0, 0], + "V": [1, 0, 0, 0, 0], + "W": [1, 0, 0, 0, 0], + "Y": [1, 0, 0, 0, 0], + }, + "polarity": { + "A": [0.395], + "C": [0.074], + "D": [1.0], + "E": [0.914], + "F": [0.037], + "G": [0.506], + "H": [0.679], + "I": [0.037], + "K": [0.79], + "L": [0.0], + "M": [0.099], + "N": [0.827], + "P": [0.383], + "Q": [0.691], + "R": [0.691], + "S": [0.531], + "T": [0.457], + "V": [0.123], + "W": [0.062], + "Y": [0.16], + }, + "ppcali": { + "A": [ + 0.070781, + 0.036271, + 2.042, + 0.083272, + 0.69089, + 0.15948, + -0.80893, + 0.24698, + 0.86525, + 0.68563, + -0.24665, + 0.61314, + -0.53343, + -0.50878, + -1.3646, + 2.2679, + -1.5644, + -0.75043, + -0.65875, + ], + "C": [ + 0.61013, + -0.93043, + -0.85983, + -2.2704, + 1.5877, + -2.0066, + -0.30314, + 1.2544, + -0.2832, + -1.2844, + -0.73449, + -0.11235, + -0.41152, + -0.0050164, + 0.28307, + 0.20522, + -0.021084, + -0.15627, + -0.32689, + ], + "D": [ + -1.3215, + 0.24063, + -0.032754, + -0.37863, + 1.2051, + 1.0001, + 2.1827, + 0.19212, + -0.60529, + 0.37639, + -0.46451, + -0.46788, + 1.4077, + -2.1661, + 0.72604, + -0.12332, + -0.8243, + -0.082989, + 0.053476, + ], + "E": [ + -0.87713, + 1.4905, + 1.0755, + 0.35944, + 1.567, + 0.41365, + 1.0944, + 0.72634, + -0.74957, + 0.038939, + 0.075057, + 0.78637, + -1.4543, + 1.6667, + -0.097439, + -0.24293, + 1.7687, + 0.36174, + -0.11585, + ], + "F": [ + 1.3557, + -0.10336, + -0.4309, + 0.41269, + -0.083356, + 0.83783, + 0.095381, + -0.65222, + -0.3119, + 0.43293, + -1.0011, + -0.66855, + -0.10242, + 1.2066, + 2.6234, + 1.9981, + -0.25016, + 0.71979, + 0.21569, + ], + "G": [ + -1.0818, + -2.1561, + 0.77082, + -0.92747, + -1.0748, + 1.7997, + -1.3708, + 1.279, + -1.2098, + 0.46065, + 0.43076, + 0.20037, + -0.2302, + 0.2646, + 0.57149, + -0.68432, + 0.19341, + -0.061606, + -0.08071, + ], + "H": [ + -0.050161, + 0.69246, + -0.88397, + -0.64601, + 0.24622, + 0.10487, + -1.1317, + -2.3661, + -0.89918, + 0.46391, + -0.62359, + 2.5478, + -0.34737, + -0.52062, + 0.17522, + -0.88648, + -0.4755, + 0.023187, + -0.28261, + ], + "I": [ + 1.4829, + -0.46435, + 0.50189, + 0.55724, + -0.51535, + -0.29914, + 0.97236, + -0.15793, + -0.98246, + -0.54347, + 0.97806, + 0.37577, + 1.618, + 0.62323, + -0.59359, + -0.35483, + -0.085017, + 0.55825, + -2.7542, + ], + "K": [ + -0.85344, + 1.529, + 0.27747, + 0.32993, + -1.1786, + -0.16633, + -1.0459, + 0.44621, + 0.41027, + -2.5318, + 0.91329, + 0.53385, + 0.61417, + -1.111, + 1.1323, + 0.95105, + 0.76769, + -0.016115, + 0.054995, + ], + "L": [ + 1.2857, + 0.039488, + 1.5378, + 0.87969, + -0.21419, + 0.40389, + -0.20426, + -0.14351, + 0.61024, + -1.1927, + -2.2149, + -0.84248, + -0.5061, + -0.48548, + 0.10791, + -2.1503, + -0.12006, + -0.60222, + 0.26546, + ], + "M": [ + 1.137, + 0.64388, + 0.13724, + -0.2988, + 1.2288, + 0.24981, + -1.6427, + -0.75868, + -0.54902, + 1.0571, + 1.272, + -1.9104, + 0.70919, + -0.93575, + -0.6314, + -0.079654, + 1.634, + -0.0021923, + 0.49825, + ], + "N": [ + -1.084, + -0.176, + -0.47062, + -0.92245, + -0.32953, + 0.74278, + 0.34551, + -1.4605, + 0.25219, + -1.2107, + -0.59978, + -0.79183, + 1.3268, + 1.9839, + -1.6137, + 0.5333, + 0.033889, + -1.0331, + 0.83019, + ], + "P": [ + -1.1823, + -1.6911, + -1.1331, + 3.073, + 1.1942, + -0.93426, + -0.72985, + -0.042441, + -0.19264, + -0.21603, + -0.1239, + 0.054016, + 0.15241, + -0.019691, + -0.20543, + 0.10206, + 0.07671, + -0.081968, + 0.20348, + ], + "Q": [ + -0.57747, + 0.97452, + -0.077547, + -0.0033488, + 0.17184, + -0.52537, + -0.27362, + -0.1366, + 0.2057, + -0.013066, + 1.8834, + -1.2736, + -0.84991, + 1.0445, + 0.69027, + -1.2866, + -2.6776, + 0.1683, + 0.086105, + ], + "R": [ + -0.62245, + 1.545, + -0.61966, + 0.19057, + -1.7485, + -1.3909, + -0.47526, + 1.3938, + -0.84556, + 1.7344, + -1.6516, + -0.52678, + 0.6791, + 0.24374, + -0.62551, + -0.0028271, + -0.053884, + 0.14926, + -0.17232, + ], + "S": [ + -0.86409, + -0.77147, + 0.38542, + -0.59389, + -0.53313, + -0.47585, + 0.31966, + -0.89716, + 1.8029, + 0.26431, + -0.23173, + -0.37626, + -0.47349, + -0.42878, + -0.47297, + -0.079826, + 0.57043, + 3.2057, + -0.18413, + ], + "T": [ + -0.33027, + -0.57447, + 0.18653, + -0.28941, + -0.62681, + -1.0737, + 0.80363, + -0.59525, + 1.8786, + 1.3971, + 0.63929, + 0.21281, + -0.067048, + 0.096271, + 1.323, + -0.36173, + 1.2261, + -2.2771, + -0.65412, + ], + "V": [ + 1.1675, + -0.61554, + 0.95405, + 0.11662, + -0.74473, + -1.1482, + 1.1309, + 0.12079, + -0.77171, + 0.18597, + 0.93442, + 1.201, + 0.3826, + -0.091573, + -0.31269, + 0.074367, + -0.22946, + 0.24322, + 2.9836, + ], + "W": [ + 1.1881, + 0.43789, + -1.7915, + 0.138, + 0.43088, + 1.6467, + -0.11987, + 1.7369, + 2.0818, + 0.33122, + 0.31829, + 1.1586, + 0.67649, + 0.30819, + -0.55772, + -0.54491, + -0.17969, + 0.24477, + 0.38674, + ], + "Y": [ + 0.54671, + -0.1468, + -1.5688, + 0.19001, + -1.2736, + 0.66162, + 1.1614, + -0.18614, + -0.70654, + -0.43634, + 0.44775, + -0.71366, + -2.5907, + -1.1649, + -1.1576, + 0.66572, + 0.21019, + -0.61016, + -0.34844, + ], + }, + "refractivity": { + "A": [0.102045615], + "C": [0.841053374], + "D": [0.282153774], + "E": [0.405831178], + "F": [0.691276746], + "G": [0], + "H": [0.512814484], + "I": [0.448154244], + "K": [0.50058782], + "L": [0.441570656], + "M": [0.508817305], + "N": [0.282153774], + "P": [0.256995062], + "Q": [0.405831178], + "R": [0.626851634], + "S": [0.149306372], + "T": [0.258876087], + "V": [0.327298378], + "W": [1], + "Y": [0.741359041], + }, + "t_scale": { + "A": [-8.4, -8.01, -3.73, -3.65, -6.12, -1.59, 1.56], + "C": [-2.44, -1.96, 0.93, -2.35, 1.31, 2.29, -1.52], + "D": [-6.84, -0.94, 17.68, -0.03, 3.44, 9.07, 4.32], + "E": [-6.5, 16.2, 17.28, 3.11, -4.75, -2.54, 4.72], + "F": [21.59, -5.73, 1.03, -3.3, 2.64, -5.02, 1.7], + "G": [-8.48, -10.37, -5.14, -6.51, -11.84, -3.6, 2.01], + "H": [15.28, -3.67, 6.72, -6.38, 4.12, -1.55, -2.85], + "I": [-2.97, 4.64, -0.77, 11, 3.26, -4.36, -7.88], + "K": [2.7, 13.46, -14.03, -2.55, 2.77, 0.15, 3.19], + "L": [2.61, 5.96, 1.97, 2.59, -4.77, -4.84, -5.44], + "M": [3.38, 12.43, -4.77, 0.45, -1.55, -0.6, 3.26], + "N": [-3.11, -1.22, 6.26, -9.38, 9.94, 7.66, -4.81], + "P": [-5.35, -9.07, -1.52, -8.79, -8.73, 4.29, -9.91], + "Q": [-5.31, 15.64, 8.44, 1.03, -4.32, -4.4, -0.52], + "R": [-2.27, 18.9, -18.24, -3.47, 3.03, 6.64, 0.45], + "S": [-15.88, -11.21, -2.44, -3.61, 3.46, -0.37, 8.98], + "T": [-17.81, -13.64, -5.19, 10.57, 6.91, -4.43, 3.49], + "V": [-5.8, -6.15, -2.26, 9.87, 5.28, -1.49, -7.54], + "W": [21.68, -8.78, -2.53, 15.53, -8.15, 11.98, 3.23], + "Y": [23.9, -6.47, 0.31, -4.14, 4.08, -7.28, 3.59], + }, + "tm_tend": { + "A": [0.38], + "C": [-0.3], + "D": [-3.27], + "E": [-2.9], + "F": [1.98], + "G": [-0.19], + "H": [-1.44], + "I": [1.97], + "K": [-3.46], + "L": [1.82], + "M": [1.4], + "N": [-1.62], + "P": [-1.44], + "Q": [-1.84], + "R": [-2.57], + "S": [-0.53], + "T": [-0.32], + "V": [1.46], + "W": [1.53], + "Y": [0.49], + }, + "z3": { + "A": [0.07, -1.73, 0.09], + "C": [0.71, -0.97, 4.13], + "D": [3.64, 1.13, 2.36], + "E": [3.08, 0.39, -0.07], + "F": [-4.92, 1.3, 0.45], + "G": [2.23, -5.36, 0.3], + "H": [2.41, 1.74, 1.11], + "I": [-4.44, -1.68, -1.03], + "K": [2.84, 1.41, -3.14], + "L": [-4.19, -1.03, -0.98], + "M": [-2.49, -0.27, -0.41], + "N": [3.22, 1.45, 0.84], + "P": [-1.22, 0.88, 2.23], + "Q": [2.18, 0.53, -1.14], + "R": [2.88, 2.52, -3.44], + "S": [1.96, -1.63, 0.57], + "T": [0.92, -2.09, -1.4], + "V": [-2.69, -2.53, -1.29], + "W": [-4.75, 3.65, 0.85], + "Y": [-1.39, 2.32, 0.01], + }, + "z5": { + "A": [0.24, -2.32, 0.6, -0.14, 1.3], + "C": [0.84, -1.67, 3.71, 0.18, -2.65], + "D": [3.98, 0.93, 1.93, -2.46, 0.75], + "E": [3.11, 0.26, -0.11, -3.04, -0.25], + "F": [-4.22, 1.94, 1.06, 0.54, -0.62], + "G": [2.05, -4.06, 0.36, -0.82, -0.38], + "H": [2.47, 1.95, 0.26, 3.9, 0.09], + "I": [-3.89, -1.73, -1.71, -0.84, 0.26], + "K": [2.29, 0.89, -2.49, 1.49, 0.31], + "L": [-4.28, -1.3, -1.49, -0.72, 0.84], + "M": [-2.85, -0.22, 0.47, 1.94, -0.98], + "N": [3.05, 1.62, 1.04, -1.15, 1.61], + "P": [-1.66, 0.27, 1.84, 0.7, 2], + "Q": [1.75, 0.5, -1.44, -1.34, 0.66], + "R": [3.52, 2.5, -3.5, 1.99, -0.17], + "S": [2.39, -1.07, 1.15, -1.39, 0.67], + "T": [0.75, -2.18, -1.12, -1.46, -0.4], + "V": [-2.59, -2.64, -1.54, -0.85, -0.02], + "W": [-4.36, 3.94, 0.59, 3.44, -1.59], + "Y": [-2.54, 2.44, 0.43, 0.04, -1.47], + }, } - if scalename == 'all': - d = {'I': [], 'F': [], 'V': [], 'L': [], 'W': [], 'M': [], 'A': [], 'G': [], 'C': [], 'Y': [], 'P': [], - 'T': [], 'S': [], 'H': [], 'E': [], 'N': [], 'Q': [], 'D': [], 'K': [], 'R': []} + if scalename == "all": + d = { + "I": [], + "F": [], + "V": [], + "L": [], + "W": [], + "M": [], + "A": [], + "G": [], + "C": [], + "Y": [], + "P": [], + "T": [], + "S": [], + "H": [], + "E": [], + "N": [], + "Q": [], + "D": [], + "K": [], + "R": [], + } for scale in scales.keys(): for k, v in scales[scale].items(): d[k].extend(v) - return 'all', d + return "all", d - elif scalename == 'instability': + elif scalename == "instability": d = { - "A": {"A": 1.0, "C": 44.94, "E": 1.0, "D": -7.49, "G": 1.0, "F": 1.0, "I": 1.0, "H": -7.49, "K": 1.0, - "M": 1.0, "L": 1.0, "N": 1.0, "Q": 1.0, "P": 20.26, "S": 1.0, "R": 1.0, "T": 1.0, "W": 1.0, "V": 1.0, - "Y": 1.0}, - "C": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 20.26, "G": 1.0, "F": 1.0, "I": 1.0, "H": 33.6, "K": 1.0, - "M": 33.6, "L": 20.26, "N": 1.0, "Q": -6.54, "P": 20.26, "S": 1.0, "R": 1.0, "T": 33.6, "W": 24.68, - "V": -6.54, "Y": 1.0}, - "E": {"A": 1.0, "C": 44.94, "E": 33.6, "D": 20.26, "G": 1.0, "F": 1.0, "I": 20.26, "H": -6.54, "K": 1.0, - "M": 1.0, "L": 1.0, "N": 1.0, "Q": 20.26, "P": 20.26, "S": 20.26, "R": 1.0, "T": 1.0, "W": -14.03, - "V": 1.0, "Y": 1.0}, - "D": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0, "G": 1.0, "F": -6.54, "I": 1.0, "H": 1.0, "K": -7.49, - "M": 1.0, "L": 1.0, "N": 1.0, "Q": 1.0, "P": 1.0, "S": 20.26, "R": -6.54, "T": -14.03, "W": 1.0, - "V": 1.0, "Y": 1.0}, - "G": {"A": -7.49, "C": 1.0, "E": -6.54, "D": 1.0, "G": 13.34, "F": 1.0, "I": -7.49, "H": 1.0, "K": -7.49, - "M": 1.0, "L": 1.0, "N": -7.49, "Q": 1.0, "P": 1.0, "S": 1.0, "R": 1.0, "T": -7.49, "W": 13.34, - "V": 1.0, "Y": -7.49}, - "F": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 13.34, "G": 1.0, "F": 1.0, "I": 1.0, "H": 1.0, "K": -14.03, - "M": 1.0, "L": 1.0, "N": 1.0, "Q": 1.0, "P": 20.26, "S": 1.0, "R": 1.0, "T": 1.0, "W": 1.0, "V": 1.0, - "Y": 33.601}, - "I": {"A": 1.0, "C": 1.0, "E": 44.94, "D": 1.0, "G": 1.0, "F": 1.0, "I": 1.0, "H": 13.34, "K": -7.49, - "M": 1.0, "L": 20.26, "N": 1.0, "Q": 1.0, "P": -1.88, "S": 1.0, "R": 1.0, "T": 1.0, "W": 1.0, - "V": -7.49, "Y": 1.0}, - "H": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0, "G": -9.37, "F": -9.37, "I": 44.94, "H": 1.0, "K": 24.68, - "M": 1.0, "L": 1.0, "N": 24.68, "Q": 1.0, "P": -1.88, "S": 1.0, "R": 1.0, "T": -6.54, "W": -1.88, - "V": 1.0, "Y": 44.94}, - "K": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0, "G": -7.49, "F": 1.0, "I": -7.49, "H": 1.0, "K": 1.0, - "M": 33.6, "L": -7.49, "N": 1.0, "Q": 24.64, "P": -6.54, "S": 1.0, "R": 33.6, "T": 1.0, "W": 1.0, - "V": -7.49, "Y": 1.0}, - "M": {"A": 13.34, "C": 1.0, "E": 1.0, "D": 1.0, "G": 1.0, "F": 1.0, "I": 1.0, "H": 58.28, "K": 1.0, - "M": -1.88, "L": 1.0, "N": 1.0, "Q": -6.54, "P": 44.94, "S": 44.94, "R": -6.54, "T": -1.88, "W": 1.0, - "V": 1.0, "Y": 24.68}, - "L": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0, "G": 1.0, "F": 1.0, "I": 1.0, "H": 1.0, "K": -7.49, "M": 1.0, - "L": 1.0, "N": 1.0, "Q": 33.6, "P": 20.26, "S": 1.0, "R": 20.26, "T": 1.0, "W": 24.68, "V": 1.0, - "Y": 1.0}, - "N": {"A": 1.0, "C": -1.88, "E": 1.0, "D": 1.0, "G": -14.03, "F": -14.03, "I": 44.94, "H": 1.0, "K": 24.68, - "M": 1.0, "L": 1.0, "N": 1.0, "Q": -6.54, "P": -1.88, "S": 1.0, "R": 1.0, "T": -7.49, "W": -9.37, - "V": 1.0, "Y": 1.0}, - "Q": {"A": 1.0, "C": -6.54, "E": 20.26, "D": 20.26, "G": 1.0, "F": -6.54, "I": 1.0, "H": 1.0, "K": 1.0, - "M": 1.0, "L": 1.0, "N": 1.0, "Q": 20.26, "P": 20.26, "S": 44.94, "R": 1.0, "T": 1.0, "W": 1.0, - "V": -6.54, "Y": -6.54}, - "P": {"A": 20.26, "C": -6.54, "E": 18.38, "D": -6.54, "G": 1.0, "F": 20.26, "I": 1.0, "H": 1.0, "K": 1.0, - "M": -6.54, "L": 1.0, "N": 1.0, "Q": 20.26, "P": 20.26, "S": 20.26, "R": -6.54, "T": 1.0, "W": -1.88, - "V": 20.26, "Y": 1.0}, - "S": {"A": 1.0, "C": 33.6, "E": 20.26, "D": 1.0, "G": 1.0, "F": 1.0, "I": 1.0, "H": 1.0, "K": 1.0, "M": 1.0, - "L": 1.0, "N": 1.0, "Q": 20.26, "P": 44.94, "S": 20.26, "R": 20.26, "T": 1.0, "W": 1.0, "V": 1.0, - "Y": 1.0}, - "R": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0, "G": -7.49, "F": 1.0, "I": 1.0, "H": 20.26, "K": 1.0, - "M": 1.0, "L": 1.0, "N": 13.34, "Q": 20.26, "P": 20.26, "S": 44.94, "R": 58.28, "T": 1.0, "W": 58.28, - "V": 1.0, "Y": -6.54}, - "T": {"A": 1.0, "C": 1.0, "E": 20.26, "D": 1.0, "G": -7.49, "F": 13.34, "I": 1.0, "H": 1.0, "K": 1.0, - "M": 1.0, "L": 1.0, "N": -14.03, "Q": -6.54, "P": 1.0, "S": 1.0, "R": 1.0, "T": 1.0, "W": -14.03, - "V": 1.0, "Y": 1.0}, - "W": {"A": -14.03, "C": 1.0, "E": 1.0, "D": 1.0, "G": -9.37, "F": 1.0, "I": 1.0, "H": 24.68, "K": 1.0, - "M": 24.68, "L": 13.34, "N": 13.34, "Q": 1.0, "P": 1.0, "S": 1.0, "R": 1.0, "T": -14.03, "W": 1.0, - "V": -7.49, "Y": 1.0}, - "V": {"A": 1.0, "C": 1.0, "E": 1.0, "D": -14.03, "G": -7.49, "F": 1.0, "I": 1.0, "H": 1.0, "K": -1.88, - "M": 1.0, "L": 1.0, "N": 1.0, "Q": 1.0, "P": 20.26, "S": 1.0, "R": 1.0, "T": -7.49, "W": 1.0, - "V": 1.0, "Y": -6.54}, - "Y": {"A": 24.68, "C": 1.0, "E": -6.54, "D": 24.68, "G": -7.49, "F": 1.0, "I": 1.0, "H": 13.34, "K": 1.0, - "M": 44.94, "L": 1.0, "N": 1.0, "Q": 1.0, "P": 13.34, "S": 1.0, "R": -15.91, "T": -7.49, "W": -9.37, - "V": 1.0, "Y": 13.34}} - return 'instability', d + "A": { + "A": 1.0, + "C": 44.94, + "E": 1.0, + "D": -7.49, + "G": 1.0, + "F": 1.0, + "I": 1.0, + "H": -7.49, + "K": 1.0, + "M": 1.0, + "L": 1.0, + "N": 1.0, + "Q": 1.0, + "P": 20.26, + "S": 1.0, + "R": 1.0, + "T": 1.0, + "W": 1.0, + "V": 1.0, + "Y": 1.0, + }, + "C": { + "A": 1.0, + "C": 1.0, + "E": 1.0, + "D": 20.26, + "G": 1.0, + "F": 1.0, + "I": 1.0, + "H": 33.6, + "K": 1.0, + "M": 33.6, + "L": 20.26, + "N": 1.0, + "Q": -6.54, + "P": 20.26, + "S": 1.0, + "R": 1.0, + "T": 33.6, + "W": 24.68, + "V": -6.54, + "Y": 1.0, + }, + "E": { + "A": 1.0, + "C": 44.94, + "E": 33.6, + "D": 20.26, + "G": 1.0, + "F": 1.0, + "I": 20.26, + "H": -6.54, + "K": 1.0, + "M": 1.0, + "L": 1.0, + "N": 1.0, + "Q": 20.26, + "P": 20.26, + "S": 20.26, + "R": 1.0, + "T": 1.0, + "W": -14.03, + "V": 1.0, + "Y": 1.0, + }, + "D": { + "A": 1.0, + "C": 1.0, + "E": 1.0, + "D": 1.0, + "G": 1.0, + "F": -6.54, + "I": 1.0, + "H": 1.0, + "K": -7.49, + "M": 1.0, + "L": 1.0, + "N": 1.0, + "Q": 1.0, + "P": 1.0, + "S": 20.26, + "R": -6.54, + "T": -14.03, + "W": 1.0, + "V": 1.0, + "Y": 1.0, + }, + "G": { + "A": -7.49, + "C": 1.0, + "E": -6.54, + "D": 1.0, + "G": 13.34, + "F": 1.0, + "I": -7.49, + "H": 1.0, + "K": -7.49, + "M": 1.0, + "L": 1.0, + "N": -7.49, + "Q": 1.0, + "P": 1.0, + "S": 1.0, + "R": 1.0, + "T": -7.49, + "W": 13.34, + "V": 1.0, + "Y": -7.49, + }, + "F": { + "A": 1.0, + "C": 1.0, + "E": 1.0, + "D": 13.34, + "G": 1.0, + "F": 1.0, + "I": 1.0, + "H": 1.0, + "K": -14.03, + "M": 1.0, + "L": 1.0, + "N": 1.0, + "Q": 1.0, + "P": 20.26, + "S": 1.0, + "R": 1.0, + "T": 1.0, + "W": 1.0, + "V": 1.0, + "Y": 33.601, + }, + "I": { + "A": 1.0, + "C": 1.0, + "E": 44.94, + "D": 1.0, + "G": 1.0, + "F": 1.0, + "I": 1.0, + "H": 13.34, + "K": -7.49, + "M": 1.0, + "L": 20.26, + "N": 1.0, + "Q": 1.0, + "P": -1.88, + "S": 1.0, + "R": 1.0, + "T": 1.0, + "W": 1.0, + "V": -7.49, + "Y": 1.0, + }, + "H": { + "A": 1.0, + "C": 1.0, + "E": 1.0, + "D": 1.0, + "G": -9.37, + "F": -9.37, + "I": 44.94, + "H": 1.0, + "K": 24.68, + "M": 1.0, + "L": 1.0, + "N": 24.68, + "Q": 1.0, + "P": -1.88, + "S": 1.0, + "R": 1.0, + "T": -6.54, + "W": -1.88, + "V": 1.0, + "Y": 44.94, + }, + "K": { + "A": 1.0, + "C": 1.0, + "E": 1.0, + "D": 1.0, + "G": -7.49, + "F": 1.0, + "I": -7.49, + "H": 1.0, + "K": 1.0, + "M": 33.6, + "L": -7.49, + "N": 1.0, + "Q": 24.64, + "P": -6.54, + "S": 1.0, + "R": 33.6, + "T": 1.0, + "W": 1.0, + "V": -7.49, + "Y": 1.0, + }, + "M": { + "A": 13.34, + "C": 1.0, + "E": 1.0, + "D": 1.0, + "G": 1.0, + "F": 1.0, + "I": 1.0, + "H": 58.28, + "K": 1.0, + "M": -1.88, + "L": 1.0, + "N": 1.0, + "Q": -6.54, + "P": 44.94, + "S": 44.94, + "R": -6.54, + "T": -1.88, + "W": 1.0, + "V": 1.0, + "Y": 24.68, + }, + "L": { + "A": 1.0, + "C": 1.0, + "E": 1.0, + "D": 1.0, + "G": 1.0, + "F": 1.0, + "I": 1.0, + "H": 1.0, + "K": -7.49, + "M": 1.0, + "L": 1.0, + "N": 1.0, + "Q": 33.6, + "P": 20.26, + "S": 1.0, + "R": 20.26, + "T": 1.0, + "W": 24.68, + "V": 1.0, + "Y": 1.0, + }, + "N": { + "A": 1.0, + "C": -1.88, + "E": 1.0, + "D": 1.0, + "G": -14.03, + "F": -14.03, + "I": 44.94, + "H": 1.0, + "K": 24.68, + "M": 1.0, + "L": 1.0, + "N": 1.0, + "Q": -6.54, + "P": -1.88, + "S": 1.0, + "R": 1.0, + "T": -7.49, + "W": -9.37, + "V": 1.0, + "Y": 1.0, + }, + "Q": { + "A": 1.0, + "C": -6.54, + "E": 20.26, + "D": 20.26, + "G": 1.0, + "F": -6.54, + "I": 1.0, + "H": 1.0, + "K": 1.0, + "M": 1.0, + "L": 1.0, + "N": 1.0, + "Q": 20.26, + "P": 20.26, + "S": 44.94, + "R": 1.0, + "T": 1.0, + "W": 1.0, + "V": -6.54, + "Y": -6.54, + }, + "P": { + "A": 20.26, + "C": -6.54, + "E": 18.38, + "D": -6.54, + "G": 1.0, + "F": 20.26, + "I": 1.0, + "H": 1.0, + "K": 1.0, + "M": -6.54, + "L": 1.0, + "N": 1.0, + "Q": 20.26, + "P": 20.26, + "S": 20.26, + "R": -6.54, + "T": 1.0, + "W": -1.88, + "V": 20.26, + "Y": 1.0, + }, + "S": { + "A": 1.0, + "C": 33.6, + "E": 20.26, + "D": 1.0, + "G": 1.0, + "F": 1.0, + "I": 1.0, + "H": 1.0, + "K": 1.0, + "M": 1.0, + "L": 1.0, + "N": 1.0, + "Q": 20.26, + "P": 44.94, + "S": 20.26, + "R": 20.26, + "T": 1.0, + "W": 1.0, + "V": 1.0, + "Y": 1.0, + }, + "R": { + "A": 1.0, + "C": 1.0, + "E": 1.0, + "D": 1.0, + "G": -7.49, + "F": 1.0, + "I": 1.0, + "H": 20.26, + "K": 1.0, + "M": 1.0, + "L": 1.0, + "N": 13.34, + "Q": 20.26, + "P": 20.26, + "S": 44.94, + "R": 58.28, + "T": 1.0, + "W": 58.28, + "V": 1.0, + "Y": -6.54, + }, + "T": { + "A": 1.0, + "C": 1.0, + "E": 20.26, + "D": 1.0, + "G": -7.49, + "F": 13.34, + "I": 1.0, + "H": 1.0, + "K": 1.0, + "M": 1.0, + "L": 1.0, + "N": -14.03, + "Q": -6.54, + "P": 1.0, + "S": 1.0, + "R": 1.0, + "T": 1.0, + "W": -14.03, + "V": 1.0, + "Y": 1.0, + }, + "W": { + "A": -14.03, + "C": 1.0, + "E": 1.0, + "D": 1.0, + "G": -9.37, + "F": 1.0, + "I": 1.0, + "H": 24.68, + "K": 1.0, + "M": 24.68, + "L": 13.34, + "N": 13.34, + "Q": 1.0, + "P": 1.0, + "S": 1.0, + "R": 1.0, + "T": -14.03, + "W": 1.0, + "V": -7.49, + "Y": 1.0, + }, + "V": { + "A": 1.0, + "C": 1.0, + "E": 1.0, + "D": -14.03, + "G": -7.49, + "F": 1.0, + "I": 1.0, + "H": 1.0, + "K": -1.88, + "M": 1.0, + "L": 1.0, + "N": 1.0, + "Q": 1.0, + "P": 20.26, + "S": 1.0, + "R": 1.0, + "T": -7.49, + "W": 1.0, + "V": 1.0, + "Y": -6.54, + }, + "Y": { + "A": 24.68, + "C": 1.0, + "E": -6.54, + "D": 24.68, + "G": -7.49, + "F": 1.0, + "I": 1.0, + "H": 13.34, + "K": 1.0, + "M": 44.94, + "L": 1.0, + "N": 1.0, + "Q": 1.0, + "P": 13.34, + "S": 1.0, + "R": -15.91, + "T": -7.49, + "W": -9.37, + "V": 1.0, + "Y": 13.34, + }, + } + return "instability", d else: return scalename, scales[scalename] @@ -1074,8 +2894,10 @@ def read_fasta(inputfile): all = f.readlines() last = all[-1] for line in all: - if line.startswith('>'): - names.append(line.split(' ')[0][1:].strip()) # add FASTA name without description as molecule name + if line.startswith(">"): + names.append( + line.split(" ")[0][1:].strip() + ) # add FASTA name without description as molecule name sequences.append(seq.strip()) seq = str() elif line == last: @@ -1097,13 +2919,13 @@ def save_fasta(filename, sequences, names=None): if os.path.exists(filename): os.remove(filename) # remove outputfile, it it exists - with open(filename, 'w') as o: + with open(filename, "w") as o: for n, seq in enumerate(sequences): if names: - o.write('>' + str(names[n]) + '\n') + o.write(">" + str(names[n]) + "\n") else: - o.write('>Seq_' + str(n) + '\n') - o.write(seq + '\n') + o.write(">Seq_" + str(n) + "\n") + o.write(seq + "\n") def aa_weights(): @@ -1113,25 +2935,64 @@ def aa_weights(): .. versionadded:: v2.4.1 """ - weights = {'A': 89.093, 'C': 121.158, 'D': 133.103, 'E': 147.129, 'F': 165.189, 'G': 75.067, - 'H': 155.155, 'I': 131.173, 'K': 146.188, 'L': 131.173, 'M': 149.211, 'N': 132.118, - 'P': 115.131, 'Q': 146.145, 'R': 174.20, 'S': 105.093, 'T': 119.119, 'V': 117.146, - 'W': 204.225, 'Y': 181.189} + weights = { + "A": 89.093, + "C": 121.158, + "D": 133.103, + "E": 147.129, + "F": 165.189, + "G": 75.067, + "H": 155.155, + "I": 131.173, + "K": 146.188, + "L": 131.173, + "M": 149.211, + "N": 132.118, + "P": 115.131, + "Q": 146.145, + "R": 174.20, + "S": 105.093, + "T": 119.119, + "V": 117.146, + "W": 204.225, + "Y": 181.189, + } return weights -def count_aas(seq, scale='relative'): +def count_aas(seq, scale="relative"): """Function to count the amino acids occuring in a given sequence. :param seq: {str} amino acid sequence :param scale: {'absolute' or 'relative'} defines whether counts or frequencies are given for each AA :return: {dict} dictionary with amino acids as keys and their counts in the sequence as values. """ - if seq == '': # error if len(seq) == 0 - seq = ' ' - aas = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] - scl = 1. - if scale == 'relative': + if seq == "": # error if len(seq) == 0 + seq = " " + aas = [ + "A", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "K", + "L", + "M", + "N", + "P", + "Q", + "R", + "S", + "T", + "V", + "W", + "Y", + ] + scl = 1.0 + if scale == "relative": scl = len(seq) aa = {a: (float(seq.count(a)) / scl) for a in aas} aa = collections.OrderedDict(sorted(list(aa.items()))) @@ -1145,15 +3006,17 @@ def count_ngrams(seq, n): :param n: {int or list of ints} defines whether counts or frequencies are given for each AA :return: {dict} dictionary with n-grams as keys and their counts in the sequence as values. """ - if seq == '': - seq = ' ' + if seq == "": + seq = " " if isinstance(n, int): n = [n] ngrams = list() for i in n: - ngrams.extend([seq[j:j+i] for j in range(len(seq) - (i-1))]) + ngrams.extend([seq[j : j + i] for j in range(len(seq) - (i - 1))]) counts = {g: (seq.count(g)) for g in set(ngrams)} - counts = collections.OrderedDict(sorted(counts.items(), key=operator.itemgetter(1), reverse=True)) + counts = collections.OrderedDict( + sorted(counts.items(), key=operator.itemgetter(1), reverse=True) + ) return counts @@ -1163,9 +3026,28 @@ def aa_energies(): :return: dictionary with amino acid letters and corresponding energies. """ - energies = {'L': -4.92, 'I': -4.92, 'V': -4.04, 'F': -2.98, 'M': -2.35, 'W': -2.33, 'A': -1.81, 'C': -1.28, - 'G': -0.94, 'Y': 0.14, 'T': 2.57, 'S': 3.40, 'H': 4.66, 'Q': 5.54, 'K': 5.55, 'N': 6.64, 'E': 6.81, - 'D': 8.72, 'R': 14.92, 'P': 0.} + energies = { + "L": -4.92, + "I": -4.92, + "V": -4.04, + "F": -2.98, + "M": -2.35, + "W": -2.33, + "A": -1.81, + "C": -1.28, + "G": -0.94, + "Y": 0.14, + "T": 2.57, + "S": 3.40, + "H": 4.66, + "Q": 5.54, + "K": 5.55, + "N": 6.64, + "E": 6.81, + "D": 8.72, + "R": 14.92, + "P": 0.0, + } return energies @@ -1178,20 +3060,142 @@ def ngrams_apd(): :return: numpy.array containing most frequent ngrams """ - return np.array(['AGK', 'CKI', 'RR', 'YGGG', 'LSGL', 'RG', 'YGGY', 'PRP', 'LGGG', - 'GV', 'GT', 'GS', 'GR', 'IAG', 'GG', 'GF', 'GC', 'GGYG', 'GA', 'GL', - 'GK', 'GI', 'IPC', 'KAA', 'LAK', 'GLGG', 'GGLG', 'CKIT', 'GAGK', - 'LLSG', 'LKK', 'FLP', 'LSG', 'SCK', 'LLS', 'GETC', 'VLG', 'GKLL', - 'LLG', 'C', 'KCKI', 'G', 'VGK', 'CSC', 'TKKC', 'GCS', 'GKA', 'IGK', - 'GESC', 'KVCY', 'KKL', 'KKI', 'KKC', 'LGGL', 'GLL', 'CGE', 'GGYC', - 'GLLS', 'GLF', 'AKK', 'GKAA', 'ESCV', 'GLP', 'CGES', 'PCGE', 'FL', - 'CGET', 'GLW', 'KGAA', 'KAAL', 'GGY', 'GGG', 'IKG', 'LKG', 'GGL', - 'CK', 'GTC', 'CG', 'SKKC', 'CS', 'CR', 'KC', 'AGKA', 'KA', 'KG', - 'LKCK', 'SCKL', 'KK', 'KI', 'KN', 'KL', 'SK', 'KV', 'SL', 'SC', - 'SG', 'AAA', 'VAK', 'AAL', 'AAK', 'GGGG', 'KNVA', 'GGGL', 'GYG', - 'LG', 'LA', 'LL', 'LK', 'LS', 'LP', 'GCSC', 'TC', 'GAA', 'AA', 'VA', - 'VC', 'AG', 'VG', 'AI', 'AK', 'VL', 'AL', 'TPGC', 'IK', 'IA', 'IG', - 'YGG', 'LGK', 'CSCK', 'GYGG', 'LGG', 'KGA']) + return np.array( + [ + "AGK", + "CKI", + "RR", + "YGGG", + "LSGL", + "RG", + "YGGY", + "PRP", + "LGGG", + "GV", + "GT", + "GS", + "GR", + "IAG", + "GG", + "GF", + "GC", + "GGYG", + "GA", + "GL", + "GK", + "GI", + "IPC", + "KAA", + "LAK", + "GLGG", + "GGLG", + "CKIT", + "GAGK", + "LLSG", + "LKK", + "FLP", + "LSG", + "SCK", + "LLS", + "GETC", + "VLG", + "GKLL", + "LLG", + "C", + "KCKI", + "G", + "VGK", + "CSC", + "TKKC", + "GCS", + "GKA", + "IGK", + "GESC", + "KVCY", + "KKL", + "KKI", + "KKC", + "LGGL", + "GLL", + "CGE", + "GGYC", + "GLLS", + "GLF", + "AKK", + "GKAA", + "ESCV", + "GLP", + "CGES", + "PCGE", + "FL", + "CGET", + "GLW", + "KGAA", + "KAAL", + "GGY", + "GGG", + "IKG", + "LKG", + "GGL", + "CK", + "GTC", + "CG", + "SKKC", + "CS", + "CR", + "KC", + "AGKA", + "KA", + "KG", + "LKCK", + "SCKL", + "KK", + "KI", + "KN", + "KL", + "SK", + "KV", + "SL", + "SC", + "SG", + "AAA", + "VAK", + "AAL", + "AAK", + "GGGG", + "KNVA", + "GGGL", + "GYG", + "LG", + "LA", + "LL", + "LK", + "LS", + "LP", + "GCSC", + "TC", + "GAA", + "AA", + "VA", + "VC", + "AG", + "VG", + "AI", + "AK", + "VL", + "AL", + "TPGC", + "IK", + "IA", + "IG", + "YGG", + "LGK", + "CSCK", + "GYGG", + "LGG", + "KGA", + ] + ) def aa_formulas(): @@ -1199,25 +3203,26 @@ def aa_formulas(): Function returning the molecular formulas of all amino acids. All amino acids are considered in the neutral form (uncharged). """ - formulas = {'A': {'C': 3, 'H': 7, 'N': 1, 'O': 2, 'S': 0}, - 'C': {'C': 3, 'H': 7, 'N': 1, 'O': 2, 'S': 1}, - 'D': {'C': 4, 'H': 7, 'N': 1, 'O': 4, 'S': 0}, - 'E': {'C': 5, 'H': 9, 'N': 1, 'O': 4, 'S': 0}, - 'F': {'C': 9, 'H': 11, 'N': 1, 'O': 2, 'S': 0}, - 'G': {'C': 2, 'H': 5, 'N': 1, 'O': 2, 'S': 0}, - 'H': {'C': 6, 'H': 9, 'N': 3, 'O': 2, 'S': 0}, - 'I': {'C': 6, 'H': 13, 'N': 1, 'O': 2, 'S': 0}, - 'K': {'C': 6, 'H': 14, 'N': 2, 'O': 2, 'S': 0}, - 'L': {'C': 6, 'H': 13, 'N': 1, 'O': 2, 'S': 0}, - 'M': {'C': 5, 'H': 11, 'N': 1, 'O': 2, 'S': 1}, - 'N': {'C': 4, 'H': 8, 'N': 2, 'O': 3, 'S': 0}, - 'P': {'C': 5, 'H': 9, 'N': 1, 'O': 2, 'S': 0}, - 'Q': {'C': 5, 'H': 10, 'N': 2, 'O': 3, 'S': 0}, - 'R': {'C': 6, 'H': 14, 'N': 4, 'O': 2, 'S': 0}, - 'S': {'C': 3, 'H': 7, 'N': 1, 'O': 3, 'S': 0}, - 'T': {'C': 4, 'H': 9, 'N': 1, 'O': 3, 'S': 0}, - 'V': {'C': 5, 'H': 11, 'N': 1, 'O': 2, 'S': 0}, - 'W': {'C': 11, 'H': 12, 'N': 2, 'O': 2, 'S': 0}, - 'Y': {'C': 9, 'H': 11, 'N': 1, 'O': 3, 'S': 0} - } + formulas = { + "A": {"C": 3, "H": 7, "N": 1, "O": 2, "S": 0}, + "C": {"C": 3, "H": 7, "N": 1, "O": 2, "S": 1}, + "D": {"C": 4, "H": 7, "N": 1, "O": 4, "S": 0}, + "E": {"C": 5, "H": 9, "N": 1, "O": 4, "S": 0}, + "F": {"C": 9, "H": 11, "N": 1, "O": 2, "S": 0}, + "G": {"C": 2, "H": 5, "N": 1, "O": 2, "S": 0}, + "H": {"C": 6, "H": 9, "N": 3, "O": 2, "S": 0}, + "I": {"C": 6, "H": 13, "N": 1, "O": 2, "S": 0}, + "K": {"C": 6, "H": 14, "N": 2, "O": 2, "S": 0}, + "L": {"C": 6, "H": 13, "N": 1, "O": 2, "S": 0}, + "M": {"C": 5, "H": 11, "N": 1, "O": 2, "S": 1}, + "N": {"C": 4, "H": 8, "N": 2, "O": 3, "S": 0}, + "P": {"C": 5, "H": 9, "N": 1, "O": 2, "S": 0}, + "Q": {"C": 5, "H": 10, "N": 2, "O": 3, "S": 0}, + "R": {"C": 6, "H": 14, "N": 4, "O": 2, "S": 0}, + "S": {"C": 3, "H": 7, "N": 1, "O": 3, "S": 0}, + "T": {"C": 4, "H": 9, "N": 1, "O": 3, "S": 0}, + "V": {"C": 5, "H": 11, "N": 1, "O": 2, "S": 0}, + "W": {"C": 11, "H": 12, "N": 2, "O": 2, "S": 0}, + "Y": {"C": 9, "H": 11, "N": 1, "O": 3, "S": 0}, + } return formulas diff --git a/cpt_helical_wheel/plotWheels/descriptors.py b/cpt_helical_wheel/plotWheels/descriptors.py index 6f8c498..3b4e8d5 100644 --- a/cpt_helical_wheel/plotWheels/descriptors.py +++ b/cpt_helical_wheel/plotWheels/descriptors.py @@ -22,7 +22,14 @@ from scipy import stats from sklearn.externals.joblib import Parallel, delayed -from plotWheels.core import BaseDescriptor, load_scale, count_aas, aa_weights, aa_energies, aa_formulas +from plotWheels.core import ( + BaseDescriptor, + load_scale, + count_aas, + aa_weights, + aa_energies, + aa_formulas, +) __author__ = "Alex Müller, Gisela Gabernet" __docformat__ = "restructuredtext en" @@ -44,18 +51,27 @@ def _one_autocorr(seq, window, scale): # auto-correlation in defined sequence window seqdesc = list() for dist in range(window): # for all correlation distances - for val in range(len(scale['A'])): # for all features of the descriptor scale + for val in range( + len(scale["A"]) + ): # for all features of the descriptor scale valsum = list() - cntr = 0. + cntr = 0.0 for pos in range(len(seq)): # for every position in the sequence - if (pos + dist) < len(seq): # check if corr distance is possible at that sequence position + if (pos + dist) < len( + seq + ): # check if corr distance is possible at that sequence position cntr += 1 # counter to scale sum valsum.append(m[pos][val] * m[pos + dist][val]) - seqdesc.append(sum(valsum) / cntr) # append scaled correlation distance values + seqdesc.append( + sum(valsum) / cntr + ) # append scaled correlation distance values return seqdesc except ZeroDivisionError: - print("ERROR!\nThe chosen correlation window % i is larger than the sequence %s !" % (window, seq)) - + print( + "ERROR!\nThe chosen correlation window % i is larger than the sequence %s !" + % (window, seq) + ) + def _one_crosscorr(seq, window, scale): """Private function used for calculating cross-correlated descriptors for 1 given sequence, window and an AA scale. @@ -72,24 +88,35 @@ def _one_crosscorr(seq, window, scale): m.append(scale[str(seq[l])]) # auto-correlation in defined sequence window seqdesc = list() - for val in range(len(scale['A'])): # for all features of the descriptor scale - for cc in range(len(scale['A'])): # for every feature cross correlation - if (val + cc) < len(scale['A']): # check if corr distance is in range of the num of features + for val in range(len(scale["A"])): # for all features of the descriptor scale + for cc in range(len(scale["A"])): # for every feature cross correlation + if (val + cc) < len( + scale["A"] + ): # check if corr distance is in range of the num of features for dist in range(window): # for all correlation distances cntr = float() valsum = list() - for pos in range(len(seq)): # for every position in the sequence - if (pos + dist) < len(seq): # check if corr distance is possible at that sequence pos + for pos in range( + len(seq) + ): # for every position in the sequence + if (pos + dist) < len( + seq + ): # check if corr distance is possible at that sequence pos cntr += 1 # counter to scale sum valsum.append(m[pos][val] * m[pos + dist][val + cc]) - seqdesc.append(sum(valsum) / cntr) # append scaled correlation distance values + seqdesc.append( + sum(valsum) / cntr + ) # append scaled correlation distance values return seqdesc except ZeroDivisionError: - print("ERROR!\nThe chosen correlation window % i is larger than the sequence %s !" % (window, seq)) + print( + "ERROR!\nThe chosen correlation window % i is larger than the sequence %s !" + % (window, seq) + ) def _one_arc(seq, modality, scale): - """ Privat function used for calculating arc descriptors for one sequence and AA scale. This function is used by + """Privat function used for calculating arc descriptors for one sequence and AA scale. This function is used by :py:func:`calculate_arc` method method of :py:class:`PeptideDescriptor`. :param seq: {str} amino acid sequence to calculate descriptor for @@ -118,7 +145,7 @@ def _one_arc(seq, modality, scale): # loop through all windows for j in range(num_windows): # slices descriptor matrix into current window - window_mat = desc_mat[j:j + window, :] + window_mat = desc_mat[j : j + window, :] # defines order of amino acids in helical projection order = [0, 11, 4, 15, 8, 1, 12, 5, 16, 9, 2, 13, 6, 17, 10, 3, 14, 7] @@ -137,24 +164,39 @@ def _one_arc(seq, modality, scale): # loop through pharmacophoric features for m in range(desc_dim): - all_arcs = [] # stores all arcs that can be found of a pharmacophoric feature + all_arcs = ( + [] + ) # stores all arcs that can be found of a pharmacophoric feature arc = 0 - for n in range(18): # for all positions in helix, regardless of sequence length - if ordered[n, m] == 0: # if position does not contain pharmacophoric feature + for n in range( + 18 + ): # for all positions in helix, regardless of sequence length + if ( + ordered[n, m] == 0 + ): # if position does not contain pharmacophoric feature all_arcs.append(arc) # append previous arc to all arcs list arc = 0 # arc is initialized - elif ordered[n, m] == 1: # if position contains pharmacophoric feature(PF), elongate arc by 20° + elif ( + ordered[n, m] == 1 + ): # if position contains pharmacophoric feature(PF), elongate arc by 20° arc += 20 elif ordered[n, m] == 2: # if position doesn't contain amino acid: - if ordered[n - 1, m] == 1: # if previous position contained PF add 10° + if ( + ordered[n - 1, m] == 1 + ): # if previous position contained PF add 10° arc += 10 - elif ordered[n - 1, m] == 0: # if previous position didn't contain PF don't add anything + elif ( + ordered[n - 1, m] == 0 + ): # if previous position didn't contain PF don't add anything arc += 0 - elif ordered[ - n - 2, m] == 1: # if previous position is empty then check second previous for PF + elif ( + ordered[n - 2, m] == 1 + ): # if previous position is empty then check second previous for PF arc += 10 - if n == 17: # if we are at the last position check for position n=0 instead of next position. + if ( + n == 17 + ): # if we are at the last position check for position n=0 instead of next position. if ordered[0, m] == 1: # if it contains PF add 10° extra arc += 10 else: # if next position contains PF add 10° extra @@ -175,14 +217,18 @@ def _one_arc(seq, modality, scale): arc0 = all_arcs.pop() + all_arcs[0] # join first and last arc together all_arcs = [arc0] + all_arcs[1:] - window_arc.append(np.max(all_arcs)) # append to window arcs the maximum arc of this PF + window_arc.append( + np.max(all_arcs) + ) # append to window arcs the maximum arc of this PF allwindows_arc.append(window_arc) # append all PF arcs of this window allwindows_arc = np.asarray(allwindows_arc) - if modality == 'max': - final_arc = np.max(allwindows_arc, axis=0) # calculate maximum / mean arc along all windows - elif modality == 'mean': + if modality == "max": + final_arc = np.max( + allwindows_arc, axis=0 + ) # calculate maximum / mean arc along all windows + elif modality == "mean": final_arc = np.mean(allwindows_arc, axis=0) else: print('modality is unknown, please choose between "max" and "mean"\n.') @@ -203,17 +249,17 @@ def _charge(seq, ph=7.0, amide=False): :param amide: {boolean} whether the sequences have an amidated C-terminus. :return: {array} descriptor values in the attribute :py:attr:`descriptor """ - + if amide: - pos_pks = {'Nterm': 9.38, 'K': 10.67, 'R': 12.10, 'H': 6.04} - neg_pks = {'Cterm': 15., 'D': 3.71, 'E': 4.15, 'C': 8.14, 'Y': 10.10} + pos_pks = {"Nterm": 9.38, "K": 10.67, "R": 12.10, "H": 6.04} + neg_pks = {"Cterm": 15.0, "D": 3.71, "E": 4.15, "C": 8.14, "Y": 10.10} else: - pos_pks = {'Nterm': 9.38, 'K': 10.67, 'R': 12.10, 'H': 6.04} - neg_pks = {'Cterm': 2.15, 'D': 3.71, 'E': 4.15, 'C': 8.14, 'Y': 10.10} - - aa_content = count_aas(seq, scale='absolute') - aa_content['Nterm'] = 1.0 - aa_content['Cterm'] = 1.0 + pos_pks = {"Nterm": 9.38, "K": 10.67, "R": 12.10, "H": 6.04} + neg_pks = {"Cterm": 2.15, "D": 3.71, "E": 4.15, "C": 8.14, "Y": 10.10} + + aa_content = count_aas(seq, scale="absolute") + aa_content["Nterm"] = 1.0 + aa_content["Cterm"] = 1.0 pos_charge = 0.0 for aa, pK in pos_pks.items(): c_r = 10 ** (pK - ph) @@ -254,7 +300,7 @@ def length(self, append=False): attribute :py:attr:`descriptor`. :return: array of sequence lengths in the attribute :py:attr:`descriptor` :Example: - + >>> desc = GlobalDescriptor(['AFDGHLKI','KKLQRSDLLRTK','KKLASCNNIPPR']) >>> desc.length() >>> desc.descriptor @@ -266,20 +312,20 @@ def length(self, append=False): desc = np.asarray(desc).reshape(len(desc), 1) if append: self.descriptor = np.hstack((self.descriptor, np.array(desc))) - self.featurenames.append('Length') + self.featurenames.append("Length") else: self.descriptor = np.array(desc) - self.featurenames = ['Length'] - + self.featurenames = ["Length"] + def formula(self, amide=False, append=False): """Method to calculate the molecular formula of every sequence in the attribute :py:attr:`sequences`. - + :param amide: {boolean} whether the sequences are C-terminally amidated. :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the attribute :py:attr:`descriptor`. :return: array of molecular formulas {str} in the attribute :py:attr:`descriptor` :Example: - + >>> desc = GlobalDescriptor(['KADSFLSADGHSADFSLDKKLKERL', 'ERTILSDFPQWWFASLDFLNC', 'ACDEFGHIKLMNPQRSTVWY']) >>> desc.formula(amide=True) >>> for v in desc.descriptor: @@ -287,41 +333,48 @@ def formula(self, amide=False, append=False): C122 H197 N35 O39 C121 H168 N28 O33 S C106 H157 N29 O30 S2 - + .. seealso:: :py:func:`modlamp.core.aa_formulas()` - + .. versionadded:: v2.7.6 """ desc = [] formulas = aa_formulas() for seq in self.sequences: - f = {'C': 0, 'H': 0, 'N': 0, 'O': 0, 'S': 0} + f = {"C": 0, "H": 0, "N": 0, "O": 0, "S": 0} for aa in seq: # loop over all AAs for k in f.keys(): f[k] += formulas[aa][k] - + # substract H2O for every peptide bond - f['H'] -= 2 * (len(seq) - 1) - f['O'] -= (len(seq) - 1) - + f["H"] -= 2 * (len(seq) - 1) + f["O"] -= len(seq) - 1 + if amide: # add C-terminal amide --> replace OH with NH2 - f['O'] -= 1 - f['H'] += 1 - f['N'] += 1 - - if f['S'] != 0: - val = 'C%s H%s N%s O%s %s%s' % (f['C'], f['H'], f['N'], f['O'], 'S', f['S']) + f["O"] -= 1 + f["H"] += 1 + f["N"] += 1 + + if f["S"] != 0: + val = "C%s H%s N%s O%s %s%s" % ( + f["C"], + f["H"], + f["N"], + f["O"], + "S", + f["S"], + ) else: - val = 'C%s H%s N%s O%s' % (f['C'], f['H'], f['N'], f['O']) - + val = "C%s H%s N%s O%s" % (f["C"], f["H"], f["N"], f["O"]) + desc.append([val]) - + if append: self.descriptor = np.hstack((self.descriptor, np.array(desc))) - self.featurenames.append('Formula') + self.featurenames.append("Formula") else: self.descriptor = np.array(desc) - self.featurenames = ['Formula'] + self.featurenames = ["Formula"] def calculate_MW(self, amide=False, append=False): """Method to calculate the molecular weight [g/mol] of every sequence in the attribute :py:attr:`sequences`. @@ -331,7 +384,7 @@ def calculate_MW(self, amide=False, append=False): attribute :py:attr:`descriptor`. :return: array of descriptor values in the attribute :py:attr:`descriptor` :Example: - + >>> desc = GlobalDescriptor('IAESFKGHIPL') >>> desc.calculate_MW(amide=True) >>> desc.descriptor @@ -347,17 +400,21 @@ def calculate_MW(self, amide=False, append=False): mw = [] for aa in seq: # sum over aa weights mw.append(weights[aa]) - desc.append(round(sum(mw) - 18.015 * (len(seq) - 1), 2)) # sum over AA MW and subtract H20 MW for every + desc.append( + round(sum(mw) - 18.015 * (len(seq) - 1), 2) + ) # sum over AA MW and subtract H20 MW for every # peptide bond desc = np.asarray(desc).reshape(len(desc), 1) - if amide: # if sequences are amidated, subtract 0.98 from calculated MW (OH - NH2) + if ( + amide + ): # if sequences are amidated, subtract 0.98 from calculated MW (OH - NH2) desc = [d - 0.98 for d in desc] if append: self.descriptor = np.hstack((self.descriptor, np.array(desc))) - self.featurenames.append('MW') + self.featurenames.append("MW") else: self.descriptor = np.array(desc) - self.featurenames = ['MW'] + self.featurenames = ["MW"] def calculate_charge(self, ph=7.0, amide=False, append=False): """Method to overall charge of every sequence in the attribute :py:attr:`sequences`. @@ -376,7 +433,7 @@ def calculate_charge(self, ph=7.0, amide=False, append=False): attribute :py:attr:`descriptor`. :return: array of descriptor values in the attribute :py:attr:`descriptor` :Example: - + >>> desc = GlobalDescriptor('KLAKFGKRSELVALSG') >>> desc.calculate_charge(ph=7.4, amide=True) >>> desc.descriptor @@ -385,14 +442,16 @@ def calculate_charge(self, ph=7.0, amide=False, append=False): desc = [] for seq in self.sequences: - desc.append(_charge(seq, ph, amide)) # calculate charge with helper function + desc.append( + _charge(seq, ph, amide) + ) # calculate charge with helper function desc = np.asarray(desc).reshape(len(desc), 1) if append: self.descriptor = np.hstack((self.descriptor, np.array(desc))) - self.featurenames.append('Charge') + self.featurenames.append("Charge") else: self.descriptor = np.array(desc) - self.featurenames = ['Charge'] + self.featurenames = ["Charge"] def charge_density(self, ph=7.0, amide=False, append=False): """Method to calculate the charge density (charge / MW) of every sequences in the attributes :py:attr:`sequences` @@ -403,7 +462,7 @@ def charge_density(self, ph=7.0, amide=False, append=False): attribute :py:attr:`descriptor`. :return: array of descriptor values in the attribute :py:attr:`descriptor`. :Example: - + >>> desc = GlobalDescriptor('GNSDLLIEQRTLLASDEF') >>> desc.charge_density(ph=6, amide=True) >>> desc.descriptor @@ -417,10 +476,10 @@ def charge_density(self, ph=7.0, amide=False, append=False): desc = np.asarray(desc).reshape(len(desc), 1) if append: self.descriptor = np.hstack((self.descriptor, np.array(desc))) - self.featurenames.append('ChargeDensity') + self.featurenames.append("ChargeDensity") else: self.descriptor = np.array(desc) - self.featurenames = ['ChargeDensity'] + self.featurenames = ["ChargeDensity"] def isoelectric_point(self, amide=False, append=False): """ @@ -436,7 +495,7 @@ def isoelectric_point(self, amide=False, append=False): attribute :py:attr:`descriptor`. :return: array of descriptor values in the attribute :py:attr:`descriptor` :Example: - + >>> desc = GlobalDescriptor('KLFDIKFGHIPQRST') >>> desc.isoelectric_point() >>> desc.descriptor @@ -485,10 +544,10 @@ def isoelectric_point(self, amide=False, append=False): desc = np.asarray(desc).reshape(len(desc), 1) if append: self.descriptor = np.hstack((self.descriptor, np.array(desc))) - self.featurenames.append('pI') + self.featurenames.append("pI") else: self.descriptor = np.array(desc) - self.featurenames = ['pI'] + self.featurenames = ["pI"] def instability_index(self, append=False): """ @@ -500,27 +559,27 @@ def instability_index(self, append=False): attribute :py:attr:`descriptor`. :return: array of descriptor values in the attribute :py:attr:`descriptor` :Example: - + >>> desc = GlobalDescriptor('LLASMNDLLAKRST') >>> desc.instability_index() >>> desc.descriptor array([[ 63.95714286]]) """ - + desc = [] - dimv = load_scale('instability')[1] + dimv = load_scale("instability")[1] for seq in self.sequences: stabindex = float() for i in range(len(seq) - 1): - stabindex += dimv[seq[i]][seq[i+1]] + stabindex += dimv[seq[i]][seq[i + 1]] desc.append((10.0 / len(seq)) * stabindex) desc = np.asarray(desc).reshape(len(desc), 1) if append: self.descriptor = np.hstack((self.descriptor, np.array(desc))) - self.featurenames.append('InstabilityInd') + self.featurenames.append("InstabilityInd") else: self.descriptor = np.array(desc) - self.featurenames = ['InstabilityInd'] + self.featurenames = ["InstabilityInd"] def aromaticity(self, append=False): """ @@ -531,7 +590,7 @@ def aromaticity(self, append=False): attribute :py:attr:`descriptor`. :return: array of descriptor values in the attribute :py:attr:`descriptor` :Example: - + >>> desc = GlobalDescriptor('GLFYWRFFLQRRFLYWW') >>> desc.aromaticity() >>> desc.descriptor @@ -539,17 +598,17 @@ def aromaticity(self, append=False): """ desc = [] for seq in self.sequences: - f = seq.count('F') - w = seq.count('W') - y = seq.count('Y') + f = seq.count("F") + w = seq.count("W") + y = seq.count("Y") desc.append(float(f + w + y) / len(seq)) desc = np.asarray(desc).reshape(len(desc), 1) if append: self.descriptor = np.hstack((self.descriptor, np.array(desc))) - self.featurenames.append('Aromaticity') + self.featurenames.append("Aromaticity") else: self.descriptor = np.array(desc) - self.featurenames = ['Aromaticity'] + self.featurenames = ["Aromaticity"] def aliphatic_index(self, append=False): """ @@ -562,7 +621,7 @@ def aliphatic_index(self, append=False): attribute :py:attr:`descriptor`. :return: array of descriptor values in the attribute :py:attr:`descriptor` :Example: - + >>> desc = GlobalDescriptor('KWLKYLKKLAKLVK') >>> desc.aliphatic_index() >>> desc.descriptor @@ -572,15 +631,19 @@ def aliphatic_index(self, append=False): aa_dict = aa_weights() for seq in self.sequences: d = {aa: seq.count(aa) for aa in aa_dict.keys()} # count aa - d = {k: (float(d[k]) / len(seq)) * 100 for k in d.keys()} # get mole percent of all AA - desc.append(d['A'] + 2.9 * d['V'] + 3.9 * (d['I'] + d['L'])) # formula for calculating the AI (Ikai, 1980) + d = { + k: (float(d[k]) / len(seq)) * 100 for k in d.keys() + } # get mole percent of all AA + desc.append( + d["A"] + 2.9 * d["V"] + 3.9 * (d["I"] + d["L"]) + ) # formula for calculating the AI (Ikai, 1980) desc = np.asarray(desc).reshape(len(desc), 1) if append: self.descriptor = np.hstack((self.descriptor, np.array(desc))) - self.featurenames.append('AliphaticInd') + self.featurenames.append("AliphaticInd") else: self.descriptor = np.array(desc) - self.featurenames = ['AliphaticInd'] + self.featurenames = ["AliphaticInd"] def boman_index(self, append=False): """Method to calculate the boman index of every sequence in the attribute :py:attr:`sequences`. @@ -589,14 +652,14 @@ def boman_index(self, append=False): dividing by sequence length. ([1] H. G. Boman, D. Wade, I. a Boman, B. Wåhlin, R. B. Merrifield, *FEBS Lett*. **1989**, *259*, 103–106. [2] A. Radzick, R. Wolfenden, *Biochemistry* **1988**, *27*, 1664–1670.) - + .. seealso:: :py:func:`modlamp.core.aa_energies()` :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the attribute :py:attr:`descriptor`. :return: array of descriptor values in the attribute :py:attr:`descriptor` :Example: - + >>> desc = GlobalDescriptor('GLFDIVKKVVGALGSL') >>> desc.boman_index() >>> desc.descriptor @@ -612,10 +675,10 @@ def boman_index(self, append=False): desc = np.asarray(desc).reshape(len(desc), 1) if append: self.descriptor = np.hstack((self.descriptor, np.array(desc))) - self.featurenames.append('BomanInd') + self.featurenames.append("BomanInd") else: self.descriptor = np.array(desc) - self.featurenames = ['BomanInd'] + self.featurenames = ["BomanInd"] def hydrophobic_ratio(self, append=False): """ @@ -626,7 +689,7 @@ def hydrophobic_ratio(self, append=False): attribute :py:attr:`descriptor`. :return: array of descriptor values in the attribute :py:attr:`descriptor` :Example: - + >>> desc = GlobalDescriptor('VALLYWRTVLLAIII') >>> desc.hydrophobic_ratio() >>> desc.descriptor @@ -637,24 +700,27 @@ def hydrophobic_ratio(self, append=False): for seq in self.sequences: pa = {aa: seq.count(aa) for aa in aa_dict.keys()} # count aa # formula for calculating the AI (Ikai, 1980): - desc.append((pa['A'] + pa['C'] + pa['F'] + pa['I'] + pa['L'] + pa['M'] + pa['V']) / float(len(seq))) + desc.append( + (pa["A"] + pa["C"] + pa["F"] + pa["I"] + pa["L"] + pa["M"] + pa["V"]) + / float(len(seq)) + ) desc = np.asarray(desc).reshape(len(desc), 1) if append: self.descriptor = np.hstack((self.descriptor, np.array(desc))) - self.featurenames.append('HydrophRatio') + self.featurenames.append("HydrophRatio") else: self.descriptor = np.array(desc) - self.featurenames = ['HydrophRatio'] + self.featurenames = ["HydrophRatio"] def calculate_all(self, ph=7.4, amide=True): """Method combining all global descriptors and appending them into the feature matrix in the attribute :py:attr:`descriptor`. - + :param ph: {float} pH at which to calculate peptide charge :param amide: {boolean} whether the sequences have an amidated C-terminus. :return: array of descriptor values in the attribute :py:attr:`descriptor` :Example: - + >>> desc = GlobalDescriptor('AFGHFKLKKLFIFGHERT') >>> desc.calculate_all(amide=True) >>> desc.featurenames @@ -663,7 +729,7 @@ def calculate_all(self, ph=7.4, amide=True): array([[ 18., 2.17559000e+03, 1.87167619e-03, 1.16757812e+01, ... 1.10555556e+00, 4.44444444e-01]]) >>> desc.save_descriptor('/path/to/outputfile.csv') # save the descriptor data (with feature names header) """ - + # This is a strange way of doing it. However, the append=True option excludes length and charge, no idea why! fn = [] self.length() # sequence length @@ -696,7 +762,7 @@ def calculate_all(self, ph=7.4, amide=True): self.hydrophobic_ratio() # Hydrophobic ratio hr = self.descriptor fn.extend(self.featurenames) - + self.descriptor = np.concatenate((l, mw, c, cd, pi, si, ar, ai, bi, hr), axis=1) self.featurenames = fn @@ -738,7 +804,7 @@ class PeptideDescriptor(BaseDescriptor): """ - def __init__(self, seqs, scalename='Eisenberg'): + def __init__(self, seqs, scalename="Eisenberg"): """ :param seqs: a .fasta file with sequences, a list of sequences or a single sequence as string to calculate the descriptor values for. @@ -788,7 +854,9 @@ def calculate_autocorr(self, window, append=False): .. versionchanged:: v.2.3.0 """ - desc = Parallel(n_jobs=-1)(delayed(_one_autocorr)(seq, window, self.scale) for seq in self.sequences) + desc = Parallel(n_jobs=-1)( + delayed(_one_autocorr)(seq, window, self.scale) for seq in self.sequences + ) if append: self.descriptor = np.hstack((self.descriptor, np.array(desc))) @@ -811,14 +879,16 @@ def calculate_crosscorr(self, window, append=False): >>> AMP.descriptor.shape (1, 147) """ - desc = Parallel(n_jobs=-1)(delayed(_one_crosscorr)(seq, window, self.scale) for seq in self.sequences) + desc = Parallel(n_jobs=-1)( + delayed(_one_crosscorr)(seq, window, self.scale) for seq in self.sequences + ) if append: self.descriptor = np.hstack((self.descriptor, np.array(desc))) else: self.descriptor = np.array(desc) - def calculate_moment(self, window=1000, angle=100, modality='max', append=False): + def calculate_moment(self, window=1000, angle=100, modality="max", append=False): """Method for calculating the maximum or mean moment of the amino acid values for a given descriptor scale and window. @@ -840,48 +910,56 @@ def calculate_moment(self, window=1000, angle=100, modality='max', append=False) >>> AMP.descriptor array([[ 0.48790226]]) """ - if self.scale['A'] == list: - print('\n Descriptor moment calculation is only possible for one dimensional descriptors.\n') - + if self.scale["A"] == list: + print( + "\n Descriptor moment calculation is only possible for one dimensional descriptors.\n" + ) + else: desc = [] for seq in self.sequences: - wdw = min(window, len(seq)) # if sequence is shorter than window, take the whole sequence instead + wdw = min( + window, len(seq) + ) # if sequence is shorter than window, take the whole sequence instead mtrx = [] mwdw = [] - + for aa in range(len(seq)): mtrx.append(self.scale[str(seq[aa])]) - + for i in range(len(mtrx) - wdw + 1): - mwdw.append(sum(mtrx[i:i + wdw], [])) - + mwdw.append(sum(mtrx[i : i + wdw], [])) + mwdw = np.asarray(mwdw) - rads = angle * (np.pi / 180) * np.asarray(range(wdw)) # calculate actual moment (radial) + rads = ( + angle * (np.pi / 180) * np.asarray(range(wdw)) + ) # calculate actual moment (radial) vcos = (mwdw * np.cos(rads)).sum(axis=1) vsin = (mwdw * np.sin(rads)).sum(axis=1) - moms = np.sqrt(vsin ** 2 + vcos ** 2) / wdw - - if modality == 'max': # take window with maximal value + moms = np.sqrt(vsin**2 + vcos**2) / wdw + + if modality == "max": # take window with maximal value moment = np.max(moms) - elif modality == 'mean': # take average value over all windows + elif modality == "mean": # take average value over all windows moment = np.mean(moms) - elif modality == 'all': + elif modality == "all": moment = moms else: - print('\nERROR!\nModality parameter is wrong, please choose between "all", "max" and "mean".\n') + print( + '\nERROR!\nModality parameter is wrong, please choose between "all", "max" and "mean".\n' + ) return desc.append(moment) self.all_moms.append(moms) - + desc = np.asarray(desc).reshape(len(desc), 1) # final descriptor array - + if append: self.descriptor = np.hstack((self.descriptor, np.array(desc))) else: self.descriptor = np.array(desc) - def calculate_global(self, window=1000, modality='max', append=False): + def calculate_global(self, window=1000, modality="max", append=False): """Method for calculating a global / window averaging descriptor value of a given AA scale :param window: {int} amino acid window in which to calculate the moment. If the sequence is shorter than the @@ -900,38 +978,48 @@ def calculate_global(self, window=1000, modality='max', append=False): """ desc = list() for n, seq in enumerate(self.sequences): - wdw = min(window, len(seq)) # if sequence is shorter than window, take the whole sequence instead + wdw = min( + window, len(seq) + ) # if sequence is shorter than window, take the whole sequence instead mtrx = [] mwdw = [] - + for l in range(len(seq)): # translate AA sequence into values mtrx.append(self.scale[str(seq[l])]) for i in range(len(mtrx) - wdw + 1): - mwdw.append(sum(mtrx[i:i + wdw], [])) # list of all the values for the different windows - + mwdw.append( + sum(mtrx[i : i + wdw], []) + ) # list of all the values for the different windows + mwdw = np.asarray(mwdw) glob = np.sum(mwdw, axis=1) / float(wdw) outglob = float() - - if modality in ['max', 'mean']: - if modality == 'max': - outglob = np.max(glob) # returned moment will be the maximum of all windows - elif modality == 'mean': - outglob = np.mean(glob) # returned moment will be the mean of all windows + + if modality in ["max", "mean"]: + if modality == "max": + outglob = np.max( + glob + ) # returned moment will be the maximum of all windows + elif modality == "mean": + outglob = np.mean( + glob + ) # returned moment will be the mean of all windows else: - print('Modality parameter is wrong, please choose between "max" and "mean"\n.') + print( + 'Modality parameter is wrong, please choose between "max" and "mean"\n.' + ) return desc.append(outglob) self.all_globs.append(glob) - + desc = np.asarray(desc).reshape(len(desc), 1) if append: self.descriptor = np.hstack((self.descriptor, np.array(desc))) else: self.descriptor = np.array(desc) - def calculate_profile(self, prof_type='uH', window=7, append=False): + def calculate_profile(self, prof_type="uH", window=7, append=False): """Method for calculating hydrophobicity or hydrophobic moment profiles for given sequences and fitting for slope and intercept. The hydrophobicity scale used is "eisenberg" @@ -948,23 +1036,27 @@ def calculate_profile(self, prof_type='uH', window=7, append=False): >>> AMP.descriptor array([[ 0.03731293, 0.19246599]]) """ - if prof_type == 'uH': + if prof_type == "uH": self.calculate_moment(window=window) y_vals = self.all_moms - elif prof_type == 'H': + elif prof_type == "H": self.calculate_global(window=window) y_vals = self.all_globs else: - print('prof_type parameter is unknown, choose "uH" for hydrophobic moment or "H" for hydrophobicity\n.') + print( + 'prof_type parameter is unknown, choose "uH" for hydrophobic moment or "H" for hydrophobicity\n.' + ) sys.exit() desc = list() for n, seq in enumerate(self.sequences): - x_vals = range(len(seq))[int((window - 1) / 2):-int((window - 1) / 2)] + x_vals = range(len(seq))[int((window - 1) / 2) : -int((window - 1) / 2)] if len(seq) <= window: slope, intercept, r_value, p_value, std_err = [0, 0, 0, 0, 0] else: - slope, intercept, r_value, p_value, std_err = stats.linregress(x_vals, y_vals[n]) + slope, intercept, r_value, p_value, std_err = stats.linregress( + x_vals, y_vals[n] + ) desc.append([slope, intercept]) if append: @@ -973,8 +1065,8 @@ def calculate_profile(self, prof_type='uH', window=7, append=False): self.descriptor = np.array(desc) def calculate_arc(self, modality="max", append=False): - """ Method for calculating property arcs as seen in the helical wheel plot. Use for binary amino acid scales only. - + """Method for calculating property arcs as seen in the helical wheel plot. Use for binary amino acid scales only. + :param modality: modality of the arc to calculate, to choose between "max" and "mean". :param append: if true, append to current descriptor stored in the descriptor attribute. :return: calculated descriptor as numpy.array in the descriptor attribute. @@ -986,7 +1078,9 @@ def calculate_arc(self, modality="max", append=False): >>> arc.descriptor array([[200, 160, 160, 0, 0]]) """ - desc = Parallel(n_jobs=-1)(delayed(_one_arc)(seq, modality, self.scale) for seq in self.sequences) + desc = Parallel(n_jobs=-1)( + delayed(_one_arc)(seq, modality, self.scale) for seq in self.sequences + ) # Converts each of the amino acids to descriptor vector for seq in self.sequences: @@ -995,13 +1089,13 @@ def calculate_arc(self, modality="max", append=False): # for aa in seq: # desc_mat.append(self.scale[aa]) # desc_mat = np.asarray(desc_mat) - # + # # # Check descriptor dimension # desc_dim = desc_mat.shape[1] - # + # # # list to store descriptor values for all windows # allwindows_arc = [] - # + # # if len(seq) > 18: # window = 18 # # calculates number of windows in sequence @@ -1009,15 +1103,15 @@ def calculate_arc(self, modality="max", append=False): # else: # window = len(seq) # num_windows = 1 - # + # # # loop through all windows # for j in range(num_windows): # # slices descriptor matrix into current window # window_mat = desc_mat[j:j + window, :] - # + # # # defines order of amino acids in helical projection # order = [0, 11, 4, 15, 8, 1, 12, 5, 16, 9, 2, 13, 6, 17, 10, 3, 14, 7] - # + # # # orders window descriptor matrix into helical projection order # ordered = [] # for pos in order: @@ -1027,14 +1121,14 @@ def calculate_arc(self, modality="max", append=False): # # for sequences of len < 18 adding dummy vector with 2s, length of descriptor dimensions # ordered.append([2] * desc_dim) # ordered = np.asarray(ordered) - # + # # window_arc = [] - # + # # # loop through pharmacophoric features # for m in range(desc_dim): # all_arcs = [] # stores all arcs that can be found of a pharmacophoric feature # arc = 0 - # + # # for n in range(18): # for all positions in helix, regardless of sequence length # if ordered[n, m] == 0: # if position does not contain pharmacophoric feature # all_arcs.append(arc) # append previous arc to all arcs list @@ -1064,17 +1158,17 @@ def calculate_arc(self, modality="max", append=False): # else: # if ordered[n + 2, m] == 1: # arc += 10 - # + # # all_arcs.append(arc) # if not arc == 360: # arc0 = all_arcs.pop() + all_arcs[0] # join first and last arc together # all_arcs = [arc0] + all_arcs[1:] - # + # # window_arc.append(np.max(all_arcs)) # append to window arcs the maximum arc of this PF # allwindows_arc.append(window_arc) # append all PF arcs of this window - # + # # allwindows_arc = np.asarray(allwindows_arc) - # + # # if modality == 'max': # final_arc = np.max(allwindows_arc, axis=0) # calculate maximum / mean arc along all windows # elif modality == 'mean': @@ -1087,11 +1181,3 @@ def calculate_arc(self, modality="max", append=False): self.descriptor = np.hstack((self.descriptor, np.array(desc))) else: self.descriptor = np.array(desc) - - - - - - - - diff --git a/cpt_helical_wheel/plotWheels/helical_wheel.py b/cpt_helical_wheel/plotWheels/helical_wheel.py index b4d6b56..eb9a8bf 100644 --- a/cpt_helical_wheel/plotWheels/helical_wheel.py +++ b/cpt_helical_wheel/plotWheels/helical_wheel.py @@ -1,19 +1,33 @@ import matplotlib -matplotlib.use('Agg') + +matplotlib.use("Agg") import matplotlib.lines as lines import matplotlib.patches as patches import matplotlib.pyplot as plt -#from mpl_toolkits.mplot3d import Axes3D + +# from mpl_toolkits.mplot3d import Axes3D import numpy as np from scipy.stats.kde import gaussian_kde from plotWheels.core import load_scale from plotWheels.descriptors import PeptideDescriptor -def helical_wheel(sequence, colorcoding='rainbow', text_color=None, - lineweights=True, filename=None, seq=False, moment=False, - seqRange=1, t_size=32, rot=float(90), dpi=150, numbering=True): + +def helical_wheel( + sequence, + colorcoding="rainbow", + text_color=None, + lineweights=True, + filename=None, + seq=False, + moment=False, + seqRange=1, + t_size=32, + rot=float(90), + dpi=150, + numbering=True, +): """A function to project a given peptide sequence onto a helical wheel plot. It can be useful to illustrate the properties of alpha-helices, like positioning of charged and hydrophobic residues along the sequence. @@ -48,59 +62,280 @@ def helical_wheel(sequence, colorcoding='rainbow', text_color=None, .. versionadded:: v2.1.5 """ # color mappings - aa = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] + aa = [ + "A", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "K", + "L", + "M", + "N", + "P", + "Q", + "R", + "S", + "T", + "V", + "W", + "Y", + ] if colorcoding == type(str): - f_rainbow = ['#3e3e28', '#ffcc33', '#b30047', '#b30047', '#ffcc33', '#3e3e28', '#80d4ff', '#ffcc33', '#0047b3', - '#ffcc33', '#ffcc33', '#b366ff', '#29a329', '#b366ff', '#0047b3', '#ff66cc', '#ff66cc', '#ffcc33', - '#ffcc33', '#ffcc33'] - f_charge = ['#000000', '#000000', '#ff4d94', '#ff4d94', '#000000', '#000000', '#80d4ff', '#000000', '#80d4ff', - '#000000', '#000000', '#000000', '#000000', '#000000', '#80d4ff', '#000000', '#000000', '#000000', - '#000000', '#000000'] - f_polar = ['#000000', '#000000', '#80d4ff', '#80d4ff', '#000000', '#000000', '#80d4ff', '#000000', '#80d4ff', - '#000000', '#000000', '#80d4ff', '#000000', '#80d4ff', '#80d4ff', '#80d4ff', '#80d4ff', '#000000', - '#000000', '#000000'] - f_simple = ['#ffcc33', '#ffcc33', '#0047b3', '#0047b3', '#ffcc33', '#7f7f7f', '#0047b3', '#ffcc33', '#0047b3', - '#ffcc33', '#ffcc33', '#0047b3', '#ffcc33', '#0047b3', '#0047b3', '#0047b3', '#0047b3', '#ffcc33', - '#ffcc33', '#ffcc33'] - f_none = ['#ffffff'] * 20 - f_amphi = ['#ffcc33', '#29a329', '#b30047', '#b30047', '#f79318', '#80d4ff', '#0047b3', '#ffcc33', '#0047b3', - '#ffcc33', '#ffcc33', '#80d4ff', '#29a329', '#80d4ff', '#0047b3', '#80d4ff', '#80d4ff', '#ffcc33', - '#f79318', '#f79318'] - t_rainbow = ['w', 'k', 'w', 'w', 'k', 'w', 'k', 'k', 'w', 'k', 'k', 'k', 'k', 'k', 'w', 'k', 'k', 'k', 'k', 'k'] - t_charge = ['w', 'w', 'k', 'k', 'w', 'w', 'k', 'w', 'k', 'w', 'w', 'w', 'w', 'w', 'k', 'w', 'w', 'w', 'w', 'w'] - t_polar = ['w', 'w', 'k', 'k', 'w', 'w', 'k', 'w', 'k', 'w', 'w', 'k', 'w', 'k', 'k', 'k', 'k', 'w', 'w', 'w'] - t_simple = ['k', 'k', 'w', 'w', 'k', 'w', 'w', 'k', 'w', 'k', 'k', 'k', 'k', 'w', 'w', 'w', 'w', 'k', 'k', 'k'] - t_none = ['k'] * 20 - t_amphi = ['k', 'k', 'w', 'w', 'w', 'k', 'w', 'k', 'w', 'k', 'k', 'k', 'w', 'k', 'w', 'k', 'k', 'k', 'w', 'w'] - d_eisberg = load_scale('eisenberg')[1] # eisenberg hydrophobicity values for HM + f_rainbow = [ + "#3e3e28", + "#ffcc33", + "#b30047", + "#b30047", + "#ffcc33", + "#3e3e28", + "#80d4ff", + "#ffcc33", + "#0047b3", + "#ffcc33", + "#ffcc33", + "#b366ff", + "#29a329", + "#b366ff", + "#0047b3", + "#ff66cc", + "#ff66cc", + "#ffcc33", + "#ffcc33", + "#ffcc33", + ] + f_charge = [ + "#000000", + "#000000", + "#ff4d94", + "#ff4d94", + "#000000", + "#000000", + "#80d4ff", + "#000000", + "#80d4ff", + "#000000", + "#000000", + "#000000", + "#000000", + "#000000", + "#80d4ff", + "#000000", + "#000000", + "#000000", + "#000000", + "#000000", + ] + f_polar = [ + "#000000", + "#000000", + "#80d4ff", + "#80d4ff", + "#000000", + "#000000", + "#80d4ff", + "#000000", + "#80d4ff", + "#000000", + "#000000", + "#80d4ff", + "#000000", + "#80d4ff", + "#80d4ff", + "#80d4ff", + "#80d4ff", + "#000000", + "#000000", + "#000000", + ] + f_simple = [ + "#ffcc33", + "#ffcc33", + "#0047b3", + "#0047b3", + "#ffcc33", + "#7f7f7f", + "#0047b3", + "#ffcc33", + "#0047b3", + "#ffcc33", + "#ffcc33", + "#0047b3", + "#ffcc33", + "#0047b3", + "#0047b3", + "#0047b3", + "#0047b3", + "#ffcc33", + "#ffcc33", + "#ffcc33", + ] + f_none = ["#ffffff"] * 20 + f_amphi = [ + "#ffcc33", + "#29a329", + "#b30047", + "#b30047", + "#f79318", + "#80d4ff", + "#0047b3", + "#ffcc33", + "#0047b3", + "#ffcc33", + "#ffcc33", + "#80d4ff", + "#29a329", + "#80d4ff", + "#0047b3", + "#80d4ff", + "#80d4ff", + "#ffcc33", + "#f79318", + "#f79318", + ] + t_rainbow = [ + "w", + "k", + "w", + "w", + "k", + "w", + "k", + "k", + "w", + "k", + "k", + "k", + "k", + "k", + "w", + "k", + "k", + "k", + "k", + "k", + ] + t_charge = [ + "w", + "w", + "k", + "k", + "w", + "w", + "k", + "w", + "k", + "w", + "w", + "w", + "w", + "w", + "k", + "w", + "w", + "w", + "w", + "w", + ] + t_polar = [ + "w", + "w", + "k", + "k", + "w", + "w", + "k", + "w", + "k", + "w", + "w", + "k", + "w", + "k", + "k", + "k", + "k", + "w", + "w", + "w", + ] + t_simple = [ + "k", + "k", + "w", + "w", + "k", + "w", + "w", + "k", + "w", + "k", + "k", + "k", + "k", + "w", + "w", + "w", + "w", + "k", + "k", + "k", + ] + t_none = ["k"] * 20 + t_amphi = [ + "k", + "k", + "w", + "w", + "w", + "k", + "w", + "k", + "w", + "k", + "k", + "k", + "w", + "k", + "w", + "k", + "k", + "k", + "w", + "w", + ] + d_eisberg = load_scale("eisenberg")[1] # eisenberg hydrophobicity values for HM else: f_custom = colorcoding t_custom = text_color - d_eisberg = load_scale('eisenberg')[1] + d_eisberg = load_scale("eisenberg")[1] if lineweights: - lw = np.arange(0.1, 5.5, 5. / (len(sequence) - 1)) # line thickness array + lw = np.arange(0.1, 5.5, 5.0 / (len(sequence) - 1)) # line thickness array lw = lw[::-1] # inverse order else: - lw = [2.] * (len(sequence) - 1) + lw = [2.0] * (len(sequence) - 1) # check which color coding to use if colorcoding == type(str): - if colorcoding == 'rainbow': + if colorcoding == "rainbow": df = dict(zip(aa, f_rainbow)) dt = dict(zip(aa, t_rainbow)) - elif colorcoding == 'charge': + elif colorcoding == "charge": df = dict(zip(aa, f_charge)) dt = dict(zip(aa, t_charge)) - elif colorcoding == 'polar': + elif colorcoding == "polar": df = dict(zip(aa, f_polar)) dt = dict(zip(aa, t_polar)) - elif colorcoding == 'simple': + elif colorcoding == "simple": df = dict(zip(aa, f_simple)) dt = dict(zip(aa, t_simple)) - elif colorcoding == 'none': + elif colorcoding == "none": df = dict(zip(aa, f_none)) dt = dict(zip(aa, t_none)) - elif colorcoding == 'amphipathic': + elif colorcoding == "amphipathic": df = dict(zip(aa, f_amphi)) dt = dict(zip(aa, t_amphi)) else: @@ -110,103 +345,197 @@ def helical_wheel(sequence, colorcoding='rainbow', text_color=None, else: df = dict(zip(aa, f_custom)) dt = dict(zip(aa, t_custom)) - + # degree to radian - deg = np.arange(float(len(sequence))) * -100. + deg = np.arange(float(len(sequence))) * -100.0 deg = [d + rot for d in deg] # start at 270 degree in unit circle (on top) rad = np.radians(deg) - + # dict for coordinates and eisenberg values - d_hydro = dict(zip(rad, [0.] * len(rad))) - + d_hydro = dict(zip(rad, [0.0] * len(rad))) + # create figure fig = plt.figure(frameon=False, figsize=(10, 10)) ax = fig.add_subplot(111) old = None hm = list() - + # iterate over sequence for i, r in enumerate(rad): new = (np.cos(r), np.sin(r)) # new AA coordinates if i < 18: # plot the connecting lines if old is not None: - line = lines.Line2D((old[0], new[0]), (old[1], new[1]), transform=ax.transData, color='k', - linewidth=lw[i - 1]) + line = lines.Line2D( + (old[0], new[0]), + (old[1], new[1]), + transform=ax.transData, + color="k", + linewidth=lw[i - 1], + ) line.set_zorder(1) # 1 = level behind circles ax.add_line(line) elif 17 < i < 36: - line = lines.Line2D((old[0], new[0]), (old[1], new[1]), transform=ax.transData, color='k', - linewidth=lw[i - 1]) + line = lines.Line2D( + (old[0], new[0]), + (old[1], new[1]), + transform=ax.transData, + color="k", + linewidth=lw[i - 1], + ) line.set_zorder(1) # 1 = level behind circles ax.add_line(line) new = (np.cos(r) * 1.2, np.sin(r) * 1.2) elif i == 36: - line = lines.Line2D((old[0], new[0]), (old[1], new[1]), transform=ax.transData, color='k', - linewidth=lw[i - 1]) + line = lines.Line2D( + (old[0], new[0]), + (old[1], new[1]), + transform=ax.transData, + color="k", + linewidth=lw[i - 1], + ) line.set_zorder(1) # 1 = level behind circles ax.add_line(line) new = (np.cos(r) * 1.4, np.sin(r) * 1.4) else: new = (np.cos(r) * 1.4, np.sin(r) * 1.4) - + # plot circles - circ = patches.Circle(new, radius=0.125, transform=ax.transData, edgecolor='k', facecolor=df[sequence[i]]) + circ = patches.Circle( + new, + radius=0.125, + transform=ax.transData, + edgecolor="k", + facecolor=df[sequence[i]], + ) circ.set_zorder(2) # level in front of lines ax.add_patch(circ) - + # check if N- or C-terminus and add subscript, then plot AA letter if numbering: size = t_size if i == 0: - ax.text(new[0], new[1], sequence[i] + '$_N$', va='center', ha='center', transform=ax.transData, - size=size, color=dt[sequence[i]], fontweight='bold') + ax.text( + new[0], + new[1], + sequence[i] + "$_N$", + va="center", + ha="center", + transform=ax.transData, + size=size, + color=dt[sequence[i]], + fontweight="bold", + ) elif i == len(sequence) - 1: - ax.text(new[0], new[1], sequence[i] + '$_C$', va='center', ha='center', transform=ax.transData, - size=size, color=dt[sequence[i]], fontweight='bold') + ax.text( + new[0], + new[1], + sequence[i] + "$_C$", + va="center", + ha="center", + transform=ax.transData, + size=size, + color=dt[sequence[i]], + fontweight="bold", + ) else: seqRange += 1 - ax.text(new[0], new[1], sequence[i] + '$_{'+str(seqRange)+'}$', va='center', ha='center', transform=ax.transData, - size=size, color=dt[sequence[i]], fontweight='bold') + ax.text( + new[0], + new[1], + sequence[i] + "$_{" + str(seqRange) + "}$", + va="center", + ha="center", + transform=ax.transData, + size=size, + color=dt[sequence[i]], + fontweight="bold", + ) eb = d_eisberg[sequence[i]][0] # eisenberg value for this AA - hm.append([eb * new[0], eb * new[1]]) # save eisenberg hydrophobicity vector value to later calculate HM - + hm.append( + [eb * new[0], eb * new[1]] + ) # save eisenberg hydrophobicity vector value to later calculate HM + old = (np.cos(r), np.sin(r)) # save as previous coordinates else: size = t_size if i == 0: - ax.text(new[0], new[1], sequence[i] + '$_N$', va='center', ha='center', transform=ax.transData, - size=size, color=dt[sequence[i]], fontweight='bold') + ax.text( + new[0], + new[1], + sequence[i] + "$_N$", + va="center", + ha="center", + transform=ax.transData, + size=size, + color=dt[sequence[i]], + fontweight="bold", + ) elif i == len(sequence) - 1: - ax.text(new[0], new[1], sequence[i] + '$_C$', va='center', ha='center', transform=ax.transData, - size=size, color=dt[sequence[i]], fontweight='bold') + ax.text( + new[0], + new[1], + sequence[i] + "$_C$", + va="center", + ha="center", + transform=ax.transData, + size=size, + color=dt[sequence[i]], + fontweight="bold", + ) else: - ax.text(new[0], new[1], sequence[i], va='center', ha='center', transform=ax.transData, - size=size, color=dt[sequence[i]], fontweight='bold') + ax.text( + new[0], + new[1], + sequence[i], + va="center", + ha="center", + transform=ax.transData, + size=size, + color=dt[sequence[i]], + fontweight="bold", + ) eb = d_eisberg[sequence[i]][0] # eisenberg value for this AA - hm.append([eb * new[0], eb * new[1]]) # save eisenberg hydrophobicity vector value to later calculate HM - + hm.append( + [eb * new[0], eb * new[1]] + ) # save eisenberg hydrophobicity vector value to later calculate HM + old = (np.cos(r), np.sin(r)) # save as previous coordinates - + # draw hydrophobic moment arrow if moment option if moment: v_hm = np.sum(np.array(hm), 0) - x = .0333 * v_hm[0] - y = .0333 * v_hm[1] - ax.arrow(0., 0., x, y, head_width=0.04, head_length=0.03, transform=ax.transData, - color='k', linewidth=6.) + x = 0.0333 * v_hm[0] + y = 0.0333 * v_hm[1] + ax.arrow( + 0.0, + 0.0, + x, + y, + head_width=0.04, + head_length=0.03, + transform=ax.transData, + color="k", + linewidth=6.0, + ) desc = PeptideDescriptor(sequence) # calculate hydrophobic moment desc.calculate_moment() - if abs(x) < 0.2 and y > 0.: # right positioning of HM text so arrow does not cover it + if ( + abs(x) < 0.2 and y > 0.0 + ): # right positioning of HM text so arrow does not cover it z = -0.2 else: z = 0.2 - plt.text(0., z, str(round(desc.descriptor[0][0], 3)), fontdict={'fontsize': 20, 'fontweight': 'bold', - 'ha': 'center'}) - + plt.text( + 0.0, + z, + str(round(desc.descriptor[0][0], 3)), + fontdict={"fontsize": 20, "fontweight": "bold", "ha": "center"}, + ) + # plot shape if len(sequence) < 19: ax.set_xlim(-1.2, 1.2) @@ -214,18 +543,18 @@ def helical_wheel(sequence, colorcoding='rainbow', text_color=None, else: ax.set_xlim(-1.4, 1.4) ax.set_ylim(-1.4, 1.4) - ax.spines['right'].set_visible(False) - ax.spines['top'].set_visible(False) - ax.spines['left'].set_visible(False) - ax.spines['bottom'].set_visible(False) + ax.spines["right"].set_visible(False) + ax.spines["top"].set_visible(False) + ax.spines["left"].set_visible(False) + ax.spines["bottom"].set_visible(False) cur_axes = plt.gca() cur_axes.axes.get_xaxis().set_visible(False) cur_axes.axes.get_yaxis().set_visible(False) plt.tight_layout() - + if seq: - plt.title(sequence, fontweight='bold', fontsize=20) - + plt.title(sequence, fontweight="bold", fontsize=20) + # show or save plot if filename: plt.savefig(filename, dpi=dpi) diff --git a/cpt_intersect_adj/intersect_and_adjacent.py b/cpt_intersect_adj/intersect_and_adjacent.py index 0d38ad1..674c9dc 100755 --- a/cpt_intersect_adj/intersect_and_adjacent.py +++ b/cpt_intersect_adj/intersect_and_adjacent.py @@ -9,12 +9,14 @@ logging.basicConfig(level=logging.INFO) log = logging.getLogger(__name__) + def validFeat(rec): for feat in rec.features: - if feat.type != 'remark' and feat.type != 'annotation': - return True + if feat.type != "remark" and feat.type != "annotation": + return True return False + def treeFeatures(features, window): for feat in features: # Interval(begin, end, data) @@ -23,10 +25,12 @@ def treeFeatures(features, window): int(feat.location.end) + int(window), feat.id, ) + + def treeFeatures_noRem(features, window): for feat in features: - if feat.type == 'remark' or feat.type == 'annotation': - continue + if feat.type == "remark" or feat.type == "annotation": + continue # Interval(begin, end, data) yield Interval( int(feat.location.start) - int(window), @@ -42,167 +46,208 @@ def intersect(a, b, window, stranding): rec_b_out = [] maxLen = min(len(rec_a), len(rec_b)) iterate = 0 - if maxLen > 0: while iterate < maxLen: - rec_a_i = rec_a[iterate] - rec_b_i = rec_b[iterate] + rec_a_i = rec_a[iterate] + rec_b_i = rec_b[iterate] - if (not validFeat(rec_a_i)) or (not validFeat(rec_b_i)): - rec_a_out.append(SeqRecord(rec_a[iterate].seq, rec_a[iterate].id, rec_a[iterate].name, rec_a[iterate].description, rec_a[iterate].dbxrefs, [], rec_a[iterate].annotations)) - rec_b_out.append(SeqRecord(rec_b[iterate].seq, rec_b[iterate].id, rec_b[iterate].name, rec_b[iterate].description, rec_b[iterate].dbxrefs, [], rec_b[iterate].annotations)) - iterate += 1 - continue + if (not validFeat(rec_a_i)) or (not validFeat(rec_b_i)): + rec_a_out.append( + SeqRecord( + rec_a[iterate].seq, + rec_a[iterate].id, + rec_a[iterate].name, + rec_a[iterate].description, + rec_a[iterate].dbxrefs, + [], + rec_a[iterate].annotations, + ) + ) + rec_b_out.append( + SeqRecord( + rec_b[iterate].seq, + rec_b[iterate].id, + rec_b[iterate].name, + rec_b[iterate].description, + rec_b[iterate].dbxrefs, + [], + rec_b[iterate].annotations, + ) + ) + iterate += 1 + continue - a_neg = [] - a_pos = [] - b_neg = [] - b_pos = [] - tree_a = [] - tree_b = [] - if stranding == True: - for feat in rec_a_i.features: - if feat.type == 'remark' or feat.type == 'annotation': - continue - if feat.strand > 0: - a_pos.append( - Interval( - int(feat.location.start) - int(window), - int(feat.location.end) + int(window), - feat.id, + a_neg = [] + a_pos = [] + b_neg = [] + b_pos = [] + tree_a = [] + tree_b = [] + if stranding == True: + for feat in rec_a_i.features: + if feat.type == "remark" or feat.type == "annotation": + continue + if feat.strand > 0: + a_pos.append( + Interval( + int(feat.location.start) - int(window), + int(feat.location.end) + int(window), + feat.id, + ) ) - ) - else: - a_neg.append( - Interval( - int(feat.location.start) - int(window), - int(feat.location.end) + int(window), - feat.id, + else: + a_neg.append( + Interval( + int(feat.location.start) - int(window), + int(feat.location.end) + int(window), + feat.id, + ) ) - ) - for feat in rec_b_i.features: - if feat.type == 'remark' or feat.type == 'annotation': - continue - if feat.strand > 0: - b_pos.append( + for feat in rec_b_i.features: + if feat.type == "remark" or feat.type == "annotation": + continue + if feat.strand > 0: + b_pos.append( + Interval( + int(feat.location.start) - int(window), + int(feat.location.end) + int(window), + feat.id, + ) + ) + else: + b_neg.append( + Interval( + int(feat.location.start) - int(window), + int(feat.location.end) + int(window), + feat.id, + ) + ) + + else: + for feat in rec_a_i.features: + if feat.type == "remark" or feat.type == "annotation": + continue + tree_a.append( Interval( int(feat.location.start) - int(window), int(feat.location.end) + int(window), feat.id, ) ) - else: - b_neg.append( + for feat in rec_b_i.features: + if feat.type == "remark" or feat.type == "annotation": + continue + tree_b.append( Interval( int(feat.location.start) - int(window), int(feat.location.end) + int(window), feat.id, ) ) + if stranding: + # builds interval tree from Interval objects of form (start, end, id) for each feature + # tree_a = IntervalTree(list(treeFeatures_noRem(rec_a_i.features, window))) + # tree_b = IntervalTree(list(treeFeatures_noRem(rec_b_i.features, window))) + # else: + tree_a_pos = IntervalTree(a_pos) + tree_a_neg = IntervalTree(a_neg) + tree_b_pos = IntervalTree(b_pos) + tree_b_neg = IntervalTree(b_neg) + else: + tree_a = IntervalTree(tree_a) + tree_b = IntervalTree(tree_b) - else: - for feat in rec_a_i.features: - if feat.type == 'remark' or feat.type == 'annotation': - continue - tree_a.append( - Interval( - int(feat.location.start) - int(window), - int(feat.location.end) + int(window), - feat.id, - ) - ) - for feat in rec_b_i.features: - if feat.type == 'remark' or feat.type == 'annotation': - continue - tree_b.append( - Interval( - int(feat.location.start) - int(window), - int(feat.location.end) + int(window), - feat.id, - ) - ) - if stranding: - # builds interval tree from Interval objects of form (start, end, id) for each feature - # tree_a = IntervalTree(list(treeFeatures_noRem(rec_a_i.features, window))) - #tree_b = IntervalTree(list(treeFeatures_noRem(rec_b_i.features, window))) - #else: - tree_a_pos = IntervalTree(a_pos) - tree_a_neg = IntervalTree(a_neg) - tree_b_pos = IntervalTree(b_pos) - tree_b_neg = IntervalTree(b_neg) - else: - tree_a = IntervalTree(tree_a) - tree_b = IntervalTree(tree_b) - - - # Used to map ids back to features later - rec_a_map = {f.id: f for f in rec_a_i.features} - rec_b_map = {f.id: f for f in rec_b_i.features} - - rec_a_hits_in_b = [] - rec_b_hits_in_a = [] - - for feature in rec_a_i.features: - # Save each feature in rec_a that overlaps a feature in rec_b - # hits = tree_b.find_range((int(feature.location.start), int(feature.location.end))) - - if feature.type == "remark" or feature.type == "annotation": - continue - - if stranding == False: - hits = tree_b[int(feature.location.start) : int(feature.location.end)] - - - # feature id is saved in interval result.data, use map to get full feature - for hit in hits: - rec_a_hits_in_b.append(rec_b_map[hit.data]) + # Used to map ids back to features later + rec_a_map = {f.id: f for f in rec_a_i.features} + rec_b_map = {f.id: f for f in rec_b_i.features} - else: - if feature.strand > 0: - hits_pos = tree_b_pos[ - int(feature.location.start) : int(feature.location.end) - ] - for hit in hits_pos: - rec_a_hits_in_b.append(rec_b_map[hit.data]) - else: - hits_neg = tree_b_neg[ + rec_a_hits_in_b = [] + rec_b_hits_in_a = [] + + for feature in rec_a_i.features: + # Save each feature in rec_a that overlaps a feature in rec_b + # hits = tree_b.find_range((int(feature.location.start), int(feature.location.end))) + + if feature.type == "remark" or feature.type == "annotation": + continue + + if stranding == False: + hits = tree_b[ int(feature.location.start) : int(feature.location.end) ] - for hit in hits_neg: - rec_a_hits_in_b.append(rec_b_map[hit.data]) - for feature in rec_b_i.features: - if feature.type == "remark" or feature.type == "annotation": - continue + # feature id is saved in interval result.data, use map to get full feature + for hit in hits: + rec_a_hits_in_b.append(rec_b_map[hit.data]) - if stranding == False: - hits = tree_a[int(feature.location.start) : int(feature.location.end)] + else: + if feature.strand > 0: + hits_pos = tree_b_pos[ + int(feature.location.start) : int(feature.location.end) + ] + for hit in hits_pos: + rec_a_hits_in_b.append(rec_b_map[hit.data]) + else: + hits_neg = tree_b_neg[ + int(feature.location.start) : int(feature.location.end) + ] + for hit in hits_neg: + rec_a_hits_in_b.append(rec_b_map[hit.data]) - # feature id is saved in interval result.data, use map to get full feature - for hit in hits: - rec_b_hits_in_a.append(rec_a_map[hit.data]) + for feature in rec_b_i.features: + if feature.type == "remark" or feature.type == "annotation": + continue - else: - if feature.strand > 0: - hits_pos = tree_a_pos[ + if stranding == False: + hits = tree_a[ int(feature.location.start) : int(feature.location.end) ] - for hit in hits_pos: + + # feature id is saved in interval result.data, use map to get full feature + for hit in hits: rec_b_hits_in_a.append(rec_a_map[hit.data]) + else: - hits_neg = tree_a_neg[ - int(feature.location.start) : int(feature.location.end) - ] - for hit in hits_neg: - rec_b_hits_in_a.append(rec_a_map[hit.data]) + if feature.strand > 0: + hits_pos = tree_a_pos[ + int(feature.location.start) : int(feature.location.end) + ] + for hit in hits_pos: + rec_b_hits_in_a.append(rec_a_map[hit.data]) + else: + hits_neg = tree_a_neg[ + int(feature.location.start) : int(feature.location.end) + ] + for hit in hits_neg: + rec_b_hits_in_a.append(rec_a_map[hit.data]) + + # Remove duplicate features using sets + rec_a_out.append( + SeqRecord( + rec_a[iterate].seq, + rec_a[iterate].id, + rec_a[iterate].name, + rec_a[iterate].description, + rec_a[iterate].dbxrefs, + sorted(set(rec_a_hits_in_b), key=lambda feat: feat.location.start), + rec_a[iterate].annotations, + ) + ) + rec_b_out.append( + SeqRecord( + rec_b[iterate].seq, + rec_b[iterate].id, + rec_b[iterate].name, + rec_b[iterate].description, + rec_b[iterate].dbxrefs, + sorted(set(rec_b_hits_in_a), key=lambda feat: feat.location.start), + rec_b[iterate].annotations, + ) + ) + iterate += 1 - # Remove duplicate features using sets - rec_a_out.append(SeqRecord(rec_a[iterate].seq, rec_a[iterate].id, rec_a[iterate].name, rec_a[iterate].description, rec_a[iterate].dbxrefs, sorted(set(rec_a_hits_in_b), key=lambda feat: feat.location.start), rec_a[iterate].annotations)) - rec_b_out.append(SeqRecord(rec_b[iterate].seq, rec_b[iterate].id, rec_b[iterate].name, rec_b[iterate].description, rec_b[iterate].dbxrefs, sorted(set(rec_b_hits_in_a), key=lambda feat: feat.location.start), rec_b[iterate].annotations)) - iterate += 1 - else: # If one input is empty, output two empty result files. rec_a_out = [SeqRecord(Seq(""), "none")] @@ -235,8 +280,8 @@ def intersect(a, b, window, stranding): with open(args.oa, "w") as handle: for rec in a: - gffWrite([rec], handle) + gffWrite([rec], handle) with open(args.ob, "w") as handle: for rec in b: - gffWrite([rec], handle) + gffWrite([rec], handle) diff --git a/cpt_intron_detect/gff3.py b/cpt_intron_detect/gff3.py index d4795d4..48496c3 100755 --- a/cpt_intron_detect/gff3.py +++ b/cpt_intron_detect/gff3.py @@ -97,10 +97,10 @@ def feature_test_type(feature, **kwargs): if "type" in kwargs: return str(feature.type).upper() == str(kwargs["type"]).upper() elif "types" in kwargs: - for x in kwargs["types"]: - if str(feature.type).upper() == str(x).upper(): - return True - return False + for x in kwargs["types"]: + if str(feature.type).upper() == str(x).upper(): + return True + return False raise Exception("Incorrect feature_test_type call, need type or types") diff --git a/cpt_intron_detect/intron_detection.py b/cpt_intron_detect/intron_detection.py index 698de28..73a42d0 100755 --- a/cpt_intron_detect/intron_detection.py +++ b/cpt_intron_detect/intron_detection.py @@ -17,7 +17,7 @@ def parse_xml(blastxml, thresh): - """ Parses xml file to get desired info (genes, hits, etc) """ + """Parses xml file to get desired info (genes, hits, etc)""" blast = [] discarded_records = 0 totLen = 0 @@ -64,7 +64,7 @@ def parse_xml(blastxml, thresh): def filter_lone_clusters(clusters): - """ Removes all clusters with only one member and those with no hits """ + """Removes all clusters with only one member and those with no hits""" filtered_clusters = {} for key in clusters: if len(clusters[key]) > 1 and len(key) > 0: @@ -78,7 +78,7 @@ def test_true(feature, **kwargs): def parse_gff(gff3): - """ Extracts strand and start location to be used in cluster filtering """ + """Extracts strand and start location to be used in cluster filtering""" log.debug("parse_gff3") gff_info = {} _rec = None @@ -113,12 +113,12 @@ def parse_gff(gff3): def all_same(genes_list): - """ Returns True if all gene names in cluster are identical """ + """Returns True if all gene names in cluster are identical""" return all(gene["name"] == genes_list[0]["name"] for gene in genes_list[1:]) def remove_duplicates(clusters): - """ Removes clusters with multiple members but only one gene name """ + """Removes clusters with multiple members but only one gene name""" filtered_clusters = {} for key in clusters: if all_same(clusters[key]): @@ -130,7 +130,7 @@ def remove_duplicates(clusters): class IntronFinder(object): - """ IntronFinder objects are lists that contain a list of hits for every gene """ + """IntronFinder objects are lists that contain a list of hits for every gene""" def __init__(self, gff3, blastp, thresh): self.blast = [] @@ -142,7 +142,7 @@ def __init__(self, gff3, blastp, thresh): self.blast = parse_xml(blastp, thresh) def create_clusters(self): - """ Finds 2 or more genes with matching hits """ + """Finds 2 or more genes with matching hits""" clusters = {} for gene in self.blast: for hit in gene: @@ -163,7 +163,7 @@ def create_clusters(self): self.clusters = filter_lone_clusters(clusters) def check_strand(self): - """ filters clusters for genes on the same strand """ + """filters clusters for genes on the same strand""" filtered_clusters = {} for key in self.clusters: pos_strand = [] @@ -423,10 +423,14 @@ def output_gff3(self, clusters): # And we attach the things properly. mRNA.sub_features = cdss - mRNA.location = FeatureLocation(mRNA.location.start, mRNA.location.end, cds.location.strand) + mRNA.location = FeatureLocation( + mRNA.location.start, mRNA.location.end, cds.location.strand + ) gene.sub_features = [mRNA] - gene.location = FeatureLocation(gene.location.start, gene.location.end, cds.location.strand) - + gene.location = FeatureLocation( + gene.location.start, gene.location.end, cds.location.strand + ) + # And append to our record rec.features.append(gene) return rec diff --git a/cpt_linear_genome_plot/dna_features_viewer/BiopythonTranslator/BiopythonTranslator.py b/cpt_linear_genome_plot/dna_features_viewer/BiopythonTranslator/BiopythonTranslator.py index af6aa92..dbd362b 100755 --- a/cpt_linear_genome_plot/dna_features_viewer/BiopythonTranslator/BiopythonTranslator.py +++ b/cpt_linear_genome_plot/dna_features_viewer/BiopythonTranslator/BiopythonTranslator.py @@ -96,7 +96,7 @@ def compute_feature_label_link_color(self, feature): def compute_filtered_features(self, features): """Return the list of features minus the ignored ones. - + By the method keeps any feature whose type is not in ignored_features_types and for which all filter(f) pass """ diff --git a/cpt_linear_genome_plot/dna_features_viewer/BiopythonTranslator/BiopythonTranslatorBase.py b/cpt_linear_genome_plot/dna_features_viewer/BiopythonTranslator/BiopythonTranslatorBase.py index 7fbba3c..40176ad 100755 --- a/cpt_linear_genome_plot/dna_features_viewer/BiopythonTranslator/BiopythonTranslatorBase.py +++ b/cpt_linear_genome_plot/dna_features_viewer/BiopythonTranslator/BiopythonTranslatorBase.py @@ -23,6 +23,7 @@ class BiopythonTranslatorBase: A function (feature)=> properties_dict """ + graphic_record_parameters = {} def __init__(self, features_filters=(), features_properties=None): @@ -40,11 +41,11 @@ def translate_feature(self, feature): box_color=self.compute_feature_box_color(feature), linewidth=self.compute_feature_linewidth(feature), label_link_color=self.compute_feature_label_link_color(feature), - legend_text=self.compute_feature_legend_text(feature) + legend_text=self.compute_feature_legend_text(feature), ) if self.features_properties is not None: other_properties = self.features_properties - if hasattr(other_properties, '__call__'): + if hasattr(other_properties, "__call__"): other_properties = other_properties(feature) properties.update(other_properties) @@ -77,7 +78,7 @@ def translate_record(self, record, record_class=None): if record_class in classes: record_class = classes[record_class] - if isinstance(record, str) or hasattr(record, 'read'): + if isinstance(record, str) or hasattr(record, "read"): record = load_record(record) filtered_features = self.compute_filtered_features(record.features) return record_class( @@ -90,7 +91,7 @@ def translate_record(self, record, record_class=None): ], **self.graphic_record_parameters ) - + @classmethod def quick_class_plot(cls, record, figure_width=12, **kwargs): """Allows super quick and dirty plotting of Biopython records. @@ -104,7 +105,7 @@ def quick_class_plot(cls, record, figure_width=12, **kwargs): graphic_record = cls().translate_record(record) ax, _ = graphic_record.plot(figure_width=figure_width, **kwargs) return ax - + def quick_plot(self, record, figure_width=12, **kwargs): """Allows super quick and dirty plotting of Biopython records. diff --git a/cpt_linear_genome_plot/dna_features_viewer/BiopythonTranslator/BlackBoxlessLabelTranslator.py b/cpt_linear_genome_plot/dna_features_viewer/BiopythonTranslator/BlackBoxlessLabelTranslator.py index 1c308b8..a9a9472 100755 --- a/cpt_linear_genome_plot/dna_features_viewer/BiopythonTranslator/BlackBoxlessLabelTranslator.py +++ b/cpt_linear_genome_plot/dna_features_viewer/BiopythonTranslator/BlackBoxlessLabelTranslator.py @@ -1,8 +1,9 @@ from .BiopythonTranslator import BiopythonTranslator + class BlackBoxlessLabelTranslator(BiopythonTranslator): """Translates Biopython records into GraphicRecords where annotations - appear black on a white background with no box. Which can be cleaner.""" + appear black on a white background with no box. Which can be cleaner.""" def compute_feature_box_linewidth(self, feature): """Return 0 as this translator doesn't show a box.""" @@ -10,4 +11,4 @@ def compute_feature_box_linewidth(self, feature): def compute_feature_box_color(self, feature): """Return white.""" - return "white" \ No newline at end of file + return "white" diff --git a/cpt_linear_genome_plot/dna_features_viewer/BiopythonTranslator/__init__.py b/cpt_linear_genome_plot/dna_features_viewer/BiopythonTranslator/__init__.py index 1051e78..c0ceb03 100755 --- a/cpt_linear_genome_plot/dna_features_viewer/BiopythonTranslator/__init__.py +++ b/cpt_linear_genome_plot/dna_features_viewer/BiopythonTranslator/__init__.py @@ -1,4 +1,4 @@ from .BiopythonTranslator import BiopythonTranslator from .BlackBoxlessLabelTranslator import BlackBoxlessLabelTranslator -__all__ = ["BiopythonTranslator", "BlackBoxlessLabelTranslator"] \ No newline at end of file +__all__ = ["BiopythonTranslator", "BlackBoxlessLabelTranslator"] diff --git a/cpt_linear_genome_plot/dna_features_viewer/CircularGraphicRecord/ArrowWedge.py b/cpt_linear_genome_plot/dna_features_viewer/CircularGraphicRecord/ArrowWedge.py index 993d155..b7b4721 100755 --- a/cpt_linear_genome_plot/dna_features_viewer/CircularGraphicRecord/ArrowWedge.py +++ b/cpt_linear_genome_plot/dna_features_viewer/CircularGraphicRecord/ArrowWedge.py @@ -35,20 +35,16 @@ class ArrowWedge(mpatches.Wedge): indirect sense (-1) or no sense at all (0) """ - def __init__( - self, center, radius, theta1, theta2, width, direction=+1, **kwargs - ): + def __init__(self, center, radius, theta1, theta2, width, direction=+1, **kwargs): self.direction = direction self.radius = radius - mpatches.Wedge.__init__( - self, center, radius, theta1, theta2, width, **kwargs - ) + mpatches.Wedge.__init__(self, center, radius, theta1, theta2, width, **kwargs) self._recompute_path() def _recompute_path(self): """Recompute the full path forming the "tick" arrowed wedge - + This method overwrites "mpatches.Wedge._recompute_path" in the super-class. """ @@ -66,27 +62,19 @@ def _recompute_path(self): inner_arc = arc.vertices * (1 - normalized_arrow_width) arrow_vertices = [ outer_arc[-1], - np.array( - [np.cos(np.deg2rad(theta1)), np.sin(np.deg2rad(theta1))] - ), + np.array([np.cos(np.deg2rad(theta1)), np.sin(np.deg2rad(theta1))]), inner_arc[0], ] else: angle_start_arrow = theta2 - arrow_angle arc = mpatches.Path.arc(theta1, angle_start_arrow) - outer_arc = ( - arc.vertices * (self.radius + self.width / 2.0) / self.radius - ) + outer_arc = arc.vertices * (self.radius + self.width / 2.0) / self.radius inner_arc = ( - arc.vertices[::-1] - * (self.radius - self.width / 2.0) - / self.radius + arc.vertices[::-1] * (self.radius - self.width / 2.0) / self.radius ) arrow_vertices = [ outer_arc[-1], - np.array( - [np.cos(np.deg2rad(theta2)), np.sin(np.deg2rad(theta2))] - ), + np.array([np.cos(np.deg2rad(theta2)), np.sin(np.deg2rad(theta2))]), inner_arc[0], ] p = np.vstack([outer_arc, arrow_vertices, inner_arc]) diff --git a/cpt_linear_genome_plot/dna_features_viewer/CircularGraphicRecord/CircularGraphicRecord.py b/cpt_linear_genome_plot/dna_features_viewer/CircularGraphicRecord/CircularGraphicRecord.py index d2fbd3f..d56d9b5 100755 --- a/cpt_linear_genome_plot/dna_features_viewer/CircularGraphicRecord/CircularGraphicRecord.py +++ b/cpt_linear_genome_plot/dna_features_viewer/CircularGraphicRecord/CircularGraphicRecord.py @@ -62,8 +62,7 @@ def __init__( self.labels_spacing = labels_spacing def initialize_ax(self, ax, draw_line, with_ruler): - """Initialize the ax with a circular line, sets limits, aspect etc. - """ + """Initialize the ax with a circular line, sets limits, aspect etc.""" if draw_line: circle = mpatches.Circle( @@ -91,18 +90,13 @@ def finalize_ax( annotations_max_level, auto_figure_height=False, ideal_yspan=None, - annotations_are_elevated=True + annotations_are_elevated=True, ): """Final display range and figure dimension tweakings.""" - annotation_height = self.determine_annotation_height( - annotations_max_level - ) - ymin = -2 * self.radius - self.feature_level_height * ( - features_levels + 1 - ) - ymax = ( - self.feature_level_height * (features_levels + 1) - + annotation_height * (annotations_max_level + 1) + annotation_height = self.determine_annotation_height(annotations_max_level) + ymin = -2 * self.radius - self.feature_level_height * (features_levels + 1) + ymax = self.feature_level_height * (features_levels + 1) + annotation_height * ( + annotations_max_level + 1 ) if ideal_yspan is not None: ymax = max(annotation_height * ideal_yspan + ymin, ymax) @@ -146,14 +140,11 @@ def position_to_angle(self, position): return 90 - a def coordinates_in_plot(self, position, level): - """Convert a sequence position and height level to (x, y) coordinates. - """ + """Convert a sequence position and height level to (x, y) coordinates.""" r = self.radius + level * self.feature_level_height angle = self.position_to_angle(position) rad_angle = np.deg2rad(angle) - return np.array( - [r * np.cos(rad_angle), r * np.sin(rad_angle) - self.radius] - ) + return np.array([r * np.cos(rad_angle), r * np.sin(rad_angle) - self.radius]) def determine_annotation_height(self, max_annotations_level): """Auto-select the annotations height. @@ -165,7 +156,7 @@ def determine_annotation_height(self, max_annotations_level): return min(0.25, 3.0 * self.radius / (1.0 + max_annotations_level)) def compute_padding(self, ax): - "" + """""" ax_width = ax.get_window_extent(ax.figure.canvas.get_renderer()).width xmin, xmax = ax.get_xlim() - return 3 * self.labels_spacing * (xmax - xmin) / (1.0 * ax_width) \ No newline at end of file + return 3 * self.labels_spacing * (xmax - xmin) / (1.0 * ax_width) diff --git a/cpt_linear_genome_plot/dna_features_viewer/GraphicFeature.py b/cpt_linear_genome_plot/dna_features_viewer/GraphicFeature.py index f3a4385..97cfa8e 100755 --- a/cpt_linear_genome_plot/dna_features_viewer/GraphicFeature.py +++ b/cpt_linear_genome_plot/dna_features_viewer/GraphicFeature.py @@ -1,5 +1,6 @@ from copy import deepcopy + class GraphicFeature: """Genetic Feature to be plotted. @@ -94,9 +95,7 @@ def __init__( self.box_linewidth = box_linewidth self.box_color = box_color self.label_link_color = label_link_color - self.fontdict = dict( - [("fontsize", 11)] + list((fontdict or {}).items()) - ) + self.fontdict = dict([("fontsize", 11)] + list((fontdict or {}).items())) self.html = html self.open_left = open_left self.open_right = open_right @@ -129,8 +128,7 @@ def crop(self, window): return copy def overlaps_with(self, other): - """Return whether the feature's location overlaps with feature `other` - """ + """Return whether the feature's location overlaps with feature `other`""" loc1, loc2 = (self.start, self.end), (other.start, other.end) loc1, loc2 = sorted(loc1), sorted(loc2) loc1, loc2 = sorted([loc1, loc2], key=lambda loc: loc[0]) diff --git a/cpt_linear_genome_plot/dna_features_viewer/GraphicRecord/BokehPlottableMixin.py b/cpt_linear_genome_plot/dna_features_viewer/GraphicRecord/BokehPlottableMixin.py index 793f1a5..ea0108e 100755 --- a/cpt_linear_genome_plot/dna_features_viewer/GraphicRecord/BokehPlottableMixin.py +++ b/cpt_linear_genome_plot/dna_features_viewer/GraphicRecord/BokehPlottableMixin.py @@ -78,7 +78,7 @@ def plot_with_bokeh(self, figure_width=5, figure_height="auto", tools="auto"): height = int(0.5 * height) else: height = 100 * figure_height - height = max(height, 185) # Minimal height to see all icons + height = max(height, 185) # Minimal height to see all icons max_y = max( [data["annotation_y"] for f, data in plot_data.items()] diff --git a/cpt_linear_genome_plot/dna_features_viewer/GraphicRecord/GraphicRecord.py b/cpt_linear_genome_plot/dna_features_viewer/GraphicRecord/GraphicRecord.py index 3c0d2f8..4ee175e 100755 --- a/cpt_linear_genome_plot/dna_features_viewer/GraphicRecord/GraphicRecord.py +++ b/cpt_linear_genome_plot/dna_features_viewer/GraphicRecord/GraphicRecord.py @@ -37,24 +37,24 @@ class GraphicRecord(MatplotlibPlottableMixin, BokehPlottableMixin): Indicates which standard to use to show nucleotide indices in the plots. If 'biopython', the standard python indexing is used (starting at 0). If 'genbank', the indexing follows the Genbank standard (starting at 1). - + labels_spacing Number of pixels that will "pad" every labels to force some horizontal space between two labels or between a label and the borders of a feature. - + ticks_resolution Leave to "auto" for an auto-selected number of ticks on the ruler, or set - to e.g. 50 for a tick every 50 nucleotide. + to e.g. 50 for a tick every 50 nucleotide. Attributes ---------- default_font_family Default font to use for a feature that doesn't declare a font. - + default_ruler_color Default ruler color to use when no color is given at plot() time. - + default_box_color Default box color for non-inline annotations. If set to None, no boxes will be drawn unless the features declare a box_color. @@ -81,7 +81,7 @@ def __init__( first_index=0, plots_indexing="biopython", labels_spacing=8, - ticks_resolution='auto' + ticks_resolution="auto", ): if sequence_length is None: sequence_length = len(sequence) @@ -147,7 +147,7 @@ def crop(self, window): first_index=start, plots_indexing=self.plots_indexing, labels_spacing=self.labels_spacing, - ticks_resolution=self.ticks_resolution + ticks_resolution=self.ticks_resolution, ) def determine_annotation_height(self, levels): @@ -159,8 +159,7 @@ def determine_annotation_height(self, levels): return self.feature_level_height def coordinates_in_plot(self, x, level): - """Convert a sequence position and height level into a (x, y) position. - """ + """Convert a sequence position and height level into a (x, y) position.""" return (x, level * self.feature_level_height) def split_overflowing_features_circularly(self): diff --git a/cpt_linear_genome_plot/dna_features_viewer/GraphicRecord/MatplotlibPlottableMixin.py b/cpt_linear_genome_plot/dna_features_viewer/GraphicRecord/MatplotlibPlottableMixin.py index 6163e94..7ebd198 100755 --- a/cpt_linear_genome_plot/dna_features_viewer/GraphicRecord/MatplotlibPlottableMixin.py +++ b/cpt_linear_genome_plot/dna_features_viewer/GraphicRecord/MatplotlibPlottableMixin.py @@ -14,9 +14,7 @@ from .SequenceAndTranslationMixin import SequenceAndTranslationMixin -class MatplotlibPlottableMixin( - MultilinePlottableMixin, SequenceAndTranslationMixin -): +class MatplotlibPlottableMixin(MultilinePlottableMixin, SequenceAndTranslationMixin): """Class mixin for matplotlib-related methods.""" default_elevate_outline_annotations = False @@ -71,16 +69,16 @@ def finalize_ax( annotations_are_elevated=True, ): """Prettify the figure with some last changes. - + Changes include redefining y-bounds and figure height. Parameters ========== ax ax on which the record was plotted - + features_levels - + annotations_max_level Number indicating to the method the maximum height for an annotation, so the method can set ymax accordingly @@ -160,9 +158,7 @@ def plot_feature(self, ax, feature, level, linewidth=1.0): head_length = 0.001 else: width_pixel = self._get_ax_width(ax, unit="pixel") - head_length = ( - 0.5 * width_pixel * feature.length / self.sequence_length - ) + head_length = 0.5 * width_pixel * feature.length / self.sequence_length head_length = min(head_length, 0.6 * feature.thickness) arrowstyle = mpatches.ArrowStyle.Simple( @@ -279,20 +275,20 @@ def place_annotation( max_label_length, indicate_strand_in_label=False, ): - """"Place an annotation in the figure. Decide on inline vs. outline. - + """ "Place an annotation in the figure. Decide on inline vs. outline. + Parameters ---------- feature Graphic feature to place in the figure - + ax Matplotlib ax in which to place the feature. - + level level at which the annotation should be placed - + annotate_inline If true, the plotter will attempt to annotate inline, and fall back to outline annotation. @@ -392,11 +388,11 @@ def plot( All features and annotations will be pushed up by "level_offset". Can be useful when plotting several sets of features successively on a same ax. - + elevate_outline_annotations If true, every text annotation will be above every feature. If false, text annotations will be as close as possible to the features. - + strand_in_label_pixel_threshold Number N such that, when provided, every feature with a graphical width in pixels below N will have its strand indicated in the label @@ -404,7 +400,7 @@ def plot( x_lim Horizontal axis limits to be set at the end. - + sequence_params parameters for plot_sequence """ @@ -421,9 +417,7 @@ def plot( for f in features_levels: features_levels[f] += level_offset max_level = ( - 1 - if (features_levels == {}) - else max(1, max(features_levels.values())) + 1 if (features_levels == {}) else max(1, max(features_levels.values())) ) auto_figure_height = (ax is None) and (figure_height is None) if ax is None: @@ -457,13 +451,7 @@ def strand_in_label(f): self.plot_feature(ax=ax, feature=feature, level=level) if feature.label is None: continue - ( - text, - overflowing, - nlines, - (x1, x2,), - height, - ) = self.place_annotation( + (text, overflowing, nlines, (x1, x2,), height,) = self.place_annotation( feature=feature, ax=ax, level=level, @@ -510,9 +498,7 @@ def strand_in_label(f): is_base=True, ) overflowing_annotations.append(base_feature) - annotations_levels = compute_features_levels( - overflowing_annotations - ) + annotations_levels = compute_features_levels(overflowing_annotations) else: for f in self.features: f.data.update(dict(nlines=1, fixed_level=features_levels[f])) @@ -525,9 +511,7 @@ def strand_in_label(f): max_annotations_level = max([0] + list(annotations_levels.values())) annotation_height = self.determine_annotation_height(max_level) - annotation_height = max( - self.min_y_height_of_text_line, annotation_height - ) + annotation_height = max(self.min_y_height_of_text_line, annotation_height) labels_data = {} for feature, level in annotations_levels.items(): if "is_base" in feature.data: @@ -577,7 +561,9 @@ def plot_legend( if text is None: continue parameters = dict( - label=text, facecolor=feature.color, edgecolor="black", + label=text, + facecolor=feature.color, + edgecolor="black", ) if include_edge: parameters.update( @@ -589,23 +575,16 @@ def plot_legend( if text in features_parameters: previous_parameters = features_parameters[text] if (not allow_ambiguity) and any( - [ - parameters[k] != previous_parameters[k] - for k in parameters - ] + [parameters[k] != previous_parameters[k] for k in parameters] ): - raise ValueError( - "Cannot generate an unambiguous legend as two" - ) + raise ValueError("Cannot generate an unambiguous legend as two") continue features_parameters[text] = parameters handles.append(Patch(**parameters)) ax.legend(handles=handles, **legend_kwargs) -def change_luminosity( - color, luminosity=None, min_luminosity=None, factor=None -): +def change_luminosity(color, luminosity=None, min_luminosity=None, factor=None): """Return a version of the color with different luminosity. Parameters @@ -620,7 +599,7 @@ def change_luminosity( Only used if `luminosity` is not set. Positive factors increase luminosity and negative factors decrease it. More precisely, the luminosity of the new color is L^(-factor), where L is the current - luminosity, between 0 and 1. + luminosity, between 0 and 1. """ r, g, b = colorConverter.to_rgb(color) h, l, s = colorsys.rgb_to_hls(r, g, b) diff --git a/cpt_linear_genome_plot/dna_features_viewer/GraphicRecord/MultilinePlottableMixin.py b/cpt_linear_genome_plot/dna_features_viewer/GraphicRecord/MultilinePlottableMixin.py index a8f3021..40d07ba 100755 --- a/cpt_linear_genome_plot/dna_features_viewer/GraphicRecord/MultilinePlottableMixin.py +++ b/cpt_linear_genome_plot/dna_features_viewer/GraphicRecord/MultilinePlottableMixin.py @@ -20,11 +20,11 @@ def plot_on_multiple_lines( n_lines Number of lines on which the record will be plotted. A number of nucleotides per line can be provided instead (see below). - + nucl_per_line Number of nucleotides to be represented on every line (determines the number of lines ``n_lines``). - + plot_sequence Whether to plot the nucleotide sequence on each line @@ -38,7 +38,7 @@ def plot_on_multiple_lines( of the individual lines. This includes ``draw_line``, ``with_ruler``, ``annotate_inline``, ``plot_sequence``, ``evelate_outline_annotations``, ``strand_in_label_pixel_threshold`` - + Returns ------- @@ -103,7 +103,7 @@ def plot_on_multiple_pages( """Plot the features on different lines on different pages of a PDF. This function returns None - + Parameters ---------- @@ -113,14 +113,14 @@ def plot_on_multiple_pages( n_lines Number of lines on which the record will be plotted. A number of nucleotides per line can be provided instead (see below). - + nucl_per_line Number of nucleotides to be represented on every line (determines the number of lines ``n_lines``). - + lines_per_page Number of lines on each page - + plot_sequence Whether to plot the nucleotide sequence on each line diff --git a/cpt_linear_genome_plot/dna_features_viewer/GraphicRecord/SequenceAndTranslationMixin.py b/cpt_linear_genome_plot/dna_features_viewer/GraphicRecord/SequenceAndTranslationMixin.py index a3cb715..024082c 100755 --- a/cpt_linear_genome_plot/dna_features_viewer/GraphicRecord/SequenceAndTranslationMixin.py +++ b/cpt_linear_genome_plot/dna_features_viewer/GraphicRecord/SequenceAndTranslationMixin.py @@ -98,7 +98,7 @@ def plot_translation( translation Sequence of amino acids either as a string ``'MAKG...'`` or as a list ``['Met', 'Ala', ...]`` - + """ start, end = location[0], location[1] @@ -140,6 +140,4 @@ def plot_translation( zorder=-10000, ) if guides_intensity: - ax.axvline( - end - 0.5, linewidth=0.1, color=guides_color, zorder=-10000 - ) + ax.axvline(end - 0.5, linewidth=0.1, color=guides_color, zorder=-10000) diff --git a/cpt_linear_genome_plot/dna_features_viewer/GraphicRecord/__init__.py b/cpt_linear_genome_plot/dna_features_viewer/GraphicRecord/__init__.py index 8de163b..3eb1c56 100755 --- a/cpt_linear_genome_plot/dna_features_viewer/GraphicRecord/__init__.py +++ b/cpt_linear_genome_plot/dna_features_viewer/GraphicRecord/__init__.py @@ -1,3 +1,3 @@ from .GraphicRecord import GraphicRecord -__all__ = ['GraphicRecord'] +__all__ = ["GraphicRecord"] diff --git a/cpt_linear_genome_plot/dna_features_viewer/biotools.py b/cpt_linear_genome_plot/dna_features_viewer/biotools.py index c48506b..a59e114 100755 --- a/cpt_linear_genome_plot/dna_features_viewer/biotools.py +++ b/cpt_linear_genome_plot/dna_features_viewer/biotools.py @@ -11,9 +11,7 @@ class GFF: def parse(*a): """Not available. Please install bcbio-gff.""" - raise ImportError( - "Please install the bcbio-gff library to parse GFF data" - ) + raise ImportError("Please install the bcbio-gff library to parse GFF data") def complement(dna_sequence): @@ -37,8 +35,7 @@ def reverse_complement(sequence): aa_short_to_long_form_dict = { - _aa1: _aa3[0] + _aa3[1:].lower() - for (_aa1, _aa3) in zip(aa1 + "*", aa3 + ["*"]) + _aa1: _aa3[0] + _aa3[1:].lower() for (_aa1, _aa3) in zip(aa1 + "*", aa3 + ["*"]) } @@ -93,10 +90,10 @@ def extract_graphical_translation(sequence, location, long_form=False): def load_record(path): - """Load a Genbank file """ + """Load a Genbank file""" if isinstance(path, str): # Input is a file path - if path.lower().endswith('.gff'): + if path.lower().endswith(".gff"): return list(GFF.parse(path))[0] else: return SeqIO.read(path, "genbank") @@ -110,11 +107,7 @@ def load_record(path): def annotate_biopython_record( - seqrecord, - location="full", - feature_type="misc_feature", - margin=0, - **qualifiers + seqrecord, location="full", feature_type="misc_feature", margin=0, **qualifiers ): """Add a feature to a Biopython SeqRecord. diff --git a/cpt_linear_genome_plot/dna_features_viewer/compute_features_levels.py b/cpt_linear_genome_plot/dna_features_viewer/compute_features_levels.py index 6c19b35..77e3fcc 100755 --- a/cpt_linear_genome_plot/dna_features_viewer/compute_features_levels.py +++ b/cpt_linear_genome_plot/dna_features_viewer/compute_features_levels.py @@ -46,16 +46,13 @@ def compute_features_levels(features): if f1.overlaps_with(f2) ] graph = Graph(features, edges) - levels = { - n: n.data.get("fixed_level", None) - for n in graph.nodes - } + levels = {n: n.data.get("fixed_level", None) for n in graph.nodes} def collision(node, level): """Return whether the node placed at base_level collides with its neighbors in the graph.""" line_factor = 0.5 - nlines = node.data.get("nlines", 1) + nlines = node.data.get("nlines", 1) for neighbor in graph.neighbors[node]: neighbor_level = levels[neighbor] if neighbor_level is None: diff --git a/cpt_linear_genome_plot/linear_genome_plot.py b/cpt_linear_genome_plot/linear_genome_plot.py index 3e77682..3249317 100755 --- a/cpt_linear_genome_plot/linear_genome_plot.py +++ b/cpt_linear_genome_plot/linear_genome_plot.py @@ -9,6 +9,7 @@ import sys import argparse + class CPTTranslator(BiopythonTranslator): """ This is a customized translator from the dna_features_viewer module to fit Galaxy @@ -25,16 +26,28 @@ class CPTTranslator(BiopythonTranslator): def compute_feature_color(self, feature): if feature.type == "CDS": if "product" in feature.qualifiers: - color_specific = any(re.search(("(\\b"+str(item)+"\\b)"),feature.qualifiers["product"][0]) for item in custom_name_colors.keys()) or any(re.search((item),feature.qualifiers["product"][0]) for item in custom_name_colors.keys()) + color_specific = any( + re.search( + ("(\\b" + str(item) + "\\b)"), feature.qualifiers["product"][0] + ) + for item in custom_name_colors.keys() + ) or any( + re.search((item), feature.qualifiers["product"][0]) + for item in custom_name_colors.keys() + ) if color_specific: try: return custom_name_colors[feature.qualifiers["product"][0]] except KeyError: for item in custom_name_colors.keys(): if item in feature.qualifiers["product"][0]: - custom_name_colors[feature.qualifiers["product"][0]] = custom_name_colors[item] - return custom_name_colors[feature.qualifiers["product"][0]] - #print(feature.qualifiers["product"][0]) + custom_name_colors[ + feature.qualifiers["product"][0] + ] = custom_name_colors[item] + return custom_name_colors[ + feature.qualifiers["product"][0] + ] + # print(feature.qualifiers["product"][0]) else: try: return custom_feature_colors[feature.type] @@ -47,12 +60,21 @@ def compute_feature_color(self, feature): except KeyError: return BiopythonTranslator.compute_feature_color(self, feature) - def compute_feature_label(self, feature): # remove the chop_blocks + def compute_feature_label(self, feature): # remove the chop_blocks self.label_fields = label_fields if feature.type == "CDS": if "product" in feature.qualifiers: - if ignored_gene_labels: # product name drop - verify_chops = any(re.search(("(\\b"+str(item)+"\\b)"),feature.qualifiers["product"][0]) for item in ignored_gene_labels) or any(re.search((item), feature.qualifiers["product"][0]) for item in ignored_gene_labels) + if ignored_gene_labels: # product name drop + verify_chops = any( + re.search( + ("(\\b" + str(item) + "\\b)"), + feature.qualifiers["product"][0], + ) + for item in ignored_gene_labels + ) or any( + re.search((item), feature.qualifiers["product"][0]) + for item in ignored_gene_labels + ) if verify_chops: return None else: @@ -66,7 +88,9 @@ def compute_feature_label(self, feature): # remove the chop_blocks def compute_filtered_features(self, features): return [ - feature for feature in features if feature.type not in ignored_features_types + feature + for feature in features + if feature.type not in ignored_features_types ] def compute_feature_legend_text(self, feature): @@ -85,10 +109,11 @@ def compute_feature_box_linewidth(self, feature): else: return 0 + def parse_gbk(file): - """ simple function to parse out the feature information AND products """ + """simple function to parse out the feature information AND products""" - record = SeqIO.read(file,"genbank") + record = SeqIO.read(file, "genbank") count = 0 feature_types = {} product_names = [] @@ -102,42 +127,115 @@ def parse_gbk(file): return feature_types, product_names, record + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Linear Genome Plot") # Input and Parameters - parser.add_argument("input_file",type=argparse.FileType("r"),help="genbank or gff3 file") - parser.add_argument("--plot_width",type=int,default=20) - #parser.add_argument("--plot_height",type=int,default=4) - parser.add_argument("--title",type=str,default="genome plot") # NEED TO ADD TO XML - parser.add_argument("--common_features_excluded", default="", help="common features to be excluded") - parser.add_argument("--features_excluded",default="",help="features to be excluded from plot, separate by commas") - parser.add_argument("--common_ignore_feature_labels", default="", help="common feature labels to be excluded") - parser.add_argument("--ignored_feature_labels",default="",help="ignore labeling of specific features") - parser.add_argument("--common_ignore_product_labels", default="", help="common product names to not label") - parser.add_argument("--ignore_labeling",default="",help="labeling for specific products to ignore, separate by commas") - parser.add_argument("--feature_label_order",default="locus_tag",help="label order, where the first choice is the first feature listed to pull name labels from") # NEED TO ADD TO XML - parser.add_argument("--label_box",action="store_true",help="Use to have label box around feature labels") - parser.add_argument("--label_algo",action="store_true",help="use dna features spacing algo for label placement (in or above feature)") - #parser.add_argument("--level_offset",type=int,default=0,help="All features and annotations will be pushed up by the input amount. Useful for when plotting several sets of features successively on the same axis.") # Will exclude for now - #parser.add_argument("--custom_region",action="store_true",help="cropped region for plot") - parser.add_argument("--sz",type=int,help="beginning location for crop") - parser.add_argument("--ez",type=int,help="end location for crop") - parser.add_argument("--st",type=int,help="start site of translation") - parser.add_argument("--et",type=int,help="end site of translation") - parser.add_argument("--translation_on",action="store_true",help="plot the translation sub-axis") - parser.add_argument("--feature_id",nargs="*",action="append",help="feature label to have custom color") # NEED TO ADD TO XML - parser.add_argument("--feature_id_color",nargs="*",action="append",help="feature's accompanying color") - parser.add_argument("--gene_id",nargs="*",action="append",help="gene/cds label to have custom color") - parser.add_argument("--gene_id_color",nargs="*",action="append",help="gene/cds's accompanying color") - parser.add_argument("--multiline",action="store_true",help="Plot multiline plot") - parser.add_argument("--nucl_per_line",type=int,help="nucleotides per line of multiline") + parser.add_argument( + "input_file", type=argparse.FileType("r"), help="genbank or gff3 file" + ) + parser.add_argument("--plot_width", type=int, default=20) + # parser.add_argument("--plot_height",type=int,default=4) + parser.add_argument( + "--title", type=str, default="genome plot" + ) # NEED TO ADD TO XML + parser.add_argument( + "--common_features_excluded", default="", help="common features to be excluded" + ) + parser.add_argument( + "--features_excluded", + default="", + help="features to be excluded from plot, separate by commas", + ) + parser.add_argument( + "--common_ignore_feature_labels", + default="", + help="common feature labels to be excluded", + ) + parser.add_argument( + "--ignored_feature_labels", + default="", + help="ignore labeling of specific features", + ) + parser.add_argument( + "--common_ignore_product_labels", + default="", + help="common product names to not label", + ) + parser.add_argument( + "--ignore_labeling", + default="", + help="labeling for specific products to ignore, separate by commas", + ) + parser.add_argument( + "--feature_label_order", + default="locus_tag", + help="label order, where the first choice is the first feature listed to pull name labels from", + ) # NEED TO ADD TO XML + parser.add_argument( + "--label_box", + action="store_true", + help="Use to have label box around feature labels", + ) + parser.add_argument( + "--label_algo", + action="store_true", + help="use dna features spacing algo for label placement (in or above feature)", + ) + # parser.add_argument("--level_offset",type=int,default=0,help="All features and annotations will be pushed up by the input amount. Useful for when plotting several sets of features successively on the same axis.") # Will exclude for now + # parser.add_argument("--custom_region",action="store_true",help="cropped region for plot") + parser.add_argument("--sz", type=int, help="beginning location for crop") + parser.add_argument("--ez", type=int, help="end location for crop") + parser.add_argument("--st", type=int, help="start site of translation") + parser.add_argument("--et", type=int, help="end site of translation") + parser.add_argument( + "--translation_on", action="store_true", help="plot the translation sub-axis" + ) + parser.add_argument( + "--feature_id", + nargs="*", + action="append", + help="feature label to have custom color", + ) # NEED TO ADD TO XML + parser.add_argument( + "--feature_id_color", + nargs="*", + action="append", + help="feature's accompanying color", + ) + parser.add_argument( + "--gene_id", + nargs="*", + action="append", + help="gene/cds label to have custom color", + ) + parser.add_argument( + "--gene_id_color", + nargs="*", + action="append", + help="gene/cds's accompanying color", + ) + parser.add_argument("--multiline", action="store_true", help="Plot multiline plot") + parser.add_argument( + "--nucl_per_line", type=int, help="nucleotides per line of multiline" + ) # Output - parser.add_argument("--file_stats",type=argparse.FileType("w"),default="out_stats.txt",help="output stat file") - #parser.add_argument("--tmp_img",dest="tmp_img",type=argparse.FileType("wb"),default="out_tmp.svg") - parser.add_argument("--out_img",dest="out_img",type=argparse.FileType("wb"),default="out_img.svg",help="svg genome plot") + parser.add_argument( + "--file_stats", + type=argparse.FileType("w"), + default="out_stats.txt", + help="output stat file", + ) + # parser.add_argument("--tmp_img",dest="tmp_img",type=argparse.FileType("wb"),default="out_tmp.svg") + parser.add_argument( + "--out_img", + dest="out_img", + type=argparse.FileType("wb"), + default="out_img.svg", + help="svg genome plot", + ) args = parser.parse_args() - ## Part I ; Parse and send output of features count and the list of product names feature_counts, products, genome = parse_gbk(args.input_file) with args.file_stats as f: @@ -161,8 +259,10 @@ def parse_gbk(file): if args.feature_id: feature_ids = [f for listed_obj in args.feature_id for f in listed_obj] - feature_ids_colors = [f for listed_obj in args.feature_id_color for f in listed_obj] - custom_feature_colors = dict(zip(feature_ids,feature_ids_colors)) + feature_ids_colors = [ + f for listed_obj in args.feature_id_color for f in listed_obj + ] + custom_feature_colors = dict(zip(feature_ids, feature_ids_colors)) else: custom_feature_colors = {} @@ -170,63 +270,67 @@ def parse_gbk(file): if args.gene_id: gene_ids = [g for listed_obj in args.gene_id for g in listed_obj] gene_ids_colors = [g for listed_obj in args.gene_id_color for g in listed_obj] - custom_name_colors = dict(zip(gene_ids,gene_ids_colors)) + custom_name_colors = dict(zip(gene_ids, gene_ids_colors)) else: custom_name_colors = {} ## Ignored Features - #ignored_features_types = str.split(args.features_excluded,",") + # ignored_features_types = str.split(args.features_excluded,",") if args.common_features_excluded: ignored_features_types = str.split(args.common_features_excluded, ",") if args.features_excluded: - ignored_features_types += str.split(args.features_excluded,",") + ignored_features_types += str.split(args.features_excluded, ",") elif args.features_excluded: - ignored_features_types = str.split(args.features_excluded,",") + ignored_features_types = str.split(args.features_excluded, ",") else: ignored_features_types = False print(ignored_features_types) - + ## product labels if args.common_ignore_product_labels: - ignored_gene_labels = str.split(args.common_ignore_product_labels,",") + ignored_gene_labels = str.split(args.common_ignore_product_labels, ",") if args.ignore_labeling: - ignored_gene_labels += str.split(args.ignore_labeling,",") + ignored_gene_labels += str.split(args.ignore_labeling, ",") elif args.ignore_labeling: - ignored_gene_labels = str.split(args.ignore_labeling,",") + ignored_gene_labels = str.split(args.ignore_labeling, ",") else: ignored_gene_labels = False - + print(ignored_gene_labels) - if args.feature_label_order != ['']: - label_fields = str.split(args.feature_label_order,",") + if args.feature_label_order != [""]: + label_fields = str.split(args.feature_label_order, ",") - #if ignored_gene_labels == ['']: + # if ignored_gene_labels == ['']: # ignored_gene_labels = False ## Ignored Labeling if args.common_ignore_feature_labels: - ignored_feature_labels = str.split(args.common_ignore_feature_labels,",") + ignored_feature_labels = str.split(args.common_ignore_feature_labels, ",") if args.ignored_feature_labels: - ignored_feature_labels += str.split(args.ignored_feature_labels,",") + ignored_feature_labels += str.split(args.ignored_feature_labels, ",") elif args.ignored_feature_labels: - ignored_feature_labels = str.split(args.ignored_feature_labels,",") + ignored_feature_labels = str.split(args.ignored_feature_labels, ",") else: ignored_feature_labels = False - + print(ignored_feature_labels) ## Print Statements for Debugging - #print(custom_feature_colors) - #print(custom_name_colors) - #print(ignored_features_types) - #print(ignored_gene_labels) - #print(label_fields) + # print(custom_feature_colors) + # print(custom_name_colors) + # print(ignored_features_types) + # print(ignored_gene_labels) + # print(label_fields) ## Part III ; PLOT # Housekeeping - rc_context({"font.family": ["monospace"],}) # courier-like - matplotlib.use('Agg') # I think this has to be used... + rc_context( + { + "font.family": ["monospace"], + } + ) # courier-like + matplotlib.use("Agg") # I think this has to be used... if args.label_algo: lab_algo = True @@ -240,13 +344,22 @@ def parse_gbk(file): img.truncate(0) img.close() - if args.sz and not args.multiline: # if user is wanting to look at a subset region of the genome + if ( + args.sz and not args.multiline + ): # if user is wanting to look at a subset region of the genome zoom_start, zoom_end = args.sz, args.ez - cropped = graphic_record.crop((zoom_start,zoom_end)) - ax, _ = cropped.plot(figure_width=args.plot_width, annotate_inline=lab_algo,figure_height=None) + cropped = graphic_record.crop((zoom_start, zoom_end)) + ax, _ = cropped.plot( + figure_width=args.plot_width, annotate_inline=lab_algo, figure_height=None + ) if args.translation_on: crop_seq = (args.st - 1, args.et) - cropped.plot_translation(ax, location=crop_seq, fontdict={'size':8, 'weight':'bold'},y_offset=1) + cropped.plot_translation( + ax, + location=crop_seq, + fontdict={"size": 8, "weight": "bold"}, + y_offset=1, + ) ax.set_title(args.title) # Galaxy specific shenanigans tmp_fig = "./tmp.svg" @@ -257,20 +370,22 @@ def parse_gbk(file): zoom_start, zoom_end = args.sz, args.ez else: zoom_start, zoom_end = 1, graphic_record.sequence_length - cropped = graphic_record.crop((zoom_start,zoom_end)) + cropped = graphic_record.crop((zoom_start, zoom_end)) ax, _ = cropped.plot_on_multiple_lines( figure_width=args.plot_width, annotate_inline=lab_algo, figure_height=None, nucl_per_line=args.nucl_per_line, - plot_sequence=False + plot_sequence=False, ) - #ax.set_title(args.title) + # ax.set_title(args.title) tmp_fig = "./tmp.svg" plt.savefig(tmp_fig) plt.close() else: - ax, _ = graphic_record.plot(figure_width=args.plot_width, annotate_inline=lab_algo) + ax, _ = graphic_record.plot( + figure_width=args.plot_width, annotate_inline=lab_algo + ) ax.set_title(args.title) tmp_fig = "./tmp.svg" # Galaxy specific shenanigans diff --git a/cpt_lipop_conv/gff3.py b/cpt_lipop_conv/gff3.py index d4795d4..48496c3 100755 --- a/cpt_lipop_conv/gff3.py +++ b/cpt_lipop_conv/gff3.py @@ -97,10 +97,10 @@ def feature_test_type(feature, **kwargs): if "type" in kwargs: return str(feature.type).upper() == str(kwargs["type"]).upper() elif "types" in kwargs: - for x in kwargs["types"]: - if str(feature.type).upper() == str(x).upper(): - return True - return False + for x in kwargs["types"]: + if str(feature.type).upper() == str(x).upper(): + return True + return False raise Exception("Incorrect feature_test_type call, need type or types") diff --git a/cpt_lipop_conv/lipoP_to_gff3.py b/cpt_lipop_conv/lipoP_to_gff3.py index a12e080..0f7dd34 100755 --- a/cpt_lipop_conv/lipoP_to_gff3.py +++ b/cpt_lipop_conv/lipoP_to_gff3.py @@ -24,18 +24,17 @@ def lipoP_gff(lipoIn, gff3In, jBrowseOut, filterSP2): rowElem = row.split("\t") orgID = rowElem[0] - + if filterSP2: - if rowElem[2] == "CleavII": - if not (orgID in orgIDs.keys()): - orgIDs[orgID] = [] - orgIDs[orgID].append(int(rowElem[3])) # , int(rowElem[4]))) + if rowElem[2] == "CleavII": + if not (orgID in orgIDs.keys()): + orgIDs[orgID] = [] + orgIDs[orgID].append(int(rowElem[3])) # , int(rowElem[4]))) else: - if rowElem[2] in "CleavII": - if not (orgID in orgIDs.keys()): - orgIDs[orgID] = [] - orgIDs[orgID].append(int(rowElem[3])) # , int(rowElem[4]))) - + if rowElem[2] in "CleavII": + if not (orgID in orgIDs.keys()): + orgIDs[orgID] = [] + orgIDs[orgID].append(int(rowElem[3])) # , int(rowElem[4]))) # Rebase for gff in gffParse(gff3In): @@ -61,7 +60,7 @@ def lipoP_gff(lipoIn, gff3In, jBrowseOut, filterSP2): keepSeq.append(xRec) continue - #if jBrowseOut: + # if jBrowseOut: # xRec.sub_features = [] i = 0 @@ -103,7 +102,7 @@ def lipoP_gff(lipoIn, gff3In, jBrowseOut, filterSP2): ) parser.add_argument( "--filterSP2", - action='store_true', + action="store_true", help="Filter for only SPII sites", ) args = parser.parse_args() diff --git a/cpt_lipory/gff3.py b/cpt_lipory/gff3.py index d4795d4..48496c3 100755 --- a/cpt_lipory/gff3.py +++ b/cpt_lipory/gff3.py @@ -97,10 +97,10 @@ def feature_test_type(feature, **kwargs): if "type" in kwargs: return str(feature.type).upper() == str(kwargs["type"]).upper() elif "types" in kwargs: - for x in kwargs["types"]: - if str(feature.type).upper() == str(x).upper(): - return True - return False + for x in kwargs["types"]: + if str(feature.type).upper() == str(x).upper(): + return True + return False raise Exception("Incorrect feature_test_type call, need type or types") diff --git a/cpt_lipory/lipory.py b/cpt_lipory/lipory.py index dbba3e8..98533e6 100755 --- a/cpt_lipory/lipory.py +++ b/cpt_lipory/lipory.py @@ -19,9 +19,7 @@ def find_lipoprotein(gff3_file, fasta_genome, lipobox_mindist=10, lipobox_maxdis re.compile( "^.{%s,%s}[ILMFTV][^REKD][GAS]C" % (lipobox_mindist, lipobox_maxdist) ), - re.compile( - "^.{%s,%s}AW[AGS]C" % (lipobox_mindist, lipobox_maxdist) - ), + re.compile("^.{%s,%s}AW[AGS]C" % (lipobox_mindist, lipobox_maxdist)), # Make sure to not have multiple cases that share matches, will introduce duplicate features into gff3 file ] diff --git a/cpt_mist3/mist3.py b/cpt_mist3/mist3.py index 5e4a09a..58368a4 100755 --- a/cpt_mist3/mist3.py +++ b/cpt_mist3/mist3.py @@ -41,7 +41,9 @@ class FancyRecord(object): def __init__(self, record, tmpdir): - self.temp = tempfile.NamedTemporaryFile(mode='w', dir=tmpdir, delete=False, suffix=".fa") + self.temp = tempfile.NamedTemporaryFile( + mode="w", dir=tmpdir, delete=False, suffix=".fa" + ) self.temp_path = self.temp.name self.id = self.temp_path.rsplit("/")[-1] self.record = record @@ -110,8 +112,7 @@ def __repr__(self): return "Subplot [%s]" % self.get_description() def run_gepard(self, matrix, window, global_rescale="35%"): - """Run gepard on two sequences, with a specified output file - """ + """Run gepard on two sequences, with a specified output file""" log.info("Running Gepard on %s", self.get_description()) destination_fn = ( @@ -138,8 +139,8 @@ def run_gepard(self, matrix, window, global_rescale="35%"): "--silent", ] log.debug(subprocess.list2cmdline(cmd)) - #log.info(subprocess.check_output("convert -list type")) - #exit(2) + # log.info(subprocess.check_output("convert -list type")) + # exit(2) failure_count = 0 while True: try: @@ -209,8 +210,7 @@ def char_height(font_size): return int(float(font_size) * 30 / 40) def est_pixels(string, font_size): - """guess pixel width of a string at a given font size - """ + """guess pixel width of a string at a given font size""" return char_width(font_size) * len(string) j_ticks = int(Misty.BestTick(self.j.length, 5)) @@ -246,7 +246,10 @@ def est_pixels(string, font_size): primary_header = self.i.header secondary_head = self.i.description cmd += ( - ["-rotate", "-90",] + [ + "-rotate", + "-90", + ] + FONT_30pt + [ # Side label (i/row) @@ -266,7 +269,10 @@ def est_pixels(string, font_size): ] ) - if est_pixels(self.i.description, 10) < original_dims[1] and secondary_head != "": + if ( + est_pixels(self.i.description, 10) < original_dims[1] + and secondary_head != "" + ): cmd += FONT_10pt + [ # Side label (i/row) "-annotate", @@ -340,7 +346,10 @@ def est_pixels(string, font_size): ] ) - if est_pixels(self.j.description, 10) < original_dims[0] and secondary_head != "": + if ( + est_pixels(self.j.description, 10) < original_dims[0] + and secondary_head != "" + ): cmd += FONT_10pt + [ "-annotate", "+%s+%s" @@ -379,25 +388,23 @@ def est_pixels(string, font_size): cmd += ["-annotate", "+%s+%s" % (x + 5, y), self.label_formatter(z)] cmd.append(outfile) - #tmpFile = open(outfile, "w") - #tmpFile.close() - log.info(subprocess.check_output( ["cp", infile, outfile] )) + # tmpFile = open(outfile, "w") + # tmpFile.close() + log.info(subprocess.check_output(["cp", infile, outfile])) log.info(subprocess.list2cmdline(cmd)) - log.info(subprocess.check_output( "ls" )) + log.info(subprocess.check_output("ls")) log.info(self.tmpdir) - log.info(subprocess.check_output( ["ls", self.tmpdir])) + log.info(subprocess.check_output(["ls", self.tmpdir])) log.info(outfile[2:]) log.info("Above was ls\n") try: - subprocess.check_output(cmd)# + [" 2>&1"]) + subprocess.check_output(cmd) # + [" 2>&1"]) except: log.info("Excepted") - class Misty(object): - """MIST Class for building MIST Plots - """ + """MIST Class for building MIST Plots""" def __init__(self, window=10, zoom=50, matrix="edna", files_path="mist_images"): self.tmpdir = tempfile.mkdtemp(prefix="cpt.mist3.", dir=".") @@ -563,7 +570,7 @@ def _generate_montage(self): MONTAGE_BORDER_COORD = "%sx%s" % (MONTAGE_BORDER, MONTAGE_BORDER) m0 = os.path.join(self.tmpdir, "m0.png") -# log.info(subprocess.check_output( ["cp", image_list[0], m0] )) + # log.info(subprocess.check_output( ["cp", image_list[0], m0] )) cmd = ["montage"] + image_list cmd += [ "-tile", @@ -581,9 +588,9 @@ def _generate_montage(self): log.debug(" ".join(cmd)) try: - subprocess.check_call(cmd) + subprocess.check_call(cmd) except: - log.debug("Excepted, 2") + log.debug("Excepted, 2") # Add grey borders montage_path = os.path.join(self.tmpdir, "montage.png") cmd = [ @@ -602,9 +609,9 @@ def _generate_montage(self): log.debug(" ".join(cmd)) try: - subprocess.check_call(cmd) + subprocess.check_call(cmd) except: - log.debug("Excepted, 2") + log.debug("Excepted, 2") os.unlink(m0) return montage_path @@ -629,10 +636,7 @@ def _annotate_montage(self, base_path): current_sum_width = MONTAGE_BORDER current_sum_height = MONTAGE_BORDER - convert_arguments_top+= [ - "-rotate", - "-90" - ] + convert_arguments_top += ["-rotate", "-90"] # Top side for j in range(len(self.matrix_data[0])): subplot = self.matrix_data[0][j]["subplot"] @@ -640,27 +644,39 @@ def _annotate_montage(self, base_path): "-fill", LABEL_COLOUR, "-annotate", - "-%s+%s" % (0, str(cumulative_width - current_sum_width -(subplot.get_thumb_dims()[0]/2) + (2 * MONTAGE_BORDER) + IMAGE_BORDER)), + "-%s+%s" + % ( + 0, + str( + cumulative_width + - current_sum_width + - (subplot.get_thumb_dims()[0] / 2) + + (2 * MONTAGE_BORDER) + + IMAGE_BORDER + ), + ), subplot.j.header, ] current_sum_width += subplot.get_thumb_dims()[0] + (2 * IMAGE_BORDER) log.debug("CSW %s", current_sum_width) - convert_arguments_top+= [ - "-rotate", - "90" - ] + convert_arguments_top += ["-rotate", "90"] # Left side - #convert_arguments_left += [ + # convert_arguments_left += [ # "-rotate", # "90" - #] + # ] for i in range(len(self.matrix_data)): subplot = self.matrix_data[i][0]["subplot"] convert_arguments_left += [ "-fill", LABEL_COLOUR, "-annotate", - "+2+%s" % str(current_sum_height + (subplot.get_thumb_dims()[1]/2.0) + IMAGE_BORDER), + "+2+%s" + % str( + current_sum_height + + (subplot.get_thumb_dims()[1] / 2.0) + + IMAGE_BORDER + ), "\n" + subplot.i.header, ] current_sum_height += subplot.get_thumb_dims()[1] + (2 * IMAGE_BORDER) @@ -669,15 +685,15 @@ def _annotate_montage(self, base_path): cmd = [ "convert", base_path, - # "-rotate", - # "-90", + # "-rotate", + # "-90", "-pointsize", "20", "-font", TYPEFONT, ] cmd += convert_arguments_left - # cmd += ["-rotate", "90"] + # cmd += ["-rotate", "90"] cmd += convert_arguments_top output_path = os.path.join(self.tmpdir, "large.png") @@ -694,7 +710,7 @@ def _annotate_montage(self, base_path): subprocess.check_call(cmd) except: log.debug("Excepted, 3") - #subprocess.check_output(cmd) + # subprocess.check_output(cmd) return output_path def run(self): diff --git a/cpt_phageqc_annotation/gff3.py b/cpt_phageqc_annotation/gff3.py index d4795d4..48496c3 100755 --- a/cpt_phageqc_annotation/gff3.py +++ b/cpt_phageqc_annotation/gff3.py @@ -97,10 +97,10 @@ def feature_test_type(feature, **kwargs): if "type" in kwargs: return str(feature.type).upper() == str(kwargs["type"]).upper() elif "types" in kwargs: - for x in kwargs["types"]: - if str(feature.type).upper() == str(x).upper(): - return True - return False + for x in kwargs["types"]: + if str(feature.type).upper() == str(x).upper(): + return True + return False raise Exception("Incorrect feature_test_type call, need type or types") diff --git a/cpt_phageqc_annotation/phage_annotation_validator.py b/cpt_phageqc_annotation/phage_annotation_validator.py index d780272..97f80ba 100755 --- a/cpt_phageqc_annotation/phage_annotation_validator.py +++ b/cpt_phageqc_annotation/phage_annotation_validator.py @@ -56,14 +56,13 @@ def gen_qc_feature(start, end, message, strand=0, id_src=None, type_src="gene"): kwargs = {"qualifiers": {"note": [message]}} kwargs["type"] = type_src kwargs["strand"] = strand - kwargs["phase"]=0 - kwargs["score"]=0.0 - kwargs["source"]="feature" + kwargs["phase"] = 0 + kwargs["score"] = 0.0 + kwargs["source"] = "feature" if id_src is not None: kwargs["id"] = id_src.id kwargs["qualifiers"]["ID"] = [id_src.id] kwargs["qualifiers"]["Name"] = id_src.qualifiers.get("Name", []) - if end >= start: return gffSeqFeature(FeatureLocation(start, end, strand=strand), **kwargs) @@ -142,13 +141,22 @@ def missing_rbs(record, lookahead_min=5, lookahead_max=15): qc_features.append( gen_qc_feature( - start, end, "Missing RBS", strand=gene.strand, id_src=gene, type_src="gene" + start, + end, + "Missing RBS", + strand=gene.strand, + id_src=gene, + type_src="gene", ) ) bad += 1 results.append(gene) - results[-1].location = FeatureLocation(results[-1].location.start + 1, results[-1].location.end, results[-1].location.strand) + results[-1].location = FeatureLocation( + results[-1].location.start + 1, + results[-1].location.end, + results[-1].location.strand, + ) else: if len(rbss) > 1: log.warn("%s RBSs found for gene %s", rbss[0].id, get_gff3_id(gene)) @@ -174,13 +182,17 @@ def missing_rbs(record, lookahead_min=5, lookahead_max=15): gene.__message, strand=gene.strand, id_src=gene, - type_src="gene" + type_src="gene", ) ) bad += 1 results.append(gene) - results[-1].location = FeatureLocation(results[-1].location.start + 1, results[-1].location.end, results[-1].location.strand) + results[-1].location = FeatureLocation( + results[-1].location.start + 1, + results[-1].location.end, + results[-1].location.strand, + ) else: good += 1 @@ -275,7 +287,7 @@ def excessive_gap( b = contiguous_regions[i] gap_size = abs(b[0] - a[1]) - + if gap_size > min(excess, excess_divergent): a_feat_l = itertools.islice( feature_lambda( @@ -337,7 +349,9 @@ def excessive_gap( for result_obj in results: start = result_obj[0] end = result_obj[1] - f = gen_qc_feature(start, end, "Excessive gap, %s bases" % abs(end - start), type_src="gene") + f = gen_qc_feature( + start, end, "Excessive gap, %s bases" % abs(end - start), type_src="gene" + ) qc_features.append(f) putative_genes = of.putative_genes_in_sequence( str(record[start - slop : end + slop].seq) @@ -360,7 +374,9 @@ def excessive_gap( else: possible_cds = gffSeqFeature( FeatureLocation( - possible_gene_end, possible_gene_start, strand=putative_gene[2], + possible_gene_end, + possible_gene_start, + strand=putative_gene[2], ), type="CDS", ) @@ -382,7 +398,9 @@ def excessive_gap( else: possible_rbs = gffSeqFeature( FeatureLocation( - putative_gene[6], putative_gene[5], strand=putative_gene[2], + putative_gene[6], + putative_gene[5], + strand=putative_gene[2], ), type="Shine_Dalgarno_sequence", ) @@ -398,7 +416,9 @@ def excessive_gap( else: possible_gene = gffSeqFeature( FeatureLocation( - possible_gene_end, possible_gene_start, strand=putative_gene[2], + possible_gene_end, + possible_gene_start, + strand=putative_gene[2], ), type="gene", qualifiers={"note": ["Possible gene"]}, @@ -506,7 +526,13 @@ def excessive_overlap(record, excess=15, excess_divergent=30): ): bad += float(len(ix)) / float(min(excess, excess_divergent)) qc_features.append( - gen_qc_feature(min(ix), max(ix), "Excessive Overlap", id_src=gene_a, type_src="gene") + gen_qc_feature( + min(ix), + max(ix), + "Excessive Overlap", + id_src=gene_a, + type_src="gene", + ) ) results.append((gene_a, gene_b, min(ix), max(ix))) @@ -520,8 +546,7 @@ def excessive_overlap(record, excess=15, excess_divergent=30): def get_encouragement(score): - """Some text telling the user how they did - """ + """Some text telling the user how they did""" for encouragement in ENCOURAGEMENT: if score > encouragement[0]: return encouragement[1] @@ -529,8 +554,7 @@ def get_encouragement(score): def genome_overview(record): - """Genome overview - """ + """Genome overview""" data = { "genes": { "count": 0, @@ -553,11 +577,19 @@ def genome_overview(record): data["genes"]["count"] = len(gene_features) for feat in gene_features: - data["genes"]["comp"]["A"] += feat.extract(record).seq.count("A") + feat.extract(record).seq.count("a") - data["genes"]["comp"]["C"] += feat.extract(record).seq.count("C") + feat.extract(record).seq.count("c") - data["genes"]["comp"]["T"] += feat.extract(record).seq.count("T") + feat.extract(record).seq.count("t") - data["genes"]["comp"]["G"] += feat.extract(record).seq.count("G") + feat.extract(record).seq.count("g") - #data["genes"]["bases"] += len(feat) + data["genes"]["comp"]["A"] += feat.extract(record).seq.count( + "A" + ) + feat.extract(record).seq.count("a") + data["genes"]["comp"]["C"] += feat.extract(record).seq.count( + "C" + ) + feat.extract(record).seq.count("c") + data["genes"]["comp"]["T"] += feat.extract(record).seq.count( + "T" + ) + feat.extract(record).seq.count("t") + data["genes"]["comp"]["G"] += feat.extract(record).seq.count( + "G" + ) + feat.extract(record).seq.count("g") + # data["genes"]["bases"] += len(feat) data["genes"]["avg_len"].append(len(feat)) data["genes"]["avg_len"] = float(sum(data["genes"]["avg_len"])) / len(gene_features) @@ -602,8 +634,7 @@ def find_morons(record): def bad_gene_model(record): - """Find features without product - """ + """Find features without product""" results = [] good = 0 bad = 0 @@ -630,8 +661,8 @@ def bad_gene_model(record): gene.location.end, "Mismatched number of exons and CDSs in gff3 representation", strand=gene.strand, - id_src=gene, - type_src="gene" + id_src=gene, + type_src="gene", ) ) bad += 1 @@ -655,8 +686,8 @@ def bad_gene_model(record): exon.location.end, "CDS does not extend to full length of gene", strand=exon.strand, - id_src=gene, - type_src="CDS" + id_src=gene, + type_src="CDS", ) ) bad += 1 @@ -677,8 +708,7 @@ def bad_gene_model(record): def weird_starts(record): - """Find features without product - """ + """Find features without product""" good = 0 bad = 0 qc_features = [] @@ -696,26 +726,28 @@ def weird_starts(record): seq_str = str(seq.extract(record.seq)) start_codon = seq_str[0:3] if len(seq_str) < 3: - sys.stderr.write("Fatal Error: CDS of length less than 3 at " + str(seq.location) + '\n') + sys.stderr.write( + "Fatal Error: CDS of length less than 3 at " + str(seq.location) + "\n" + ) exit(2) -# if len(seq_str) % 3 != 0: -# if len(seq_str) < 3: -# stop_codon = seq_str[-(len(seq_str))] -# else: -# stop_codon = seq_str[-3] -# -# log.warn("CDS at %s length is not a multiple of three (Length = %d)", get_gff3_id(gene), len(seq_str)) -# seq.__error = "Bad CDS Length" -# results.append(seq) -# qc_features.append( -# gen_qc_feature( -# s, e, "Bad Length", strand=seq.strand, id_src=gene -# ) -# ) -# bad += 1 -# seq.__start = start_codon -# seq.__stop = stop_codon -# continue + # if len(seq_str) % 3 != 0: + # if len(seq_str) < 3: + # stop_codon = seq_str[-(len(seq_str))] + # else: + # stop_codon = seq_str[-3] + # + # log.warn("CDS at %s length is not a multiple of three (Length = %d)", get_gff3_id(gene), len(seq_str)) + # seq.__error = "Bad CDS Length" + # results.append(seq) + # qc_features.append( + # gen_qc_feature( + # s, e, "Bad Length", strand=seq.strand, id_src=gene + # ) + # ) + # bad += 1 + # seq.__start = start_codon + # seq.__stop = stop_codon + # continue stop_codon = seq_str[-3] seq.__start = start_codon @@ -739,10 +771,19 @@ def weird_starts(record): e = seq.location.end - 3 results.append(seq) - results[-1].location = FeatureLocation(results[-1].location.start + 1, results[-1].location.end, results[-1].location.strand) + results[-1].location = FeatureLocation( + results[-1].location.start + 1, + results[-1].location.end, + results[-1].location.strand, + ) qc_features.append( gen_qc_feature( - s, e, "Weird start codon", strand=seq.strand, id_src=gene, type_src="gene" + s, + e, + "Weird start codon", + strand=seq.strand, + id_src=gene, + type_src="gene", ) ) bad += 1 @@ -753,8 +794,7 @@ def weird_starts(record): def missing_genes(record): - """Find features without product - """ + """Find features without product""" results = [] good = 0 bad = 0 @@ -804,8 +844,8 @@ def gene_model_correction_issues(record): gene.location.start, gene.location.end, "Gene is missing a locus_tag", - strand=gene.strand, - type_src="gene" + strand=gene.strand, + type_src="gene", ) ) @@ -830,8 +870,8 @@ def gene_model_correction_issues(record): cds.location.start, cds.location.end, "CDS is missing a locus_tag", - strand=cds.strand, - type_src="CDS" + strand=cds.strand, + type_src="CDS", ) ) local_qc_features.append( @@ -839,8 +879,8 @@ def gene_model_correction_issues(record): gene.location.start, gene.location.end, "Gene is missing a locus_tag", - strand=gene.strand, - type_src="gene" + strand=gene.strand, + type_src="gene", ) ) elif problem == "Different locus tag from associated gene.": @@ -854,8 +894,8 @@ def gene_model_correction_issues(record): gene.location.start, gene.location.end, "Gene and CDS have differing locus tags", - strand=gene.strand, - type_src="gene" + strand=gene.strand, + type_src="gene", ) ) elif problem == "Missing Locus Tag": @@ -869,8 +909,8 @@ def gene_model_correction_issues(record): cds.location.start, cds.location.end, "CDS is missing a locus_tag", - strand=cds.strand, - type_src="CDS" + strand=cds.strand, + type_src="CDS", ) ) else: @@ -887,8 +927,7 @@ def gene_model_correction_issues(record): def missing_tags(record): - """Find features without product - """ + """Find features without product""" results = [] good = 0 bad = 0 @@ -910,7 +949,7 @@ def missing_tags(record): cds.location.end, "Missing product tag", strand=cds.strand, - type_src="CDS" + type_src="CDS", ) ) results.append(cds) @@ -944,9 +983,9 @@ def evaluate_and_report( # TODO: support multiple GFF3 files. mostFeat = 0 for rec in list(gffParse(annotations, base_dict=seq_dict)): - if len(rec.features) > mostFeat: - mostFeat = len(rec.features) - record = rec + if len(rec.features) > mostFeat: + mostFeat = len(rec.features) + record = rec gff3_qc_record = SeqRecord(record.id, id=record.id) gff3_qc_record.features = [] @@ -1132,15 +1171,15 @@ def evaluate_and_report( def nice_strand(direction): # It is somehow possible for whole gffSeqFeature objects to end up in here, apparently at the gene level if "SeqFeature" in str(type(direction)): - direction = direction.location.strand + direction = direction.location.strand if direction > 0: - return "→"#.decode("utf-8") + return "→" # .decode("utf-8") else: - return "←"#.decode("utf-8") + return "←" # .decode("utf-8") def nice_strand_tex(direction): if "SeqFeature" in str(type(direction)): - direction = direction.location.strand + direction = direction.location.strand if direction > 0: return "$\\rightarrow$" else: @@ -1153,13 +1192,13 @@ def length(data): return len(data) def my_encode(data): - return str(data)#.encode("utf-8") + return str(data) # .encode("utf-8") def my_decode(data): # For production - return str(data)#.decode("utf-8") + return str(data) # .decode("utf-8") # For local testing. No, I do not understand. - return str(data)#.encode("utf-8")).decode("utf-8") + return str(data) # .encode("utf-8")).decode("utf-8") env = Environment( loader=FileSystemLoader(SCRIPT_PATH), trim_blocks=True, lstrip_blocks=True @@ -1176,7 +1215,7 @@ def my_decode(data): } ) tpl = env.get_template(reportTemplateName) - return tpl.render(**kwargs)#.encode("utf-8") + return tpl.render(**kwargs) # .encode("utf-8") if __name__ == "__main__": diff --git a/cpt_phageqc_annotation/shinefind.py b/cpt_phageqc_annotation/shinefind.py index c51665e..509cff3 100755 --- a/cpt_phageqc_annotation/shinefind.py +++ b/cpt_phageqc_annotation/shinefind.py @@ -53,10 +53,10 @@ def list_sds(self, sequence, sd_min=3, sd_max=17): for regex in self.sd_reg: for match in regex.finditer(sequence): spacing = len(sequence) - len(match.group()) - match.start() - if sd_max >= spacing+sd_min and spacing+sd_min >= sd_min: - #if the spacing is within gap limits, add - #(search space is [sd_max+7 .. sd_min] so actual gap is spacing+sd_min) - #print('min %d max %d - adding SD with gap %d' % (sd_min, sd_max, spacing+sd_min)) + if sd_max >= spacing + sd_min and spacing + sd_min >= sd_min: + # if the spacing is within gap limits, add + # (search space is [sd_max+7 .. sd_min] so actual gap is spacing+sd_min) + # print('min %d max %d - adding SD with gap %d' % (sd_min, sd_max, spacing+sd_min)) hits.append( { "spacing": spacing, @@ -66,7 +66,7 @@ def list_sds(self, sequence, sd_min=3, sd_max=17): "len": len(match.group()), } ) - hits = sorted(hits, key= lambda x: (-x['len'],x['spacing'])) + hits = sorted(hits, key=lambda x: (-x["len"], x["spacing"])) return hits @classmethod @@ -80,7 +80,16 @@ def highlight_sd(cls, sequence, start, end): ) @classmethod - def to_features(cls, hits, strand, parent_start, parent_end, feature_id=None, sd_min=3, sd_max=17): + def to_features( + cls, + hits, + strand, + parent_start, + parent_end, + feature_id=None, + sd_min=3, + sd_max=17, + ): results = [] for idx, hit in enumerate(hits): # gene complement(124..486) @@ -90,7 +99,7 @@ def to_features(cls, hits, strand, parent_start, parent_end, feature_id=None, sd # -1 491 501 2 3 5 # -1 491 501 1 3 5 # -1 491 501 0 3 5 - + qualifiers = { "source": "CPT_ShineFind", "ID": "%s.rbs-%s" % (feature_id, idx), @@ -108,7 +117,7 @@ def to_features(cls, hits, strand, parent_start, parent_end, feature_id=None, sd # minimum absolute value of these two will be the proper gap regardless of strand tmp = gffSeqFeature( FeatureLocation(min(start, end), max(start, end), strand=strand), - #FeatureLocation(min(start, end), max(start, end), strand=strand), + # FeatureLocation(min(start, end), max(start, end), strand=strand), type="Shine_Dalgarno_sequence", qualifiers=qualifiers, ) @@ -133,7 +142,10 @@ def testFeatureUpstream(self, feature, record, sd_min=3, sd_max=17): # Create our temp feature used to obtain correct portion of # genome - tmp = gffSeqFeature(FeatureLocation(min(start, end), max(start, end), strand=strand), type="domain") + tmp = gffSeqFeature( + FeatureLocation(min(start, end), max(start, end), strand=strand), + type="domain", + ) seq = str(tmp.extract(record.seq)) return self.list_sds(seq, sd_min, sd_max), start, end, seq @@ -175,6 +187,7 @@ def fix_gene_boundaries(feature): feature.location = FeatureLocation(fmin, fmax, strand=-1) return feature + def shinefind( fasta, gff3, diff --git a/cpt_promote_qualifiers/gff3.py b/cpt_promote_qualifiers/gff3.py index d4795d4..48496c3 100755 --- a/cpt_promote_qualifiers/gff3.py +++ b/cpt_promote_qualifiers/gff3.py @@ -97,10 +97,10 @@ def feature_test_type(feature, **kwargs): if "type" in kwargs: return str(feature.type).upper() == str(kwargs["type"]).upper() elif "types" in kwargs: - for x in kwargs["types"]: - if str(feature.type).upper() == str(x).upper(): - return True - return False + for x in kwargs["types"]: + if str(feature.type).upper() == str(x).upper(): + return True + return False raise Exception("Incorrect feature_test_type call, need type or types") diff --git a/cpt_promote_qualifiers/promote_qualifier.py b/cpt_promote_qualifiers/promote_qualifier.py index 96dbf12..c4e8cad 100755 --- a/cpt_promote_qualifiers/promote_qualifier.py +++ b/cpt_promote_qualifiers/promote_qualifier.py @@ -31,7 +31,10 @@ def promote_qualifier(qualifier, parent, child, gff3): reverse=False if parent_feature.strand > 0 else True, )[0] except IndexError: - logging.warning("Child type %s not found under parent %s" % (child, parent_feature.qualifiers["ID"])) + logging.warning( + "Child type %s not found under parent %s" + % (child, parent_feature.qualifiers["ID"]) + ) continue try: parent_feature.qualifiers[qualifier] = first_child.qualifiers[qualifier] diff --git a/cpt_psm_recombine/PSM_Recombine.py b/cpt_psm_recombine/PSM_Recombine.py index f3da237..bf727cf 100644 --- a/cpt_psm_recombine/PSM_Recombine.py +++ b/cpt_psm_recombine/PSM_Recombine.py @@ -10,49 +10,50 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(description="Identify shine-dalgarno sequences") parser.add_argument("psmTable", type=argparse.FileType("r")) - parser.add_argument('gbkList', type=argparse.FileType("r"), nargs="+") + parser.add_argument("gbkList", type=argparse.FileType("r"), nargs="+") args = parser.parse_args() - + gbkRecs = [] recIDs = [] - recFlatten = [] # Can only seek argparse file once + recFlatten = [] # Can only seek argparse file once for f in args.gbkList: - tempRecs = SeqIO.parse(f, "genbank") - for rec in tempRecs: - recFlatten.append(rec) + tempRecs = SeqIO.parse(f, "genbank") + for rec in tempRecs: + recFlatten.append(rec) for line in args.psmTable: - lineElems = line.split("\t") - numGenes = 0 - accession = "" - lineOut = "" - if recIDs == []: - for i in lineElems: - recIDs.append(i.strip()) - lineOut += i.strip() + "\t" - for rec in recFlatten: - if i.strip() in rec.id or rec.id in i.strip(): - gbkRecs.append(rec) - lineOut += "No. of phages in which gene is present\tBest Database Match" - print(lineOut) - continue - - for i in range(0, len(lineElems)): - checkFeat = lineElems[i].strip() - if checkFeat == "-": - lineOut += "(-)\t" - continue - else: - lineOut += checkFeat + "\t" - numGenes += 1 - if accession == "": - for feat in gbkRecs[i].features: - if "locus_tag" in feat.qualifiers.keys() and feat.qualifiers["locus_tag"][0] == checkFeat: - if "protein_id" in feat.qualifiers.keys(): - accession = feat.qualifiers["protein_id"][0] - break # Comment out if we need to get more info - lineOut += str(numGenes) + "\t" + accession - print(lineOut) - + lineElems = line.split("\t") + numGenes = 0 + accession = "" + lineOut = "" + if recIDs == []: + for i in lineElems: + recIDs.append(i.strip()) + lineOut += i.strip() + "\t" + for rec in recFlatten: + if i.strip() in rec.id or rec.id in i.strip(): + gbkRecs.append(rec) + lineOut += "No. of phages in which gene is present\tBest Database Match" + print(lineOut) + continue + for i in range(0, len(lineElems)): + checkFeat = lineElems[i].strip() + if checkFeat == "-": + lineOut += "(-)\t" + continue + else: + lineOut += checkFeat + "\t" + numGenes += 1 + if accession == "": + for feat in gbkRecs[i].features: + if ( + "locus_tag" in feat.qualifiers.keys() + and feat.qualifiers["locus_tag"][0] == checkFeat + ): + if "protein_id" in feat.qualifiers.keys(): + accession = feat.qualifiers["protein_id"][0] + break # Comment out if we need to get more info + lineOut += str(numGenes) + "\t" + accession + print(lineOut) diff --git a/cpt_putative_isp/generate-putative-isp.py b/cpt_putative_isp/generate-putative-isp.py index 56a1ab1..aa7bce2 100755 --- a/cpt_putative_isp/generate-putative-isp.py +++ b/cpt_putative_isp/generate-putative-isp.py @@ -173,24 +173,20 @@ type=int, ) - parser.add_argument( - "--isp_mode", - action="store_true", - default=True - ) + parser.add_argument("--isp_mode", action="store_true", default=True) parser.add_argument( "--peri_min", type=int, default=18, - help="amount of residues after TMD is found min" + help="amount of residues after TMD is found min", ) parser.add_argument( "--peri_max", type=int, default=206, - help="amount of residues after TMD is found max" + help="amount of residues after TMD is found max", ) # parser.add_argument('-v', action='version', version='0.3.0') # Is this manually updated? args = parser.parse_args() @@ -258,7 +254,7 @@ with args.putative_isp_fa as f: for desc, s in candidate_dict.items(): # description / sequence f.write(">" + str(desc)) - f.write("\n" + lineWrapper(str(s).replace("*","")) + "\n") + f.write("\n" + lineWrapper(str(s).replace("*", "")) + "\n") length.append(len(s)) # ORF.append(desc) if not length: @@ -274,31 +270,30 @@ else: i = n // 2 med = (length[i - 1] + length[i]) / 2 - + #### Extra statistics args.out_isp_prot.close() all_orfs = open(args.out_isp_prot.name, "r") all_isps = open(args.putative_isp_fa.name, "r") - #record = SeqIO.read(all_orfs, "fasta") - #print(len(record)) + # record = SeqIO.read(all_orfs, "fasta") + # print(len(record)) n = 0 for line in all_orfs: if line.startswith(">"): n += 1 all_orfs_counts = n - + c = 0 for line in all_isps: if line.startswith(">"): c += 1 all_isps_counts = c - #print(f"{n} -> {c}") - #count = 0 - #for feature in record.features: + # print(f"{n} -> {c}") + # count = 0 + # for feature in record.features: # count += 1 - #print(count) - + # print(count) with args.summary_isp_txt as f: f.write("total potential o-spanins: " + str(total_isp) + "\n") @@ -306,11 +301,13 @@ f.write("median length (AA): " + str(med) + "\n") f.write("maximum orf in size (AA): " + str(top_size) + "\n") f.write("minimum orf in size (AA): " + str(bot_size) + "\n") - f.write("ratio of isps found from naive orfs: " + str(c) + "/" +str(n)) + f.write("ratio of isps found from naive orfs: " + str(c) + "/" + str(n)) # Output the putative list in gff3 format args.putative_isp_fa = open(args.putative_isp_fa.name, "r") - gff_data = prep_a_gff3(fa=args.putative_isp_fa, spanin_type="isp",org=args.fasta_file) + gff_data = prep_a_gff3( + fa=args.putative_isp_fa, spanin_type="isp", org=args.fasta_file + ) write_gff3(data=gff_data, output=args.putative_isp_gff) """https://docs.python.org/3.4/library/subprocess.html""" diff --git a/cpt_putative_isp/spaninFuncs.py b/cpt_putative_isp/spaninFuncs.py index 35d627a..bbf5870 100755 --- a/cpt_putative_isp/spaninFuncs.py +++ b/cpt_putative_isp/spaninFuncs.py @@ -18,9 +18,9 @@ def check_back_end_snorkels(seq, tmsize): """ - Searches through the backend of a potential TMD snorkel. This is the 2nd part of a TMD snorkel lysine match. - --> seq : should be the sequence fed from the "search_region" portion of the sequence - --> tmsize : size of the potential TMD being investigated + Searches through the backend of a potential TMD snorkel. This is the 2nd part of a TMD snorkel lysine match. + --> seq : should be the sequence fed from the "search_region" portion of the sequence + --> tmsize : size of the potential TMD being investigated """ found = [] if seq[tmsize - 4] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 5]): @@ -42,10 +42,10 @@ def check_back_end_snorkels(seq, tmsize): def prep_a_gff3(fa, spanin_type, org): """ - Function parses an input detailed 'fa' file and outputs a 'gff3' file - ---> fa = input .fa file - ---> output = output a returned list of data, easily portable to a gff3 next - ---> spanin_type = 'isp' or 'osp' + Function parses an input detailed 'fa' file and outputs a 'gff3' file + ---> fa = input .fa file + ---> output = output a returned list of data, easily portable to a gff3 next + ---> spanin_type = 'isp' or 'osp' """ with org as f: header = f.readline() @@ -76,17 +76,21 @@ def prep_a_gff3(fa, spanin_type, org): source = "cpt.py|putative-*.py" # column 2 score = "." # column 6 phase = "." # column 8 - attributes = "ID=" +orgacc+ "|"+ orfid + ";ALIAS=" + spanin + ";SEQ="+a_pair[1] # column 9 - sequence = [[orgacc, source, methodtype, start, end, score, strand, phase, attributes]] + attributes = ( + "ID=" + orgacc + "|" + orfid + ";ALIAS=" + spanin + ";SEQ=" + a_pair[1] + ) # column 9 + sequence = [ + [orgacc, source, methodtype, start, end, score, strand, phase, attributes] + ] data += sequence return data def write_gff3(data, output="results.gff3"): """ - Parses results from prep_a_gff3 into a gff3 file - ---> input : list from prep_a_gff3 - ---> output : gff3 file + Parses results from prep_a_gff3 into a gff3 file + ---> input : list from prep_a_gff3 + ---> output : gff3 file """ data = data filename = output @@ -109,14 +113,23 @@ def write_gff3(data, output="results.gff3"): f.close() -def find_tmd(pair, minimum=10, maximum=30, TMDmin=10, TMDmax=20, isp_mode=False, peri_min=18, peri_max=206): - """ - Function that searches for lysine snorkels and then for a spanning hydrophobic region that indicates a potential TMD - ---> pair : Input of tuple with description and AA sequence (str) - ---> minimum : How close from the initial start codon a TMD can be within - ---> maximum : How far from the initial start codon a TMD can be within - ---> TMDmin : The minimum size that a transmembrane can be (default = 10) - ---> TMDmax : The maximum size tha ta transmembrane can be (default = 20) +def find_tmd( + pair, + minimum=10, + maximum=30, + TMDmin=10, + TMDmax=20, + isp_mode=False, + peri_min=18, + peri_max=206, +): + """ + Function that searches for lysine snorkels and then for a spanning hydrophobic region that indicates a potential TMD + ---> pair : Input of tuple with description and AA sequence (str) + ---> minimum : How close from the initial start codon a TMD can be within + ---> maximum : How far from the initial start codon a TMD can be within + ---> TMDmin : The minimum size that a transmembrane can be (default = 10) + ---> TMDmax : The maximum size tha ta transmembrane can be (default = 20) """ # hydrophobicAAs = ['P', 'F', 'I', 'W', 'L', 'V', 'M', 'Y', 'C', 'A', 'T', 'G', 'S'] tmd = [] @@ -125,55 +138,62 @@ def find_tmd(pair, minimum=10, maximum=30, TMDmin=10, TMDmax=20, isp_mode=False, if maximum > len(s): maximum = len(s) search_region = s[minimum - 1 : maximum + 1] - #print(f"this is the search region: {search_region}") + # print(f"this is the search region: {search_region}") # print(search_region) # for trouble shooting - for tmsize in range(TMDmin, TMDmax+1, 1): - #print(f"this is the current tmsize we're trying: {tmsize}") + for tmsize in range(TMDmin, TMDmax + 1, 1): + # print(f"this is the current tmsize we're trying: {tmsize}") # print('==============='+str(tmsize)+'================') # print for troubleshooting - pattern = "[PFIWLVMYCATGS]{"+str(tmsize)+"}" # searches for these hydrophobic residues tmsize total times - #print(pattern) - #print(f"sending to regex: {search_region}") + pattern = ( + "[PFIWLVMYCATGS]{" + str(tmsize) + "}" + ) # searches for these hydrophobic residues tmsize total times + # print(pattern) + # print(f"sending to regex: {search_region}") if re.search( - ("[K]"), search_region[1:8]): # grabbing one below with search region, so I want to grab one ahead here when I query. - store_search = re.search(("[K]"), search_region[1:8]) # storing regex object + ("[K]"), search_region[1:8] + ): # grabbing one below with search region, so I want to grab one ahead here when I query. + store_search = re.search( + ("[K]"), search_region[1:8] + ) # storing regex object where_we_are = store_search.start() # finding where we got the hit if re.search( ("[PFIWLVMYCATGS]"), search_region[where_we_are + 1] ) and re.search( ("[PFIWLVMYCATGS]"), search_region[where_we_are - 1] ): # hydrophobic neighbor - #try: - g = re.search(("[PFIWLVMYCATGS]"), search_region[where_we_are + 1]).group() + # try: + g = re.search( + ("[PFIWLVMYCATGS]"), search_region[where_we_are + 1] + ).group() backend = check_back_end_snorkels(search_region, tmsize) if backend == "match": if isp_mode: g = re.search((pattern), search_region).group() - end_of_tmd = re.search((g), s).end()+1 + end_of_tmd = re.search((g), s).end() + 1 amt_peri = len(s) - end_of_tmd if peri_min <= amt_peri <= peri_max: - pair_desc = pair[0] + ", peri_count~="+str(amt_peri) - new_pair = (pair_desc,pair[1]) + pair_desc = pair[0] + ", peri_count~=" + str(amt_peri) + new_pair = (pair_desc, pair[1]) tmd.append(new_pair) else: tmd.append(pair) else: continue - #else: - #print("I'm continuing out of snorkel loop") - #print(f"{search_region}") - #continue + # else: + # print("I'm continuing out of snorkel loop") + # print(f"{search_region}") + # continue if re.search((pattern), search_region): - #print(f"found match: {}") - #print("I AM HEREEEEEEEEEEEEEEEEEEEEEEE") - #try: + # print(f"found match: {}") + # print("I AM HEREEEEEEEEEEEEEEEEEEEEEEE") + # try: if isp_mode: g = re.search((pattern), search_region).group() - end_of_tmd = re.search((g), s).end()+1 + end_of_tmd = re.search((g), s).end() + 1 amt_peri = len(s) - end_of_tmd if peri_min <= amt_peri <= peri_max: - pair_desc = pair[0] + ", peri_count~="+str(amt_peri) - new_pair = (pair_desc,pair[1]) + pair_desc = pair[0] + ", peri_count~=" + str(amt_peri) + new_pair = (pair_desc, pair[1]) tmd.append(new_pair) else: tmd.append(pair) @@ -183,13 +203,15 @@ def find_tmd(pair, minimum=10, maximum=30, TMDmin=10, TMDmax=20, isp_mode=False, return tmd -def find_lipobox(pair, minimum=10, maximum=50, min_after=30, max_after=185, regex=1, osp_mode=False): +def find_lipobox( + pair, minimum=10, maximum=50, min_after=30, max_after=185, regex=1, osp_mode=False +): """ - Function that takes an input tuple, and will return pairs of sequences to their description that have a lipoobox - ---> minimum - min distance from start codon to first AA of lipobox - ---> maximum - max distance from start codon to first AA of lipobox - ---> regex - option 1 (default) => more strict regular expression ; option 2 => looser selection, imported from LipoRy - + Function that takes an input tuple, and will return pairs of sequences to their description that have a lipoobox + ---> minimum - min distance from start codon to first AA of lipobox + ---> maximum - max distance from start codon to first AA of lipobox + ---> regex - option 1 (default) => more strict regular expression ; option 2 => looser selection, imported from LipoRy + """ if regex == 1: pattern = "[ILMFTV][^REKD][GAS]C" # regex for Lipobox from findSpanin.pl @@ -199,19 +221,23 @@ def find_lipobox(pair, minimum=10, maximum=50, min_after=30, max_after=185, rege candidates = [] s = str(pair[1]) # print(s) # trouble shooting - search_region = s[minimum-1 : maximum + 5] # properly slice the input... add 4 to catch if it hangs off at max input + search_region = s[ + minimum - 1 : maximum + 5 + ] # properly slice the input... add 4 to catch if it hangs off at max input # print(search_region) # trouble shooting - patterns = ["[ILMFTV][^REKD][GAS]C","AW[AGS]C"] + patterns = ["[ILMFTV][^REKD][GAS]C", "AW[AGS]C"] for pattern in patterns: - #print(pattern) # trouble shooting + # print(pattern) # trouble shooting if re.search((pattern), search_region): # lipobox must be WITHIN the range... # searches the sequence with the input RegEx AND omits if - g = re.search((pattern), search_region).group() # find the exact group match + g = re.search( + (pattern), search_region + ).group() # find the exact group match amt_peri = len(s) - re.search((g), s).end() + 1 - if min_after <= amt_peri <= max_after: # find the lipobox end region + if min_after <= amt_peri <= max_after: # find the lipobox end region if osp_mode: - pair_desc = pair[0] + ", peri_count~="+str(amt_peri) - new_pair = (pair_desc,pair[1]) + pair_desc = pair[0] + ", peri_count~=" + str(amt_peri) + new_pair = (pair_desc, pair[1]) candidates.append(new_pair) else: candidates.append(pair) @@ -221,9 +247,9 @@ def find_lipobox(pair, minimum=10, maximum=50, min_after=30, max_after=185, rege def tuple_fasta(fasta_file): """ - #### INPUT: Fasta File - #### OUTPUT: zipped (zip) : pairwise relationship of description to sequence - #### + #### INPUT: Fasta File + #### OUTPUT: zipped (zip) : pairwise relationship of description to sequence + #### """ fasta = SeqIO.parse(fasta_file, "fasta") descriptions = [] @@ -281,10 +307,10 @@ def splitStrands(text, strand="+"): def parse_a_range(pair, start, end): """ - Takes an input data tuple from a fasta tuple pair and keeps only those within the input sequence range - ---> data : fasta tuple data - ---> start : start range to keep - ---> end : end range to keep (will need to + 1) + Takes an input data tuple from a fasta tuple pair and keeps only those within the input sequence range + ---> data : fasta tuple data + ---> start : start range to keep + ---> end : end range to keep (will need to + 1) """ matches = [] for each_pair in pair: @@ -310,12 +336,18 @@ def grabLocs(text): Grabs the locations of the spanin based on NT location (seen from ORF). Grabs the ORF name, as per named from the ORF class/module from cpt.py """ - start = re.search(("[\d]+\.\."), text).group(0) # Start of the sequence ; looks for [numbers].. - end = re.search(("\.\.[\d]+"), text).group(0) # End of the sequence ; Looks for ..[numbers] - orf = re.search(("(ORF)[\d]+"), text).group(0) # Looks for ORF and the numbers that are after it - if re.search(("(\[1\])"), text): # stores strand + start = re.search(("[\d]+\.\."), text).group( + 0 + ) # Start of the sequence ; looks for [numbers].. + end = re.search(("\.\.[\d]+"), text).group( + 0 + ) # End of the sequence ; Looks for ..[numbers] + orf = re.search(("(ORF)[\d]+"), text).group( + 0 + ) # Looks for ORF and the numbers that are after it + if re.search(("(\[1\])"), text): # stores strand strand = "+" - elif re.search(("(\[-1\])"), text): # stores strand + elif re.search(("(\[-1\])"), text): # stores strand strand = "-" start = int(start.split("..")[0]) end = int(end.split("..")[1]) @@ -329,7 +361,7 @@ def spaninProximity(isp, osp, max_dist=30): _NOTE THIS FUNCTION COULD BE MODIFIED TO RETURN SEQUENCES_ Compares the locations of i-spanins and o-spanins. max_dist is the distance in NT measurement from i-spanin END site to o-spanin START. The user will be inputting AA distance, so a conversion will be necessary ( * 3) - I modified this on 07.30.2020 to bypass the pick + or - strand. To + I modified this on 07.30.2020 to bypass the pick + or - strand. To INPUT: list of OSP and ISP candidates OUTPUT: Return (improved) candidates for overlapping, embedded, and separate list """ @@ -358,13 +390,27 @@ def spaninProximity(isp, osp, max_dist=30): elif iseq[0] < oseq[0] <= iseq[1] and oseq[1] > iseq[1]: ### OVERLAP / SEPARATE ### if (iseq[1] - oseq[0]) < 6: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + combo = [ + iseq[0], + iseq[1], + oseq[2], + oseq[0], + oseq[1], + iseq[3], + ] separate[iseq[2]] += [combo] else: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + combo = [ + iseq[0], + iseq[1], + oseq[2], + oseq[0], + oseq[1], + iseq[3], + ] overlap[iseq[2]] += [combo] elif iseq[1] <= oseq[0] <= iseq[1] + max_dist: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1], iseq[3]] separate[iseq[2]] += [combo] else: continue @@ -383,13 +429,27 @@ def spaninProximity(isp, osp, max_dist=30): embedded[iseq[2]] += [combo] elif iseq[0] <= oseq[1] <= iseq[1] and oseq[0] < iseq[0]: if (oseq[1] - iseq[0]) < 6: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + combo = [ + iseq[0], + iseq[1], + oseq[2], + oseq[0], + oseq[1], + iseq[3], + ] separate[iseq[2]] += [combo] else: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + combo = [ + iseq[0], + iseq[1], + oseq[2], + oseq[0], + oseq[1], + iseq[3], + ] overlap[iseq[2]] += [combo] elif iseq[0] - 10 < oseq[1] < iseq[0]: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1], iseq[3]] separate[iseq[2]] += [combo] else: continue @@ -402,7 +462,8 @@ def spaninProximity(isp, osp, max_dist=30): def check_for_usp(): - " pass " + "pass" + ############################################### TEST RANGE ######################################################################### #################################################################################################################################### @@ -454,7 +515,7 @@ def check_for_usp(): pairs = zip(test_desc, test_seq) lipo = [] for each_pair in pairs: - #print(each_pair) + # print(each_pair) # try: try: lipo += find_lipobox(pair=each_pair, regex=2) # , minimum=8) diff --git a/cpt_putative_osp/generate-putative-osp.py b/cpt_putative_osp/generate-putative-osp.py index dde2f58..7beac93 100755 --- a/cpt_putative_osp/generate-putative-osp.py +++ b/cpt_putative_osp/generate-putative-osp.py @@ -173,11 +173,7 @@ default="_putative_osp.gff3", help="gff3 output for putative o-spanins", ) - parser.add_argument( - "--osp_mode", - action="store_true", - default=True - ) + parser.add_argument("--osp_mode", action="store_true", default=True) # parser.add_argument('-v', action='version', version='0.3.0') # Is this manually updated? args = parser.parse_args() @@ -255,7 +251,7 @@ with args.putative_osp_fa as f: for desc, s in candidate_dict.items(): # description / sequence f.write(">" + str(desc)) - f.write("\n" + lineWrapper(str(s).replace("*","")) + "\n") + f.write("\n" + lineWrapper(str(s).replace("*", "")) + "\n") length.append(len(s)) ORF.append(desc) if not length: @@ -275,15 +271,15 @@ args.out_osp_prot.close() all_orfs = open(args.out_osp_prot.name, "r") all_osps = open(args.putative_osp_fa.name, "r") - #record = SeqIO.read(all_orfs, "fasta") - #print(len(record)) + # record = SeqIO.read(all_orfs, "fasta") + # print(len(record)) #### Extra stats n = 0 for line in all_orfs: if line.startswith(">"): n += 1 all_orfs_counts = n - + c = 0 for line in all_osps: if line.startswith(">"): @@ -296,9 +292,11 @@ f.write("median length (AA): " + str(med) + "\n") f.write("maximum orf in size (AA): " + str(top_size) + "\n") f.write("minimum orf in size (AA): " + str(bot_size) + "\n") - #f.write(f"ratio of osps found from naive orfs: {c}/{n}") - f.write("ratio of osps found from naive orfs: "+ str(c) + "/" +str(n)) + # f.write(f"ratio of osps found from naive orfs: {c}/{n}") + f.write("ratio of osps found from naive orfs: " + str(c) + "/" + str(n)) # Output the putative list in gff3 format: args.putative_osp_fa = open(args.putative_osp_fa.name, "r") - gff_data = prep_a_gff3(fa=args.putative_osp_fa, spanin_type="osp",org=args.fasta_file) + gff_data = prep_a_gff3( + fa=args.putative_osp_fa, spanin_type="osp", org=args.fasta_file + ) write_gff3(data=gff_data, output=args.putative_osp_gff) diff --git a/cpt_putative_osp/spaninFuncs.py b/cpt_putative_osp/spaninFuncs.py index 35d627a..bbf5870 100755 --- a/cpt_putative_osp/spaninFuncs.py +++ b/cpt_putative_osp/spaninFuncs.py @@ -18,9 +18,9 @@ def check_back_end_snorkels(seq, tmsize): """ - Searches through the backend of a potential TMD snorkel. This is the 2nd part of a TMD snorkel lysine match. - --> seq : should be the sequence fed from the "search_region" portion of the sequence - --> tmsize : size of the potential TMD being investigated + Searches through the backend of a potential TMD snorkel. This is the 2nd part of a TMD snorkel lysine match. + --> seq : should be the sequence fed from the "search_region" portion of the sequence + --> tmsize : size of the potential TMD being investigated """ found = [] if seq[tmsize - 4] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 5]): @@ -42,10 +42,10 @@ def check_back_end_snorkels(seq, tmsize): def prep_a_gff3(fa, spanin_type, org): """ - Function parses an input detailed 'fa' file and outputs a 'gff3' file - ---> fa = input .fa file - ---> output = output a returned list of data, easily portable to a gff3 next - ---> spanin_type = 'isp' or 'osp' + Function parses an input detailed 'fa' file and outputs a 'gff3' file + ---> fa = input .fa file + ---> output = output a returned list of data, easily portable to a gff3 next + ---> spanin_type = 'isp' or 'osp' """ with org as f: header = f.readline() @@ -76,17 +76,21 @@ def prep_a_gff3(fa, spanin_type, org): source = "cpt.py|putative-*.py" # column 2 score = "." # column 6 phase = "." # column 8 - attributes = "ID=" +orgacc+ "|"+ orfid + ";ALIAS=" + spanin + ";SEQ="+a_pair[1] # column 9 - sequence = [[orgacc, source, methodtype, start, end, score, strand, phase, attributes]] + attributes = ( + "ID=" + orgacc + "|" + orfid + ";ALIAS=" + spanin + ";SEQ=" + a_pair[1] + ) # column 9 + sequence = [ + [orgacc, source, methodtype, start, end, score, strand, phase, attributes] + ] data += sequence return data def write_gff3(data, output="results.gff3"): """ - Parses results from prep_a_gff3 into a gff3 file - ---> input : list from prep_a_gff3 - ---> output : gff3 file + Parses results from prep_a_gff3 into a gff3 file + ---> input : list from prep_a_gff3 + ---> output : gff3 file """ data = data filename = output @@ -109,14 +113,23 @@ def write_gff3(data, output="results.gff3"): f.close() -def find_tmd(pair, minimum=10, maximum=30, TMDmin=10, TMDmax=20, isp_mode=False, peri_min=18, peri_max=206): - """ - Function that searches for lysine snorkels and then for a spanning hydrophobic region that indicates a potential TMD - ---> pair : Input of tuple with description and AA sequence (str) - ---> minimum : How close from the initial start codon a TMD can be within - ---> maximum : How far from the initial start codon a TMD can be within - ---> TMDmin : The minimum size that a transmembrane can be (default = 10) - ---> TMDmax : The maximum size tha ta transmembrane can be (default = 20) +def find_tmd( + pair, + minimum=10, + maximum=30, + TMDmin=10, + TMDmax=20, + isp_mode=False, + peri_min=18, + peri_max=206, +): + """ + Function that searches for lysine snorkels and then for a spanning hydrophobic region that indicates a potential TMD + ---> pair : Input of tuple with description and AA sequence (str) + ---> minimum : How close from the initial start codon a TMD can be within + ---> maximum : How far from the initial start codon a TMD can be within + ---> TMDmin : The minimum size that a transmembrane can be (default = 10) + ---> TMDmax : The maximum size tha ta transmembrane can be (default = 20) """ # hydrophobicAAs = ['P', 'F', 'I', 'W', 'L', 'V', 'M', 'Y', 'C', 'A', 'T', 'G', 'S'] tmd = [] @@ -125,55 +138,62 @@ def find_tmd(pair, minimum=10, maximum=30, TMDmin=10, TMDmax=20, isp_mode=False, if maximum > len(s): maximum = len(s) search_region = s[minimum - 1 : maximum + 1] - #print(f"this is the search region: {search_region}") + # print(f"this is the search region: {search_region}") # print(search_region) # for trouble shooting - for tmsize in range(TMDmin, TMDmax+1, 1): - #print(f"this is the current tmsize we're trying: {tmsize}") + for tmsize in range(TMDmin, TMDmax + 1, 1): + # print(f"this is the current tmsize we're trying: {tmsize}") # print('==============='+str(tmsize)+'================') # print for troubleshooting - pattern = "[PFIWLVMYCATGS]{"+str(tmsize)+"}" # searches for these hydrophobic residues tmsize total times - #print(pattern) - #print(f"sending to regex: {search_region}") + pattern = ( + "[PFIWLVMYCATGS]{" + str(tmsize) + "}" + ) # searches for these hydrophobic residues tmsize total times + # print(pattern) + # print(f"sending to regex: {search_region}") if re.search( - ("[K]"), search_region[1:8]): # grabbing one below with search region, so I want to grab one ahead here when I query. - store_search = re.search(("[K]"), search_region[1:8]) # storing regex object + ("[K]"), search_region[1:8] + ): # grabbing one below with search region, so I want to grab one ahead here when I query. + store_search = re.search( + ("[K]"), search_region[1:8] + ) # storing regex object where_we_are = store_search.start() # finding where we got the hit if re.search( ("[PFIWLVMYCATGS]"), search_region[where_we_are + 1] ) and re.search( ("[PFIWLVMYCATGS]"), search_region[where_we_are - 1] ): # hydrophobic neighbor - #try: - g = re.search(("[PFIWLVMYCATGS]"), search_region[where_we_are + 1]).group() + # try: + g = re.search( + ("[PFIWLVMYCATGS]"), search_region[where_we_are + 1] + ).group() backend = check_back_end_snorkels(search_region, tmsize) if backend == "match": if isp_mode: g = re.search((pattern), search_region).group() - end_of_tmd = re.search((g), s).end()+1 + end_of_tmd = re.search((g), s).end() + 1 amt_peri = len(s) - end_of_tmd if peri_min <= amt_peri <= peri_max: - pair_desc = pair[0] + ", peri_count~="+str(amt_peri) - new_pair = (pair_desc,pair[1]) + pair_desc = pair[0] + ", peri_count~=" + str(amt_peri) + new_pair = (pair_desc, pair[1]) tmd.append(new_pair) else: tmd.append(pair) else: continue - #else: - #print("I'm continuing out of snorkel loop") - #print(f"{search_region}") - #continue + # else: + # print("I'm continuing out of snorkel loop") + # print(f"{search_region}") + # continue if re.search((pattern), search_region): - #print(f"found match: {}") - #print("I AM HEREEEEEEEEEEEEEEEEEEEEEEE") - #try: + # print(f"found match: {}") + # print("I AM HEREEEEEEEEEEEEEEEEEEEEEEE") + # try: if isp_mode: g = re.search((pattern), search_region).group() - end_of_tmd = re.search((g), s).end()+1 + end_of_tmd = re.search((g), s).end() + 1 amt_peri = len(s) - end_of_tmd if peri_min <= amt_peri <= peri_max: - pair_desc = pair[0] + ", peri_count~="+str(amt_peri) - new_pair = (pair_desc,pair[1]) + pair_desc = pair[0] + ", peri_count~=" + str(amt_peri) + new_pair = (pair_desc, pair[1]) tmd.append(new_pair) else: tmd.append(pair) @@ -183,13 +203,15 @@ def find_tmd(pair, minimum=10, maximum=30, TMDmin=10, TMDmax=20, isp_mode=False, return tmd -def find_lipobox(pair, minimum=10, maximum=50, min_after=30, max_after=185, regex=1, osp_mode=False): +def find_lipobox( + pair, minimum=10, maximum=50, min_after=30, max_after=185, regex=1, osp_mode=False +): """ - Function that takes an input tuple, and will return pairs of sequences to their description that have a lipoobox - ---> minimum - min distance from start codon to first AA of lipobox - ---> maximum - max distance from start codon to first AA of lipobox - ---> regex - option 1 (default) => more strict regular expression ; option 2 => looser selection, imported from LipoRy - + Function that takes an input tuple, and will return pairs of sequences to their description that have a lipoobox + ---> minimum - min distance from start codon to first AA of lipobox + ---> maximum - max distance from start codon to first AA of lipobox + ---> regex - option 1 (default) => more strict regular expression ; option 2 => looser selection, imported from LipoRy + """ if regex == 1: pattern = "[ILMFTV][^REKD][GAS]C" # regex for Lipobox from findSpanin.pl @@ -199,19 +221,23 @@ def find_lipobox(pair, minimum=10, maximum=50, min_after=30, max_after=185, rege candidates = [] s = str(pair[1]) # print(s) # trouble shooting - search_region = s[minimum-1 : maximum + 5] # properly slice the input... add 4 to catch if it hangs off at max input + search_region = s[ + minimum - 1 : maximum + 5 + ] # properly slice the input... add 4 to catch if it hangs off at max input # print(search_region) # trouble shooting - patterns = ["[ILMFTV][^REKD][GAS]C","AW[AGS]C"] + patterns = ["[ILMFTV][^REKD][GAS]C", "AW[AGS]C"] for pattern in patterns: - #print(pattern) # trouble shooting + # print(pattern) # trouble shooting if re.search((pattern), search_region): # lipobox must be WITHIN the range... # searches the sequence with the input RegEx AND omits if - g = re.search((pattern), search_region).group() # find the exact group match + g = re.search( + (pattern), search_region + ).group() # find the exact group match amt_peri = len(s) - re.search((g), s).end() + 1 - if min_after <= amt_peri <= max_after: # find the lipobox end region + if min_after <= amt_peri <= max_after: # find the lipobox end region if osp_mode: - pair_desc = pair[0] + ", peri_count~="+str(amt_peri) - new_pair = (pair_desc,pair[1]) + pair_desc = pair[0] + ", peri_count~=" + str(amt_peri) + new_pair = (pair_desc, pair[1]) candidates.append(new_pair) else: candidates.append(pair) @@ -221,9 +247,9 @@ def find_lipobox(pair, minimum=10, maximum=50, min_after=30, max_after=185, rege def tuple_fasta(fasta_file): """ - #### INPUT: Fasta File - #### OUTPUT: zipped (zip) : pairwise relationship of description to sequence - #### + #### INPUT: Fasta File + #### OUTPUT: zipped (zip) : pairwise relationship of description to sequence + #### """ fasta = SeqIO.parse(fasta_file, "fasta") descriptions = [] @@ -281,10 +307,10 @@ def splitStrands(text, strand="+"): def parse_a_range(pair, start, end): """ - Takes an input data tuple from a fasta tuple pair and keeps only those within the input sequence range - ---> data : fasta tuple data - ---> start : start range to keep - ---> end : end range to keep (will need to + 1) + Takes an input data tuple from a fasta tuple pair and keeps only those within the input sequence range + ---> data : fasta tuple data + ---> start : start range to keep + ---> end : end range to keep (will need to + 1) """ matches = [] for each_pair in pair: @@ -310,12 +336,18 @@ def grabLocs(text): Grabs the locations of the spanin based on NT location (seen from ORF). Grabs the ORF name, as per named from the ORF class/module from cpt.py """ - start = re.search(("[\d]+\.\."), text).group(0) # Start of the sequence ; looks for [numbers].. - end = re.search(("\.\.[\d]+"), text).group(0) # End of the sequence ; Looks for ..[numbers] - orf = re.search(("(ORF)[\d]+"), text).group(0) # Looks for ORF and the numbers that are after it - if re.search(("(\[1\])"), text): # stores strand + start = re.search(("[\d]+\.\."), text).group( + 0 + ) # Start of the sequence ; looks for [numbers].. + end = re.search(("\.\.[\d]+"), text).group( + 0 + ) # End of the sequence ; Looks for ..[numbers] + orf = re.search(("(ORF)[\d]+"), text).group( + 0 + ) # Looks for ORF and the numbers that are after it + if re.search(("(\[1\])"), text): # stores strand strand = "+" - elif re.search(("(\[-1\])"), text): # stores strand + elif re.search(("(\[-1\])"), text): # stores strand strand = "-" start = int(start.split("..")[0]) end = int(end.split("..")[1]) @@ -329,7 +361,7 @@ def spaninProximity(isp, osp, max_dist=30): _NOTE THIS FUNCTION COULD BE MODIFIED TO RETURN SEQUENCES_ Compares the locations of i-spanins and o-spanins. max_dist is the distance in NT measurement from i-spanin END site to o-spanin START. The user will be inputting AA distance, so a conversion will be necessary ( * 3) - I modified this on 07.30.2020 to bypass the pick + or - strand. To + I modified this on 07.30.2020 to bypass the pick + or - strand. To INPUT: list of OSP and ISP candidates OUTPUT: Return (improved) candidates for overlapping, embedded, and separate list """ @@ -358,13 +390,27 @@ def spaninProximity(isp, osp, max_dist=30): elif iseq[0] < oseq[0] <= iseq[1] and oseq[1] > iseq[1]: ### OVERLAP / SEPARATE ### if (iseq[1] - oseq[0]) < 6: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + combo = [ + iseq[0], + iseq[1], + oseq[2], + oseq[0], + oseq[1], + iseq[3], + ] separate[iseq[2]] += [combo] else: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + combo = [ + iseq[0], + iseq[1], + oseq[2], + oseq[0], + oseq[1], + iseq[3], + ] overlap[iseq[2]] += [combo] elif iseq[1] <= oseq[0] <= iseq[1] + max_dist: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1], iseq[3]] separate[iseq[2]] += [combo] else: continue @@ -383,13 +429,27 @@ def spaninProximity(isp, osp, max_dist=30): embedded[iseq[2]] += [combo] elif iseq[0] <= oseq[1] <= iseq[1] and oseq[0] < iseq[0]: if (oseq[1] - iseq[0]) < 6: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + combo = [ + iseq[0], + iseq[1], + oseq[2], + oseq[0], + oseq[1], + iseq[3], + ] separate[iseq[2]] += [combo] else: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + combo = [ + iseq[0], + iseq[1], + oseq[2], + oseq[0], + oseq[1], + iseq[3], + ] overlap[iseq[2]] += [combo] elif iseq[0] - 10 < oseq[1] < iseq[0]: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1], iseq[3]] separate[iseq[2]] += [combo] else: continue @@ -402,7 +462,8 @@ def spaninProximity(isp, osp, max_dist=30): def check_for_usp(): - " pass " + "pass" + ############################################### TEST RANGE ######################################################################### #################################################################################################################################### @@ -454,7 +515,7 @@ def check_for_usp(): pairs = zip(test_desc, test_seq) lipo = [] for each_pair in pairs: - #print(each_pair) + # print(each_pair) # try: try: lipo += find_lipobox(pair=each_pair, regex=2) # , minimum=8) diff --git a/cpt_putative_usp/generate-putative-usp.py b/cpt_putative_usp/generate-putative-usp.py index d138a34..dbc310d 100755 --- a/cpt_putative_usp/generate-putative-usp.py +++ b/cpt_putative_usp/generate-putative-usp.py @@ -18,7 +18,7 @@ parser = argparse.ArgumentParser( description="Get putative protein candidates for u-spanins" ) - + parser.add_argument( "fasta_file", type=argparse.FileType("r"), help="Fasta file" ) # the "input" argument @@ -119,16 +119,42 @@ help="gff3 output for putative o-spanins", ) - parser.add_argument("--min_size", type=int, default=100, help="minimum size of peptide") - parser.add_argument("--max_size", type=int, default=200, help="maximum size of peptide") - parser.add_argument("--lipo_min_start", type=int, default=10, help="minimum start site of lipobox") - parser.add_argument("--lipo_max_start", type=int, default=30, help="maximum end site of lipobox") - parser.add_argument("--min_lipo_after", type=int, default=60, help="minumum amount of residues after lipobox") - parser.add_argument("--max_lipo_after", type=int, default=160, help="maximum amount of residues after lipobox") - parser.add_argument("--tmd_min_start", type=int, default=75, help="minumum start site of TMD") - parser.add_argument("--tmd_max_start", type=int, default=200, help="maximum end site of TMD") - parser.add_argument("--tmd_min_size", type=int, default=15, help="minimum size of TMD") - parser.add_argument("--tmd_max_size", type=int, default=25, help="maximum size of TMD") + parser.add_argument( + "--min_size", type=int, default=100, help="minimum size of peptide" + ) + parser.add_argument( + "--max_size", type=int, default=200, help="maximum size of peptide" + ) + parser.add_argument( + "--lipo_min_start", type=int, default=10, help="minimum start site of lipobox" + ) + parser.add_argument( + "--lipo_max_start", type=int, default=30, help="maximum end site of lipobox" + ) + parser.add_argument( + "--min_lipo_after", + type=int, + default=60, + help="minumum amount of residues after lipobox", + ) + parser.add_argument( + "--max_lipo_after", + type=int, + default=160, + help="maximum amount of residues after lipobox", + ) + parser.add_argument( + "--tmd_min_start", type=int, default=75, help="minumum start site of TMD" + ) + parser.add_argument( + "--tmd_max_start", type=int, default=200, help="maximum end site of TMD" + ) + parser.add_argument( + "--tmd_min_size", type=int, default=15, help="minimum size of TMD" + ) + parser.add_argument( + "--tmd_max_size", type=int, default=25, help="maximum size of TMD" + ) args = parser.parse_args() @@ -147,45 +173,47 @@ args.fasta_file.close() args.fasta_file = open(args.fasta_file.name, "r") args.out_usp_prot.close() - args.out_usp_prot = open(args.out_usp_prot.name,"r") + args.out_usp_prot = open(args.out_usp_prot.name, "r") pairs = tuple_fasta(fasta_file=args.out_usp_prot) have_lipo = [] - + for each_pair in pairs: if len(each_pair[1]) <= args.max_size: try: - have_lipo += find_lipobox(pair=each_pair, - minimum=args.lipo_min_start, - maximum=args.lipo_max_start, - min_after=args.min_lipo_after, - max_after=args.max_lipo_after, - ) + have_lipo += find_lipobox( + pair=each_pair, + minimum=args.lipo_min_start, + maximum=args.lipo_max_start, + min_after=args.min_lipo_after, + max_after=args.max_lipo_after, + ) except (IndexError, TypeError): continue - - #print(len(have_lipo)) - #print(have_lipo) + + # print(len(have_lipo)) + # print(have_lipo) have_tmd_and_lipo = [] - #print(args.tmd_min_start) - #print(args.tmd_max_start) - #print(args.tmd_min_size) - #print(args.tmd_max_size) + # print(args.tmd_min_start) + # print(args.tmd_max_start) + # print(args.tmd_min_size) + # print(args.tmd_max_size) for each_pair in have_lipo: try: - have_tmd_and_lipo += find_tmd(pair=each_pair, - minimum=args.tmd_min_start, - maximum=args.tmd_max_start, - TMDmin=args.tmd_min_size, - TMDmax=args.tmd_max_size, - ) + have_tmd_and_lipo += find_tmd( + pair=each_pair, + minimum=args.tmd_min_start, + maximum=args.tmd_max_start, + TMDmin=args.tmd_min_size, + TMDmax=args.tmd_max_size, + ) except (IndexError, TypeError): continue - - #print(len(have_tmd_and_lipo)) - #print(have_tmd_and_lipo) + + # print(len(have_tmd_and_lipo)) + # print(have_tmd_and_lipo) if args.switch == "all": pass @@ -195,30 +223,30 @@ start = int(range_of.split(":")[0]) end = int(range_of.split(":")[1]) have_lipo = parse_a_range(pair=have_tmd_and_lipo, start=start, end=end) - + total_have_tmd_and_lipo = len(have_tmd_and_lipo) ORF = [] length = [] - candidate_dict = {k:v for k, v in have_tmd_and_lipo} + candidate_dict = {k: v for k, v in have_tmd_and_lipo} with args.putative_usp_fa as f: for desc, s in candidate_dict.items(): f.write(">" + str(desc)) - f.write("\n" + lineWrapper(str(s).replace("*",""))+"\n") + f.write("\n" + lineWrapper(str(s).replace("*", "")) + "\n") length.append(len(s)) ORF.append(desc) #### Extra statistics args.out_usp_prot.close() all_orfs = open(args.out_usp_prot.name, "r") all_isps = open(args.putative_usp_fa.name, "r") - #record = SeqIO.read(all_orfs, "fasta") - #print(len(record)) + # record = SeqIO.read(all_orfs, "fasta") + # print(len(record)) n = 0 for line in all_orfs: if line.startswith(">"): n += 1 all_orfs_counts = n - + c = 0 for line in all_isps: if line.startswith(">"): @@ -231,22 +259,24 @@ avg = (sum(length)) / total_have_tmd_and_lipo n = len(length) if n == 0: - raise Exception("no median for empty data") + raise Exception("no median for empty data") if n % 2 == 1: - med = length[n // 2] + med = length[n // 2] else: - i = n // 2 - med = (length[i - 1] + length[i]) / 2 + i = n // 2 + med = (length[i - 1] + length[i]) / 2 with args.summary_usp_txt as f: - f.write("total potential u-spanins: " +str(total_have_tmd_and_lipo) + "\n") + f.write("total potential u-spanins: " + str(total_have_tmd_and_lipo) + "\n") f.write("average length (AA): " + str(avg) + "\n") f.write("median length (AA): " + str(med) + "\n") f.write("maximum orf in size (AA): " + str(top_size) + "\n") f.write("minimum orf in size (AA): " + str(bot_size) + "\n") - f.write("ratio of isps found from naive orfs: " + str(c) + "/" +str(n)) + f.write("ratio of isps found from naive orfs: " + str(c) + "/" + str(n)) args.putative_usp_fa = open(args.putative_usp_fa.name, "r") - gff_data = prep_a_gff3(fa=args.putative_usp_fa, spanin_type="usp", org=args.fasta_file) + gff_data = prep_a_gff3( + fa=args.putative_usp_fa, spanin_type="usp", org=args.fasta_file + ) write_gff3(data=gff_data, output=args.putative_usp_gff) else: with args.summary_usp_txt as f: @@ -254,7 +284,7 @@ if have_lipo: f.write("\nLipoboxes were found here:\n") for each_lipo in have_lipo: - f.write('>'+str(each_lipo[0])) - f.write("\n" + lineWrapper(each_lipo[1].replace("*",""))+"\n") + f.write(">" + str(each_lipo[0])) + f.write("\n" + lineWrapper(each_lipo[1].replace("*", "")) + "\n") else: f.write("\nNo Lipobox(es) were found within search restraints") diff --git a/cpt_putative_usp/spaninFuncs.py b/cpt_putative_usp/spaninFuncs.py index 35d627a..bbf5870 100755 --- a/cpt_putative_usp/spaninFuncs.py +++ b/cpt_putative_usp/spaninFuncs.py @@ -18,9 +18,9 @@ def check_back_end_snorkels(seq, tmsize): """ - Searches through the backend of a potential TMD snorkel. This is the 2nd part of a TMD snorkel lysine match. - --> seq : should be the sequence fed from the "search_region" portion of the sequence - --> tmsize : size of the potential TMD being investigated + Searches through the backend of a potential TMD snorkel. This is the 2nd part of a TMD snorkel lysine match. + --> seq : should be the sequence fed from the "search_region" portion of the sequence + --> tmsize : size of the potential TMD being investigated """ found = [] if seq[tmsize - 4] == Lys and re.search(("[FIWLVMYCATGS]"), seq[tmsize - 5]): @@ -42,10 +42,10 @@ def check_back_end_snorkels(seq, tmsize): def prep_a_gff3(fa, spanin_type, org): """ - Function parses an input detailed 'fa' file and outputs a 'gff3' file - ---> fa = input .fa file - ---> output = output a returned list of data, easily portable to a gff3 next - ---> spanin_type = 'isp' or 'osp' + Function parses an input detailed 'fa' file and outputs a 'gff3' file + ---> fa = input .fa file + ---> output = output a returned list of data, easily portable to a gff3 next + ---> spanin_type = 'isp' or 'osp' """ with org as f: header = f.readline() @@ -76,17 +76,21 @@ def prep_a_gff3(fa, spanin_type, org): source = "cpt.py|putative-*.py" # column 2 score = "." # column 6 phase = "." # column 8 - attributes = "ID=" +orgacc+ "|"+ orfid + ";ALIAS=" + spanin + ";SEQ="+a_pair[1] # column 9 - sequence = [[orgacc, source, methodtype, start, end, score, strand, phase, attributes]] + attributes = ( + "ID=" + orgacc + "|" + orfid + ";ALIAS=" + spanin + ";SEQ=" + a_pair[1] + ) # column 9 + sequence = [ + [orgacc, source, methodtype, start, end, score, strand, phase, attributes] + ] data += sequence return data def write_gff3(data, output="results.gff3"): """ - Parses results from prep_a_gff3 into a gff3 file - ---> input : list from prep_a_gff3 - ---> output : gff3 file + Parses results from prep_a_gff3 into a gff3 file + ---> input : list from prep_a_gff3 + ---> output : gff3 file """ data = data filename = output @@ -109,14 +113,23 @@ def write_gff3(data, output="results.gff3"): f.close() -def find_tmd(pair, minimum=10, maximum=30, TMDmin=10, TMDmax=20, isp_mode=False, peri_min=18, peri_max=206): - """ - Function that searches for lysine snorkels and then for a spanning hydrophobic region that indicates a potential TMD - ---> pair : Input of tuple with description and AA sequence (str) - ---> minimum : How close from the initial start codon a TMD can be within - ---> maximum : How far from the initial start codon a TMD can be within - ---> TMDmin : The minimum size that a transmembrane can be (default = 10) - ---> TMDmax : The maximum size tha ta transmembrane can be (default = 20) +def find_tmd( + pair, + minimum=10, + maximum=30, + TMDmin=10, + TMDmax=20, + isp_mode=False, + peri_min=18, + peri_max=206, +): + """ + Function that searches for lysine snorkels and then for a spanning hydrophobic region that indicates a potential TMD + ---> pair : Input of tuple with description and AA sequence (str) + ---> minimum : How close from the initial start codon a TMD can be within + ---> maximum : How far from the initial start codon a TMD can be within + ---> TMDmin : The minimum size that a transmembrane can be (default = 10) + ---> TMDmax : The maximum size tha ta transmembrane can be (default = 20) """ # hydrophobicAAs = ['P', 'F', 'I', 'W', 'L', 'V', 'M', 'Y', 'C', 'A', 'T', 'G', 'S'] tmd = [] @@ -125,55 +138,62 @@ def find_tmd(pair, minimum=10, maximum=30, TMDmin=10, TMDmax=20, isp_mode=False, if maximum > len(s): maximum = len(s) search_region = s[minimum - 1 : maximum + 1] - #print(f"this is the search region: {search_region}") + # print(f"this is the search region: {search_region}") # print(search_region) # for trouble shooting - for tmsize in range(TMDmin, TMDmax+1, 1): - #print(f"this is the current tmsize we're trying: {tmsize}") + for tmsize in range(TMDmin, TMDmax + 1, 1): + # print(f"this is the current tmsize we're trying: {tmsize}") # print('==============='+str(tmsize)+'================') # print for troubleshooting - pattern = "[PFIWLVMYCATGS]{"+str(tmsize)+"}" # searches for these hydrophobic residues tmsize total times - #print(pattern) - #print(f"sending to regex: {search_region}") + pattern = ( + "[PFIWLVMYCATGS]{" + str(tmsize) + "}" + ) # searches for these hydrophobic residues tmsize total times + # print(pattern) + # print(f"sending to regex: {search_region}") if re.search( - ("[K]"), search_region[1:8]): # grabbing one below with search region, so I want to grab one ahead here when I query. - store_search = re.search(("[K]"), search_region[1:8]) # storing regex object + ("[K]"), search_region[1:8] + ): # grabbing one below with search region, so I want to grab one ahead here when I query. + store_search = re.search( + ("[K]"), search_region[1:8] + ) # storing regex object where_we_are = store_search.start() # finding where we got the hit if re.search( ("[PFIWLVMYCATGS]"), search_region[where_we_are + 1] ) and re.search( ("[PFIWLVMYCATGS]"), search_region[where_we_are - 1] ): # hydrophobic neighbor - #try: - g = re.search(("[PFIWLVMYCATGS]"), search_region[where_we_are + 1]).group() + # try: + g = re.search( + ("[PFIWLVMYCATGS]"), search_region[where_we_are + 1] + ).group() backend = check_back_end_snorkels(search_region, tmsize) if backend == "match": if isp_mode: g = re.search((pattern), search_region).group() - end_of_tmd = re.search((g), s).end()+1 + end_of_tmd = re.search((g), s).end() + 1 amt_peri = len(s) - end_of_tmd if peri_min <= amt_peri <= peri_max: - pair_desc = pair[0] + ", peri_count~="+str(amt_peri) - new_pair = (pair_desc,pair[1]) + pair_desc = pair[0] + ", peri_count~=" + str(amt_peri) + new_pair = (pair_desc, pair[1]) tmd.append(new_pair) else: tmd.append(pair) else: continue - #else: - #print("I'm continuing out of snorkel loop") - #print(f"{search_region}") - #continue + # else: + # print("I'm continuing out of snorkel loop") + # print(f"{search_region}") + # continue if re.search((pattern), search_region): - #print(f"found match: {}") - #print("I AM HEREEEEEEEEEEEEEEEEEEEEEEE") - #try: + # print(f"found match: {}") + # print("I AM HEREEEEEEEEEEEEEEEEEEEEEEE") + # try: if isp_mode: g = re.search((pattern), search_region).group() - end_of_tmd = re.search((g), s).end()+1 + end_of_tmd = re.search((g), s).end() + 1 amt_peri = len(s) - end_of_tmd if peri_min <= amt_peri <= peri_max: - pair_desc = pair[0] + ", peri_count~="+str(amt_peri) - new_pair = (pair_desc,pair[1]) + pair_desc = pair[0] + ", peri_count~=" + str(amt_peri) + new_pair = (pair_desc, pair[1]) tmd.append(new_pair) else: tmd.append(pair) @@ -183,13 +203,15 @@ def find_tmd(pair, minimum=10, maximum=30, TMDmin=10, TMDmax=20, isp_mode=False, return tmd -def find_lipobox(pair, minimum=10, maximum=50, min_after=30, max_after=185, regex=1, osp_mode=False): +def find_lipobox( + pair, minimum=10, maximum=50, min_after=30, max_after=185, regex=1, osp_mode=False +): """ - Function that takes an input tuple, and will return pairs of sequences to their description that have a lipoobox - ---> minimum - min distance from start codon to first AA of lipobox - ---> maximum - max distance from start codon to first AA of lipobox - ---> regex - option 1 (default) => more strict regular expression ; option 2 => looser selection, imported from LipoRy - + Function that takes an input tuple, and will return pairs of sequences to their description that have a lipoobox + ---> minimum - min distance from start codon to first AA of lipobox + ---> maximum - max distance from start codon to first AA of lipobox + ---> regex - option 1 (default) => more strict regular expression ; option 2 => looser selection, imported from LipoRy + """ if regex == 1: pattern = "[ILMFTV][^REKD][GAS]C" # regex for Lipobox from findSpanin.pl @@ -199,19 +221,23 @@ def find_lipobox(pair, minimum=10, maximum=50, min_after=30, max_after=185, rege candidates = [] s = str(pair[1]) # print(s) # trouble shooting - search_region = s[minimum-1 : maximum + 5] # properly slice the input... add 4 to catch if it hangs off at max input + search_region = s[ + minimum - 1 : maximum + 5 + ] # properly slice the input... add 4 to catch if it hangs off at max input # print(search_region) # trouble shooting - patterns = ["[ILMFTV][^REKD][GAS]C","AW[AGS]C"] + patterns = ["[ILMFTV][^REKD][GAS]C", "AW[AGS]C"] for pattern in patterns: - #print(pattern) # trouble shooting + # print(pattern) # trouble shooting if re.search((pattern), search_region): # lipobox must be WITHIN the range... # searches the sequence with the input RegEx AND omits if - g = re.search((pattern), search_region).group() # find the exact group match + g = re.search( + (pattern), search_region + ).group() # find the exact group match amt_peri = len(s) - re.search((g), s).end() + 1 - if min_after <= amt_peri <= max_after: # find the lipobox end region + if min_after <= amt_peri <= max_after: # find the lipobox end region if osp_mode: - pair_desc = pair[0] + ", peri_count~="+str(amt_peri) - new_pair = (pair_desc,pair[1]) + pair_desc = pair[0] + ", peri_count~=" + str(amt_peri) + new_pair = (pair_desc, pair[1]) candidates.append(new_pair) else: candidates.append(pair) @@ -221,9 +247,9 @@ def find_lipobox(pair, minimum=10, maximum=50, min_after=30, max_after=185, rege def tuple_fasta(fasta_file): """ - #### INPUT: Fasta File - #### OUTPUT: zipped (zip) : pairwise relationship of description to sequence - #### + #### INPUT: Fasta File + #### OUTPUT: zipped (zip) : pairwise relationship of description to sequence + #### """ fasta = SeqIO.parse(fasta_file, "fasta") descriptions = [] @@ -281,10 +307,10 @@ def splitStrands(text, strand="+"): def parse_a_range(pair, start, end): """ - Takes an input data tuple from a fasta tuple pair and keeps only those within the input sequence range - ---> data : fasta tuple data - ---> start : start range to keep - ---> end : end range to keep (will need to + 1) + Takes an input data tuple from a fasta tuple pair and keeps only those within the input sequence range + ---> data : fasta tuple data + ---> start : start range to keep + ---> end : end range to keep (will need to + 1) """ matches = [] for each_pair in pair: @@ -310,12 +336,18 @@ def grabLocs(text): Grabs the locations of the spanin based on NT location (seen from ORF). Grabs the ORF name, as per named from the ORF class/module from cpt.py """ - start = re.search(("[\d]+\.\."), text).group(0) # Start of the sequence ; looks for [numbers].. - end = re.search(("\.\.[\d]+"), text).group(0) # End of the sequence ; Looks for ..[numbers] - orf = re.search(("(ORF)[\d]+"), text).group(0) # Looks for ORF and the numbers that are after it - if re.search(("(\[1\])"), text): # stores strand + start = re.search(("[\d]+\.\."), text).group( + 0 + ) # Start of the sequence ; looks for [numbers].. + end = re.search(("\.\.[\d]+"), text).group( + 0 + ) # End of the sequence ; Looks for ..[numbers] + orf = re.search(("(ORF)[\d]+"), text).group( + 0 + ) # Looks for ORF and the numbers that are after it + if re.search(("(\[1\])"), text): # stores strand strand = "+" - elif re.search(("(\[-1\])"), text): # stores strand + elif re.search(("(\[-1\])"), text): # stores strand strand = "-" start = int(start.split("..")[0]) end = int(end.split("..")[1]) @@ -329,7 +361,7 @@ def spaninProximity(isp, osp, max_dist=30): _NOTE THIS FUNCTION COULD BE MODIFIED TO RETURN SEQUENCES_ Compares the locations of i-spanins and o-spanins. max_dist is the distance in NT measurement from i-spanin END site to o-spanin START. The user will be inputting AA distance, so a conversion will be necessary ( * 3) - I modified this on 07.30.2020 to bypass the pick + or - strand. To + I modified this on 07.30.2020 to bypass the pick + or - strand. To INPUT: list of OSP and ISP candidates OUTPUT: Return (improved) candidates for overlapping, embedded, and separate list """ @@ -358,13 +390,27 @@ def spaninProximity(isp, osp, max_dist=30): elif iseq[0] < oseq[0] <= iseq[1] and oseq[1] > iseq[1]: ### OVERLAP / SEPARATE ### if (iseq[1] - oseq[0]) < 6: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + combo = [ + iseq[0], + iseq[1], + oseq[2], + oseq[0], + oseq[1], + iseq[3], + ] separate[iseq[2]] += [combo] else: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + combo = [ + iseq[0], + iseq[1], + oseq[2], + oseq[0], + oseq[1], + iseq[3], + ] overlap[iseq[2]] += [combo] elif iseq[1] <= oseq[0] <= iseq[1] + max_dist: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1], iseq[3]] separate[iseq[2]] += [combo] else: continue @@ -383,13 +429,27 @@ def spaninProximity(isp, osp, max_dist=30): embedded[iseq[2]] += [combo] elif iseq[0] <= oseq[1] <= iseq[1] and oseq[0] < iseq[0]: if (oseq[1] - iseq[0]) < 6: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + combo = [ + iseq[0], + iseq[1], + oseq[2], + oseq[0], + oseq[1], + iseq[3], + ] separate[iseq[2]] += [combo] else: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + combo = [ + iseq[0], + iseq[1], + oseq[2], + oseq[0], + oseq[1], + iseq[3], + ] overlap[iseq[2]] += [combo] elif iseq[0] - 10 < oseq[1] < iseq[0]: - combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1],iseq[3]] + combo = [iseq[0], iseq[1], oseq[2], oseq[0], oseq[1], iseq[3]] separate[iseq[2]] += [combo] else: continue @@ -402,7 +462,8 @@ def spaninProximity(isp, osp, max_dist=30): def check_for_usp(): - " pass " + "pass" + ############################################### TEST RANGE ######################################################################### #################################################################################################################################### @@ -454,7 +515,7 @@ def check_for_usp(): pairs = zip(test_desc, test_seq) lipo = [] for each_pair in pairs: - #print(each_pair) + # print(each_pair) # try: try: lipo += find_lipobox(pair=each_pair, regex=2) # , minimum=8) diff --git a/cpt_related_genome_nuc/relatedness.py b/cpt_related_genome_nuc/relatedness.py index 77d2349..3c8a66b 100755 --- a/cpt_related_genome_nuc/relatedness.py +++ b/cpt_related_genome_nuc/relatedness.py @@ -10,61 +10,63 @@ log = logging.getLogger() -def parse_blast(blast, isXML = False): +def parse_blast(blast, isXML=False): res = [] finalRes = [] if isXML: - for iter_num, blast_record in enumerate(NCBIXML.parse(blast), 1): - for alignment in blast_record.alignments: - tempID = alignment.hit_id[alignment.hit_id.find("gb|") + 3:] - tempID = tempID[:tempID.find("|")] - tempDesc = alignment.title - while tempDesc.find("|") >= 0: - tempDesc = tempDesc[tempDesc.find("|") + 1:] - tempDesc = tempDesc.strip() - tempID = tempID.strip() - for hsp in alignment.hsps: - line = [str(blast_record.query)] - line.append(str(hsp.align_length)) - line.append(str(hsp.identities)) - line.append(str(blast_record.query_length)) - line.append(str(alignment.length)) - line.append(tempDesc) - line.append(tempID) - #line.append("0000000") - #print(line) - res.append(line) - blast.seek(0) - resInd = -1 - taxLine = blast.readline() - while taxLine: - if "" in taxLine: - resInd += 1 - taxSlice = "" - elif "" in taxLine: - taxSlice = taxLine[taxLine.find("") + 7:taxLine.find("")] - finalRes.append(res[resInd]) - finalRes[-1].append(taxSlice) - #print(finalRes[-1]) + for iter_num, blast_record in enumerate(NCBIXML.parse(blast), 1): + for alignment in blast_record.alignments: + tempID = alignment.hit_id[alignment.hit_id.find("gb|") + 3 :] + tempID = tempID[: tempID.find("|")] + tempDesc = alignment.title + while tempDesc.find("|") >= 0: + tempDesc = tempDesc[tempDesc.find("|") + 1 :] + tempDesc = tempDesc.strip() + tempID = tempID.strip() + for hsp in alignment.hsps: + line = [str(blast_record.query)] + line.append(str(hsp.align_length)) + line.append(str(hsp.identities)) + line.append(str(blast_record.query_length)) + line.append(str(alignment.length)) + line.append(tempDesc) + line.append(tempID) + # line.append("0000000") + # print(line) + res.append(line) + blast.seek(0) + resInd = -1 taxLine = blast.readline() - return finalRes + while taxLine: + if "" in taxLine: + resInd += 1 + taxSlice = "" + elif "" in taxLine: + taxSlice = taxLine[ + taxLine.find("") + 7 : taxLine.find("") + ] + finalRes.append(res[resInd]) + finalRes[-1].append(taxSlice) + # print(finalRes[-1]) + taxLine = blast.readline() + return finalRes else: - for line in blast: - taxSplit = [] - preTaxSplit = line.strip("\n").split("\t") - for tax in preTaxSplit[-1].split(";"): - shallowCopy = [] - for x in range(len(preTaxSplit)): - shallowCopy.append(preTaxSplit[x]) - shallowCopy[-1] = tax - res.append(shallowCopy) - for line in res: - for access in line[6].split(";"): - shallowCopy = [] - for x in range(len(line)): - shallowCopy.append(line[x]) - shallowCopy[6] = access - finalRes.append(shallowCopy) + for line in blast: + taxSplit = [] + preTaxSplit = line.strip("\n").split("\t") + for tax in preTaxSplit[-1].split(";"): + shallowCopy = [] + for x in range(len(preTaxSplit)): + shallowCopy.append(preTaxSplit[x]) + shallowCopy[-1] = tax + res.append(shallowCopy) + for line in res: + for access in line[6].split(";"): + shallowCopy = [] + for x in range(len(line)): + shallowCopy.append(line[x]) + shallowCopy[6] = access + finalRes.append(shallowCopy) # for x in finalRes: # print(x) # exit() @@ -303,18 +305,18 @@ def scoreMap(blast): data = [] # Reformatting to list rather than generator data = parse_blast(args.blast, args.xmlMode) - nameRec = data[0][0] + nameRec = data[0][0] data = make_num(data) data = add_dice(data) data = bundle_dice(data) - + # data = filter_dice(data, threshold=0.0) # data = important_only(data, splitId) # data = expand_taxIDs(data) # data = deform_scores(data) if not args.noFilter: - data = filter_phage(data, phageTaxLookup) + data = filter_phage(data, phageTaxLookup) # data = expand_titles(data) if args.protein or args.canonical: @@ -343,7 +345,7 @@ def scoreMap(blast): "%s\t%s\t%s\t%s\t%s\t%s\t%.4f\n" % (out[7], out[5], out[6], out[4], out[9], out[2], out[8]) ) - + else: sys.stdout.write( "Top %d matches for BLASTn results of %s\t\t\t\t\t\n" @@ -361,4 +363,3 @@ def scoreMap(blast): "%s\t%s\t%s\t%s\t%s\t%.4f\n" % (out[7], out[5], out[4], out[9], out[1], out[8]) ) - diff --git a/cpt_related_genome_prot/relatedness_prot.py b/cpt_related_genome_prot/relatedness_prot.py index 00506d6..6eee8ef 100755 --- a/cpt_related_genome_prot/relatedness_prot.py +++ b/cpt_related_genome_prot/relatedness_prot.py @@ -8,43 +8,49 @@ logging.basicConfig(level=logging.DEBUG) log = logging.getLogger() -def parse_blast(blast, isXML = False): + +def parse_blast(blast, isXML=False): res = [] finalRes = [] if isXML: - for iter_num, blast_record in enumerate(NCBIXML.parse(blast), 1): - for alignment in blast_record.alignments: - tempID = alignment.hit_id[alignment.hit_id.find("gb|") + 3:] - tempID = tempID[:tempID.find("|")] - tempDesc = alignment.title - while tempDesc.find("|") >= 0: - tempDesc = tempDesc[tempDesc.find("|") + 1:] - tempDesc = tempDesc.strip() - tempID = tempID.strip() - #for hsp in alignment.hsps: - line = [str(blast_record.query)[:str(blast_record.query).find("[")].strip()] - line.append(alignment.hit_id) - line.append(tempDesc) - line.append(alignment.accession) - res.append(line) - blast.seek(0) - resInd = -1 - taxLine = blast.readline() - while taxLine: - if "" in taxLine: - resInd += 1 - taxSlice = "" - elif "" in taxLine: - taxSlice = taxLine[taxLine.find("") + 7:taxLine.find("")] - finalRes.append(res[resInd]) - finalRes[-1].append(taxSlice) + for iter_num, blast_record in enumerate(NCBIXML.parse(blast), 1): + for alignment in blast_record.alignments: + tempID = alignment.hit_id[alignment.hit_id.find("gb|") + 3 :] + tempID = tempID[: tempID.find("|")] + tempDesc = alignment.title + while tempDesc.find("|") >= 0: + tempDesc = tempDesc[tempDesc.find("|") + 1 :] + tempDesc = tempDesc.strip() + tempID = tempID.strip() + # for hsp in alignment.hsps: + line = [ + str(blast_record.query)[: str(blast_record.query).find("[")].strip() + ] + line.append(alignment.hit_id) + line.append(tempDesc) + line.append(alignment.accession) + res.append(line) + blast.seek(0) + resInd = -1 taxLine = blast.readline() - return finalRes + while taxLine: + if "" in taxLine: + resInd += 1 + taxSlice = "" + elif "" in taxLine: + taxSlice = taxLine[ + taxLine.find("") + 7 : taxLine.find("") + ] + finalRes.append(res[resInd]) + finalRes[-1].append(taxSlice) + taxLine = blast.readline() + return finalRes else: - for line in blast: - finalRes.append(line.strip("\n").split("\t")) + for line in blast: + finalRes.append(line.strip("\n").split("\t")) return finalRes + def with_dice(blast): for data in blast: dice = 2 * int(data[14]) / (float(data[22]) + float(data[23])) @@ -108,18 +114,19 @@ def expand_fields(blast): for x in range(0, len(data[4])): yield [data[0], data[1], data[2][x], data[3], int(data[4][x])] + def expand_taxIDs(blast, taxFilter): for data in blast: # if(len(data[4]) > 0): # print(data[0]) for ID in data[4]: if ID != "N/A": - filterOut = False - for tax in taxFilter: - if str(ID).strip() == tax: - filterOut = True - if not filterOut: - yield [data[0], data[1], data[2], data[3], int(ID)] + filterOut = False + for tax in taxFilter: + if str(ID).strip() == tax: + filterOut = True + if not filterOut: + yield [data[0], data[1], data[2], data[3], int(ID)] def expand_titles(blast): @@ -150,6 +157,7 @@ def remove_dupes(data): # Pretty simple yield row + def scoreMap(blast): c = {} m = {} @@ -172,10 +180,10 @@ def scoreMap(blast): parser.add_argument("--protein", action="store_true") parser.add_argument("--canonical", action="store_true") parser.add_argument("--noFilter", action="store_true") - #parser.add_argument("--title", action="store_true") # Add when ready to update XML after semester + # parser.add_argument("--title", action="store_true") # Add when ready to update XML after semester parser.add_argument("--hits", type=int, default=5) - parser.add_argument("--xmlMode", action="store_true") - parser.add_argument("--taxFilter", type=str) + parser.add_argument("--xmlMode", action="store_true") + parser.add_argument("--taxFilter", type=str) args = parser.parse_args() @@ -183,18 +191,18 @@ def scoreMap(blast): phageTaxLookup = [] sciName = [] line = phageDb.readline() - + taxList = [] - if args.taxFilter and args.taxFilter != "" : - args.taxFilter = args.taxFilter.split(" ") - for ind in args.taxFilter: - taxList.append(ind.strip()) + if args.taxFilter and args.taxFilter != "": + args.taxFilter = args.taxFilter.split(" ") + for ind in args.taxFilter: + taxList.append(ind.strip()) while line: line = line.split("\t") phageTaxLookup.append(int(line[0])) line[1] = line[1].strip() - if (line[1] == ""): + if line[1] == "": line[1] = "Novel Genome" sciName.append(line[1]) line = phageDb.readline() @@ -213,7 +221,7 @@ def scoreMap(blast): # data = with_dice(data) # data = filter_dice(data, threshold=0.0) data = important_only(data, splitId) - + data = expand_taxIDs(data, taxList) data = remove_dupes(data) if not args.noFilter: @@ -221,19 +229,16 @@ def scoreMap(blast): listify = [] for x in data: listify.append(x) - #listify = greatest_taxID(listify) - + # listify = greatest_taxID(listify) + count_label = "Similar Unique Proteins" - + counts, accessions = scoreMap(listify) - + nameRec = listify[0][0] - sys.stdout.write( - "Top %d matches for BLASTp results of %s\n" - % (args.hits, nameRec) - ) + sys.stdout.write("Top %d matches for BLASTp results of %s\n" % (args.hits, nameRec)) header = "# TaxID\t" - #if args.title: + # if args.title: header += "Name\t" if args.access: header += "Accessions\t" @@ -241,14 +246,14 @@ def scoreMap(blast): sys.stdout.write(header) for idx, ((name, ID), num) in enumerate( - sorted(counts.items(), key=lambda item: -item[1]) - ): + sorted(counts.items(), key=lambda item: -item[1]) + ): if idx > args.hits - 1: break line = str(ID) + "\t" - #if args.title: + # if args.title: line += str(name) + "\t" if args.access: - line += str(accessions[(name, ID)][0]) + "\t" - line += str(num) + "\n" + line += str(accessions[(name, ID)][0]) + "\t" + line += str(num) + "\n" sys.stdout.write(line) diff --git a/cpt_renumber_gbk/BIO_FIX_TOPO.py b/cpt_renumber_gbk/BIO_FIX_TOPO.py index 846f9cc..4111d5a 100755 --- a/cpt_renumber_gbk/BIO_FIX_TOPO.py +++ b/cpt_renumber_gbk/BIO_FIX_TOPO.py @@ -2,10 +2,9 @@ def record_end(self, content): - """Clean up when we've finished the record. - """ - #from Bio import Alphabet - #from Bio.Alphabet import IUPAC + """Clean up when we've finished the record.""" + # from Bio import Alphabet + # from Bio.Alphabet import IUPAC from Bio.Seq import Seq, UnknownSeq # Try and append the version number to the accession for the full id @@ -24,8 +23,8 @@ def record_end(self, content): # first, determine the alphabet # we default to an generic alphabet if we don't have a # seq type or have strange sequence information. - - #seq_alphabet = Alphabet.generic_alphabet + + # seq_alphabet = Alphabet.generic_alphabet # now set the sequence sequence = "".join(self._seq_data) @@ -77,9 +76,9 @@ def record_end(self, content): self.data.annotations["topology"] = "linear" """ if not sequence and self.__expected_size: - self.data.seq = UnknownSeq(self._expected_size)#, seq_alphabet) + self.data.seq = UnknownSeq(self._expected_size) # , seq_alphabet) else: - self.data.seq = Seq(sequence)#, seq_alphabet) + self.data.seq = Seq(sequence) # , seq_alphabet) Bio.GenBank._FeatureConsumer.record_end = record_end diff --git a/cpt_renumber_gbk/renumber.py b/cpt_renumber_gbk/renumber.py index 2fc5271..07a78a4 100755 --- a/cpt_renumber_gbk/renumber.py +++ b/cpt_renumber_gbk/renumber.py @@ -93,9 +93,9 @@ def renumber_genes( ) oldNames = [] for x in f_gene: - if tag_to_update in x.qualifiers.keys(): - oldNames.append(x.qualifiers[tag_to_update]) - + if tag_to_update in x.qualifiers.keys(): + oldNames.append(x.qualifiers[tag_to_update]) + f_rbs = sorted( [f for f in record.features if f.type == "RBS"], key=lambda x: x.location.start, @@ -127,40 +127,49 @@ def renumber_genes( if is_within(rbs, gene) and ( rbs.location.start == geneComp or rbs.location.end == geneComp ): - if (tag_to_update not in rbs.qualifiers.keys()): - tag.append(rbs) - f_processed.append(rbs) - break - elif (tag_to_update not in gene.qualifiers.keys()): # This will gurantee qual is in gene and RBS for next check - tag.append(rbs) - f_processed.append(rbs) - break - elif (not forceTagMatch) or (rbs.qualifiers[tag_to_update] == gene.qualifiers[tag_to_update]): - tag.append(rbs) - f_processed.append(rbs) - break - + if tag_to_update not in rbs.qualifiers.keys(): + tag.append(rbs) + f_processed.append(rbs) + break + elif ( + tag_to_update not in gene.qualifiers.keys() + ): # This will gurantee qual is in gene and RBS for next check + tag.append(rbs) + f_processed.append(rbs) + break + elif (not forceTagMatch) or ( + rbs.qualifiers[tag_to_update] + == gene.qualifiers[tag_to_update] + ): + tag.append(rbs) + f_processed.append(rbs) + break + # find all other non-RBS features for feature in [f for f in f_sorted if f not in f_processed]: # If the feature is within the gene boundaries (genes are the first entry in tag list), # add it to the same locus tag group, does not process RBS if is_within(feature, gene): if tag_to_update not in feature.qualifiers.keys(): - # catches genes and CDS feature that are intron-contained. - if feature.type == "CDS": - if ( - feature.location.start == gene.location.start - or feature.location.end == gene.location.end - ): - + # catches genes and CDS feature that are intron-contained. + if feature.type == "CDS": + if ( + feature.location.start == gene.location.start + or feature.location.end == gene.location.end + ): + + tag.append(feature) + f_processed.append(feature) + else: tag.append(feature) f_processed.append(feature) - else: + elif (not forceTagMatch) or ( + tag_to_update in gene.qualifiers.keys() + and feature.qualifiers[tag_to_update] + == gene.qualifiers[tag_to_update] + ): tag.append(feature) f_processed.append(feature) - elif (not forceTagMatch) or (tag_to_update in gene.qualifiers.keys() and feature.qualifiers[tag_to_update] == gene.qualifiers[tag_to_update]): - tag.append(feature) - f_processed.append(feature) elif feature.location.start > gene.location.end: # because the features are sorted by coordinates, # no features further down on the list will be in this gene @@ -175,27 +184,30 @@ def renumber_genes( for rbs in [f for f in f_rbs if f not in f_processed]: dupeRBS = False for x in f_processed: - if x.type == "RBS" and (tag_to_update in rbs.qualifiers.keys() and tag_to_update in x.qualifiers.keys() and rbs.qualifiers[tag_to_update] == x.qualifiers[tag_to_update]): - dupeRBS = True + if x.type == "RBS" and ( + tag_to_update in rbs.qualifiers.keys() + and tag_to_update in x.qualifiers.keys() + and rbs.qualifiers[tag_to_update] == x.qualifiers[tag_to_update] + ): + dupeRBS = True if dupeRBS: - change_table.write( - record.id - + "\t" - + rbs.type - + ":" - + (rbs.qualifiers[tag_to_update][0]) - + "\t[Removed: Parent gene already had an RBS]\n" - ) + change_table.write( + record.id + + "\t" + + rbs.type + + ":" + + (rbs.qualifiers[tag_to_update][0]) + + "\t[Removed: Parent gene already had an RBS]\n" + ) else: - change_table.write( - record.id - + "\t" - + rbs.type - + ":" - + (rbs.qualifiers[tag_to_update][0]) - + "\t[Removed: RBS did not both fall within boundary of gene and share a boundary with a gene]\n" - ) - + change_table.write( + record.id + + "\t" + + rbs.type + + ":" + + (rbs.qualifiers[tag_to_update][0]) + + "\t[Removed: RBS did not both fall within boundary of gene and share a boundary with a gene]\n" + ) tag_index = 1 delta = [] @@ -226,98 +238,106 @@ def renumber_genes( # Update all features record.features = sorted(clean_features, key=lambda x: x.location.start) - + for feature in [f for f in f_sorted if f not in f_processed]: if feature.type == "CDS": - if tag_to_update in feature.qualifiers.keys() and forceTagMatch: - failNameCheck = True - for x in oldNames: - for tag in feature.qualifiers[tag_to_update]: - if tag in x: - failNameCheck = False - if not failNameCheck: - break - if failNameCheck: - change_table.write( - record.id - + "\t" - + feature.type - + ":" - + (feature.qualifiers[tag_to_update][0]) - + "\t[Removed: (Tag check enabled) CDS did not both share a start/end with and fall within a gene with the same " + tag_to_update + " value]\n" - ) + if tag_to_update in feature.qualifiers.keys() and forceTagMatch: + failNameCheck = True + for x in oldNames: + for tag in feature.qualifiers[tag_to_update]: + if tag in x: + failNameCheck = False + if not failNameCheck: + break + if failNameCheck: + change_table.write( + record.id + + "\t" + + feature.type + + ":" + + (feature.qualifiers[tag_to_update][0]) + + "\t[Removed: (Tag check enabled) CDS did not both share a start/end with and fall within a gene with the same " + + tag_to_update + + " value]\n" + ) + else: + change_table.write( + record.id + + "\t" + + feature.type + + ":" + + (feature.qualifiers[tag_to_update][0]) + + "\t[Removed: CDS did not both fall within boundary of gene and share a boundary with a gene]\n" + ) + elif tag_to_update in feature.qualifiers.keys(): + change_table.write( + record.id + + "\t" + + feature.type + + ":" + + (feature.qualifiers[tag_to_update][0]) + + "\t[Removed: CDS did not both fall within boundary of gene and share a boundary with a gene]\n" + ) else: - change_table.write( - record.id - + "\t" - + feature.type - + ":" - + (feature.qualifiers[tag_to_update][0]) - + "\t[Removed: CDS did not both fall within boundary of gene and share a boundary with a gene]\n" - ) - elif tag_to_update in feature.qualifiers.keys(): - change_table.write( - record.id - + "\t" - + feature.type - + ":" - + (feature.qualifiers[tag_to_update][0]) - + "\t[Removed: CDS did not both fall within boundary of gene and share a boundary with a gene]\n" - ) - else: - change_table.write( - record.id - + "\t" - + feature.type - + ": No " - + tag_to_update - + "\t[Removed: CDS at (" + str(feature.location.start) + "," + str(feature.location.end) + ") did not both fall within boundary of gene and share a boundary with a gene]\n" - ) + change_table.write( + record.id + + "\t" + + feature.type + + ": No " + + tag_to_update + + "\t[Removed: CDS at (" + + str(feature.location.start) + + "," + + str(feature.location.end) + + ") did not both fall within boundary of gene and share a boundary with a gene]\n" + ) else: - if tag_to_update in feature.qualifiers.keys() and forceTagMatch: - failNameCheck = True - for x in oldNames: - for tag in feature.qualifiers[tag_to_update]: - if tag in x: - failNameCheck = False - if not failNameCheck: - break - if failNameCheck: - change_table.write( - record.id - + "\t" - + feature.type - + ":" - + (feature.qualifiers[tag_to_update][0]) - + "\t[Removed: (Tag check enabled) Feature did not fall within a gene it shared a " + tag_to_update + " value with]\n" - ) + if tag_to_update in feature.qualifiers.keys() and forceTagMatch: + failNameCheck = True + for x in oldNames: + for tag in feature.qualifiers[tag_to_update]: + if tag in x: + failNameCheck = False + if not failNameCheck: + break + if failNameCheck: + change_table.write( + record.id + + "\t" + + feature.type + + ":" + + (feature.qualifiers[tag_to_update][0]) + + "\t[Removed: (Tag check enabled) Feature did not fall within a gene it shared a " + + tag_to_update + + " value with]\n" + ) + else: + change_table.write( + record.id + + "\t" + + feature.type + + ":" + + (feature.qualifiers[tag_to_update][0]) + + "\t[Removed: Feature not within boundary of a gene]\n" + ) + elif tag_to_update in feature.qualifiers.keys(): + change_table.write( + record.id + + "\t" + + feature.type + + ":" + + (feature.qualifiers[tag_to_update][0]) + + "\t[Removed: Feature not within boundary of a gene]\n" + ) else: - change_table.write( - record.id - + "\t" - + feature.type - + ":" - + (feature.qualifiers[tag_to_update][0]) - + "\t[Removed: Feature not within boundary of a gene]\n" - ) - elif tag_to_update in feature.qualifiers.keys(): - change_table.write( - record.id - + "\t" - + feature.type - + ":" - + (feature.qualifiers[tag_to_update][0]) - + "\t[Removed: Feature not within boundary of a gene]\n" - ) - else: - change_table.write( - record.id - + "\t" - + feature.type - + ": (has no " - + tag_to_update - + ")\t[Removed: Feature not within boundary of a gene]\n" - ) + change_table.write( + record.id + + "\t" + + feature.type + + ": (has no " + + tag_to_update + + ")\t[Removed: Feature not within boundary of a gene]\n" + ) change_table.write("\n".join(delta) + "\n") # Output @@ -340,15 +360,12 @@ def is_within(query, feature): # checks if the query item is within the bounds of the given feature sortedList = sorted(query.location.parts, key=lambda x: x.start) for x in sortedList: - if ( - feature.location.start <= x.start - and feature.location.end >= x.end - ): - if x.strand < 0 and x == sortedList[-1]: - return True - elif x.strand >= 0 and x == sortedList[0]: - return True - #else: + if feature.location.start <= x.start and feature.location.end >= x.end: + if x.strand < 0 and x == sortedList[-1]: + return True + elif x.strand >= 0 and x == sortedList[0]: + return True + # else: return False @@ -382,7 +399,9 @@ def is_within(query, feature): ) parser.add_argument( - "--forceTagMatch", action="store_true", help="Make non-CDS features match tag initially" + "--forceTagMatch", + action="store_true", + help="Make non-CDS features match tag initially", ) parser.add_argument( diff --git a/cpt_req_phage_start/gff3.py b/cpt_req_phage_start/gff3.py index d4795d4..48496c3 100755 --- a/cpt_req_phage_start/gff3.py +++ b/cpt_req_phage_start/gff3.py @@ -97,10 +97,10 @@ def feature_test_type(feature, **kwargs): if "type" in kwargs: return str(feature.type).upper() == str(kwargs["type"]).upper() elif "types" in kwargs: - for x in kwargs["types"]: - if str(feature.type).upper() == str(x).upper(): - return True - return False + for x in kwargs["types"]: + if str(feature.type).upper() == str(x).upper(): + return True + return False raise Exception("Incorrect feature_test_type call, need type or types") diff --git a/cpt_req_sd/gff3.py b/cpt_req_sd/gff3.py index d4795d4..48496c3 100755 --- a/cpt_req_sd/gff3.py +++ b/cpt_req_sd/gff3.py @@ -97,10 +97,10 @@ def feature_test_type(feature, **kwargs): if "type" in kwargs: return str(feature.type).upper() == str(kwargs["type"]).upper() elif "types" in kwargs: - for x in kwargs["types"]: - if str(feature.type).upper() == str(x).upper(): - return True - return False + for x in kwargs["types"]: + if str(feature.type).upper() == str(x).upper(): + return True + return False raise Exception("Incorrect feature_test_type call, need type or types") diff --git a/cpt_req_sd/gff3_require_sd.py b/cpt_req_sd/gff3_require_sd.py index cc12501..abb8fed 100755 --- a/cpt_req_sd/gff3_require_sd.py +++ b/cpt_req_sd/gff3_require_sd.py @@ -51,9 +51,17 @@ def require_shinefind(gff3, fasta): ) gene.sub_features.append(sd_features[0]) if gene.location.start > sd_features[0].location.start: - gene.location = FeatureLocation(int(sd_features[0].location.start), int(gene.location.end), gene.location.strand) + gene.location = FeatureLocation( + int(sd_features[0].location.start), + int(gene.location.end), + gene.location.strand, + ) if gene.location.end < sd_features[0].location.end: - gene.location = FeatureLocation(int(gene.location.start), int(sd_features[0].location.end), gene.location.strand) + gene.location = FeatureLocation( + int(gene.location.start), + int(sd_features[0].location.end), + gene.location.strand, + ) good_genes.append(gene) record.features = good_genes diff --git a/cpt_req_sd/shinefind.py b/cpt_req_sd/shinefind.py index c51665e..509cff3 100755 --- a/cpt_req_sd/shinefind.py +++ b/cpt_req_sd/shinefind.py @@ -53,10 +53,10 @@ def list_sds(self, sequence, sd_min=3, sd_max=17): for regex in self.sd_reg: for match in regex.finditer(sequence): spacing = len(sequence) - len(match.group()) - match.start() - if sd_max >= spacing+sd_min and spacing+sd_min >= sd_min: - #if the spacing is within gap limits, add - #(search space is [sd_max+7 .. sd_min] so actual gap is spacing+sd_min) - #print('min %d max %d - adding SD with gap %d' % (sd_min, sd_max, spacing+sd_min)) + if sd_max >= spacing + sd_min and spacing + sd_min >= sd_min: + # if the spacing is within gap limits, add + # (search space is [sd_max+7 .. sd_min] so actual gap is spacing+sd_min) + # print('min %d max %d - adding SD with gap %d' % (sd_min, sd_max, spacing+sd_min)) hits.append( { "spacing": spacing, @@ -66,7 +66,7 @@ def list_sds(self, sequence, sd_min=3, sd_max=17): "len": len(match.group()), } ) - hits = sorted(hits, key= lambda x: (-x['len'],x['spacing'])) + hits = sorted(hits, key=lambda x: (-x["len"], x["spacing"])) return hits @classmethod @@ -80,7 +80,16 @@ def highlight_sd(cls, sequence, start, end): ) @classmethod - def to_features(cls, hits, strand, parent_start, parent_end, feature_id=None, sd_min=3, sd_max=17): + def to_features( + cls, + hits, + strand, + parent_start, + parent_end, + feature_id=None, + sd_min=3, + sd_max=17, + ): results = [] for idx, hit in enumerate(hits): # gene complement(124..486) @@ -90,7 +99,7 @@ def to_features(cls, hits, strand, parent_start, parent_end, feature_id=None, sd # -1 491 501 2 3 5 # -1 491 501 1 3 5 # -1 491 501 0 3 5 - + qualifiers = { "source": "CPT_ShineFind", "ID": "%s.rbs-%s" % (feature_id, idx), @@ -108,7 +117,7 @@ def to_features(cls, hits, strand, parent_start, parent_end, feature_id=None, sd # minimum absolute value of these two will be the proper gap regardless of strand tmp = gffSeqFeature( FeatureLocation(min(start, end), max(start, end), strand=strand), - #FeatureLocation(min(start, end), max(start, end), strand=strand), + # FeatureLocation(min(start, end), max(start, end), strand=strand), type="Shine_Dalgarno_sequence", qualifiers=qualifiers, ) @@ -133,7 +142,10 @@ def testFeatureUpstream(self, feature, record, sd_min=3, sd_max=17): # Create our temp feature used to obtain correct portion of # genome - tmp = gffSeqFeature(FeatureLocation(min(start, end), max(start, end), strand=strand), type="domain") + tmp = gffSeqFeature( + FeatureLocation(min(start, end), max(start, end), strand=strand), + type="domain", + ) seq = str(tmp.extract(record.seq)) return self.list_sds(seq, sd_min, sd_max), start, end, seq @@ -175,6 +187,7 @@ def fix_gene_boundaries(feature): feature.location = FeatureLocation(fmin, fmax, strand=-1) return feature + def shinefind( fasta, gff3, diff --git a/cpt_sar_finder/SAR_finder.py b/cpt_sar_finder/SAR_finder.py index c97af9e..da77c51 100755 --- a/cpt_sar_finder/SAR_finder.py +++ b/cpt_sar_finder/SAR_finder.py @@ -9,21 +9,52 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(description="SAR Finder") - parser.add_argument("fa",type=argparse.FileType("r"),help="organism's multi fasta file") + parser.add_argument( + "fa", type=argparse.FileType("r"), help="organism's multi fasta file" + ) - parser.add_argument("--min",type=int,default=20,help="minimum size of candidate peptide") + parser.add_argument( + "--min", type=int, default=20, help="minimum size of candidate peptide" + ) - parser.add_argument("--max",type=int,default=200,help="maximum size of candidate peptide") + parser.add_argument( + "--max", type=int, default=200, help="maximum size of candidate peptide" + ) - parser.add_argument("--sar_min",type=int,default=15,help="minimum size of candidate peptide TMD domain") + parser.add_argument( + "--sar_min", + type=int, + default=15, + help="minimum size of candidate peptide TMD domain", + ) - parser.add_argument("--sar_max",type=int,default=24,help="maximum size of candidate peptide TMD domain") - - parser.add_argument("--out_fa",type=argparse.FileType("w"),help="multifasta output of candidate SAR proteins",default="candidate_SAR.fa") + parser.add_argument( + "--sar_max", + type=int, + default=24, + help="maximum size of candidate peptide TMD domain", + ) - parser.add_argument("--out_stat",type=argparse.FileType("w"),help="summary statistic file for candidate SAR proteins, tab separated",default="candidate_SAR_stats.tsv") + parser.add_argument( + "--out_fa", + type=argparse.FileType("w"), + help="multifasta output of candidate SAR proteins", + default="candidate_SAR.fa", + ) - parser.add_argument("--out_gff3",type=argparse.FileType("w"),help="multigff3 file for candidate SAR proteins",default="candidate_SAR.gff3") + parser.add_argument( + "--out_stat", + type=argparse.FileType("w"), + help="summary statistic file for candidate SAR proteins, tab separated", + default="candidate_SAR_stats.tsv", + ) + + parser.add_argument( + "--out_gff3", + type=argparse.FileType("w"), + help="multigff3 file for candidate SAR proteins", + default="candidate_SAR.gff3", + ) args = parser.parse_args() @@ -33,12 +64,13 @@ for protein_name, protein_data in fa_dict.items(): sar = CheckSequence(protein_name, protein_data) - #sar.check_sizes(min=args.min,max=args.max) + # sar.check_sizes(min=args.min,max=args.max) hydros = sar.shrink_results(sar_min=args.sar_min, sar_max=args.sar_max) sars.update(hydros) - gff3_from_SAR_dict(sars, args.out_gff3) - tab_from_SAR_dict(sars,args.out_stat,"SGAT",sar_min=args.sar_min, sar_max=args.sar_max) - fasta_from_SAR_dict(sars,args.out_fa) - #stat_file_from_SAR_dict(sars,args.out_stat,sar_min=args.sar_min,sar_max=args.sar_max) # fix this whenever ready. \ No newline at end of file + tab_from_SAR_dict( + sars, args.out_stat, "SGAT", sar_min=args.sar_min, sar_max=args.sar_max + ) + fasta_from_SAR_dict(sars, args.out_fa) + # stat_file_from_SAR_dict(sars,args.out_stat,sar_min=args.sar_min,sar_max=args.sar_max) # fix this whenever ready. diff --git a/cpt_sar_finder/SAR_functions.py b/cpt_sar_finder/SAR_functions.py index ebeed04..bf3c196 100755 --- a/cpt_sar_finder/SAR_functions.py +++ b/cpt_sar_finder/SAR_functions.py @@ -8,10 +8,9 @@ class CheckSequence: - """ - SAR endolysin Verification class, which starts with complete FA file, and is shrunk by each function to reveal best candidates of SAR endolysin proteins """ - + SAR endolysin Verification class, which starts with complete FA file, and is shrunk by each function to reveal best candidates of SAR endolysin proteins + """ def __init__(self, protein_name, protein_data): self.name = protein_name @@ -20,9 +19,8 @@ def __init__(self, protein_name, protein_data): self.size = len(self.seq) self.store = {} - - def check_sizes(self,min,max): - """ check the minimum and maximum peptide lengths """ + def check_sizes(self, min, max): + """check the minimum and maximum peptide lengths""" if self.size < min: print("too small") elif self.size > max: @@ -31,10 +29,11 @@ def check_sizes(self,min,max): print(f"{self.name} : {self.seq}") return True - - def check_hydrophobicity_and_charge(self,sar_min=15,sar_max=20,perc_residues="SGAT"): - """ verifies the existence of a hydrophobic region within the sequence """ - hydrophobic_residues = "['FIWLVMYCATGSP']" # fed through regex + def check_hydrophobicity_and_charge( + self, sar_min=15, sar_max=20, perc_residues="SGAT" + ): + """verifies the existence of a hydrophobic region within the sequence""" + hydrophobic_residues = "['FIWLVMYCATGSP']" # fed through regex hits = self.store pos_res = "RK" neg_res = "DE" @@ -42,117 +41,273 @@ def check_hydrophobicity_and_charge(self,sar_min=15,sar_max=20,perc_residues="SG if self.size > 50: seq = self.seq[0:50] else: - seq = self.seq + seq = self.seq for sar_size in range(sar_min, sar_max, 1): - for i in range(0,len(seq)-sar_size,1): - sar_seq = str(seq[i:i+sar_size]) - if re.search((hydrophobic_residues+"{"+str(sar_size)+"}"),sar_seq): - charge_seq, charge, perc_cont, sar_coords, nterm_coords, cterm_coords, sar_start, sar_end = rep_funcs(self,seq,i,pos_res,neg_res,sar_seq,perc_residues,sar_size) - storage_dict(self=self,sar_size=sar_size,sar_seq=sar_seq,hits=hits,charge_seq=charge_seq,charge=charge,perc_cont=perc_cont,nterm_coords=nterm_coords,sar_coords=sar_coords,cterm_coords=cterm_coords,sar_start=sar_start,sar_end=sar_end) - #print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1)) - elif "K" in sar_seq[0] and re.search((hydrophobic_residues+"{"+str(sar_size-1)+"}"),sar_seq[1:]): # check frontend snorkels - charge_seq, charge, perc_cont, sar_coords, nterm_coords, cterm_coords, sar_start, sar_end = rep_funcs(self,seq,i,pos_res,neg_res,sar_seq,perc_residues,sar_size) - storage_dict(self=self,sar_size=sar_size,sar_seq=sar_seq,hits=hits,charge_seq=charge_seq,charge=charge,perc_cont=perc_cont,nterm_coords=nterm_coords,sar_coords=sar_coords,cterm_coords=cterm_coords,sar_start=sar_start,sar_end=sar_end) - #print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1)) - elif "K" in sar_seq[-1] and re.search((hydrophobic_residues+"{"+str(sar_size-1)+"}"),sar_seq[:-1]): # check backend snorkels - charge_seq, charge, perc_cont, sar_coords, nterm_coords, cterm_coords, sar_start, sar_end = rep_funcs(self,seq,i,pos_res,neg_res,sar_seq,perc_residues,sar_size) - storage_dict(self=self,sar_size=sar_size,sar_seq=sar_seq,hits=hits,charge_seq=charge_seq,charge=charge,perc_cont=perc_cont,nterm_coords=nterm_coords,sar_coords=sar_coords,cterm_coords=cterm_coords,sar_start=sar_start,sar_end=sar_end) - #print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1)) + for i in range(0, len(seq) - sar_size, 1): + sar_seq = str(seq[i : i + sar_size]) + if re.search( + (hydrophobic_residues + "{" + str(sar_size) + "}"), sar_seq + ): + ( + charge_seq, + charge, + perc_cont, + sar_coords, + nterm_coords, + cterm_coords, + sar_start, + sar_end, + ) = rep_funcs( + self, seq, i, pos_res, neg_res, sar_seq, perc_residues, sar_size + ) + storage_dict( + self=self, + sar_size=sar_size, + sar_seq=sar_seq, + hits=hits, + charge_seq=charge_seq, + charge=charge, + perc_cont=perc_cont, + nterm_coords=nterm_coords, + sar_coords=sar_coords, + cterm_coords=cterm_coords, + sar_start=sar_start, + sar_end=sar_end, + ) + # print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1)) + elif "K" in sar_seq[0] and re.search( + (hydrophobic_residues + "{" + str(sar_size - 1) + "}"), sar_seq[1:] + ): # check frontend snorkels + ( + charge_seq, + charge, + perc_cont, + sar_coords, + nterm_coords, + cterm_coords, + sar_start, + sar_end, + ) = rep_funcs( + self, seq, i, pos_res, neg_res, sar_seq, perc_residues, sar_size + ) + storage_dict( + self=self, + sar_size=sar_size, + sar_seq=sar_seq, + hits=hits, + charge_seq=charge_seq, + charge=charge, + perc_cont=perc_cont, + nterm_coords=nterm_coords, + sar_coords=sar_coords, + cterm_coords=cterm_coords, + sar_start=sar_start, + sar_end=sar_end, + ) + # print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1)) + elif "K" in sar_seq[-1] and re.search( + (hydrophobic_residues + "{" + str(sar_size - 1) + "}"), sar_seq[:-1] + ): # check backend snorkels + ( + charge_seq, + charge, + perc_cont, + sar_coords, + nterm_coords, + cterm_coords, + sar_start, + sar_end, + ) = rep_funcs( + self, seq, i, pos_res, neg_res, sar_seq, perc_residues, sar_size + ) + storage_dict( + self=self, + sar_size=sar_size, + sar_seq=sar_seq, + hits=hits, + charge_seq=charge_seq, + charge=charge, + perc_cont=perc_cont, + nterm_coords=nterm_coords, + sar_coords=sar_coords, + cterm_coords=cterm_coords, + sar_start=sar_start, + sar_end=sar_end, + ) + # print("TMDSIZE: {}\tINDEX: {}".format(sar_size,i+1)) continue - + return hits - def shrink_results(self,sar_min=15,sar_max=20,perc_residues="SGAT"): - """ removes repetiive hits, keeps only the shortest and longest of each SAR domain """ + def shrink_results(self, sar_min=15, sar_max=20, perc_residues="SGAT"): + """removes repetiive hits, keeps only the shortest and longest of each SAR domain""" compare_candidates = {} - hits = self.check_hydrophobicity_and_charge(sar_min=sar_min,sar_max=sar_max) + hits = self.check_hydrophobicity_and_charge(sar_min=sar_min, sar_max=sar_max) for sar_name, data in hits.items(): - #print(sar_name) + # print(sar_name) compare_candidates[sar_name] = {} - #print("\nThese are the values: {}".format(v)) - #count_of_times = 0 + # print("\nThese are the values: {}".format(v)) + # count_of_times = 0 tmd_log = [] - for sar_size in range(sar_max,sar_min-1,-1): - if "TMD_"+str(sar_size) in data: + for sar_size in range(sar_max, sar_min - 1, -1): + if "TMD_" + str(sar_size) in data: tmd_log.append(sar_size) - #print(tmd_log) - for idx,the_data in enumerate(data["TMD_"+str(sar_size)]): - #print(the_data[7]) - #print(the_data) - #print(f"This is the index: {idx}") - #print(f"This is the list of data at this index: {the_data}") - if the_data[7] in compare_candidates[sar_name]: # index to start + # print(tmd_log) + for idx, the_data in enumerate(data["TMD_" + str(sar_size)]): + # print(the_data[7]) + # print(the_data) + # print(f"This is the index: {idx}") + # print(f"This is the list of data at this index: {the_data}") + if ( + the_data[7] in compare_candidates[sar_name] + ): # index to start compare_candidates[sar_name][the_data[7]]["count"] += 1 - compare_candidates[sar_name][the_data[7]]["size"].append(sar_size) - compare_candidates[sar_name][the_data[7]]["index"].append(idx) + compare_candidates[sar_name][the_data[7]]["size"].append( + sar_size + ) + compare_candidates[sar_name][the_data[7]]["index"].append( + idx + ) else: compare_candidates[sar_name][the_data[7]] = {} compare_candidates[sar_name][the_data[7]]["count"] = 1 - compare_candidates[sar_name][the_data[7]]["size"] = [sar_size] + compare_candidates[sar_name][the_data[7]]["size"] = [ + sar_size + ] compare_candidates[sar_name][the_data[7]]["index"] = [idx] hits[sar_name]["biggest_sar"] = tmd_log[0] for sar_name, compare_data in compare_candidates.items(): for data in compare_data.values(): if len(data["size"]) >= 3: - #print(f"{each_size} --> {data}") - minmax = [min(data["size"]),max(data["size"])] + # print(f"{each_size} --> {data}") + minmax = [min(data["size"]), max(data["size"])] nonminmax = [x for x in data["size"] if x not in minmax] nonminmax_index = [] for each_nonminmax in nonminmax: v = data["size"].index(each_nonminmax) x = data["index"][v] nonminmax_index.append(x) - nons = zip(nonminmax,nonminmax_index) + nons = zip(nonminmax, nonminmax_index) for value in nons: - #hits[sar_name]["TMD_"+str(value[0])] = hits[sar_name]["TMD_"+str(value[0])].pop(value[1]) - hits[sar_name]["TMD_"+str(value[0])][value[1]] = [""] + # hits[sar_name]["TMD_"+str(value[0])] = hits[sar_name]["TMD_"+str(value[0])].pop(value[1]) + hits[sar_name]["TMD_" + str(value[0])][value[1]] = [""] return hits -def rep_funcs(self,seq,loc,pos_res,neg_res,sar_seq,perc_residues,sar_size): - """ run a set of functions together before sending the results to the storage dictionary """ +def rep_funcs(self, seq, loc, pos_res, neg_res, sar_seq, perc_residues, sar_size): + """run a set of functions together before sending the results to the storage dictionary""" charge_seq = str(seq[:loc]) - charge = charge_check(charge_seq,pos_res,neg_res) - perc_cont = percent_calc(sar_seq,perc_residues,int(sar_size)) + charge = charge_check(charge_seq, pos_res, neg_res) + perc_cont = percent_calc(sar_seq, perc_residues, int(sar_size)) sar_start = loc sar_end = loc + sar_size - sar_coords = "{}..{}".format(loc,loc+sar_size) - nterm_coords = "{}..{}".format("0",loc-1) - cterm_coords = "{}..{}".format(loc+sar_size+1,self.size) + sar_coords = "{}..{}".format(loc, loc + sar_size) + nterm_coords = "{}..{}".format("0", loc - 1) + cterm_coords = "{}..{}".format(loc + sar_size + 1, self.size) - return charge_seq, charge, perc_cont, sar_coords, nterm_coords, cterm_coords, sar_start, sar_end + return ( + charge_seq, + charge, + perc_cont, + sar_coords, + nterm_coords, + cterm_coords, + sar_start, + sar_end, + ) ### Extra "helper" functions -def storage_dict(self,sar_size,sar_seq,hits,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end): # probably not good to call "self" a param here...definitley not PEP approved... - """ organize dictionary for hydrophobicity check """ +def storage_dict( + self, + sar_size, + sar_seq, + hits, + charge_seq, + charge, + perc_cont, + nterm_coords, + sar_coords, + cterm_coords, + sar_start, + sar_end, +): # probably not good to call "self" a param here...definitley not PEP approved... + """organize dictionary for hydrophobicity check""" if self.name not in hits: hits[self.name] = {} hits[self.name]["description"] = str(self.description) hits[self.name]["sequence"] = str(self.seq) hits[self.name]["size"] = str(self.size) - #GAcont = str((str(self.seq).count("G")+str(self.seq).count("A"))/int(self.size)*100) - #hits[self.name]["GAcont"] = "{:.2f}%".format(float(GAcont)) - if "TMD_"+str(sar_size) not in hits[self.name]: - hits[self.name]["TMD_"+str(sar_size)] = [] - hits[self.name]["TMD_"+str(sar_size)].append([sar_seq,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end]) + # GAcont = str((str(self.seq).count("G")+str(self.seq).count("A"))/int(self.size)*100) + # hits[self.name]["GAcont"] = "{:.2f}%".format(float(GAcont)) + if "TMD_" + str(sar_size) not in hits[self.name]: + hits[self.name]["TMD_" + str(sar_size)] = [] + hits[self.name]["TMD_" + str(sar_size)].append( + [ + sar_seq, + charge_seq, + charge, + perc_cont, + nterm_coords, + sar_coords, + cterm_coords, + sar_start, + sar_end, + ] + ) else: - hits[self.name]["TMD_"+str(sar_size)].append([sar_seq,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end]) + hits[self.name]["TMD_" + str(sar_size)].append( + [ + sar_seq, + charge_seq, + charge, + perc_cont, + nterm_coords, + sar_coords, + cterm_coords, + sar_start, + sar_end, + ] + ) else: - if "TMD_"+str(sar_size) not in hits[self.name]: - hits[self.name]["TMD_"+str(sar_size)] = [] - hits[self.name]["TMD_"+str(sar_size)].append([sar_seq,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end]) + if "TMD_" + str(sar_size) not in hits[self.name]: + hits[self.name]["TMD_" + str(sar_size)] = [] + hits[self.name]["TMD_" + str(sar_size)].append( + [ + sar_seq, + charge_seq, + charge, + perc_cont, + nterm_coords, + sar_coords, + cterm_coords, + sar_start, + sar_end, + ] + ) else: - hits[self.name]["TMD_"+str(sar_size)].append([sar_seq,charge_seq,charge,perc_cont,nterm_coords,sar_coords,cterm_coords,sar_start,sar_end]) + hits[self.name]["TMD_" + str(sar_size)].append( + [ + sar_seq, + charge_seq, + charge, + perc_cont, + nterm_coords, + sar_coords, + cterm_coords, + sar_start, + sar_end, + ] + ) -def percent_calc(sequence,residues,size): - """ Calculate the percent of a set of residues within an input sequence """ +def percent_calc(sequence, residues, size): + """Calculate the percent of a set of residues within an input sequence""" counted = {} for aa in sequence: - #print(aa) + # print(aa) if aa in counted: counted[aa] += 1 else: @@ -164,15 +319,15 @@ def percent_calc(sequence,residues,size): residue_amt = counted[res_of_interest] except KeyError: residue_amt = 0 - ratio = residue_amt/size - my_ratios.append((round(ratio*100,2))) - - res_rat = list(zip(residues,my_ratios)) + ratio = residue_amt / size + my_ratios.append((round(ratio * 100, 2))) + + res_rat = list(zip(residues, my_ratios)) return res_rat -def charge_check(charge_seq,pos_res,neg_res): +def charge_check(charge_seq, pos_res, neg_res): charge = 0 for aa in charge_seq: if aa in pos_res: @@ -181,13 +336,13 @@ def charge_check(charge_seq,pos_res,neg_res): charge -= 1 return charge + if __name__ == "__main__": sequence = "MAGBYYYTRLCVRKLRKGGGHP" residues = "YL" size = len(sequence) print(size) - v = percent_calc(sequence,residues,size) + v = percent_calc(sequence, residues, size) print(v) for i in v: print(i) - diff --git a/cpt_sar_finder/biopython_parsing.py b/cpt_sar_finder/biopython_parsing.py index 6e7ae39..1c2b1d3 100755 --- a/cpt_sar_finder/biopython_parsing.py +++ b/cpt_sar_finder/biopython_parsing.py @@ -3,16 +3,17 @@ from Bio import SeqIO + class FASTA_parser: - """ Parses multi fasta file, and zips together header with sequence """ + """Parses multi fasta file, and zips together header with sequence""" def __init__(self, fa): self.fa = fa - + def multifasta_dict(self): - """ parses the input multi fasta, and puts results into dictionary """ + """parses the input multi fasta, and puts results into dictionary""" - return SeqIO.to_dict(SeqIO.parse(self.fa,"fasta")) + return SeqIO.to_dict(SeqIO.parse(self.fa, "fasta")) if __name__ == "__main__": @@ -21,4 +22,3 @@ def multifasta_dict(self): print(d) for k, v in d.items(): print(v.description) - diff --git a/cpt_sar_finder/file_operations.py b/cpt_sar_finder/file_operations.py index d238e7b..91ec9ed 100755 --- a/cpt_sar_finder/file_operations.py +++ b/cpt_sar_finder/file_operations.py @@ -1,91 +1,155 @@ - -def fasta_from_SAR_dict(sar_dict,fa_file): - """ makes a multi fasta with candidates from SAR dictionary """ +def fasta_from_SAR_dict(sar_dict, fa_file): + """makes a multi fasta with candidates from SAR dictionary""" with fa_file as f: for data in sar_dict.values(): f.writelines(">{}\n".format(data["description"])) f.writelines("{}\n".format(data["sequence"])) -def gff3_from_SAR_dict(sar_dict,gff3_file): - """ make a multi gff3 with candidates from SAR dictionary """ - gff3_cols = ["Seqid","Source","Type","Start","End","Score","Strand","Phase","Attributes"] + +def gff3_from_SAR_dict(sar_dict, gff3_file): + """make a multi gff3 with candidates from SAR dictionary""" + gff3_cols = [ + "Seqid", + "Source", + "Type", + "Start", + "End", + "Score", + "Strand", + "Phase", + "Attributes", + ] with gff3_file as f: - f.writelines(f"{gff3_cols[0]}\t{gff3_cols[1]}\t{gff3_cols[2]}\t{gff3_cols[3]}\t{gff3_cols[4]}\t{gff3_cols[5]}\t{gff3_cols[6]}\t{gff3_cols[7]}\t{gff3_cols[8]}\n") + f.writelines( + f"{gff3_cols[0]}\t{gff3_cols[1]}\t{gff3_cols[2]}\t{gff3_cols[3]}\t{gff3_cols[4]}\t{gff3_cols[5]}\t{gff3_cols[6]}\t{gff3_cols[7]}\t{gff3_cols[8]}\n" + ) if sar_dict: - #print(sar_dict) + # print(sar_dict) for name, data in sar_dict.items(): min_idx = 0 f.writelines("##gff-version 3\n") f.writelines(f"##sequence-region {name}\n") - n_start, n_end = split_seq_string(data["TMD_"+str(data["biggest_sar"])][min_idx][4]) - sar_start, sar_end = split_seq_string(data["TMD_"+str(data["biggest_sar"])][min_idx][5]) - c_start, c_end = split_seq_string(data["TMD_"+str(data["biggest_sar"])][min_idx][6]) - f.writelines(f'{name}\tSAR_finder\tTopological domain\t{n_start}\t{n_end}\t.\t.\t.\tNote=N-terminal net charge is {data["TMD_"+str(data["biggest_sar"])][min_idx][2]}\n') - f.writelines(f'{name}\tSAR_finder\tSAR domain\t{sar_start}\t{sar_end}\t.\t.\t.\tNote=residue % in SAR {[perc for perc in data["TMD_"+str(data["biggest_sar"])][min_idx][3]]},Total % is {round(sum(j for i,j in data["TMD_"+str(data["biggest_sar"])][min_idx][3]),2)}\n') - f.writelines(f'{name}\tSAR_finder\tTopological domain\t{c_start}\t{c_end}\t.\t.\t.\tNote=C-terminus\n') + n_start, n_end = split_seq_string( + data["TMD_" + str(data["biggest_sar"])][min_idx][4] + ) + sar_start, sar_end = split_seq_string( + data["TMD_" + str(data["biggest_sar"])][min_idx][5] + ) + c_start, c_end = split_seq_string( + data["TMD_" + str(data["biggest_sar"])][min_idx][6] + ) + f.writelines( + f'{name}\tSAR_finder\tTopological domain\t{n_start}\t{n_end}\t.\t.\t.\tNote=N-terminal net charge is {data["TMD_"+str(data["biggest_sar"])][min_idx][2]}\n' + ) + f.writelines( + f'{name}\tSAR_finder\tSAR domain\t{sar_start}\t{sar_end}\t.\t.\t.\tNote=residue % in SAR {[perc for perc in data["TMD_"+str(data["biggest_sar"])][min_idx][3]]},Total % is {round(sum(j for i,j in data["TMD_"+str(data["biggest_sar"])][min_idx][3]),2)}\n' + ) + f.writelines( + f"{name}\tSAR_finder\tTopological domain\t{c_start}\t{c_end}\t.\t.\t.\tNote=C-terminus\n" + ) else: f.writelines("##gff-version 3\n") f.writelines(f"##sequence-region\n") -def tab_from_SAR_dict(sar_dict,stat_file,hydrophillic_res, sar_min, sar_max): - """ convert SAR dict to a dataframe """ - columns = ["Name","Protein Sequence","Protein Length","SAR Length","SAR Start","Putative SAR Sequence","SAR End",[f"{res}%" for res in hydrophillic_res],"% Total","N-term Sequence","N-term net Charge"] # using different residues for percent calc: [f"{res}%" for res in hydrophillic_res] +def tab_from_SAR_dict(sar_dict, stat_file, hydrophillic_res, sar_min, sar_max): + """convert SAR dict to a dataframe""" + columns = [ + "Name", + "Protein Sequence", + "Protein Length", + "SAR Length", + "SAR Start", + "Putative SAR Sequence", + "SAR End", + [f"{res}%" for res in hydrophillic_res], + "% Total", + "N-term Sequence", + "N-term net Charge", + ] # using different residues for percent calc: [f"{res}%" for res in hydrophillic_res] with stat_file as f: - f.writelines(f"{columns[0]}\t{columns[1]}\t{columns[2]}\t{columns[3]}\t{columns[4]}\t{columns[5]}\t{columns[6]}\t{columns[7]}\t{columns[8]}\t{columns[9]}\t{columns[10]}\n") + f.writelines( + f"{columns[0]}\t{columns[1]}\t{columns[2]}\t{columns[3]}\t{columns[4]}\t{columns[5]}\t{columns[6]}\t{columns[7]}\t{columns[8]}\t{columns[9]}\t{columns[10]}\n" + ) if sar_dict: - #print(sar_dict) + # print(sar_dict) for name, data in sar_dict.items(): - for tmd_size in range(sar_max, sar_min-1, -1): - if "TMD_"+str(tmd_size) in data: - for each_match in data["TMD_"+str(tmd_size)]: + for tmd_size in range(sar_max, sar_min - 1, -1): + if "TMD_" + str(tmd_size) in data: + for each_match in data["TMD_" + str(tmd_size)]: if each_match != [""]: - #print(f"{name} - {data}") - #print(each_match) - #for perc in each_match[3]: + # print(f"{name} - {data}") + # print(each_match) + # for perc in each_match[3]: # print(perc) try: - f.writelines(f'{name}\t{data["sequence"]}\t{data["size"]}\t{tmd_size}\t{int(each_match[7])+1}\t{each_match[0]}\t{int(each_match[8])+1}\t{[perc for perc in each_match[3]]}\t{round(sum(j for i,j in each_match[3]),2)}\t{each_match[1]}\t{each_match[2]}\n') + f.writelines( + f'{name}\t{data["sequence"]}\t{data["size"]}\t{tmd_size}\t{int(each_match[7])+1}\t{each_match[0]}\t{int(each_match[8])+1}\t{[perc for perc in each_match[3]]}\t{round(sum(j for i,j in each_match[3]),2)}\t{each_match[1]}\t{each_match[2]}\n' + ) except IndexError: - f.writelines(f'ERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\n') + f.writelines( + f"ERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\n" + ) else: continue + def stat_file_from_SAR_dict(sar_dict, stat_file, sar_min, sar_max): - """ summary statistics from SAR finder function """ + """summary statistics from SAR finder function""" with stat_file as f: f.writelines("..........:::::: Candidate SAR Proteins ::::::..........\n\n") if sar_dict: for data in sar_dict.values(): - f.writelines("Protein Description and Name: {}\n".format(data["description"])) + f.writelines( + "Protein Description and Name: {}\n".format(data["description"]) + ) f.writelines("Protein Sequence: {}\n".format(data["sequence"])) f.writelines("Protein Length: {}\n".format(data["size"])) f.writelines("SAR Criteria matching region(s)\n") - for tmd_size in range(sar_max, sar_min-1, -1): - if "TMD_"+str(tmd_size) in data: + for tmd_size in range(sar_max, sar_min - 1, -1): + if "TMD_" + str(tmd_size) in data: f.writelines("\nSAR length of {}:\n".format(tmd_size)) - for each_match in data["TMD_"+str(tmd_size)]: - if each_match != ['']: - f.writelines("\nPotential SAR domain sequence: {}\n".format(each_match[0])) - f.writelines("N-term sequence: {}\n".format(each_match[1])) - f.writelines("N-term net charge: {}\n".format(each_match[2])) + for each_match in data["TMD_" + str(tmd_size)]: + if each_match != [""]: + f.writelines( + "\nPotential SAR domain sequence: {}\n".format( + each_match[0] + ) + ) + f.writelines( + "N-term sequence: {}\n".format(each_match[1]) + ) + f.writelines( + "N-term net charge: {}\n".format(each_match[2]) + ) for each_perc_calc in each_match[3]: - f.writelines("Percent {} content: {}%\n".format(each_perc_calc[0],each_perc_calc[1])) - f.writelines("N-term coords: {}\n".format(each_match[4])) + f.writelines( + "Percent {} content: {}%\n".format( + each_perc_calc[0], each_perc_calc[1] + ) + ) + f.writelines( + "N-term coords: {}\n".format(each_match[4]) + ) f.writelines("SAR coords: {}\n".format(each_match[5])) - f.writelines("C-term coords: {}\n".format(each_match[6])) + f.writelines( + "C-term coords: {}\n".format(each_match[6]) + ) f.writelines("SAR start: {}\n".format(each_match[7])) else: continue - f.writelines("========================================================\n\n") + f.writelines( + "========================================================\n\n" + ) else: f.writelines("No candidate SAR Proteins found") + def split_seq_string(input_range, python_indexing=True): - """ splits a #..# sequence into the two respective starts and ends, if python indexing, adds 1, otherwise keeps """ + """splits a #..# sequence into the two respective starts and ends, if python indexing, adds 1, otherwise keeps""" if python_indexing: values = input_range.split("..") - start =int(values[0]) + 1 + start = int(values[0]) + 1 end = int(values[1]) + 1 else: values = input_range.split("..") @@ -94,5 +158,6 @@ def split_seq_string(input_range, python_indexing=True): return start, end + if __name__ == "__main__": - pass \ No newline at end of file + pass diff --git a/cpt_search_file/editDB.py b/cpt_search_file/editDB.py index 29b0774..938dec7 100755 --- a/cpt_search_file/editDB.py +++ b/cpt_search_file/editDB.py @@ -6,7 +6,7 @@ ### create new key def add_new_key(db, add_key=[]): - """ Set of keys to add to the database """ + """Set of keys to add to the database""" for new_key in add_key: db[new_key] = [] @@ -15,41 +15,49 @@ def add_new_key(db, add_key=[]): ### Add values to dbase: def add_value_to_term(index_val, db, add_value=[]): - """ index value, put in value """ + """index value, put in value""" for val in add_value: db[index_val].append(val) - + return db ### Remove values from dbase: def remove_value_from_term(index_val, db, remove_value=[]): - """ remove values from list """ + """remove values from list""" for val in remove_value: db[index_val].remove(val) - + return db ### Terms to add from a file -def add_from_file(input_file,index_val,db,sep="\n"): - """ input file, new line separated currently, and append files to correct key, return is altered dictionary""" +def add_from_file(input_file, index_val, db, sep="\n"): + """input file, new line separated currently, and append files to correct key, return is altered dictionary""" terms = open(input_file).read().splitlines() - db = add_value_to_term(index_val,db,terms) + db = add_value_to_term(index_val, db, terms) return db if __name__ == "__main__": - lysis_json = "data/lysis-family-v1.0.2.json" # insert json of choice + lysis_json = "data/lysis-family-v1.0.2.json" # insert json of choice db = ej.explodeJSON(lysis_json) db = db.readJSON() - #revise_db = add_new_key(db=db,add_key=["spanins"]) - #files = ["data/term_additions/200505_holin_domains.txt","data/term_additions/200505_Spanin_Domains.txt"] - terms = ["DUF2570","PF10828","IPR022538","DUF2514","PF10721","IPR019659","DUF2681","PF10883","IPR020274"] - #revise_db = add_from_file(files[0],"holin_domains",revise_db) - #revise_db = add_from_file(files[1],"spanin_domains",revise_db) - revise_db = add_value_to_term("spanin_domains",db,add_value=terms) - save_dict_to_json(obj=revise_db,filename="data/lysis-family-v1.0.3.json") - - + # revise_db = add_new_key(db=db,add_key=["spanins"]) + # files = ["data/term_additions/200505_holin_domains.txt","data/term_additions/200505_Spanin_Domains.txt"] + terms = [ + "DUF2570", + "PF10828", + "IPR022538", + "DUF2514", + "PF10721", + "IPR019659", + "DUF2681", + "PF10883", + "IPR020274", + ] + # revise_db = add_from_file(files[0],"holin_domains",revise_db) + # revise_db = add_from_file(files[1],"spanin_domains",revise_db) + revise_db = add_value_to_term("spanin_domains", db, add_value=terms) + save_dict_to_json(obj=revise_db, filename="data/lysis-family-v1.0.3.json") diff --git a/cpt_search_file/explodeJSON.py b/cpt_search_file/explodeJSON.py index 64b97fc..5bc4ab6 100755 --- a/cpt_search_file/explodeJSON.py +++ b/cpt_search_file/explodeJSON.py @@ -6,14 +6,14 @@ def __init__(self, file): self.file = file def readJSON(self): - """ returns dictionary object for reading a JSON """ + """returns dictionary object for reading a JSON""" with open(self.file) as j: myObj = json.load(j) return myObj def explode(self): - """ Makes a list of each embedded list from the database JSON """ + """Makes a list of each embedded list from the database JSON""" data = self.readJSON() @@ -24,8 +24,9 @@ def explode(self): return terms + ### Dictionary Functions -def save_dict_to_json(obj,filename="output.json"): +def save_dict_to_json(obj, filename="output.json"): with open(filename, "w") as js: print("saved {} as json".format(filename)) json.dump(obj, js, indent=4) @@ -47,6 +48,6 @@ def save_dict_to_json(obj,filename="output.json"): terms = e.explode() print(terms) - test = {"math":["algebra","calculus"]} + test = {"math": ["algebra", "calculus"]} print(type(test)) - save_dict_to_json(obj=test,filename="test-output.json") \ No newline at end of file + save_dict_to_json(obj=test, filename="test-output.json") diff --git a/cpt_search_file/searchFile.py b/cpt_search_file/searchFile.py index 4bd8f66..5db6114 100755 --- a/cpt_search_file/searchFile.py +++ b/cpt_search_file/searchFile.py @@ -1,8 +1,8 @@ ##### User input File(s), that are BLAST XML, gff3, and/or Genbank and then searched for containing user designated terms -import argparse +import argparse import explodeJSON as ej -import gffutils # THIS IS REQUIREMENT +import gffutils # THIS IS REQUIREMENT from Bio.Blast import NCBIXML from Bio import SeqIO import re @@ -11,8 +11,8 @@ SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) ####### TERM FUNCTIONS -def dbaseTerms(terms,galaxy=True): - """ Index into dictionary object and retrieve all desired terms """ +def dbaseTerms(terms, galaxy=True): + """Index into dictionary object and retrieve all desired terms""" db_path = os.path.join(SCRIPT_DIR, "data/lysis-family-v1.0.3.json") db = ej.explodeJSON(db_path) db = db.readJSON() @@ -30,9 +30,8 @@ def dbaseTerms(terms,galaxy=True): pass - -def userTerms(file,text): - """ Select terms input by user """ +def userTerms(file, text): + """Select terms input by user""" user_terms = [] if file: terms = open(file.name).read().splitlines() @@ -40,14 +39,14 @@ def userTerms(file,text): else: pass if text: - if re.search(("__cn__"),str(text[0])): - #s = text[0].split("__cn__") - #print(s) - #print(text[0]) + if re.search(("__cn__"), str(text[0])): + # s = text[0].split("__cn__") + # print(s) + # print(text[0]) s = text[0] - #print(type(s)) + # print(type(s)) split = s.split("__cn__") - #print(split) + # print(split) user_terms.extend(split) else: user_terms.extend(text) @@ -58,7 +57,7 @@ def userTerms(file,text): def glueTerms(dbase_terms, user_terms): - """ glue dbaseTerms and userTerms together for eventual query item """ + """glue dbaseTerms and userTerms together for eventual query item""" glued = [] if dbase_terms: glued.extend(dbase_terms) @@ -71,9 +70,10 @@ def glueTerms(dbase_terms, user_terms): return glued + ####### FILE FUNCTIONS -def glueFiles(gff,gbk,fa,blast): - """ glue files into one list...I think this is a decent way to go about this...#CHECK LATER#... """ +def glueFiles(gff, gbk, fa, blast): + """glue files into one list...I think this is a decent way to go about this...#CHECK LATER#...""" files = [] gffs = [] gbks = [] @@ -86,7 +86,7 @@ def glueFiles(gff,gbk,fa,blast): if gbk: for gbk_file in gbk: gbks.extend(gbk_file) - #print(gbks) + # print(gbks) else: pass fas = [] @@ -100,47 +100,73 @@ def glueFiles(gff,gbk,fa,blast): blasts.extend(blast_file) else: pass - files = [gffs,gbks,fas,blasts] + files = [gffs, gbks, fas, blasts] return files + ######## PARSE FILE FUNCTIONS -def readGFF3(files,search_list): - " Searches through gff3 file(s) and appends " +def readGFF3(files, search_list): + "Searches through gff3 file(s) and appends" if files: for idx, file in enumerate(files): if idx == 0: - print("Parsing - "+file.name) - db = gffutils.create_db(file.name,dbfn="file.db",force=True,keep_order=False) + print("Parsing - " + file.name) + db = gffutils.create_db( + file.name, dbfn="file.db", force=True, keep_order=False + ) db = gffutils.FeatureDB("file.db") features = db.all_features() gff3_matches = [] for feature in features: - gff3_matches.extend(searchInput(str(feature), search_list=search_list)) - gff3_matches = list(set(gff3_matches)) # make sure we don't fluff the list + gff3_matches.extend( + searchInput(str(feature), search_list=search_list) + ) + gff3_matches = list( + set(gff3_matches) + ) # make sure we don't fluff the list else: - print("Parsing - "+file.name) - db = gffutils.create_db(file.name,dbfn=str(idx)+"_file.db",force=True,keep_order=False) - db = gffutils.FeatureDB(str(idx)+"_file.db") + print("Parsing - " + file.name) + db = gffutils.create_db( + file.name, dbfn=str(idx) + "_file.db", force=True, keep_order=False + ) + db = gffutils.FeatureDB(str(idx) + "_file.db") features = db.all_features() for feature in features: - gff3_matches.extend(searchInput(str(feature), search_list=search_list)) - gff3_matches = list(set(gff3_matches)) # make sure we don't fluff the list + gff3_matches.extend( + searchInput(str(feature), search_list=search_list) + ) + gff3_matches = list( + set(gff3_matches) + ) # make sure we don't fluff the list gff3_matches.sort() return gff3_matches else: pass -def readGBK(files,search_list): + +def readGBK(files, search_list): if files: for idx, file in enumerate(files): if idx == 0: - print("Parsing - "+file.name) + print("Parsing - " + file.name) record = SeqIO.read(file.name, "genbank") gbk_matches = [] for feature in record.features: try: - if searchInput(str(feature.qualifiers["product"]),search_list=search_list) or searchInput(str(feature.qualifiers["note"]),search_list=search_list) or searchInput(str(feature.qualifiers["dbxref"]),search_list=search_list): + if ( + searchInput( + str(feature.qualifiers["product"]), + search_list=search_list, + ) + or searchInput( + str(feature.qualifiers["note"]), search_list=search_list + ) + or searchInput( + str(feature.qualifiers["dbxref"]), + search_list=search_list, + ) + ): gbk_matches.extend([str(feature)]) else: continue @@ -148,11 +174,23 @@ def readGBK(files,search_list): continue gbk_matches = list(set(gbk_matches)) else: - print("Parsing - "+file.name) + print("Parsing - " + file.name) record = SeqIO.read(file.name, "genbank") for feature in record.features: try: - if searchInput(str(feature.qualifiers["product"]),search_list=search_list) or searchInput(str(feature.qualifiers["note"]),search_list=search_list) or searchInput(str(feature.qualifiers["dbxref"]),search_list=search_list): + if ( + searchInput( + str(feature.qualifiers["product"]), + search_list=search_list, + ) + or searchInput( + str(feature.qualifiers["note"]), search_list=search_list + ) + or searchInput( + str(feature.qualifiers["dbxref"]), + search_list=search_list, + ) + ): gbk_matches.extend([str(feature)]) else: continue @@ -164,47 +202,67 @@ def readGBK(files,search_list): else: pass -def readFASTA(files,search_list): + +def readFASTA(files, search_list): if files: for idx, file in enumerate(files): if idx == 0: - print("Parsing - "+file.name) + print("Parsing - " + file.name) record = SeqIO.parse(file.name, "fasta") fa_matches = [] for feature in record: - fa_matches.extend(searchInput(feature.description,search_list=search_list)) + fa_matches.extend( + searchInput(feature.description, search_list=search_list) + ) fa_matches = list(set(fa_matches)) else: - print("Parsing - "+file.name) + print("Parsing - " + file.name) record = SeqIO.parse(file.name, "fasta") for feature in record: - fa_matches.extend(searchInput(feature.description,search_list=search_list)) + fa_matches.extend( + searchInput(feature.description, search_list=search_list) + ) fa_matches = list(set(fa_matches)) fa_matches.sort() return fa_matches else: pass -def readBLAST(files,search_list): + +def readBLAST(files, search_list): if files: for idx, file in enumerate(files): if idx == 0: - print("Parsing - "+file.name) + print("Parsing - " + file.name) blast_records = NCBIXML.parse(open(file.name)) blast_matches = [] for blast_record in blast_records: for desc in blast_record.descriptions: pretty = prettifyXML(str(desc)) for each_ret in pretty: - blast_matches.extend(searchInput(each_ret,search_list=search_list,blast=True,q_id=blast_record.query)) + blast_matches.extend( + searchInput( + each_ret, + search_list=search_list, + blast=True, + q_id=blast_record.query, + ) + ) blast_matches = list(set(blast_matches)) else: - print("Parsing - "+file.name) + print("Parsing - " + file.name) blast_records = NCBIXML.parse(open(file.name)) for blast_record in blast_records: for desc in blast_record.descriptions: pretty = prettifyXML(str(desc)) - blast_matches.extend(searchInput(each_ret,search_list=search_list,blast=True,q_id=blast_record.query)) + blast_matches.extend( + searchInput( + each_ret, + search_list=search_list, + blast=True, + q_id=blast_record.query, + ) + ) blast_matches = list(set(blast_matches)) blast_matches.sort() return blast_matches @@ -213,118 +271,180 @@ def readBLAST(files,search_list): ######## SEARCH FILE FUNCTIONS -def searchInput(input, search_list,blast=False,q_id=None): - """ Takes an input search string, and returns uniques of passing """ +def searchInput(input, search_list, blast=False, q_id=None): + """Takes an input search string, and returns uniques of passing""" output = [] for search_term in search_list: if blast: if re.search(re.escape(search_term), input): - add_query = "QueryID: "+str(q_id)+"\nSearchQuery: "+search_term+"\nMatch: "+input+"\n" + add_query = ( + "QueryID: " + + str(q_id) + + "\nSearchQuery: " + + search_term + + "\nMatch: " + + input + + "\n" + ) output.extend([add_query]) else: continue - #print(search_term) - #st = r"\b"+search_term+r"\b" + # print(search_term) + # st = r"\b"+search_term+r"\b" else: if re.search(re.escape(search_term), input): - #print(search_term+" -> was found") + # print(search_term+" -> was found") output.extend([input]) else: continue return list(set(output)) + ######## prettify-XML function def prettifyXML(input): - """ prettifies a string input from a BLAST-xml """ + """prettifies a string input from a BLAST-xml""" s = input split = s.split(">") return split + ########## Output File Writer def writeResults(gffs, gbks, fas, blasts, outName="termHits.txt"): - """ Takes an input list for each parameter, and writes each result to the output file """ + """Takes an input list for each parameter, and writes each result to the output file""" with open(outName.name, "w+") as out_file: if gffs: - out_file.writelines("\n==================== GFF3 Term Hits ====================\n\n") + out_file.writelines( + "\n==================== GFF3 Term Hits ====================\n\n" + ) for gff_hits in gffs: - out_file.writelines(gff_hits+"\n") + out_file.writelines(gff_hits + "\n") else: gffs = [] if gbks: - out_file.writelines("\n==================== GBK Term Hits ====================\n\n") + out_file.writelines( + "\n==================== GBK Term Hits ====================\n\n" + ) for gbk_hits in gbks: - out_file.writelines(gbk_hits+"\n") + out_file.writelines(gbk_hits + "\n") else: gbks = [] if fas: - out_file.writelines("\n==================== FASTA Term Hits ====================\n\n") + out_file.writelines( + "\n==================== FASTA Term Hits ====================\n\n" + ) for fa_hits in fas: - out_file.writelines(fa_hits+"\n") + out_file.writelines(fa_hits + "\n") else: fas = [] if blasts: - out_file.writelines("\n==================== BLAST Term Hits ====================\n\n") + out_file.writelines( + "\n==================== BLAST Term Hits ====================\n\n" + ) for blast_hits in blasts: - out_file.writelines(blast_hits+"\n") + out_file.writelines(blast_hits + "\n") else: blasts = [] if len(gffs) or len(gbks) or len(fas) or len(blasts): - print("Terms Found") + print("Terms Found") else: out_file.writelines("No query matches, try again with new terms!") print("No query matches, try again with new terms!") -def write_gff3(gffs,outName="proxHits.gff3"): - """ writes output to gff3 file for prox2lysis pipeline """ + +def write_gff3(gffs, outName="proxHits.gff3"): + """writes output to gff3 file for prox2lysis pipeline""" with open(outName.name, "w+") as out_file: out_file.writelines("##gff-version 3\n") if gffs: for gff_hits in gffs: - out_file.writelines(gff_hits+"\n") + out_file.writelines(gff_hits + "\n") else: - #raise Exception("No terms were found from query set") + # raise Exception("No terms were found from query set") out_file.writelines("##No terms were found from query set\n") - + if __name__ == "__main__": print(os.getcwd()) - parser = argparse.ArgumentParser(description="Uses a selection of terms to query an input file for matching cases") - parser.add_argument("--dbaseTerms",nargs="*",help="dbase terms to search") # will be a select option, based on KEY within the JSON dbase - parser.add_argument("--custom_txt",nargs="*",help="custom user input terms, if using Galaxy, terms will be __cn__ sep, otherwise by space") - parser.add_argument("--custom_file",type=argparse.FileType("r"),help="custom new line separated search term file") - parser.add_argument("--gff3_files",type=argparse.FileType("r"),nargs="*",action="append",help="GFF3 File(s), if multiple files, use another flag") - parser.add_argument("--gbk_files",type=argparse.FileType("r"),nargs="*",action="append",help="GBK File(s), if multiple files, use another flag") - parser.add_argument("--fa_files",type=argparse.FileType("r"),nargs="*",action="append",help="FASTA File(s), if multiple files, use another flag") - parser.add_argument("--blast_files",type=argparse.FileType("r"),nargs="*",action="append",help="BLAST.xml File(s), if multiple files, use another flag") - parser.add_argument("--output",type=argparse.FileType("w+"),default="termHits.txt") - parser.add_argument("--prox",action="store_true",help="Use when running the prox2lysis pipeline") + parser = argparse.ArgumentParser( + description="Uses a selection of terms to query an input file for matching cases" + ) + parser.add_argument( + "--dbaseTerms", nargs="*", help="dbase terms to search" + ) # will be a select option, based on KEY within the JSON dbase + parser.add_argument( + "--custom_txt", + nargs="*", + help="custom user input terms, if using Galaxy, terms will be __cn__ sep, otherwise by space", + ) + parser.add_argument( + "--custom_file", + type=argparse.FileType("r"), + help="custom new line separated search term file", + ) + parser.add_argument( + "--gff3_files", + type=argparse.FileType("r"), + nargs="*", + action="append", + help="GFF3 File(s), if multiple files, use another flag", + ) + parser.add_argument( + "--gbk_files", + type=argparse.FileType("r"), + nargs="*", + action="append", + help="GBK File(s), if multiple files, use another flag", + ) + parser.add_argument( + "--fa_files", + type=argparse.FileType("r"), + nargs="*", + action="append", + help="FASTA File(s), if multiple files, use another flag", + ) + parser.add_argument( + "--blast_files", + type=argparse.FileType("r"), + nargs="*", + action="append", + help="BLAST.xml File(s), if multiple files, use another flag", + ) + parser.add_argument( + "--output", type=argparse.FileType("w+"), default="termHits.txt" + ) + parser.add_argument( + "--prox", action="store_true", help="Use when running the prox2lysis pipeline" + ) args = parser.parse_args() ############ STEP I ##### Determine user's terms to query - dbase_terms = dbaseTerms(terms=args.dbaseTerms,galaxy=True) - user_terms = userTerms(file=args.custom_file,text=args.custom_txt) + dbase_terms = dbaseTerms(terms=args.dbaseTerms, galaxy=True) + user_terms = userTerms(file=args.custom_file, text=args.custom_txt) glued_terms = glueTerms(dbase_terms=dbase_terms, user_terms=user_terms) ############ STEP II ##### Create list with matches - files = glueFiles(gff=args.gff3_files,gbk=args.gbk_files, fa=args.fa_files, blast=args.blast_files) - gffs = readGFF3(files=files[0],search_list=glued_terms) - gbks = readGBK(files=files[1],search_list=glued_terms) - fas = readFASTA(files=files[2],search_list=glued_terms) - blasts = readBLAST(files=files[3],search_list=glued_terms) + files = glueFiles( + gff=args.gff3_files, + gbk=args.gbk_files, + fa=args.fa_files, + blast=args.blast_files, + ) + gffs = readGFF3(files=files[0], search_list=glued_terms) + gbks = readGBK(files=files[1], search_list=glued_terms) + fas = readFASTA(files=files[2], search_list=glued_terms) + blasts = readBLAST(files=files[3], search_list=glued_terms) ############ STEP III ##### Output results to a text file or gff3 if args.prox: - write_gff3(gffs,outName=args.output) + write_gff3(gffs, outName=args.output) else: - writeResults(gffs,gbks,fas,blasts,outName=args.output) - - + writeResults(gffs, gbks, fas, blasts, outName=args.output) diff --git a/cpt_shinefind/gff3.py b/cpt_shinefind/gff3.py index d4795d4..48496c3 100755 --- a/cpt_shinefind/gff3.py +++ b/cpt_shinefind/gff3.py @@ -97,10 +97,10 @@ def feature_test_type(feature, **kwargs): if "type" in kwargs: return str(feature.type).upper() == str(kwargs["type"]).upper() elif "types" in kwargs: - for x in kwargs["types"]: - if str(feature.type).upper() == str(x).upper(): - return True - return False + for x in kwargs["types"]: + if str(feature.type).upper() == str(x).upper(): + return True + return False raise Exception("Incorrect feature_test_type call, need type or types") diff --git a/cpt_shinefind/shinefind.py b/cpt_shinefind/shinefind.py index c51665e..509cff3 100755 --- a/cpt_shinefind/shinefind.py +++ b/cpt_shinefind/shinefind.py @@ -53,10 +53,10 @@ def list_sds(self, sequence, sd_min=3, sd_max=17): for regex in self.sd_reg: for match in regex.finditer(sequence): spacing = len(sequence) - len(match.group()) - match.start() - if sd_max >= spacing+sd_min and spacing+sd_min >= sd_min: - #if the spacing is within gap limits, add - #(search space is [sd_max+7 .. sd_min] so actual gap is spacing+sd_min) - #print('min %d max %d - adding SD with gap %d' % (sd_min, sd_max, spacing+sd_min)) + if sd_max >= spacing + sd_min and spacing + sd_min >= sd_min: + # if the spacing is within gap limits, add + # (search space is [sd_max+7 .. sd_min] so actual gap is spacing+sd_min) + # print('min %d max %d - adding SD with gap %d' % (sd_min, sd_max, spacing+sd_min)) hits.append( { "spacing": spacing, @@ -66,7 +66,7 @@ def list_sds(self, sequence, sd_min=3, sd_max=17): "len": len(match.group()), } ) - hits = sorted(hits, key= lambda x: (-x['len'],x['spacing'])) + hits = sorted(hits, key=lambda x: (-x["len"], x["spacing"])) return hits @classmethod @@ -80,7 +80,16 @@ def highlight_sd(cls, sequence, start, end): ) @classmethod - def to_features(cls, hits, strand, parent_start, parent_end, feature_id=None, sd_min=3, sd_max=17): + def to_features( + cls, + hits, + strand, + parent_start, + parent_end, + feature_id=None, + sd_min=3, + sd_max=17, + ): results = [] for idx, hit in enumerate(hits): # gene complement(124..486) @@ -90,7 +99,7 @@ def to_features(cls, hits, strand, parent_start, parent_end, feature_id=None, sd # -1 491 501 2 3 5 # -1 491 501 1 3 5 # -1 491 501 0 3 5 - + qualifiers = { "source": "CPT_ShineFind", "ID": "%s.rbs-%s" % (feature_id, idx), @@ -108,7 +117,7 @@ def to_features(cls, hits, strand, parent_start, parent_end, feature_id=None, sd # minimum absolute value of these two will be the proper gap regardless of strand tmp = gffSeqFeature( FeatureLocation(min(start, end), max(start, end), strand=strand), - #FeatureLocation(min(start, end), max(start, end), strand=strand), + # FeatureLocation(min(start, end), max(start, end), strand=strand), type="Shine_Dalgarno_sequence", qualifiers=qualifiers, ) @@ -133,7 +142,10 @@ def testFeatureUpstream(self, feature, record, sd_min=3, sd_max=17): # Create our temp feature used to obtain correct portion of # genome - tmp = gffSeqFeature(FeatureLocation(min(start, end), max(start, end), strand=strand), type="domain") + tmp = gffSeqFeature( + FeatureLocation(min(start, end), max(start, end), strand=strand), + type="domain", + ) seq = str(tmp.extract(record.seq)) return self.list_sds(seq, sd_min, sd_max), start, end, seq @@ -175,6 +187,7 @@ def fix_gene_boundaries(feature): feature.location = FeatureLocation(fmin, fmax, strand=-1) return feature + def shinefind( fasta, gff3, diff --git a/cpt_starts/gff3.py b/cpt_starts/gff3.py index d4795d4..48496c3 100755 --- a/cpt_starts/gff3.py +++ b/cpt_starts/gff3.py @@ -97,10 +97,10 @@ def feature_test_type(feature, **kwargs): if "type" in kwargs: return str(feature.type).upper() == str(kwargs["type"]).upper() elif "types" in kwargs: - for x in kwargs["types"]: - if str(feature.type).upper() == str(x).upper(): - return True - return False + for x in kwargs["types"]: + if str(feature.type).upper() == str(x).upper(): + return True + return False raise Exception("Incorrect feature_test_type call, need type or types") diff --git a/cpt_starts/start_stats.py b/cpt_starts/start_stats.py index 623afe9..bf80d85 100755 --- a/cpt_starts/start_stats.py +++ b/cpt_starts/start_stats.py @@ -21,9 +21,9 @@ def main(fasta, gff3): codon_usage[seq] = 1 # TODO: print all actg combinations? Or just ones that are there - print ("# Codon\tCount") + print("# Codon\tCount") for key in sorted(codon_usage): - print ("\t".join((key, str(codon_usage[key])))) + print("\t".join((key, str(codon_usage[key])))) if __name__ == "__main__": diff --git a/cpt_stops/gff3.py b/cpt_stops/gff3.py index d4795d4..48496c3 100755 --- a/cpt_stops/gff3.py +++ b/cpt_stops/gff3.py @@ -97,10 +97,10 @@ def feature_test_type(feature, **kwargs): if "type" in kwargs: return str(feature.type).upper() == str(kwargs["type"]).upper() elif "types" in kwargs: - for x in kwargs["types"]: - if str(feature.type).upper() == str(x).upper(): - return True - return False + for x in kwargs["types"]: + if str(feature.type).upper() == str(x).upper(): + return True + return False raise Exception("Incorrect feature_test_type call, need type or types") diff --git a/cpt_stops/stop_stats.py b/cpt_stops/stop_stats.py index 3a75340..c1c02a1 100755 --- a/cpt_stops/stop_stats.py +++ b/cpt_stops/stop_stats.py @@ -27,9 +27,9 @@ def main(fasta, gff3): } # TODO: print all actg combinations? Or just ones that are there - print ("# Name\tCodon\tCount") + print("# Name\tCodon\tCount") for key in sorted(codon_usage): - print ("\t".join((names.get(key.upper(), "None"), key, str(codon_usage[key])))) + print("\t".join((names.get(key.upper(), "None"), key, str(codon_usage[key])))) if __name__ == "__main__": diff --git a/cpt_trnascanse_to_gff/tRNAscan_to_gff_SE_format.py b/cpt_trnascanse_to_gff/tRNAscan_to_gff_SE_format.py index 40fefcd..1465587 100755 --- a/cpt_trnascanse_to_gff/tRNAscan_to_gff_SE_format.py +++ b/cpt_trnascanse_to_gff/tRNAscan_to_gff_SE_format.py @@ -1,38 +1,38 @@ #!/usr/bin/env python import fileinput -print ("##gff-version-3") +print("##gff-version-3") # process each trna in tsv file metaLines = 0 for trna in fileinput.input(): if metaLines < 3: - metaLines += 1 - continue + metaLines += 1 + continue cols_tsv = trna.split("\t") if int(cols_tsv[2]) < int(cols_tsv[3]): - cols_gff = [ - cols_tsv[0].strip(), - "tRNAscan", - "tRNA", - cols_tsv[2].strip(), - cols_tsv[3].strip(), - cols_tsv[8], - "+", - ".", - 'ID=trna.%s;Anticodon=%s;Codon=tRNA-%s' - % (cols_tsv[1], cols_tsv[5].lower(), cols_tsv[4]), - ] + cols_gff = [ + cols_tsv[0].strip(), + "tRNAscan", + "tRNA", + cols_tsv[2].strip(), + cols_tsv[3].strip(), + cols_tsv[8], + "+", + ".", + "ID=trna.%s;Anticodon=%s;Codon=tRNA-%s" + % (cols_tsv[1], cols_tsv[5].lower(), cols_tsv[4]), + ] else: - cols_gff = [ - cols_tsv[0].strip(), - "tRNAscan", - "tRNA", - cols_tsv[3].strip(), - cols_tsv[2].strip(), - cols_tsv[8], - "-", - ".", - 'ID=trna.%s;Anticodon=%s;Codon=tRNA-%s' - % (cols_tsv[1], cols_tsv[5].lower(), cols_tsv[4]), - ] - print ("\t".join(cols_gff)) + cols_gff = [ + cols_tsv[0].strip(), + "tRNAscan", + "tRNA", + cols_tsv[3].strip(), + cols_tsv[2].strip(), + cols_tsv[8], + "-", + ".", + "ID=trna.%s;Anticodon=%s;Codon=tRNA-%s" + % (cols_tsv[1], cols_tsv[5].lower(), cols_tsv[4]), + ] + print("\t".join(cols_gff)) diff --git a/cpt_type_filter/filter_type.py b/cpt_type_filter/filter_type.py index 471badb..7ba8537 100755 --- a/cpt_type_filter/filter_type.py +++ b/cpt_type_filter/filter_type.py @@ -21,15 +21,15 @@ ) rec.features = [] for x in tempFeats: - rec.features.append(x) + rec.features.append(x) for x in rec.features: - if "Parent" in x.qualifiers.keys(): - found = 0 - for seek in x.qualifiers["Parent"]: - for y in rec.features: - if y.id == seek: - found += 1 - break - if found < len(x.qualifiers["Parent"]): - del x.qualifiers["Parent"] + if "Parent" in x.qualifiers.keys(): + found = 0 + for seek in x.qualifiers["Parent"]: + for y in rec.features: + if y.id == seek: + found += 1 + break + if found < len(x.qualifiers["Parent"]): + del x.qualifiers["Parent"] gffWrite([rec], sys.stdout) diff --git a/cpt_type_filter/gff3.py b/cpt_type_filter/gff3.py index d4795d4..48496c3 100755 --- a/cpt_type_filter/gff3.py +++ b/cpt_type_filter/gff3.py @@ -97,10 +97,10 @@ def feature_test_type(feature, **kwargs): if "type" in kwargs: return str(feature.type).upper() == str(kwargs["type"]).upper() elif "types" in kwargs: - for x in kwargs["types"]: - if str(feature.type).upper() == str(x).upper(): - return True - return False + for x in kwargs["types"]: + if str(feature.type).upper() == str(x).upper(): + return True + return False raise Exception("Incorrect feature_test_type call, need type or types") diff --git a/cpt_wig_rebase/gff3.py b/cpt_wig_rebase/gff3.py index d4795d4..48496c3 100755 --- a/cpt_wig_rebase/gff3.py +++ b/cpt_wig_rebase/gff3.py @@ -97,10 +97,10 @@ def feature_test_type(feature, **kwargs): if "type" in kwargs: return str(feature.type).upper() == str(kwargs["type"]).upper() elif "types" in kwargs: - for x in kwargs["types"]: - if str(feature.type).upper() == str(x).upper(): - return True - return False + for x in kwargs["types"]: + if str(feature.type).upper() == str(x).upper(): + return True + return False raise Exception("Incorrect feature_test_type call, need type or types") diff --git a/cpt_wig_rebase/wig_rebase.py b/cpt_wig_rebase/wig_rebase.py index 624a5e4..5db395c 100755 --- a/cpt_wig_rebase/wig_rebase.py +++ b/cpt_wig_rebase/wig_rebase.py @@ -64,9 +64,9 @@ def rebase_wig(parent, wigData, protein2dna=False, map_by="ID"): # No passthrough current_id = re.findall("chrom=([^ ]+)", line)[0] try: - current_ft = locations[current_id] + current_ft = locations[current_id] except: - continue + continue # Update max value if current_ft.end > maxFtLoc: maxFtLoc = current_ft.end diff --git a/cpt_xmfa/xmfa_process.py b/cpt_xmfa/xmfa_process.py index 7f74ca6..c098006 100644 --- a/cpt_xmfa/xmfa_process.py +++ b/cpt_xmfa/xmfa_process.py @@ -7,8 +7,7 @@ def parse_xmfa(xmfa): - """Simple XMFA parser until https://github.com/biopython/biopython/pull/544 - """ + """Simple XMFA parser until https://github.com/biopython/biopython/pull/544""" current_lcb = [] current_seq = {} for line in xmfa.readlines(): @@ -48,8 +47,7 @@ def parse_xmfa(xmfa): def percent_identity(a, b): - """Calculate % identity, ignoring gaps in the host sequence - """ + """Calculate % identity, ignoring gaps in the host sequence""" match = 0 mismatch = 0 for char_a, char_b in zip(list(a), list(b)): @@ -66,8 +64,7 @@ def percent_identity(a, b): def get_fasta_ids(sequences): - """Returns a list of fasta records in the order they appear - """ + """Returns a list of fasta records in the order they appear""" ids = [] for seq in SeqIO.parse(sequences, "fasta"): ids.append(seq.id) diff --git a/external/fix-aragorn-gff3.py b/external/fix-aragorn-gff3.py index 132086a..51753a5 100755 --- a/external/fix-aragorn-gff3.py +++ b/external/fix-aragorn-gff3.py @@ -13,19 +13,22 @@ def fixed_feature(rec): for idx, feature in enumerate( feature_lambda( - rec.features, feature_test_type, {"types": ["tRNA", "tmRNA"]}, subfeatures=True + rec.features, + feature_test_type, + {"types": ["tRNA", "tmRNA"]}, + subfeatures=True, ) ): - + fid = "%s-%03d" % (feature.type, 1 + idx) try: name = [feature.type + "-" + feature.qualifiers["Codon"][0]] except KeyError: - name = [feature.qualifiers['product'][0]] + name = [feature.qualifiers["product"][0]] try: - origSource = feature.qualifiers["source"][0] + origSource = feature.qualifiers["source"][0] except: - origSource = "." + origSource = "." gene = gffSeqFeature( location=feature.location, type="gene", diff --git a/external/transterm.py b/external/transterm.py index c547687..9d47650 100755 --- a/external/transterm.py +++ b/external/transterm.py @@ -42,7 +42,7 @@ def build_expterm(): def generate_annotation_file(gff3): # TODO: cleanup - t = tempfile.NamedTemporaryFile(mode="w",delete=False, suffix=".coords") + t = tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".coords") for rec in gffParse(gff3): features = feature_lambda( rec.features, feature_test_type, {"type": "CDS"}, subfeatures=False @@ -77,10 +77,11 @@ def run_transterm(expterm, fasta, annotations): def pairwise(it): it = iter(it) while True: - try: - yield next(it), next(it) - except StopIteration: - return + try: + yield next(it), next(it) + except StopIteration: + return + def parse_transterm(data): data = data.decode("utf-8") diff --git a/external/wig_rebase.py b/external/wig_rebase.py index 7dde230..6c6021b 100755 --- a/external/wig_rebase.py +++ b/external/wig_rebase.py @@ -64,9 +64,9 @@ def rebase_wig(parent, wigData, protein2dna=False, map_by="ID"): # No passthrough current_id = re.findall("chrom=([^ ]+)", line)[0] try: - current_ft = locations[current_id] + current_ft = locations[current_id] except: - continue + continue # Update max value if current_ft.end > maxFtLoc: maxFtLoc = current_ft.end diff --git a/gff3/filter_type.py b/gff3/filter_type.py index 471badb..7ba8537 100755 --- a/gff3/filter_type.py +++ b/gff3/filter_type.py @@ -21,15 +21,15 @@ ) rec.features = [] for x in tempFeats: - rec.features.append(x) + rec.features.append(x) for x in rec.features: - if "Parent" in x.qualifiers.keys(): - found = 0 - for seek in x.qualifiers["Parent"]: - for y in rec.features: - if y.id == seek: - found += 1 - break - if found < len(x.qualifiers["Parent"]): - del x.qualifiers["Parent"] + if "Parent" in x.qualifiers.keys(): + found = 0 + for seek in x.qualifiers["Parent"]: + for y in rec.features: + if y.id == seek: + found += 1 + break + if found < len(x.qualifiers["Parent"]): + del x.qualifiers["Parent"] gffWrite([rec], sys.stdout) diff --git a/gff3/gff3.py b/gff3/gff3.py index d4795d4..48496c3 100755 --- a/gff3/gff3.py +++ b/gff3/gff3.py @@ -97,10 +97,10 @@ def feature_test_type(feature, **kwargs): if "type" in kwargs: return str(feature.type).upper() == str(kwargs["type"]).upper() elif "types" in kwargs: - for x in kwargs["types"]: - if str(feature.type).upper() == str(x).upper(): - return True - return False + for x in kwargs["types"]: + if str(feature.type).upper() == str(x).upper(): + return True + return False raise Exception("Incorrect feature_test_type call, need type or types") diff --git a/gff3/gff3_add_parents_to_cds.py b/gff3/gff3_add_parents_to_cds.py index c15a254..33388f1 100755 --- a/gff3/gff3_add_parents_to_cds.py +++ b/gff3/gff3_add_parents_to_cds.py @@ -3,7 +3,8 @@ import logging import argparse from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature -#from Bio.SeqFeature import SeqFeature + +# from Bio.SeqFeature import SeqFeature from gff3 import feature_lambda, feature_test_type logging.basicConfig(level=logging.INFO) @@ -26,7 +27,11 @@ def fixed_feature(rec): mRNA = gffSeqFeature( location=feature.location, type="mRNA", - qualifiers={"source": ["cpt.fixModel"], "ID": ["%s.mRNA" % fid], "Parent": gene.qualifiers["ID"]}, + qualifiers={ + "source": ["cpt.fixModel"], + "ID": ["%s.mRNA" % fid], + "Parent": gene.qualifiers["ID"], + }, ) feature.qualifiers["ID"] = [fid + ".CDS"] feature.qualifiers["Parent"] = mRNA.qualifiers["ID"] diff --git a/gff3/gff3_extract_sequence.py b/gff3/gff3_extract_sequence.py index 22e0ca4..8e66746 100755 --- a/gff3/gff3_extract_sequence.py +++ b/gff3/gff3_extract_sequence.py @@ -17,7 +17,6 @@ def main(fasta, gff3, feature_filter=None, nodesc=False): if feature_filter == "nice_cds": from gff2gb import gff3_to_genbank as cpt_Gff2Gbk - for rec in cpt_Gff2Gbk(gff3, fasta, 11): seenList = {} @@ -66,8 +65,10 @@ def main(fasta, gff3, feature_filter=None, nodesc=False): else: feat.qualifiers["ID"] = [feat._ID] product = feat.qualifiers.get("product", "") - description = "{1} [Location={0.location};ID={0.qualifiers[ID][0]}]".format( - feat, product + description = ( + "{1} [Location={0.location};ID={0.qualifiers[ID][0]}]".format( + feat, product + ) ) yield [ SeqRecord( @@ -116,9 +117,21 @@ def main(fasta, gff3, feature_filter=None, nodesc=False): description = "" else: if feat.strand == -1: - important_data = {"Location": FeatureLocation(feat.location.start + 1, feat.location.end - feat.phase, feat.strand)} + important_data = { + "Location": FeatureLocation( + feat.location.start + 1, + feat.location.end - feat.phase, + feat.strand, + ) + } else: - important_data = {"Location": FeatureLocation(feat.location.start + 1 + feat.phase, feat.location.end, feat.strand)} + important_data = { + "Location": FeatureLocation( + feat.location.start + 1 + feat.phase, + feat.location.end, + feat.strand, + ) + } if "Name" in feat.qualifiers: important_data["Name"] = feat.qualifiers.get("Name", [""])[0] @@ -130,48 +143,65 @@ def main(fasta, gff3, feature_filter=None, nodesc=False): ] ) ) - #if feat.id == "CPT_Privateer_006.p01": - #print(feat) - #exit() - + # if feat.id == "CPT_Privateer_006.p01": + # print(feat) + # exit() + if isinstance(feat.location, CompoundLocation): - finSeq = "" - if feat.strand == -1: - for x in feat.location.parts: - finSeq += str((rec.seq[feat.location.start: feat.location.end - feat.phase]).reverse_complement()) - else: - for x in feat.location.parts: - finSeq += str(rec.seq[feat.location.start + feat.phase: feat.location.end]) - yield [ - SeqRecord( - finSeq, - id=nid.replace(" ", "-"), - description=description, - ) - ] + finSeq = "" + if feat.strand == -1: + for x in feat.location.parts: + finSeq += str( + ( + rec.seq[ + feat.location.start : feat.location.end + - feat.phase + ] + ).reverse_complement() + ) + else: + for x in feat.location.parts: + finSeq += str( + rec.seq[ + feat.location.start + feat.phase : feat.location.end + ] + ) + yield [ + SeqRecord( + finSeq, + id=nid.replace(" ", "-"), + description=description, + ) + ] elif feat.strand == -1: - yield [ - SeqRecord( - (rec.seq[feat.location.start: feat.location.end - feat.phase]).reverse_complement(), - id=nid.replace(" ", "-"), - description=description, - ) - ] + yield [ + SeqRecord( + ( + rec.seq[ + feat.location.start : feat.location.end - feat.phase + ] + ).reverse_complement(), + id=nid.replace(" ", "-"), + description=description, + ) + ] else: - yield [ - SeqRecord( - #feat.extract(rec).seq, - rec.seq[feat.location.start + feat.phase: feat.location.end], - id=nid.replace(" ", "-"), - description=description, - ) - ] + yield [ + SeqRecord( + # feat.extract(rec).seq, + rec.seq[ + feat.location.start + feat.phase : feat.location.end + ], + id=nid.replace(" ", "-"), + description=description, + ) + ] rec.features = newfeats rec.annotations = {} - #gffWrite([rec], sys.stdout) + # gffWrite([rec], sys.stdout) else: seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta")) - + for rec in gffParse(gff3, base_dict=seq_dict): noMatch = True if "Alias" in rec.features[0].qualifiers.keys(): @@ -201,9 +231,21 @@ def main(fasta, gff3, feature_filter=None, nodesc=False): description = "" else: if feat.strand == -1: - important_data = {"Location": FeatureLocation(feat.location.start + 1, feat.location.end - feat.phase, feat.strand)} + important_data = { + "Location": FeatureLocation( + feat.location.start + 1, + feat.location.end - feat.phase, + feat.strand, + ) + } else: - important_data = {"Location": FeatureLocation(feat.location.start + 1 + feat.phase, feat.location.end, feat.strand)} + important_data = { + "Location": FeatureLocation( + feat.location.start + 1 + feat.phase, + feat.location.end, + feat.strand, + ) + } if "Name" in feat.qualifiers: important_data["Name"] = feat.qualifiers.get("Name", [""])[0] @@ -217,40 +259,58 @@ def main(fasta, gff3, feature_filter=None, nodesc=False): ) if isinstance(feat.location, CompoundLocation): - finSeq = "" - if feat.strand == -1: - for x in feat.location.parts: - finSeq += str((rec.seq[x.start: x.end - feat.phase]).reverse_complement()) - else: - for x in feat.location.parts: - finSeq += str(rec.seq[x.start + feat.phase: x.end]) - yield [ - SeqRecord( - Seq(finSeq), - id=id.replace(" ", "-"), - description=description, - ) - ] + finSeq = "" + if feat.strand == -1: + for x in feat.location.parts: + finSeq += str( + ( + rec.seq[x.start : x.end - feat.phase] + ).reverse_complement() + ) + else: + for x in feat.location.parts: + finSeq += str(rec.seq[x.start + feat.phase : x.end]) + yield [ + SeqRecord( + Seq(finSeq), + id=id.replace(" ", "-"), + description=description, + ) + ] else: - if feat.strand == -1: - yield [ - SeqRecord( - seq=Seq(str(rec.seq[feat.location.start: feat.location.end - feat.phase])).reverse_complement(), - id=id.replace(" ", "-"), - description=description, - ) - ] - else: - yield [ - SeqRecord( - #feat.extract(rec).seq, - seq=Seq(str(rec.seq[feat.location.start + feat.phase: feat.location.end])), - id=id.replace(" ", "-"), - description=description, - ) - ] + if feat.strand == -1: + yield [ + SeqRecord( + seq=Seq( + str( + rec.seq[ + feat.location.start : feat.location.end + - feat.phase + ] + ) + ).reverse_complement(), + id=id.replace(" ", "-"), + description=description, + ) + ] + else: + yield [ + SeqRecord( + # feat.extract(rec).seq, + seq=Seq( + str( + rec.seq[ + feat.location.start + + feat.phase : feat.location.end + ] + ) + ), + id=id.replace(" ", "-"), + description=description, + ) + ] if __name__ == "__main__": @@ -267,9 +327,9 @@ def main(fasta, gff3, feature_filter=None, nodesc=False): ) args = parser.parse_args() for seq in main(**vars(args)): - #if isinstance(seq, list): + # if isinstance(seq, list): # for x in seq: # print(type(x.seq)) # SeqIO.write(x, sys.stdout, "fasta") - #else: - SeqIO.write(seq, sys.stdout, "fasta") + # else: + SeqIO.write(seq, sys.stdout, "fasta") diff --git a/gff3/gff3_require_sd.py b/gff3/gff3_require_sd.py index cc12501..abb8fed 100755 --- a/gff3/gff3_require_sd.py +++ b/gff3/gff3_require_sd.py @@ -51,9 +51,17 @@ def require_shinefind(gff3, fasta): ) gene.sub_features.append(sd_features[0]) if gene.location.start > sd_features[0].location.start: - gene.location = FeatureLocation(int(sd_features[0].location.start), int(gene.location.end), gene.location.strand) + gene.location = FeatureLocation( + int(sd_features[0].location.start), + int(gene.location.end), + gene.location.strand, + ) if gene.location.end < sd_features[0].location.end: - gene.location = FeatureLocation(int(gene.location.start), int(sd_features[0].location.end), gene.location.strand) + gene.location = FeatureLocation( + int(gene.location.start), + int(sd_features[0].location.end), + gene.location.strand, + ) good_genes.append(gene) record.features = good_genes diff --git a/gff3/intersect_and_adjacent.py b/gff3/intersect_and_adjacent.py index 0d38ad1..674c9dc 100755 --- a/gff3/intersect_and_adjacent.py +++ b/gff3/intersect_and_adjacent.py @@ -9,12 +9,14 @@ logging.basicConfig(level=logging.INFO) log = logging.getLogger(__name__) + def validFeat(rec): for feat in rec.features: - if feat.type != 'remark' and feat.type != 'annotation': - return True + if feat.type != "remark" and feat.type != "annotation": + return True return False + def treeFeatures(features, window): for feat in features: # Interval(begin, end, data) @@ -23,10 +25,12 @@ def treeFeatures(features, window): int(feat.location.end) + int(window), feat.id, ) + + def treeFeatures_noRem(features, window): for feat in features: - if feat.type == 'remark' or feat.type == 'annotation': - continue + if feat.type == "remark" or feat.type == "annotation": + continue # Interval(begin, end, data) yield Interval( int(feat.location.start) - int(window), @@ -42,167 +46,208 @@ def intersect(a, b, window, stranding): rec_b_out = [] maxLen = min(len(rec_a), len(rec_b)) iterate = 0 - if maxLen > 0: while iterate < maxLen: - rec_a_i = rec_a[iterate] - rec_b_i = rec_b[iterate] + rec_a_i = rec_a[iterate] + rec_b_i = rec_b[iterate] - if (not validFeat(rec_a_i)) or (not validFeat(rec_b_i)): - rec_a_out.append(SeqRecord(rec_a[iterate].seq, rec_a[iterate].id, rec_a[iterate].name, rec_a[iterate].description, rec_a[iterate].dbxrefs, [], rec_a[iterate].annotations)) - rec_b_out.append(SeqRecord(rec_b[iterate].seq, rec_b[iterate].id, rec_b[iterate].name, rec_b[iterate].description, rec_b[iterate].dbxrefs, [], rec_b[iterate].annotations)) - iterate += 1 - continue + if (not validFeat(rec_a_i)) or (not validFeat(rec_b_i)): + rec_a_out.append( + SeqRecord( + rec_a[iterate].seq, + rec_a[iterate].id, + rec_a[iterate].name, + rec_a[iterate].description, + rec_a[iterate].dbxrefs, + [], + rec_a[iterate].annotations, + ) + ) + rec_b_out.append( + SeqRecord( + rec_b[iterate].seq, + rec_b[iterate].id, + rec_b[iterate].name, + rec_b[iterate].description, + rec_b[iterate].dbxrefs, + [], + rec_b[iterate].annotations, + ) + ) + iterate += 1 + continue - a_neg = [] - a_pos = [] - b_neg = [] - b_pos = [] - tree_a = [] - tree_b = [] - if stranding == True: - for feat in rec_a_i.features: - if feat.type == 'remark' or feat.type == 'annotation': - continue - if feat.strand > 0: - a_pos.append( - Interval( - int(feat.location.start) - int(window), - int(feat.location.end) + int(window), - feat.id, + a_neg = [] + a_pos = [] + b_neg = [] + b_pos = [] + tree_a = [] + tree_b = [] + if stranding == True: + for feat in rec_a_i.features: + if feat.type == "remark" or feat.type == "annotation": + continue + if feat.strand > 0: + a_pos.append( + Interval( + int(feat.location.start) - int(window), + int(feat.location.end) + int(window), + feat.id, + ) ) - ) - else: - a_neg.append( - Interval( - int(feat.location.start) - int(window), - int(feat.location.end) + int(window), - feat.id, + else: + a_neg.append( + Interval( + int(feat.location.start) - int(window), + int(feat.location.end) + int(window), + feat.id, + ) ) - ) - for feat in rec_b_i.features: - if feat.type == 'remark' or feat.type == 'annotation': - continue - if feat.strand > 0: - b_pos.append( + for feat in rec_b_i.features: + if feat.type == "remark" or feat.type == "annotation": + continue + if feat.strand > 0: + b_pos.append( + Interval( + int(feat.location.start) - int(window), + int(feat.location.end) + int(window), + feat.id, + ) + ) + else: + b_neg.append( + Interval( + int(feat.location.start) - int(window), + int(feat.location.end) + int(window), + feat.id, + ) + ) + + else: + for feat in rec_a_i.features: + if feat.type == "remark" or feat.type == "annotation": + continue + tree_a.append( Interval( int(feat.location.start) - int(window), int(feat.location.end) + int(window), feat.id, ) ) - else: - b_neg.append( + for feat in rec_b_i.features: + if feat.type == "remark" or feat.type == "annotation": + continue + tree_b.append( Interval( int(feat.location.start) - int(window), int(feat.location.end) + int(window), feat.id, ) ) + if stranding: + # builds interval tree from Interval objects of form (start, end, id) for each feature + # tree_a = IntervalTree(list(treeFeatures_noRem(rec_a_i.features, window))) + # tree_b = IntervalTree(list(treeFeatures_noRem(rec_b_i.features, window))) + # else: + tree_a_pos = IntervalTree(a_pos) + tree_a_neg = IntervalTree(a_neg) + tree_b_pos = IntervalTree(b_pos) + tree_b_neg = IntervalTree(b_neg) + else: + tree_a = IntervalTree(tree_a) + tree_b = IntervalTree(tree_b) - else: - for feat in rec_a_i.features: - if feat.type == 'remark' or feat.type == 'annotation': - continue - tree_a.append( - Interval( - int(feat.location.start) - int(window), - int(feat.location.end) + int(window), - feat.id, - ) - ) - for feat in rec_b_i.features: - if feat.type == 'remark' or feat.type == 'annotation': - continue - tree_b.append( - Interval( - int(feat.location.start) - int(window), - int(feat.location.end) + int(window), - feat.id, - ) - ) - if stranding: - # builds interval tree from Interval objects of form (start, end, id) for each feature - # tree_a = IntervalTree(list(treeFeatures_noRem(rec_a_i.features, window))) - #tree_b = IntervalTree(list(treeFeatures_noRem(rec_b_i.features, window))) - #else: - tree_a_pos = IntervalTree(a_pos) - tree_a_neg = IntervalTree(a_neg) - tree_b_pos = IntervalTree(b_pos) - tree_b_neg = IntervalTree(b_neg) - else: - tree_a = IntervalTree(tree_a) - tree_b = IntervalTree(tree_b) - - - # Used to map ids back to features later - rec_a_map = {f.id: f for f in rec_a_i.features} - rec_b_map = {f.id: f for f in rec_b_i.features} - - rec_a_hits_in_b = [] - rec_b_hits_in_a = [] - - for feature in rec_a_i.features: - # Save each feature in rec_a that overlaps a feature in rec_b - # hits = tree_b.find_range((int(feature.location.start), int(feature.location.end))) - - if feature.type == "remark" or feature.type == "annotation": - continue - - if stranding == False: - hits = tree_b[int(feature.location.start) : int(feature.location.end)] - - - # feature id is saved in interval result.data, use map to get full feature - for hit in hits: - rec_a_hits_in_b.append(rec_b_map[hit.data]) + # Used to map ids back to features later + rec_a_map = {f.id: f for f in rec_a_i.features} + rec_b_map = {f.id: f for f in rec_b_i.features} - else: - if feature.strand > 0: - hits_pos = tree_b_pos[ - int(feature.location.start) : int(feature.location.end) - ] - for hit in hits_pos: - rec_a_hits_in_b.append(rec_b_map[hit.data]) - else: - hits_neg = tree_b_neg[ + rec_a_hits_in_b = [] + rec_b_hits_in_a = [] + + for feature in rec_a_i.features: + # Save each feature in rec_a that overlaps a feature in rec_b + # hits = tree_b.find_range((int(feature.location.start), int(feature.location.end))) + + if feature.type == "remark" or feature.type == "annotation": + continue + + if stranding == False: + hits = tree_b[ int(feature.location.start) : int(feature.location.end) ] - for hit in hits_neg: - rec_a_hits_in_b.append(rec_b_map[hit.data]) - for feature in rec_b_i.features: - if feature.type == "remark" or feature.type == "annotation": - continue + # feature id is saved in interval result.data, use map to get full feature + for hit in hits: + rec_a_hits_in_b.append(rec_b_map[hit.data]) - if stranding == False: - hits = tree_a[int(feature.location.start) : int(feature.location.end)] + else: + if feature.strand > 0: + hits_pos = tree_b_pos[ + int(feature.location.start) : int(feature.location.end) + ] + for hit in hits_pos: + rec_a_hits_in_b.append(rec_b_map[hit.data]) + else: + hits_neg = tree_b_neg[ + int(feature.location.start) : int(feature.location.end) + ] + for hit in hits_neg: + rec_a_hits_in_b.append(rec_b_map[hit.data]) - # feature id is saved in interval result.data, use map to get full feature - for hit in hits: - rec_b_hits_in_a.append(rec_a_map[hit.data]) + for feature in rec_b_i.features: + if feature.type == "remark" or feature.type == "annotation": + continue - else: - if feature.strand > 0: - hits_pos = tree_a_pos[ + if stranding == False: + hits = tree_a[ int(feature.location.start) : int(feature.location.end) ] - for hit in hits_pos: + + # feature id is saved in interval result.data, use map to get full feature + for hit in hits: rec_b_hits_in_a.append(rec_a_map[hit.data]) + else: - hits_neg = tree_a_neg[ - int(feature.location.start) : int(feature.location.end) - ] - for hit in hits_neg: - rec_b_hits_in_a.append(rec_a_map[hit.data]) + if feature.strand > 0: + hits_pos = tree_a_pos[ + int(feature.location.start) : int(feature.location.end) + ] + for hit in hits_pos: + rec_b_hits_in_a.append(rec_a_map[hit.data]) + else: + hits_neg = tree_a_neg[ + int(feature.location.start) : int(feature.location.end) + ] + for hit in hits_neg: + rec_b_hits_in_a.append(rec_a_map[hit.data]) + + # Remove duplicate features using sets + rec_a_out.append( + SeqRecord( + rec_a[iterate].seq, + rec_a[iterate].id, + rec_a[iterate].name, + rec_a[iterate].description, + rec_a[iterate].dbxrefs, + sorted(set(rec_a_hits_in_b), key=lambda feat: feat.location.start), + rec_a[iterate].annotations, + ) + ) + rec_b_out.append( + SeqRecord( + rec_b[iterate].seq, + rec_b[iterate].id, + rec_b[iterate].name, + rec_b[iterate].description, + rec_b[iterate].dbxrefs, + sorted(set(rec_b_hits_in_a), key=lambda feat: feat.location.start), + rec_b[iterate].annotations, + ) + ) + iterate += 1 - # Remove duplicate features using sets - rec_a_out.append(SeqRecord(rec_a[iterate].seq, rec_a[iterate].id, rec_a[iterate].name, rec_a[iterate].description, rec_a[iterate].dbxrefs, sorted(set(rec_a_hits_in_b), key=lambda feat: feat.location.start), rec_a[iterate].annotations)) - rec_b_out.append(SeqRecord(rec_b[iterate].seq, rec_b[iterate].id, rec_b[iterate].name, rec_b[iterate].description, rec_b[iterate].dbxrefs, sorted(set(rec_b_hits_in_a), key=lambda feat: feat.location.start), rec_b[iterate].annotations)) - iterate += 1 - else: # If one input is empty, output two empty result files. rec_a_out = [SeqRecord(Seq(""), "none")] @@ -235,8 +280,8 @@ def intersect(a, b, window, stranding): with open(args.oa, "w") as handle: for rec in a: - gffWrite([rec], handle) + gffWrite([rec], handle) with open(args.ob, "w") as handle: for rec in b: - gffWrite([rec], handle) + gffWrite([rec], handle) diff --git a/gff3/lipoP_to_gff3.py b/gff3/lipoP_to_gff3.py index a12e080..0f7dd34 100755 --- a/gff3/lipoP_to_gff3.py +++ b/gff3/lipoP_to_gff3.py @@ -24,18 +24,17 @@ def lipoP_gff(lipoIn, gff3In, jBrowseOut, filterSP2): rowElem = row.split("\t") orgID = rowElem[0] - + if filterSP2: - if rowElem[2] == "CleavII": - if not (orgID in orgIDs.keys()): - orgIDs[orgID] = [] - orgIDs[orgID].append(int(rowElem[3])) # , int(rowElem[4]))) + if rowElem[2] == "CleavII": + if not (orgID in orgIDs.keys()): + orgIDs[orgID] = [] + orgIDs[orgID].append(int(rowElem[3])) # , int(rowElem[4]))) else: - if rowElem[2] in "CleavII": - if not (orgID in orgIDs.keys()): - orgIDs[orgID] = [] - orgIDs[orgID].append(int(rowElem[3])) # , int(rowElem[4]))) - + if rowElem[2] in "CleavII": + if not (orgID in orgIDs.keys()): + orgIDs[orgID] = [] + orgIDs[orgID].append(int(rowElem[3])) # , int(rowElem[4]))) # Rebase for gff in gffParse(gff3In): @@ -61,7 +60,7 @@ def lipoP_gff(lipoIn, gff3In, jBrowseOut, filterSP2): keepSeq.append(xRec) continue - #if jBrowseOut: + # if jBrowseOut: # xRec.sub_features = [] i = 0 @@ -103,7 +102,7 @@ def lipoP_gff(lipoIn, gff3In, jBrowseOut, filterSP2): ) parser.add_argument( "--filterSP2", - action='store_true', + action="store_true", help="Filter for only SPII sites", ) args = parser.parse_args() diff --git a/gff3/shinefind.py b/gff3/shinefind.py index c51665e..509cff3 100755 --- a/gff3/shinefind.py +++ b/gff3/shinefind.py @@ -53,10 +53,10 @@ def list_sds(self, sequence, sd_min=3, sd_max=17): for regex in self.sd_reg: for match in regex.finditer(sequence): spacing = len(sequence) - len(match.group()) - match.start() - if sd_max >= spacing+sd_min and spacing+sd_min >= sd_min: - #if the spacing is within gap limits, add - #(search space is [sd_max+7 .. sd_min] so actual gap is spacing+sd_min) - #print('min %d max %d - adding SD with gap %d' % (sd_min, sd_max, spacing+sd_min)) + if sd_max >= spacing + sd_min and spacing + sd_min >= sd_min: + # if the spacing is within gap limits, add + # (search space is [sd_max+7 .. sd_min] so actual gap is spacing+sd_min) + # print('min %d max %d - adding SD with gap %d' % (sd_min, sd_max, spacing+sd_min)) hits.append( { "spacing": spacing, @@ -66,7 +66,7 @@ def list_sds(self, sequence, sd_min=3, sd_max=17): "len": len(match.group()), } ) - hits = sorted(hits, key= lambda x: (-x['len'],x['spacing'])) + hits = sorted(hits, key=lambda x: (-x["len"], x["spacing"])) return hits @classmethod @@ -80,7 +80,16 @@ def highlight_sd(cls, sequence, start, end): ) @classmethod - def to_features(cls, hits, strand, parent_start, parent_end, feature_id=None, sd_min=3, sd_max=17): + def to_features( + cls, + hits, + strand, + parent_start, + parent_end, + feature_id=None, + sd_min=3, + sd_max=17, + ): results = [] for idx, hit in enumerate(hits): # gene complement(124..486) @@ -90,7 +99,7 @@ def to_features(cls, hits, strand, parent_start, parent_end, feature_id=None, sd # -1 491 501 2 3 5 # -1 491 501 1 3 5 # -1 491 501 0 3 5 - + qualifiers = { "source": "CPT_ShineFind", "ID": "%s.rbs-%s" % (feature_id, idx), @@ -108,7 +117,7 @@ def to_features(cls, hits, strand, parent_start, parent_end, feature_id=None, sd # minimum absolute value of these two will be the proper gap regardless of strand tmp = gffSeqFeature( FeatureLocation(min(start, end), max(start, end), strand=strand), - #FeatureLocation(min(start, end), max(start, end), strand=strand), + # FeatureLocation(min(start, end), max(start, end), strand=strand), type="Shine_Dalgarno_sequence", qualifiers=qualifiers, ) @@ -133,7 +142,10 @@ def testFeatureUpstream(self, feature, record, sd_min=3, sd_max=17): # Create our temp feature used to obtain correct portion of # genome - tmp = gffSeqFeature(FeatureLocation(min(start, end), max(start, end), strand=strand), type="domain") + tmp = gffSeqFeature( + FeatureLocation(min(start, end), max(start, end), strand=strand), + type="domain", + ) seq = str(tmp.extract(record.seq)) return self.list_sds(seq, sd_min, sd_max), start, end, seq @@ -175,6 +187,7 @@ def fix_gene_boundaries(feature): feature.location = FeatureLocation(fmin, fmax, strand=-1) return feature + def shinefind( fasta, gff3, diff --git a/phage/intron_detection.py b/phage/intron_detection.py index ab28ced..8b4bf5f 100755 --- a/phage/intron_detection.py +++ b/phage/intron_detection.py @@ -17,7 +17,7 @@ def parse_xml(blastxml, thresh): - """ Parses xml file to get desired info (genes, hits, etc) """ + """Parses xml file to get desired info (genes, hits, etc)""" blast = [] discarded_records = 0 totLen = 0 @@ -64,7 +64,7 @@ def parse_xml(blastxml, thresh): def filter_lone_clusters(clusters): - """ Removes all clusters with only one member and those with no hits """ + """Removes all clusters with only one member and those with no hits""" filtered_clusters = {} for key in clusters: if len(clusters[key]) > 1 and len(key) > 0: @@ -78,7 +78,7 @@ def test_true(feature, **kwargs): def parse_gff(gff3): - """ Extracts strand and start location to be used in cluster filtering """ + """Extracts strand and start location to be used in cluster filtering""" log.debug("parse_gff3") gff_info = {} _rec = None @@ -113,12 +113,12 @@ def parse_gff(gff3): def all_same(genes_list): - """ Returns True if all gene names in cluster are identical """ + """Returns True if all gene names in cluster are identical""" return all(gene["name"] == genes_list[0]["name"] for gene in genes_list[1:]) def remove_duplicates(clusters): - """ Removes clusters with multiple members but only one gene name """ + """Removes clusters with multiple members but only one gene name""" filtered_clusters = {} for key in clusters: if all_same(clusters[key]): @@ -130,7 +130,7 @@ def remove_duplicates(clusters): class IntronFinder(object): - """ IntronFinder objects are lists that contain a list of hits for every gene """ + """IntronFinder objects are lists that contain a list of hits for every gene""" def __init__(self, gff3, blastp, thresh): self.blast = [] @@ -142,7 +142,7 @@ def __init__(self, gff3, blastp, thresh): self.blast = parse_xml(blastp, thresh) def create_clusters(self): - """ Finds 2 or more genes with matching hits """ + """Finds 2 or more genes with matching hits""" clusters = {} for gene in self.blast: for hit in gene: @@ -163,7 +163,7 @@ def create_clusters(self): self.clusters = filter_lone_clusters(clusters) def check_strand(self): - """ filters clusters for genes on the same strand """ + """filters clusters for genes on the same strand""" filtered_clusters = {} for key in self.clusters: pos_strand = [] diff --git a/util/cpt_convert_glimmer_to_gff3.py b/util/cpt_convert_glimmer_to_gff3.py index ab18803..d077cb5 100755 --- a/util/cpt_convert_glimmer_to_gff3.py +++ b/util/cpt_convert_glimmer_to_gff3.py @@ -45,13 +45,13 @@ def glimmer3_to_gff3(glimmer, genome): start -= 1 if start > end: - #gene found on boundary (ex [4000, 200]) from glimmer assuming circular genome - #-------------start<=======|sequence end|========>end------ + # gene found on boundary (ex [4000, 200]) from glimmer assuming circular genome + # -------------start<=======|sequence end|========>end------ if strand > 0: end = len(current_record) else: start = 0 - gene_id+="_truncated" + gene_id += "_truncated" cds_feat = gffSeqFeature( FeatureLocation(start, end), @@ -62,7 +62,7 @@ def glimmer3_to_gff3(glimmer, genome): "source": "Glimmer3", "ID": "%s.cds_%s" % (current_record.id, gene_id), }, - source="Glimmer3" + source="Glimmer3", ) gene = gffSeqFeature( @@ -74,7 +74,7 @@ def glimmer3_to_gff3(glimmer, genome): "source": "Glimmer3", "ID": "%s.%s" % (current_record.id, gene_id), }, - source="Glimmer3" + source="Glimmer3", ) gene.sub_features = [cds_feat] current_record.features.append(gene) diff --git a/util/cpt_convert_mga_to_gff3.py b/util/cpt_convert_mga_to_gff3.py index e2d51d6..fddd73e 100755 --- a/util/cpt_convert_mga_to_gff3.py +++ b/util/cpt_convert_mga_to_gff3.py @@ -67,7 +67,7 @@ def mga_to_gff3(mga_output, genome): "Source": "MGA", }, phase=phase, - source="MGA" + source="MGA", ) cds_feat = gffSeqFeature( @@ -77,9 +77,9 @@ def mga_to_gff3(mga_output, genome): qualifiers={ "Source": "MGA", "ID": "%s.cds_%s" % (current_record.id, gene_id), - }, + }, phase=phase, - source="MGA" + source="MGA", ) if rbs_feat is not None: @@ -103,7 +103,7 @@ def mga_to_gff3(mga_output, genome): "ID": "%s.%s" % (current_record.id, gene_id), }, phase=phase, - source="MGA" + source="MGA", ) gene.sub_features = [cds_feat]