-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathResCons.py
4637 lines (3778 loc) · 195 KB
/
ResCons.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
'''
LICENSE:
Copyright (C) <2015> Manavalan Gajapathy, Joseph D. Ng
ResCons is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 or version 3 of the License.
ResCons is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with ResCons; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Author's Email id: [email protected]
'''
__author__ = "Manavalan Gajapathy, Joseph D. Ng"
__license__ = "Lesser General Public License"
__version__ = "28 beta"
__email__ = "[email protected]"
'''
Ver 28 (under development) major modifications:
1. File 'ClustalO_embl_api' is moved in to directory 'Rescon_Files' so as to contain all files to execute ResCons
in one location. Script modified to reflect the same for importing.
2. Added following checkpoints to user-provided Clustal omega command when used in 'web-server' mode:
a. Is parameter 'email' present?
b. Proper email id provided?
b. Is user providing parameters that needs to be set by ResCons?
3. Edited default command for Clustal Omega in webserver mode. Added '--iterations 3' to existing command.
4. Software title 'ResCon' changed to 'ResCons'
'''
from Bio.Align.Applications import ClustalOmegaCommandline
from Bio import AlignIO
from Bio import SeqIO
from Bio.Blast import NCBIXML
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio import Phylo
from Tkinter import *
import tkMessageBox
from tkFileDialog import askopenfilename, askdirectory
from tkColorChooser import askcolor
import os, errno, logging, ast, shutil, re
import subprocess
import traceback # to raise exceptions in gui
import threading
from sys import platform as _platform # to determine OS under use
import csv
import operator
import StringIO
from ResCons_Files.clustalo_embl_api import clustalo_api # to run clustal omega through EMBL webserver
terminal_output = False # to show or not to show error/info in the terminal
# terminal_output = True
additional_imports = True # this enables to run script in case such features from thrid party libs are not desired.
if additional_imports:
natural_sorting = True # if native sorting is desired in output csv file from mismatch analyzer
if natural_sorting:
from natsort import natsorted
Image_in_GUI = False # if logos needed to be shown in GUI
if Image_in_GUI:
from PIL import Image, ImageTk # to add image to the GUI
matplot_reqd = False # to draw bar graph in output html using matplot; if set to false, javascript may be used to draw bar chart
if matplot_reqd:
import matplotlib.pyplot as plt
else:
natural_sorting = False
Image_in_GUI = False
matplot_reqd = False
# This is to enable actions specific to particular operating system
# win_os = False # for both Windows and Ubuntu OS
# linux_os = False # Keep it as 'False'. This is for threading purposes. But threading is currently causing trouble.
# mac_os = True # only for Mac OS
# ubuntu_os = False # only for Ubuntu OS
# win_os = True # for both Windows and Ubuntu OS
# linux_os = False # Keep it as 'False'. This is for threading purposes. But threading is currently causing trouble.
# mac_os = False # only for Mac OS
# ubuntu_os = False # only for Ubuntu OS
# This automatically detects the Operating system under use
if _platform == "linux" or _platform == "linux2": # for linux OS
ubuntu_os = True
mac_os = False
linux_os = False
win_os = False
elif _platform == "darwin": # for Mac
ubuntu_os = False
mac_os = True
linux_os = False
win_os = False
try: # prevents app from opening minimized (at least in OSx 10.9). Uses AppleEvents to give focus to Python app.
os.system('''/usr/bin/osascript -e 'tell app "Finder" to set frontmost of process "Python" to true' ''')
except Exception as e:
pass
elif _platform == "win32": # for Windows
ubuntu_os = False
mac_os = False
linux_os = False
win_os = True
# For logging. Logging module is utilized but sys.stderr is utilized for writing
# into output log file as it enables to record unexpected errors (stderr) from the program during its execution.
if win_os: # if __file__ not in quotes, it results in error after executable is installed in Windows
current_path = os.path.dirname( os.path.realpath("__file__") ) # gets filepath of this script file
else:
current_path = os.path.dirname( os.path.realpath(__file__) ) # gets filepath of this script file
if win_os and not ubuntu_os:
# In Windows, write privilages restricts writing log file in 'program files' folder
# This part writes log file to C:/temp folder. However if C: is not
# present, then this part will write into a drive that is present in that computer
if os.path.exists('C:'):
logger_filename = 'C://temp//Logs_ResCons.txt'
else:
drive_letter = map(chr, range(68, 91))
drive_letter = ['A', 'B'] + drive_letter # all alphabets except letter C
for letter in drive_letter:
drive = letter + ':'
if os.path.exists(drive):
logger_filename = letter + '://temp//Logs_ResCons.txt'
break
dir_path = os.path.dirname(logger_filename) # this part creates temp folder if it is not already present
try:
os.makedirs(dir_path)
except OSError as exception:
if exception.errno != errno.EEXIST: # if administrative privilages prevents from creating folder
tkMessageBox.showerror('Error', "Cannot create file '%s'. Check if you have privileges to create this file "
"in the path specified." %dir_path)
raise
else: # for mac and ubuntu, log file is created in the same folder as this script file
logger_filename = current_path + '/Logs_ResCons.txt'
if not terminal_output:
sys.stderr = open(logger_filename, 'w') # Logs and errors are directed to stderr which in turn gets logged.
# Following are dummy variables which will be removed when scripting is completed
# This is just to keep Pycharm's code inspection happy
# or otherwise code inspection shows error everywhere suggesting 'unresolved reference'
# html_log = logging.getLogger('value')
# genbank_log = logging.getLogger('value')
# runscript_log = logging.getLogger('value')
# blast_log = logging.getLogger('value')
# mismatch_log = logging.getLogger('value')
# clustal_log = logging.getLogger('value')
# gui_log = logging.getLogger('value')
# clades_log = logging.getLogger('value')
# descr_filter_log = logging.getLogger('value')
# idfilter_log = logging.getLogger('value')
# header_extractor_log = logging.getLogger('value')
# unexpected_error_log = logging.getLogger('value')
# user_error_log = logging.getLogger('value')
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)-19s %(levelname)-8s %(message)s',
datefmt= '%Y-%m-%d %H:%M:%S')
logging_dict = {'gui_log': 'GUI_Main', 'clustal_log': 'Clustal_Alignment', 'mismatch_log': 'Fetch_Mismatch',
'html_log': 'html_formatting', 'genbank_log': 'genbank2Fasta', 'clades_log': 'Extract_Clades',
'blast_log': 'blast_filter', 'runscript_log': 'Run_Script_Main', 'descr_filter_log': 'Filter_by_Descrip',
'idfilter_log': 'Filter_by_id' , 'header_extractor_log': 'Descrip/ID_Extractor',
'unexpected_error_log': 'Unexpected_error', 'user_error_log': 'User_error'}
for key, value in logging_dict.iteritems(): # defines multiple loggers using dictionary items
locals()[key] = logging.getLogger(value)
# This part gets settings from a text file (so as to allow user to modify certain settings)
settings_file_name = current_path + "/ResCons_Files/Settings_ResCons.txt"
with open(settings_file_name, 'Ur') as settings_data:
for line in settings_data:
line = line .replace('\n', '')
if line != '' and '##' not in line: # skips blank and description lines
for index, char in enumerate(line): # gets name of variable and also gets rest of the line
if char == ':':
var_name = line[0:index].replace(' ', '')
trimmed_1 = line[index+1 ::]
break
for index, char in enumerate(trimmed_1): # removes leading spaces and gets values provided
if char != ' ':
trimmed_2 = trimmed_1[index::]
break
for no in range(0, len(trimmed_2)): # removes spaces at end and gets values provided
if trimmed_2[-(no+1)] != '^':
if not no:
trimmed_3 = trimmed_2
else:
trimmed_3 = trimmed_2[:-no]
break
# settings for Mismatch Analyzer
if var_name == 'match_color': # gets color for matching residues
# match_color = line_list[1]
match_color = trimmed_3
elif var_name == 'similar_color': # gets color for similar residues
similar_color = trimmed_3
elif var_name == 'mismatch_color': # gets color for mismatching residues
mismatch_color = trimmed_3
elif var_name == 'color_gradient':
color_gradient = trimmed_3.replace(' ', '')
color_gradient = color_gradient.split(',')
elif var_name == 'conservation_method': # gets scoring method to calculate residue conservation
conserve_method = trimmed_3.lower()
elif var_name == 'ref_included_in_analysis': # gets if ref sequence need to be included in stats or not
if trimmed_3 == 'No':
ref_included = False
else:
ref_included = True
elif var_name == 'ClustalO_Command_local': # gets ClustalO command (for running locally)
line_list = line.split(' {')
clustalo_command_local_default = trimmed_3
elif var_name == 'ClustalO_Command_web': # gets clustalo command (for running locally)
clustalo_command_web_default = trimmed_3
elif var_name == 'id_delimiter':
id_delimiter = trimmed_3
elif var_name == 'aa_set': # gets amino acid grouping that user likes to use
if list(set(trimmed_1)) == [' ']:
aa_set = ['', '']
aa_set_str = ''
else:
trimmed_2 = trimmed_2.replace(' ', '')
line_list = trimmed_2.split(',')
aa_set_upper = [x.upper() for x in line_list] # makes upper and lower case as a set
aa_set_lower = [x.lower() for x in line_list]
aa_set = aa_set_upper + aa_set_lower
aa_set_str = ', '.join(aa_set_upper)
elif var_name == 'chart_method': # gets how to draw bar chart in html
chart_method = trimmed_3
# settings for GenPept to fasta converter tool
elif var_name == 'Fasta_ID_length': # gets fasta id length
fasta_id_length_default = trimmed_3
elif var_name == 'connector_id': # gets symbol that is used to connect different fields
connector_id = trimmed_2
elif var_name == 'symbols_to_be_replaced': # gets symbols that are to be replaced in fasta header id
symbols_to_be_replaced = trimmed_3.split()
symbols_to_be_replaced_str = ' '.join(symbols_to_be_replaced)
elif var_name == 'newick_sym_replace': # gets symbol to replace sensitive symbols
newick_sym_replace = trimmed_2
# settings for FASTA Description/ID Extractor
# elif var_name == 'symbol_replacing_comma': # gets symbol for use if user don't want comma in output
# symbol_replacing_comma = trimmed_2
elif var_name == 'regex_desc_extr': # regular expression to be used for description extractor
line_list = line.split(':')
regex_desc_extr = trimmed_2
elif var_name == 'regex_id_extr': # regular expression to be used for ID extractor
regex_id_extr = trimmed_2
# Settings for Filter fasta by Sequences' Description
elif var_name == 'regex_desc_fltr': # regular expression to be used for filtering by partial description
regex_desc_fltr = trimmed_2
# reads Similarity Matrix S from text file for Residue Conservation Scoring method by Liu et. al., 2008
simi_matrix_filename = "ResCons_Files/Similarity_Matrix_S.txt"
simi_matrix_handle = open(simi_matrix_filename, "Ur")
dict_similarity_matrix = {}
line_count = 1
for line in simi_matrix_handle:
line = line.replace("\n", "")
if line_count == 1:
aa_label = line.split()
else:
line_list = line.split()
dict_similarity_matrix[ line_list[0] ] = {}
for aa_no in range(0, len(aa_label)):
dict_similarity_matrix[ line_list[0] ][aa_label[aa_no]] = int( line_list[aa_no + 1] )
line_count += 1
simi_matrix_handle.close()
# chart_method = 'chart.js' # options: 'matplot', 'chart.js' or 'chartnew.js' (if commented, value is used from settings file)
# Function to verify existence of a path and create it if non-existent.
# Also raises warning when folder is already present as it may rewrite files in them.
def verify_filepath_exists(path):
try:
os.makedirs(path)
except OSError as exception:
runscript_log.error('Error that resulted when creating path "%s":\n %s' % (path, exception))
if exception.errno != errno.EEXIST: # if administrative privilages prevents from creating folder
temp = ("Cannot create file: \n '%s'. \n Check if you have privileges to create this file "
"in the path specified." %path)
runscript_log.error(temp)
popup_error(temp)
# raise
raise_enabler('stop')
# Mac & Ubuntu produce OSerror 17 when a folder already exists; for windows (8.1), it is 183
elif exception.errno == (17 or 183): # warns if folder pre-exists
temp = ('Folder "%s" already exists. All or some files in it may get overwritten by ResCons. '
'Do you wish to continue?' % path)
runscript_log.warning(temp)
rewrite_or_not = tkMessageBox.askyesno('Warning', temp, default='yes')
runscript_log.info("User has selected (True = Yes, False = No): %s" % rewrite_or_not)
if not rewrite_or_not:
runscript_log.info("ResCons will stop as user has requested.\n")
raise_enabler('User opted not to write in selected output folder!')
else:
runscript_log.info("ResCons will continue as user requested.")
# Warns user if they intend to write output files into 'Program Files' folder in Windows OS.
if not ubuntu_os and win_os and 'Program Files' in dir_path:
temp = "Windows OS may not allow you to write results into 'Program Files' folder unless you have " \
"administrator privileges. Click 'Yes' to continue and 'No' to cancel."
user_error_log.warning(temp)
ans = tkMessageBox.askyesno('Warning', temp, default = 'no')
if ans:
user_error_log.info("User clicked 'Yes'. ResCons continues further.")
else:
user_error_log.info("User clicked 'No'. ResCons stops here.")
raise_enabler('stop')
# Function to pop-up an error in gui
def popup_error(string):
global top
try:
tkMessageBox.showerror('Error', string, parent = top)
except Exception as e:
tkMessageBox.showerror('Error', string)
# Function that enables displaying errors in gui when unexpected errors come up during execution of script.
# Thanks to stack-overflow for this function's basic skeleton (@Jochen Ritzel)
def show_error(self, *args):
err = traceback.format_exception(*args)
err_string = '' # turning it a string makes it easier to read.
for x in err:
err_string += x
err_string += '\n'
if 'raise_enabler' not in err_string: # error will pop up only when unexpected error occurs
temp = "Unexpected error occured. Click 'Yes' for more details on this error."
unexpected_error_log.error(temp)
unexpected_error_log.error('Unexpected error that resulted: %s' % err_string)
try:
# 'parent' parameter will enable to have pop up window on top of respective windows
ans = tkMessageBox.askquestion('Error', temp, parent = top, default = 'no')
if ans == 'yes':
popup_error(err_string)
except Exception as e:
ans = tkMessageBox.askquestion('Error', temp, default = 'no')
if ans == 'yes':
popup_error(err_string)
else: # for error that can be corrected by user
user_error_log.error('Error that resulted: %s' % err_string)
raise_enabler('none')
# Re-enable convert/submit job buttons and hide 'processing' label when 'Raise' function needs to be called
# intentionally to avoid further execution of code when an error is noted.
# Also defines what kind of 'raise' function is called
def raise_enabler(string):
# to re-enable submit button and hide 'processing job label' of root window during unexpected errors
try:
global button_run_script
global processing, processing_clustal
button_run_script.configure(state=ACTIVE)
processing.grid_remove()
processing_clustal.grid_remove()
except Exception as e:
pass
# to re-enable submit buttons and hide 'processing job label' in child window during unexpected errors
try:
global Blast_Submit, genbank_Submit, Extract_Submit, descr_extractor_Submit, fasta_filter_by_name_Submit, fasta_filter_by_id_Submit
global blast_processing, genbank_processing, subtree_processing, descr_extract_processing, filter_name_processing, filter_id_processing
submit_buttons = [Blast_Submit, genbank_Submit, Extract_Submit, descr_extractor_Submit,
fasta_filter_by_name_Submit, fasta_filter_by_id_Submit]
processing_labels = [blast_processing, genbank_processing, subtree_processing, descr_extract_processing,
filter_name_processing, filter_id_processing]
for no in range(0, len(submit_buttons)):
submit_buttons[no].configure(state=ACTIVE)
processing_labels[no].grid_remove()
except Exception as e:
pass
if string == 'stop':
user_error_log.error("Resulted in an error that can be corrected by User. ResCons is ready for next job now!")
raise Exception("Error can be rectified by user (you)!. Controlled termination by ResCons!")
if string == 'OSError':
raise OSError
elif string == 'none': # in cases where submit buttons need to be activated but raise fn not to be called
pass
else:
raise Exception(string)
# Function to index each character (residue) in the Reference sequence
def indexing(string, res):
index_list = []
for index, char in enumerate(string):
if char == res:
index_list.append(index)
return index_list
# Works in mac and ubuntu but it is not currently implemented as catching exceptions from thread is not resolved yet.
# Function that runs tools in a new thread and shows a 'process under progress' label
# This function keeps gui stable while a tool is underway.
# this function works in mac and ubuntu but not in windows. In windows, when this function is run, tkmessagebox module
# inside the tool getting called poses trouble as tkinter have trouble being thread-safe.
def gen_thread(proc_label, tool_fn):
xyz = threading.Thread(target=tool_fn, args= ())
xyz.start()
proc_label.grid()
while xyz.isAlive():
try:
app.update()
except Exception as e:
top.update()
xyz.join()
proc_label.grid_remove()
# function to run a function in a new thread; intended to run clustal omega
def new_thread(tool_fn):
xyz = threading.Thread(target=tool_fn, args= ())
xyz.start()
while xyz.isAlive():
app.update()
xyz.join()
# function to check for reference seq in query seqs. Appends it if not present.
def is_ref_in_seqs_file():
global SeqQuery_file
global Reference
global Output_Path
global outfile_ref_added
global Aligned_Filename
global tree_filename
try:
all_seqs_input = list(SeqIO.parse(SeqQuery_file, "fasta"))
except Exception as e:
clustal_log.error('Error that resulted when reading Sequences file: %s' % e)
temp = ("Ran into problem reading file:\n '%s'" % SeqQuery_file)
clustal_log.error(temp)
popup_error(temp)
raise_enabler('stop')
if len(all_seqs_input) == 0:
temp = ('Sequences file seems to be empty. Make sure it is in FASTA format.'
'Try again after fixing this problem')
clustal_log.error(temp)
popup_error(temp)
raise_enabler('stop')
clustal_log.info("Number of sequences found in '%s' are '%i'" %(SeqQuery_file, len(all_seqs_input)))
# check for duplicate sequence identifiers in Sequences file
all_seqs_id = [item.id for item in all_seqs_input]
duplicate_ids_list = set([item for item in all_seqs_id if all_seqs_id.count(item) > 1])
if duplicate_ids_list:
duplicate_ids = '\t' + '\n\t'.join(duplicate_ids_list)
temp = "Duplicate Sequence IDs are present in Sequences file. \n Click 'Yes' if you would like ResCons to " \
"proceed as it is. \n Click 'No' to stop further execution." \
"\n\nFolowing are the duplicate sequence IDs: \n%s" %duplicate_ids
clustal_log.error(temp)
ans = tkMessageBox.askquestion('Error', temp, default = 'yes')
if ans == 'no':
gui_log.info("User clicked 'No'. ResCons will stop now.")
raise_enabler('stop')
else:
gui_log.info("User clicked 'Yes'. ResCons proceeds further.")
# Verifies if Reference sequence is present in query sequences and adds to it, if non-existent.
outfile_ref_added = None
if Reference.id not in all_seqs_id:
all_seqs_input.append(Reference)
OutputFile_temp = SeqQuery_FileName.replace('.fasta', '') + "_Reference_SeqAdded.fasta"
outfile_ref_added = Output_Path + OutputFile_temp
Output_handle = open(outfile_ref_added, 'w')
SeqIO.write(all_seqs_input, Output_handle, 'fasta')
Output_handle.close()
clustal_log.info("Reference seq is not present in file: '%s'. \n\tA new file '%s' is created "
"with Reference Seq appended to it." % (SeqQuery_file, outfile_ref_added))
SeqQuery_file = outfile_ref_added
Aligned_Filename = Output_Path + "Aligned_ClustalO_" + OutputFile_temp.replace('.fasta', '')
tree_filename = Output_Path + "Tree_" + OutputFile_temp.replace('.fasta', '') + ".newick"
else:
clustal_log.info('Reference seq is present in file: %s' % SeqQuery_file)
Aligned_Filename = Output_Path + "Aligned_ClustalO_" + SeqQuery_FileName.replace('.fasta', '')
tree_filename = Output_Path + "Tree_" + SeqQuery_FileName.replace('.fasta', '') + ".newick"
# verify sequence in Reference file matches to that in Sequences file
refseq_in_input = all_seqs_input[ all_seqs_id.index(Reference.id) ].seq
if str(Reference.seq) == str(refseq_in_input):
clustal_log.info("Reference sequence matches to corresponding sequence in Sequences file.")
else:
temp = "Reference sequence does not match to corresponding sequence in 'Sequences file'. " \
"Fix it and try again!"
clustal_log.error(temp)
popup_error(temp)
raise_enabler('stop')
# function to perform clustal alignment using EMBL-EBI webserver
def clustalo_webserver_fn():
global SeqQuery_file
global Output_Path
global Aligned_Filename
# verify if certain parameters are not provided by user as they will be set by ResCons
para_not_allowed = ['--outfmt', '--outformat', '--sequence', '--outfile']
para_problem = False
for para in para_not_allowed:
if para in clustal_web_user_command:
para_problem = True
break
temp = ''
# verify if certain parameters are not provided by user as they will be set by ResCons
if para_problem:
para_string = '\t' + '\n\t'.join(para_not_allowed)
temp = "Following parameters are not allowed in Clutal omega command:\n\n%s\n\n Fix it and try again!" % para_string
# verify email id address is present as part of clustal command for EMBL web server
elif '-email' not in clustal_web_user_command:
temp = "Parameter '--email' is required to utilize Clustal omega through EMBL webserver. Try again!"
# Raises error if email id provided is '[email protected]'
elif '[email protected]' in clustal_web_user_command:
temp = "Provide a valid email address instead of '[email protected]' in ClustalO command.\n\n" \
"Tip: To permanantly store your valid email id, change it in ResCons's settings file. It is available " \
"at bottom of the window of 'File menu --> Edit settings'"
if temp:
clustal_log.info(temp)
popup_error(temp)
raise_enabler('stop')
Output_Path_web = Output_Path + 'webserver_results/'
verify_filepath_exists(Output_Path_web)
Aligned_Filename = Output_Path_web + 'output.aln-fasta.fasta' # reads output aligned file. Had to hard code. Sorry!
# this removes clustal aligned file if it is pre-existing in the output folder. Done as a work around to check success of clustal omega alignment
if os.path.exists(Aligned_Filename):
os.remove(Aligned_Filename)
clustal_log.debug("Deleted pre-existing file: '%s'" % Aligned_Filename)
# All results from webserver will be stored with 'output' as filename with different extensions
# fasta is the only alignment output format allowed when in webserver mode.
# To avoid unnecessary clutter, only formats mentioned in 'outformat' are written as output
parameters_string = '%s --sequence "%s" --outfile "%s"output --outfmt fa --outformat "aln-fasta, phylotree, pim"' \
% (clustal_web_user_command, SeqQuery_file, Output_Path_web)
clustal_log.info('Parameters sent to Clustal omega server:\t"%s"' %parameters_string)
temp = 'Clustal alignment via webserver will begin now. Warning: It may take few seconds ' \
'to several minutes depending on sequences provided.'
clustal_log.info(temp)
tkMessageBox.showinfo('Info', temp)
clustal_log.info('Following info are obtained from Clustal omega webserver: ')
# calls clustal omega webserver api
processing_clustal.grid()
try:
# clustalo_api(parameters_string)
new_thread(lambda: clustalo_api(parameters_string)) # this doesn't catch exception though.
except Exception as e: # This will catch all the major errors (when threading is not used)
temp = 'It seems Clustal Omega webserver resulted in error.\n' \
' 1. Are you connected to internet?\n' \
' 2. Did you provide valid email id address in the command line?\n' \
' 3. Does no. of sequences in input exceed 2000? Webserver allows only 2000 or less sequences.\n' \
' 4. Check the command provided for errors\n\n' \
'Here is the error returned by Clustal Omega webserver:\n %s' %e
clustal_log.error(temp)
popup_error(temp)
button_run_script.configure(state = ACTIVE) # Re-enables submit job button while processing
raise_enabler('Clustal Omega resulted in error! So ultimately ResCons had to terminate the job!')
processing_clustal.grid_remove()
# This is if clustal omega webserve results in an error and previous test missed the error somehow.
# Done by veriying if output aligned file is created or not
if not os.path.exists(Aligned_Filename):
processing_clustal.grid_remove()
temp = 'It seems Alignment using Clustal Omega webserver resulted in error.\n' \
' 1. Are you connected to internet?\n' \
' 2. Did you provide valid email id address in the command line?\n' \
' 3. Does no. of sequences in input exceed 2000? Webserver allows only 2000 or less sequences.\n' \
' 4. Check the command provided for errors\n\n' \
'See log file for specific error resulted (File -> open log file)'
clustal_log.error(temp)
popup_error(temp)
button_run_script.configure(state = ACTIVE) # Re-enables submit job button while processing
raise_enabler('Clustal Omega resulted in error! So ultimately ResCons had to terminate the job!')
# Function to perform clustal alignment, if needed.
# Can accept commands from user
# (note: user has to supply such commands in dictionary format).
def clustal_alignment_local():
clustal_log.debug('Clustal Alignment (local) module initiated.')
global Aligned_Filename
global SeqQuery_FileName
global clustal_local_user_command_string
global tree_filename
# function to verify if a executable program is available in the system through shell.
# This function was shamelessly copied from Suraj Barkale's answer in stack exchange
# source: http://stackoverflow.com/questions/377017/test-if-executable-exists-in-python
def which(program):
def is_exe(fpath):
return os.path.exists(fpath) and os.access(fpath, os.X_OK)
def ext_candidates(fpath):
yield fpath
for ext in os.environ.get("PATHEXT", "").split(os.pathsep):
yield fpath + ext
fpath, fname = os.path.split(program)
if fpath:
if is_exe(program):
return program
else:
for path in os.environ["PATH"].split(os.pathsep):
exe_file = os.path.join(path, program)
for candidate in ext_candidates(exe_file):
if is_exe(candidate):
return candidate
return None
is_clustalo_present = which('clustalo') #tests if clustal omega is available in the system
if is_clustalo_present is None:
temp = 'It appears Clustal Omega is not available in your system. Click Cancel if you believe this is not true.'
clustal_log.error(temp)
response = tkMessageBox.askokcancel('Clustal Omega not installed', temp, default = 'ok')
clustal_log.info("User's response (True = OK, False = Cancel) : %s" %response)
if response:
clustal_log.info("User chose not to proceed further.")
raise_enabler('stop')
else:
clustal_log.info("User chose to proceed further despite warning on 'unavailability of Clustal Omega")
clustal_command_temp_1 = {'infile': SeqQuery_file} # Infile and outfile command is provided through script; not dependent on user
# obtain and verify syntax of user-provided Clustal-O command
try:
clustal_command_temp_2 = ast.literal_eval(clustal_local_user_command_string) # convert string to a dictionary
except Exception as e:
temp = 'Syntax error in Clustal-O command entered. Correct it and Try again!'
clustal_log.error(temp)
clustal_log.error('Error that resulted: %s' % e)
popup_error(temp)
raise_enabler('stop')
clustal_commandline = {} # Merge two dictionaries into one to get complete clustal command
clustal_commandline.update(clustal_command_temp_1)
clustal_commandline.update(clustal_command_temp_2)
clustal_log.info("Clustal commandline provided by user: \n\t%s" %clustal_commandline)
if 'infile' in clustal_command_temp_2:
temp = "'infile' parameter is not allowed to be part of Clustal-O command. Remove it and try again!"
clustal_log.error(temp)
popup_error(temp)
raise_enabler('stop')
# paramter 'outfile' is not allowed anymore as ResCons now allows to choose output folder. It is redundant
if "outfile" in clustal_command_temp_2:
temp = "Parameter 'outfile' is not allowed in ClustalO command. Output file instead will be stored in output folder chosen. Press yes to proceed further"
ans = tkMessageBox.askquestion('Error', temp, default = 'yes')
if ans == 'no':
clustal_log.info("User clicked 'No'. ResCons will stop now.")
raise_enabler('stop')
else:
clustal_log.info("User clicked 'Yes'. ResCons proceeds further.")
# reads user provided clustal omega output format
outfmt_allowed = ["a2m", "fa", "fasta", "clu", "clustal", "phy", "phylip", "vie", "vienna"]
if "outfmt" in clustal_commandline:
if clustal_commandline['outfmt'] in outfmt_allowed:
file_extension = clustal_commandline['outfmt']
else:
temp = ("'outfmt' parameter '%s' provided in Clustal Omega commandline is not allowed here. " \
"Choose among the following parameters:\n a2m, fa, fasta, clu, clustal, phy, phylip, vie or vienna" \
% clustal_commandline['outfmt'] )
clustal_log.error(temp)
popup_error(temp)
raise_enabler('stop')
else:
file_extension = 'fasta' # unless specified, clustal output will be in fasta format
# the following is decided to be unnecessary as ResCons now allows to choose output folder
# # if user requests to change default filename or filepath of aligned file, following will take care of it.
# if "outfile" in clustal_commandline:
# if clustal_commandline['outfile'] != 'default':
# Aligned_Filename = clustal_commandline['outfile']
# temp_path = os.path.dirname(Aligned_Filename)
# verify_filepath_exists(temp_path)
# else:
# Aligned_Filename += ('.' + file_extension)
# clustal_commandline['outfile'] = Aligned_Filename
# else:
# temp = "Parameter 'outfile' is missing in ClustalO command provided. It is required to execute ClustalO."
# clustal_log.error(temp)
# popup_error(temp)
# raise_enabler('stop')
# parameter 'outfile' is passed as part of clustal command
Aligned_Filename += ('.' + file_extension)
clustal_commandline['outfile'] = Aligned_Filename
# this removes clustal aligned file if it is pre-existing in the output folder. Done as a work around to check success of clustal omega alignment
if os.path.exists(Aligned_Filename):
os.remove(Aligned_Filename)
clustal_log.debug("Deleted pre-existing file: '%s'" % Aligned_Filename)
# if user requests to change default filename or filepath of phylogenetic tree file, following will take care of it.
if "guidetree_out" in clustal_commandline:
if clustal_commandline['guidetree_out'] != 'default.newick':
tree_filename = clustal_commandline['guidetree_out']
temp_path = os.path.dirname(tree_filename)
verify_filepath_exists(temp_path)
else:
clustal_commandline['guidetree_out'] = tree_filename
# Run ClustalO with user provided (partially) commandline
# clustalomega_cline = ClustalOmegaCommandline(infile=SeqQuery_file, outfile=Aligned_Filename, outfmt='clu',
# guidetree_out=tree_filename, auto=True, verbose=True, force=True)
try:
clustalomega_cline = ClustalOmegaCommandline(**clustal_commandline)
except Exception as e: # This will catch all the major errors
temp = 'Error in command directed to ClustalO. Check the command entered!'
clustal_log.error(temp)
clustal_log.error('Error that resulted: %s' % e)
popup_error(temp)
raise_enabler('stop')
clustal_log.info("Equivalent command prompt command that will be used to"
" execute clustal omega alignment: \n\t'%s'" %str(clustalomega_cline))
temp = 'Clustal alignment will begin now. Warning: It may take few seconds ' \
'to several minutes depending on sequences provided and your processor.'
clustal_log.info(temp)
tkMessageBox.showinfo('Info', temp)
# execute clustal omega in a new thread. As catching exceptions when using threading is problematic, ResCons
# looks for absence of clustal aligned output file as a workaround to verify if clustal omega ran successfully.
# For the above reason, ResCons deletes if a file with same name as clustal output file was already
# present in user's output folder.
processing_clustal.grid()
if linux_os:
clustalomega_cline()
else:
new_thread(clustalomega_cline)
processing_clustal.grid_remove()
# This is if clustal omega results in an error. Usage of threading makes it hard to catch exceptions(errors)
if not os.path.exists(Aligned_Filename):
processing_clustal.grid_remove()
temp = 'Error executing ClustalO command. Possible problems include: \n ' \
'1. Clustal omega is not installed in your computer or \n ' \
'2. Error in Clustal omega installation or \n ' \
'3. If already installed, correct path may not have been properly specified or \n ' \
'4. If this is Windows OS, Clustal omega may have trouble if Sequences file has lot of sequences.' \
" (This error is caused by Clustal Omega application; not ResCons) \n " \
'5. Error in your Sequences file or in its filename. \n ' \
'Note: This list is not inclusive of all problems though.'
clustal_log.error(temp)
popup_error(temp)
button_run_script.configure(state = ACTIVE) # Re-enables submit job button while processing
raise_enabler('Clustal Omega resulted in error! So ultimately ResCons had to terminate the job!')
# Function that reads clustal alignment file and extracts details of mismatches, if any, at the sites requested
# and writes detailed info into csv and txt files.
def fetch_mismatch():
global unique_residues_line
mismatch_log.debug("Module 'Parsing for Aligned residues' initiated.")
global resi_positions
global Aligned_Filename
global Output_Path
global query_in_Alignment
global query_site_residues
global query_site_actual
global identity_perc_list
# global similarity_perc_list
global conserve_score_list
global protein_mode
global dict_identifier_status
global match_seqs_total
global mismatched_seqs_total
global method_name
mismatches_text_output = False # Default: False. If True, writes a text output file along with csv output file.
mismatch_log.debug("Alignment file '%s' will be read." % Aligned_Filename)
# data_alignment = AlignIO.read(Aligned_Filename, "clustal")
# clustal_aligned = False
format_supported = False
try:
# this block identifies if alignment is in any of the supported formats
format_type_list = ['clustal','emboss','fasta','fasta-m10','ig','maf','nexus','phylip','phylip-sequential','phylip-relaxed','stockholm']
for format_type in format_type_list:
try:
data_alignment = AlignIO.read(Aligned_Filename, format_type)
# if format_type == "clustal":
# clustal_aligned = True
mismatch_log.info('Alignment format is detected as "%s" format' % format_type)
format_supported = True
break
except ValueError:
pass
except Exception as e:
mismatch_log.error("Error that resulted: %s" % e)
temp = ("Could not open file: \n\t'%s'" % Aligned_Filename)
mismatch_log.error(temp)
popup_error(temp)
raise_enabler('stop')
if not format_supported:
temp = "Your Alignment file does not have alignment format supported by ResCons"
mismatch_log.error(temp)
popup_error(temp)
raise_enabler('stop')
mismatch_log.debug("Alignment file '%s' was read successfully." % Aligned_Filename)
# Checks if Reference sequence ID is present in alignment file.
Reference_index = None
for serial_no in range(len(data_alignment)):
if Reference.id == data_alignment[serial_no].id:
Reference_index = serial_no
# Checks if Reference seq is present in MSA and if present, checks if they match or not
if Reference_index is None:
temp = 'Error: Alignment file provided does not have Sequence ID corresponding to your reference sequence. ' \
'Fix it and try again!'
mismatch_log.error(temp)
popup_error(temp)
raise_enabler('stop')
else:
refseq_in_alignment = data_alignment[Reference_index].seq
refseq_in_alignment = str(refseq_in_alignment).replace('-', '')
if str(Reference.seq) == refseq_in_alignment:
clustal_log.info("Reference sequence matches to corresponding sequence in Alignment file provided.")
else:
temp = "Reference sequence does not match to corresponding sequence in Alignment file provided. " \
"Fix it and try again!"
mismatch_log.error(temp)
popup_error(temp)
raise_enabler('stop')
mismatch_log.info("Reference sequence's ID is present in alignment file and their sequences match.")
# Indexes residues (characters) in reference sequence and also for reference sequence in alignment.
mismatch_log.debug('Begins indexing residues in reference seq and also of reference sequence in alignment')
reference_seq_aligned = str(data_alignment[Reference_index].seq)
# following line creates list of upper case alphabets.
# Includes non-amino acid chars as well to support nucleotides other than A, T, G and C
# works for both upper and lower case characters and considers non-amino acid and non-nucleotide residues as mismatch
aa_list = map(chr, range(65, 91)) + map(chr, range(97,123))
Dict_aa_Index_Reference_Seq = {k: [] for k in aa_list}
Dict_aa_Index_Reference_Seq_inAlignment = {k: [] for k in aa_list}
for aa in aa_list:
Dict_aa_Index_Reference_Seq_inAlignment[aa] = indexing(reference_seq_aligned, aa)
Dict_aa_Index_Reference_Seq[aa] = indexing(Reference.seq, aa)
mismatch_log.debug('Done indexing residues in Reference seq and also of Reference sequence in alignment')
# This section is to get appropriate number of 'name components' in the title row.
# Also this accounts even if the clustal record ids have excessive number of pipes without any info in them.
aligned_ids_list = []
pipes_total_list = []
justified_list = []
for record in data_alignment:
aligned_ids_list.append(record.id)
pipes = (record.id).split(id_delimiter)
if len(pipes) == 1:
justified_list.append(len(pipes[0]))
pipes_temp = pipes
for no in range(1, len(pipes)+1):
if pipes[-no] == '':
pipes_temp = pipes[:(-no)]
else:
break
pipes_total_list.append(len(pipes_temp))
pipes_most = max(pipes_total_list)
mismatch_log.debug('Done determining number of title components in the sequence record IDs.')
# If sequence IDs are made only of few components, this will enable reasonable alignment in the text file output
if pipes_most == 1:
justified = max(justified_list)
elif pipes_most == 2:
justified = 45
elif pipes_most ==3:
justified = 33
else:
justified = 23
# Writes the title line for csv file and txt file
Title_components_csv = 'S.No '
Title_components_txt = '\t\t' + 'S.No'.ljust(7) + '\t'
for no in range(0, pipes_most):
Title_components_csv += (',' + 'Title_' + str(no+1))
Title_components_txt += (('Title_' + str(no + 1)).ljust(justified) + '\t')
Title_components_csv += ',Sequence Length'
Title_components_csv += ',No. of Mismatches'
Title_components_txt += ('Mismatch'.ljust(10)+ '\t' + 'Expected'.ljust(10) + '\n')
mismatch_log.debug('Determined the title to be used in csv and tect files.')
if checkboxval_Clustal.get(): # To account for input file used depending on clustal alignment was selected or not
input_file = SeqQuery_file
else:
input_file = Aligned_Filename
# Extracts residues present at query sites in the reference sequence and also reference sequence in alignment
mismatch_log.debug('Begins the part to extract residues at the requested query sites.')
query_site_actual = resi_positions # query_site_actual refers to actual positioning whereas query_site to python indexes.
query_site_actual = sorted(query_site_actual)
query_site = []
query_site_residues = []
for n in range(0, len(query_site_actual)):
query_site.append(int(query_site_actual[n]) - 1)
query_site_residues.append(Reference.seq[query_site[n]])
mismatch_log.info('Query sites requested: %s' % (str(query_site_actual)))
mismatch_log.info('Residues at those query sites: %s' % str(query_site_residues))
# Finds the position of query site residues in reference's alignment sequence
query_in_Alignment = []
for aa in aa_list:
for query_site_element in query_site: