-
Notifications
You must be signed in to change notification settings - Fork 0
/
ssScrape.py
105 lines (89 loc) · 3.65 KB
/
ssScrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# Web scraper to query ScanSite with an amino acid sequence and get all motif predictions
# and their confidence scores so they can be imported into Peptide Depot.
# Python 2.* compatible
import mechanize
import bs4
import re
import sys
import string
prot_name = sys.argv[1]
seq = sys.argv[2]
output_file = sys.argv[3]
# test params:
#prot_name = 'CD3zeta'
#seq = 'KNPQEGLYNELQKDK'
#seq = 'mkwkalftaailqaqlpiteaqsfglldpklcylldgilfiygviltalflrvkfsrsadapayqqgqnqlynelnlgrreeydvldkrrgrdpemggkpqrrknpqeglynelqkdkmaeayseigmkgerrrgkghdglyqglstatkdtydalhmqalppr'
for stringency in ['Low','Medium','High']:
try:
br = mechanize.Browser()
response = br.open('http://scansite.mit.edu/motifscan_seq.phtml')
br.form = list(br.forms())[0]
#1. set protein ID
pid_control = br.form.find_control('protein_id')
pid_control.value = prot_name
#2. set peptide sequence
seq_control = br.form.find_control('sequence')
seq_control.value = seq
#3. set search to look for ALL motifs
motifOpt_control = br.form.find_control('motif_option')
motifOpt_control.value = ['all']
#4. set stringency to LOW
string_control = br.form.find_control('stringency')
string_control.value = [stringency]
#5. submit form and get response
response = br.submit()
#6. pass HTML response to BeautifulSoup for parsing
soup = bs4.BeautifulSoup(response.read())
#print soup.prettify()
#7. iterate over all rows & parse. if the row is a header (denoted by red background color), get the motif type for the following entries.
#otherwise, read the two descendants (columns) and parse the data.
rows = soup.find_all('tr')
hits = []
for row in rows:
for child in row.children:
if child.has_attr('bgcolor'):
#rows with red background define motif type
if child['bgcolor']=='red':
motif_fam = str(child.font.b)
motif_fam = motif_fam.replace('<b>','').replace('</b>','')
#rows with blue backgrounds have 2 columns:
#first is blue and identifies the motif
elif child['bgcolor']=='blue':
motif = str(child.font.b)
motif = motif.replace('<b>','').replace('</b>','')
#second column is light blue (6BBBFF) and identifies the interacting protein
elif child['bgcolor']=='6BBBFF':
interactor = str(child.a.b)
interactor = interactor.replace('<b>','').replace('</b>','')
#the u tag defines rows that just have descriptive headers
elif len(child.select('u')) > 0:
pass
#the a tag defines rows with a URL string that can be parsed to get the motif match scores
elif len(child.select('a')) > 0:
url = child.a['href']
r = re.match('.*site=([a-zA-Z][0-9]*)&.*score=([0-9]*\.[0-9]*)&.*percentile=([0-9.]*)&.*zscore=(\-{0,1}[0-9.]*)&.*sequence=([A-Z]*)',url)
#if the match returns none, it matched the wrong url (row)
if r != None:
captured = r.groups()
site = str(captured[0])
site = site.replace('<b>','').replace('</b>','')
score = captured[1]
percentile = captured[2]
zscore = captured[3]
sequence = str(captured[4])
hit = [motif_fam, motif, interactor, site, score, percentile, zscore, sequence, seq]
hits.append(hit)
break
except:
#If an error is encountered, repeat the request with the next highest stringency
continue
#8. redirect stdout to the file specified in the arguments and write the results
open(output_file, 'w')
orig_stdout = sys.stdout
f = file(output_file, 'w')
sys.stdout = f
for i in range(0,len(hits)):
sys.stdout.write(str(hits[i][0]))
for j in range(1,len(hits[i])):
sys.stdout.write('\t' +hits[i][j])
sys.stdout.write('\n')