-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbioinformatics_kegg_spider.py
106 lines (83 loc) · 2.93 KB
/
bioinformatics_kegg_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/env python
import json
# import os
import requests
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError
import re
import pandas as pd
import time
def get_page(url):
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
r = requests.get(url, headers=headers)
if r.status_code == 200:
return r.text
return None
except ConnectionError:
print('error occured')
return None
#need website code regular, there, BeautifulSoup parser used
def parse_page_detail(html):
soup = BeautifulSoup(html, 'lxml')
pattern = re.compile('>.*?<')
th40 = soup.find_all('th', attrs={'class': 'th40'})
th41 = soup.find_all('th', attrs={'class': 'th41'})
th = th40 + th41
# print(len(th))
Key_th = []
for i, TH in enumerate(th):
key = TH.nobr.string
Key_th.append(key)
td40 = soup.find_all('td', attrs={'class': 'td40'})
td41 = soup.find_all('td', attrs={'class': 'td41'})
td = td40 + td41
#Definition
Definition = {}
if 'Definition' in Key_th:
Def = td[Key_th.index('Definition')]
Def_filter = re.findall(pattern, str(Def))[2]
Definition['Definition'] = Def_filter
else:
Definition["Definition"] = "No this KO"
#Brite
Brite = {}
if 'Brite' in Key_th:
Bri = td[Key_th.index('Brite')]
Bri_filter = Bri.nobr
Brite["Brite"] = Bri_filter
else:
Brite["Brite"] = ""
return Definition, Brite
def extract_ko_all(ko, Definition, Brite):
ko_des = {}
inf = 'Definition:' + str(Definition["Definition"]) + ' Brite:' + str(Brite["Brite"])
ko_des[ko] = inf
# print(ko_des)
# print(type(ko_des))
with open('E:\CR_genome\ko_alldes.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(ko_des, ensure_ascii=False) + '\n')
def main():
df_gene_ko = pd.read_csv('E:\CR_genome\only_cre_ko.txt', sep='\t')
KO = df_gene_ko["KO"]
# print(KO)
KO_Def = []
for ko in KO:
url = 'https://www.kegg.jp/dbget-bin/www_bget?ko:' + ko
html = get_page(url)
# print(html)
Definition, Brite = parse_page_detail(html)
# print(Definition, Brite)
extract_ko_all(ko, Definition, Brite)
KO_Def.append(Definition["Definition"])
time.sleep(2)
df_gene_ko["Definition"] = KO_Def
df_gene_ko.to_csv('E:\CR_genome\cre_ko_name.txt', sep="\t", index=False)
if __name__ == '__main__':
main()
#there , can also introduce argparse module that can accepts command line parameters, and make this script in packaging
# import argparse
# parser=argparse.ArgumentParser()
# parser.add_argument("echo",help="echo the string")
# args=parser.parse_args()
# print args.echo