-
Notifications
You must be signed in to change notification settings - Fork 1
/
data_spider_new.py
259 lines (233 loc) · 10.9 KB
/
data_spider_new.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
import urllib.request
import urllib.parse
import re
from lxml import etree
import pymongo
import traceback
import json
class Medical_spider:
def __init__(self):
self.conn = pymongo.MongoClient() #创建数据库需要使用 MongoClient 对象,可以指定连接的 URL 地址和要创建的数据库名。
#conn.list_database_names() 可以判断是否有database存在
self.db = self.conn['medical'] #创建 medical的数据库
self.col = self.db['data'] # 创建data的集合
'''根据url,请求html'''
def get_html(self, url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/51.0.2704.63 Safari/537.36'} # 请求头,用来伪装成浏览器
req = urllib.request.Request(url=url, headers=headers)
res = urllib.request.urlopen(req)
html = res.read().decode('utf-8')
return html # 返回一个utf-8编码的字符串
def get_str(self,ps): #获得页面下所有的文本
str1 = ''
for i in ps:
try:
str1 += i.text
except:
pass
str2 = str1.replace('\r', '').replace('\n', '').replace('\xa0', '').replace(' ','').replace('\t', '').replace(u'\u3000',u' ')
return str2
'''获得所有的疾病网址 '''
def get_jibin_list(self):
url = 'http://jbk.39.net/'
html = self.get_html(url)
selector = etree.HTML(html)
jbk_list =[] # 获取所有疾病的子页面链接
for i in selector.xpath('//p[@class="menu_item_box_txt"]/a/@href'):
if 'http' in i and 'jiancha' not in i and 'shoushu' not in i: #去除掉大科室,检查和手术的链接
jbk_list.append(i)
return jbk_list
def basicinfo_spider(self,url):
# url = 'http://jbk.39.net/pnz1/jbzs/'
html = self.get_html(url)
html1 = html.replace(' ','').replace('/n','') # 用于xpath解决不了的html,改成用正则来解决
selector = etree.HTML(html)
# title = 脾囊肿
pattern_biaoti = re.compile(r'<h1>(.*)</h1>')
title = pattern_biaoti.findall(html1)[0]
# 就诊科室
#category = selector.xpath('//div[3]/ul/li[1]/span[2]/a/text()')
# 简介
desc = selector.xpath('//p[@class = "introduction"]')[0].text
# 医保= 是
pattern_yibao = re.compile(r'是否属于医保:</span>\r\n<spanclass="disease_basic_txt">([\u4e00-\u9fa5]+)</span>')
medical_insurance ='是否属于医保:%s' %(pattern_yibao.findall(html1,re.S)[0])
# 传染性
pattern_chuanran = re.compile(r'传染性:</span>\r\n<span>([\u4e00-\u9fa5]+)</span>')
contagious = '传染性:%s'%(pattern_chuanran.findall(html1)[0])
# 发病部位
pattern_buwei = re.compile(r'发病部位:</span>\r\n<span>\r\n<ahref=".*">([\u4e00-\u9fa5]+)</a>')
incidence_site = "发病部位: %s"%(pattern_buwei.findall(html1)[0])
# 多发人群
pattern_renqun = re.compile(r'多发人群:</span>\r\n<span>(.*)</span>')
people = "多发人群 : %s" %(pattern_renqun.findall(html1)[0])
# 治疗周期
try:
pattern_zhouqi = re.compile(r'治疗周期:</span>\r\n<spanclass="disease_basic_txt">(.*)</span>')
time = "治疗周期 : %s" %(pattern_zhouqi.findall(html1)[0])
except:
time = "治疗周期 : %s" %('无')
# 治愈率
try:
pattern_zhiyulv = re.compile(r'治愈率:</span>\r\n<span>(.*)</span>')
cure_rate = "治愈率: %s"%(pattern_zhiyulv.findall(html1)[0])
except:
cure_rate = "治愈率: %s"%('无')
# 别名
try:
pattern_bieming = re.compile(r'别名:</span>\r\n<spanclass="disease_basic_txt">(.*)</span>')
alias = '别名 :%s'%(pattern_bieming.findall(html1)[0])
except:
alias = '别名 :%s'%('无')
attributes_list = [medical_insurance,contagious,incidence_site,people,time,cure_rate,alias]
infobox = []
infobox.extend(attributes_list)
basic_data = {}
basic_data['name'] = title
#basic_data['category'] = category
basic_data['desc'] = desc
basic_data['attributes'] = infobox
return basic_data
def symptom_spider(self,url): #治疗症状,预防,诊断,治疗,检查,饮食,护理解析
html = self.get_html(url)
selector = etree.HTML(html)
ps = selector.xpath('//p[@class= "article_content_text" or @class= "article_text" or @class = "article_title_num"]')
infobox = []
for p in ps:
info = p.xpath('string(.)').replace('\r', '').replace('\n', '').replace('\xa0', '').replace(' ','').replace('\t', '').replace(u'\u3000',u' ')
if info:
infobox.append(info)
return '\n'.join(infobox)
#return infoboxs.split('二、')[0].split('一、')[1], infoboxs.split('二、')[1]
def bingfazheng(self,url): #并发症
html = self.get_html(url)
selector = etree.HTML(html)
ps1 = selector.xpath('//div[1]/div[2]/p[2]/a')
ps2 = selector.xpath('//div[1]/div[2]/div/p')
infobox1 = []
infobox2 = []
for m,n in zip(ps1,ps2):
info_m = m.xpath('string(.)').replace('\r', '').replace('\n', '').replace('\xa0', '').replace(' ','').replace('\t', '').replace(u'\u3000',u' ')
info_n = n.xpath('string(.)').replace('\r', '').replace('\n', '').replace('\xa0', '').replace(' ','').replace('\t', '').replace(u'\u3000',u' ')
if info_m:
infobox1.append(info_m)
if info_n:
infobox2.append(info_n)
return ','.join(infobox1),','.join(infobox2)
def zhengzhuang(self,url):
html = self.get_html(url)
selector = etree.HTML(html)
ps = selector.xpath('//div[1]/div[2]/p')
infobox = []
for p in ps:
info = p.xpath('string(.)').replace('\r', '').replace('\n', '').replace('\xa0', '').replace(' ','').replace('\t', '').replace(u'\u3000',u' ')
if info:
infobox.append(info)
early_sym =''
last_sym =''
related_sym =''
typical_sym =''
for i in infobox:
if '早期症状' in i:
early_sym = i
if '晚期症状' in i:
last_sym = i
if '相关症状' in i:
related_sym = i
if '典型症状' in i :
typical_sym = i
return early_sym, last_sym, typical_sym,related_sym
def jiancha_spider(self,url): # 获得检查的项目
html = self.get_html(url)
selector = etree.HTML(html)
ps = selector.xpath('//table/tbody/tr/td[1]/a')
infobox = []
for p in ps:
info = p.xpath('string(.)').replace('\r', '').replace('\n', '').replace('\xa0', '').replace(' ','').replace('\t', '').replace(u'\u3000',u' ')
if info:
infobox.append(info)
return '\n'.join(infobox)
def food(self,url): # 宜吃食物和忌吃食物
html = self.get_html(url)
selector = etree.HTML(html)
ps1 = selector.xpath('//table[1]/tbody/tr/td[1]')
infobox_1 = []
ps2 = selector.xpath('//table[2]/tbody/tr/td[1]')
infobox_2 = []
for p in ps1:
info = p.xpath('string(.)').replace('\r', '').replace('\n', '').replace('\xa0', '').replace(' ','').replace('\t', '').replace(u'\u3000',u' ')
if info:
infobox_1.append(info)
for p in ps2:
info = p.xpath('string(.)').replace('\r', '').replace('\n', '').replace('\xa0', '').replace(' ','').replace('\t', '').replace(u'\u3000',u' ')
if info:
infobox_2.append(info)
return '\n'.join(infobox_1), '\n'.join(infobox_2)
def drug_spider(self,url):
html = self.get_html(url)
selector = etree.HTML(html)
infobox = selector.xpath('//div/ul[@class="drug-list"]/li/a/@title')
return infobox
def jiuzhen_spider(self,url): #治疗症状,并发症,病因,预防,诊断,治疗,检查,饮食,护理解析
html = self.get_html(url)
selector = etree.HTML(html)
infobox = []
for i in range(1,10):
infobox_sub =[]
ps_title = selector.xpath('//*[@id="contentText"]/dl[%s]/dt'%(i))
ps_answer = selector.xpath('//*[@id="contentText"]/dl[%s]/dd'%(i))
infobox_sub.append(self.get_str(ps_title))
infobox_sub.append(self.get_str(ps_answer))
infobox.append(infobox_sub)
return infobox
def data_spider(self,list):
database = []
count = 0
max_count = len(list)
for i in list:
basic_url = i + 'jbzs/'
zhengzhuang_url = i + 'zztz/'
bingyin_url = i + 'blby/'
bingfazheng_url = i + 'bfbz/'
yufang_url = i + 'yfhl/'
jianbie_url = i + 'jb/'
zhiliao_url = i + 'yyzl/'
jiuzheng_url = i + 'jzzn/'
jiancha_desc_url = i + 'jcjb/'
huli_url = i + 'hl/'
yinshi_url = i + 'ysbj/'
drug = i + 'cyyp/'
try:
data = {}
data['url'] = basic_url
data['basic_info'] = self.basicinfo_spider(basic_url)
data['early_sym'], data['last_sym'], data['typical_sym'], data['related_sym'] = self.zhengzhuang(zhengzhuang_url)
data['bingyin'] = self.symptom_spider(bingyin_url)
data['bingfazheng'], data['bingfazheng_desc'] =self.bingfazheng(bingfazheng_url)
data['yufang'] = self.symptom_spider(yufang_url)
data['jianbie'] = self.symptom_spider(jianbie_url)
data['zhiliao'] = self.symptom_spider(zhiliao_url)
data['jiuzheng'] = self.jiuzhen_spider(jiuzheng_url)
data['jiancha_desc'] = self.symptom_spider(jiancha_desc_url)
data['jiancha'] = self.jiancha_spider(jiancha_desc_url)
data['huli'] = self.symptom_spider(huli_url)
data['yinshi_desc'] = self.symptom_spider(yinshi_url)
data['eat'],data['no_eat'] = self.food(yinshi_url)
data['drug'] = self.drug_spider(drug)
#self.col.insert(data)
database.append(data)
count +=1
print (count/max_count)
except:
traceback.print_exc()
print (data['url'])
return database
if __name__ == "__main__":
med = Medical_spider()
jbk_list = med.get_jibin_list()
# 测试
database = med.data_spider(jbk_list)
with open("./database_new.json","w") as f:
json.dump(database,f)
print ("加载入文件完成...")