forked from xiaoweigege/botgate-RuiShu-Spider
-
Notifications
You must be signed in to change notification settings - Fork 0
/
spider.py
158 lines (132 loc) · 5.29 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
"""
@QQ_VX: 240942649
@Date: 2019.08.09
@Author: 小伟科技工作室
@Project: 农产品交易
"""
import re
import execjs
import requests
from lxml import etree
from retry import retry
from openpyxl import Workbook
from loguru import logger
from gevent import monkey, pool
from gevent.lock import Semaphore
monkey.patch_socket()
sem = Semaphore(1)
class LongSpider(object):
"""
农产品交易
"""
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36'
' (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
# 'Cookie': 'YwnBCHQI8xgWI5a=KMITdgVuivvmbO9tpasiosENKZeKWKc8DNGpo9B.dpWZ0H5hL45Bo03jKp6e5yCPqmLf88MFbafEblBLLmLNGJSxJw16SFG4vWNGlmr_OaHIN'
}
self.wb = Workbook()
self.ws = self.wb.active
self.ws.append(['项目编号', '项目名称', '交易地点', '公式日期', '流转方式', '流转期限', '是否续租', '流转用途',
'交易面积', '交易方式', '成交价'])
self.pool = pool.Pool(10)
with open('script/js.js', 'r', encoding='utf-8') as file:
js = file.read()
self.ctx = execjs.compile(js)
@staticmethod
def extract_first(values):
for value in values:
if value != '' and value is not None:
return re.sub(r'\s', '', value)
return '\t'
@retry(ValueError, tries=10)
def search_list(self, page):
"""
搜索列表
:return:
"""
url = 'http://www.jsnc.gov.cn/nccqjy/portal.do?method=province_cj_gg_list'
post_data = {
'unitId': '1',
'page': str(page),
'selectName': '江苏省',
'unitID': '',
'proType': '40286f8147d283fc0147d2a637c00001',
'proArea': '',
'proCode': '',
'startTime': '2018-01-01',
'endTime': '2018-12-31',
}
response = requests.post(url, data=post_data, headers=self.headers)
print(response.status_code)
if 'JLyKZlWgYjpTkAsEt9LnA' in response.text:
self.set_cooke()
raise ValueError('刷新cookie')
html = etree.HTML(response.content)
items = html.xpath('//table[@class="show_data"]/tr')[1:]
for item in items:
p_id = self.extract_first(item.xpath('./td[1]/text()'))
name = self.extract_first(item.xpath('./td[2]/a/text()'))
address = self.extract_first(item.xpath('./td[3]/span/text()'))
date = self.extract_first(item.xpath('./td[4]/text()'))
# 处理链接
link = self.extract_first(item.xpath('./td[2]/a/@onclick'))
link = 'http://www.jsnc.gov.cn' + re.findall(r"'(.+?)'", link)[0]
line = [p_id, name, address, date]
# self.get_info(link, line)
self.pool.spawn(self.get_info, link, line)
self.wb.save('info.xlsx')
@retry(ValueError, tries=10)
def get_info(self, link, s_line):
"""
获取详情
:param link:
:return:
"""
response = requests.get(link, headers=self.headers)
if 'JLyKZlWgYjpTkAsEt9LnA' in response.text:
self.set_cooke()
raise ValueError('刷新cookie')
html = etree.HTML(response.text)
# 流转方式
flow_way = self.extract_first(html.xpath('//td[contains(text(), "流转方式")]/following-sibling::td[1]/text()'))
# 流转期限
flow_limit = self.extract_first(html.xpath('//td[contains(text(), "流转期限")]/following-sibling::td[1]/text()'))
# 是否续租
is_zu = self.extract_first(html.xpath('//span[contains(text(), "是否续租")]/../text()'))
# 流转用途
desc = self.extract_first(html.xpath('//span[contains(text(), "流转用途")]/../text()'))
# 交易面积
trad_area = self.extract_first(html.xpath('//span[contains(text(), "交易面积")]/../text()'))
# 交易方式
trad_way = self.extract_first(html.xpath('//span[contains(text(), "交易方式")]/../text()'))
# 成交价
price = self.extract_first(html.xpath('//span[contains(text(), "成交价")]/../text()'))
line = s_line + [flow_way, flow_limit, is_zu, desc, trad_area, trad_way, price]
self.ws.append(line)
logger.info(f'【INFO】获取数据: {line[1]}')
def set_cooke(self):
"""
设置cookie
:return:
"""
js_url = 'http://www.jsnc.gov.cn/mRnE3GFBhtb7/MXC5cdd/a6a1a7'
response = requests.get(js_url, headers=self.headers)
go = re.findall(r'function _\$g0\(\){return "(.+?)"', response.text)
if go:
cookie = self.ctx.call('get_cookie', go[0])
cookie = f'YwnBCHQI8xgWI5a={cookie}'
logger.info(f'【INFO】设置cookie={cookie}')
self.headers['Cookie'] = cookie
else:
raise ValueError('cookie 设置失败')
def crawl(self):
self.set_cooke()
for page in range(1, 1340):
logger.info(f'【INFO】 当前页数: {page}')
self.search_list(page)
self.pool.join()
self.wb.save('info.xlsx')
if __name__ == '__main__':
spider = LongSpider()
spider.crawl()