-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathweiboSpider_v1.0.3.1.py
304 lines (294 loc) · 16 KB
/
weiboSpider_v1.0.3.1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from lxml import etree
import pandas
import json
import datetime
import time
import re
import selenium.common.exceptions
# from get_cookie import get_cookie
class GetWeibo:
browser_options = Options()
# 不显示浏览器界面
browser_options.add_argument("--headless")
# 不使用沙盒模式
# browser_options.add_argument("--no-sandbox")
browser = webdriver.Chrome(options=browser_options)
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome'
'/103.0.0.0 Safari/537.36'}
print("浏览器已成功创建。")
def __init__(self):
self.base_url = 'https://s.weibo.com/weibo'
self.keywords = None
self.origin = None
self.time_judge = None
# 如果cookie失效,请重新运行
# get_cookie()
self.main()
def open_search(self):
self.browser.get(self.base_url)
self.browser.delete_all_cookies()
time.sleep(8)
print(f'微博搜索页面{self.browser.current_url}已成功打开...')
kw = self.browser.find_element(By.XPATH, ('//div[@class="searchbox"]/div[@class="search-input"]/'
'input[@type="text"]'))
self.keywords = input('请输入微博搜索的关键词,按回车键确认:')
print(f'搜索关键词为:{self.keywords}。')
while True:
self.origin = input('搜索所有微博请输入1,按回车键确认,直接按回车键则只搜索原创微博:')
if self.origin == '':
self.origin = '&scope=ori'
print('仅搜索原创微博。')
break
elif self.origin == '1':
self.origin = '&typeall=1'
print('搜索全部微博。')
break
else:
print('输入错误,请重新输入。')
continue
while True:
date_time = input('请按年-月-日-时的格式输入抓取微博的发布截止时间(示例:2022-08-03-07),按回车键确认,直接按回车键则截止时间为当前时间:')
if date_time == '':
date_format = '%Y-%m-%d-%H'
date_time = datetime.datetime.now().strftime(date_format)
date_time = (datetime.datetime.strptime(date_time, date_format) + (
datetime.timedelta(hours=+1))).strftime(date_format)
print('截止时间为:当前时间。')
break
elif re.match(r'(2\d{3})-'
r'('
r'('
r'(0[13578]|1[02])-(0[1-9]|[12]\d|3[01])-'
r')|'
r'('
r'(0[469]|11)-(0[1-9]|[12]\d|30)-'
r')|'
r'('
r'02-(0[1-9]|1[\d|2[0-8])-'
r')'
r'('
r'(0)|([01]\d)|(2[0-3])'
r')'
r')', date_time) is None:
print('时间格式输入错误,请重新输入!')
continue
else:
print(f'截止时间为:{date_time}。')
break
self.time_judge = datetime.datetime.strptime(date_time, '%Y-%m-%d-%H')
while True:
page_begin = input('请输入微博列表的抓取起始页(0至50之间),按回车键确认,直接按回车键从第1页开始:')
if page_begin == '':
page_begin = ''
print('抓取起始页为:第1页。')
break
elif re.match(r'([1-4]\d|50)', page_begin) is None:
print('抓取起始页输入错误,请重新输入!')
continue
else:
print(f'抓取起始页为:第{page_begin}页。')
page_begin = '&page=' + str(page_begin)
break
kw.send_keys(self.keywords)
click_search = self.browser.find_element(By.XPATH, '//div[@class="searchbox"]/button[@class="s-btn-b"]')
click_search.click()
time.sleep(1)
click_list = self.browser.find_element(By.XPATH, '//div[@class ="m-main-nav"]/ul/li[2]/a')
click_list.click()
time.sleep(1)
print(f'微博列表页面{self.browser.current_url}已成功打开,列表按时间倒序排序。')
with open('cookies.txt', 'r') as f:
cookies_list = json.load(f)
for cookie in cookies_list:
if isinstance(cookie.get('expiry'), float):
cookie['expiry'] = int(cookie['expiry'])
self.browser.add_cookie(cookie)
self.browser.refresh()
date_format = '%Y-%m-%d-%H'
date_past = (datetime.datetime.strptime(date_time, date_format) + datetime.timedelta(days=-31)).strftime(
date_format)
url = self.browser.current_url
url_change = re.search(r'(.*)(?=q=)', url)
url = url_change.group() + f'q={self.keywords}{self.origin}&suball=1×cope=custom:{date_past}:{date_time}&Refer=g{page_begin}'
print(f'本次抓取的开始时间是:{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
search_times = 0
return url, search_times
def auto_search(self, url, search_times):
if url != self.browser.current_url:
self.browser.get(url)
print(f'微博列表页面{self.browser.current_url}已打开,抓取中...')
time.sleep(1)
data = etree.HTML(self.browser.page_source)
post_url = data.xpath('//p[@class="from"]/a[1]/@href')
if len(post_url) == 0:
post_url = data.xpath('//div[@class="from"]/a[1]/@href')
df = pandas.DataFrame(
columns=['微博账号', '发文时间', '发送平台', '微博内容', '转发次数', '评论次数', '点赞次数', '原博地址'])
for index, url_single in enumerate(post_url):
url = 'https:' + url_single
print(url)
while True:
self.browser.get(url)
time.sleep(1)
post = etree.HTML(self.browser.page_source)
names = post.xpath('//a[@usercard]/span[@title]/text()')
print(names)
time_ = post.xpath('//a[@title][@href][@class][1]/text()')
time_ = f'20{"".join(time_).strip()}'
if time_ == '20':
print('解析错误,正在处理...')
time.sleep(60)
self.browser.back()
continue
elif index == 0:
try:
time_mark = datetime.datetime.strptime(time_, '%Y-%m-%d %H:%M')
if time_mark > self.time_judge:
next_time = (datetime.datetime.now() + (datetime.timedelta(seconds=+3601))).strftime(
'%Y-%m-%d %H:%M:%S')
_target_time = datetime.datetime.strptime(next_time, '%Y-%m-%d %H:%M:%S')
_time = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
print(f'达到单时段最大次数限制。当前时间是{_time},目前已抓取{search_times}条微博,下次抓取时间:{next_time},现在睡眠中...')
while datetime.datetime.now() < _target_time:
time.sleep(60)
self.browser.back()
click_next = None
while True:
try:
click_next = self.browser.find_element(By.XPATH,
'//div[@class="m-page"]/div/a[@class="next"]')
break
except selenium.common.exceptions.NoSuchElementException as E:
print(repr(E))
next_time = (
datetime.datetime.now() + (datetime.timedelta(seconds=+3601))).strftime(
'%Y-%m-%d %H:%M:%S')
_target_time = datetime.datetime.strptime(next_time, '%Y-%m-%d %H:%M:%S')
_time = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
print(
f'达到单时段最大次数限制。当前时间是{_time},目前已抓取{search_times}条微博,下次抓取时间:{next_time},现在睡眠中...')
while datetime.datetime.now() < _target_time:
time.sleep(120)
self.browser.back()
continue
click_next.click()
url = self.browser.current_url
return url, search_times
except ValueError as VE:
print(repr(VE))
time.sleep(60)
self.browser.back()
continue
break
print(time_)
from1 = post.xpath('//div[@class="woo-box-flex"]/div[@title]/text()')
from2 = post.xpath('//div[@class="woo-box-flex"]/div[contains(@class, "head-info_cut")]/text()')
from1 = ''.join(from1)
from2 = ''.join(from2)
from_all = from1 + from2
blogs = post.xpath('//div[contains(@class, "detail_text")]/div/text()')
blogs = ''.join(blogs)
forward = post.xpath('//span[@class="woo-pop-ctrl"]/div/span/text()')
forward = [0 if i == ' 转发 ' else i for i in forward]
if '万' not in forward:
pass
else:
forward = ''.join(forward)
forward = int(float(forward[0:-1]) * 10000)
comments = post.xpath('//div[contains(@class, "woo-box-item-flex toolbar_item")]'
'/div[contains(@class, "woo-box-flex")]/span/text()')
comments = [0 if i == ' 评论 ' else i for i in comments]
if '万' not in comments:
pass
else:
comments = ''.join(comments)
comments = int(float(comments[0:-1]) * 10000)
likes = post.xpath('//div[contains(@class, "toolbar_likebox")]/button/span[@class="woo-like-count"]/text()')
likes = [0 if i == '赞' else i for i in likes]
if '万' not in likes:
pass
else:
likes = ''.join(likes)
likes = int(float(likes[0:-1]) * 10000)
key_list = ['微博账号', '发文时间', '发送平台', '微博内容', '转发次数', '评论次数', '点赞次数', '原博地址']
info_list = [names, time_, from_all, blogs, forward, comments, likes, url]
csv_info = dict(zip(key_list, info_list))
df1 = pandas.DataFrame(csv_info, columns=key_list)
df = pandas.concat([df, df1])
time.sleep(.5)
self.browser.back()
search_times += 1
df.to_csv('weibo_spider.csv', mode='a', encoding='utf_8_sig', header=False, index=False)
url = self.browser.current_url
page_num = url.split('page=')[-1]
if page_num.isdigit() is False:
page_num = '1'
page_num = int(page_num)
_time = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
print(f'已成功提取第{page_num}页微博信息并追加写入CSV文件!当前时间是{_time},目前已抓取{search_times}条微博。')
if page_num == 50:
post = etree.HTML(self.browser.page_source)
time_last = post.xpath('//p[@class="from"]/a[1]/text()')
if len(time_last) == 0:
time_last = post.xpath('//div[@class="from"]/a[1]/text()')
time_last = time_last[-1]
if '年' in time_last:
year_num = ''.join(re.findall(r'(\d+)', time_last)[0])
mon_num = ''.join(re.findall(r'(\d+)', time_last)[1])
day_num = ''.join(re.findall(r'(\d+)', time_last)[2])
hour_num = ''.join(re.findall(r'(\d+)', time_last)[3])
min_num = ''.join(re.findall(r'(\d+)', time_last)[4])
elif '今天' in time_last:
year_num = str(datetime.datetime.today().year)
mon_num = str(datetime.datetime.today().month)
day_num = str(datetime.datetime.today().day)
hour_num = ''.join(re.findall(r'(\d+)', time_last)[2])
min_num = ''.join(re.findall(r'(\d+)', time_last)[3])
else:
year_num = str(datetime.datetime.today().year)
mon_num = ''.join(re.findall(r'(\d+)', time_last)[0])
day_num = ''.join(re.findall(r'(\d+)', time_last)[1])
hour_num = ''.join(re.findall(r'(\d+)', time_last)[2])
min_num = ''.join(re.findall(r'(\d+)', time_last)[3])
time_last = year_num + '-' + mon_num + '-' + day_num + ' ' + hour_num + ':' + min_num
time_last = datetime.datetime.strptime(time_last, '%Y-%m-%d %H:%M')
time_last = datetime.datetime.strftime(time_last, '%Y-%m-%d-%H')
date_format = '%Y-%m-%d-%H'
time_begin = (datetime.datetime.strptime(time_last, date_format) +
datetime.timedelta(days=-31)).strftime(date_format)
time_end = (datetime.datetime.strptime(time_last, date_format) +
datetime.timedelta(hours=+1)).strftime(date_format)
print(f'这是第50页,本页最后一条微博时间为{time_last}。当前时间是{_time},目前已抓取{search_times}条微博。准备跳转页面...')
url = self.browser.current_url
url_change = re.search(r'(.*)(?=q=)', url)
url = url_change.group() + f'q={self.keywords}{self.origin}&suball=1×cope=custom:{time_begin}:{time_end}&Refer=g&page=1'
return url, search_times
click_next = None
while True:
try:
click_next = self.browser.find_element(By.XPATH,
'//div[@class="m-page"]/div/a[@class="next"]')
break
except selenium.common.exceptions.NoSuchElementException as E:
print(repr(E))
next_time = (datetime.datetime.now() + (datetime.timedelta(seconds=+3601))).strftime('%Y-%m-%d %H:%M:%S')
_target_time = datetime.datetime.strptime(next_time, '%Y-%m-%d %H:%M:%S')
_time = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
print(f'达到单时段最大次数限制。当前时间是{_time},目前已抓取{search_times}条微博,下次抓取时间:{next_time},现在睡眠中...')
while datetime.datetime.now() < _target_time:
time.sleep(60)
self.browser.back()
continue
click_next.click()
url = self.browser.current_url
return url, search_times
def main(self):
url, search_times = self.open_search()
while True:
url, search_times = self.auto_search(url, search_times)
if __name__ == '__main__':
gt = GetWeibo()