-
Notifications
You must be signed in to change notification settings - Fork 0
/
qidian.py
56 lines (46 loc) · 1.72 KB
/
qidian.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import xlwt
import requests
from lxml import etree
import time
import re
all_info_list = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
}
def get_info(url):
html = requests.get(url, headers=headers)
selector = etree.HTML(html.text)
infos = selector.xpath('//ul[@class="all-img-list cf"]/li')
for info in infos:
title = info.xpath('div[2]/h4/a/text()')[0]
author = info.xpath('div[2]/p[1]/a[1]/text()')[0]
style_1 = info.xpath('div[2]/p[1]/a[2]/text()')[0]
style_2 = info.xpath('div[2]/p[1]/a[3]/text()')[0]
style = style_1 + '·' +style_2
complete = info.xpath('div[2]/p[1]/span/text()')[0]
introduce = info.xpath('div[2]/p[2]/text()')[0].strip()
# 动态
# word = info.xpath('div[2]/p[3]/span/text()')[0]
info_list = [title, author, style, complete, introduce]
all_info_list.append(info_list)
time.sleep(1)
if __name__ == '__main__':
urls = ['https://www.qidian.com/all?page=%s' % i for i in range(1, 50)] # 注意xls文件最多65535行,更多行需要openpyxl库
for url in urls:
get_info(url)
page = re.search('\d*?$', url)
print('page', page.group(), 'has crawled!')
header = ['title', 'author', 'style', 'complete', 'introduce']
book = xlwt.Workbook(encoding='utf-8')
sheet = book.add_sheet('Sheet1')
for h in range(len(header)):
sheet.write(0, h, header[h])
i = 1
for data_list in all_info_list:
j = 0
for data in data_list:
sheet.write(i, j, data)
j += 1
i += 1
book.save('novels_qidian.xls')