-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdouban.py
66 lines (51 loc) · 1.8 KB
/
douban.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# -*- coding: utf-8 -*-
import urllib.request
import urllib.parse
import json
from bs4 import BeautifulSoup
import re
import requests
import ContactRatio, datetime, pytz
import sqlite3
def index():
final = []
num2 = 0
baseurl = 'https://movie.douban.com/subject/1295644/reviews?start='
for i in range(255):
url = baseurl + str(i*20)
html = askurl(url)
findname = re.compile(r'href="https://www.douban.com/people/(.*?)/">')
bs = BeautifulSoup(html, "html.parser")
t_list = bs.find_all()
for item in t_list:
item = str(item)
num = re.findall(findname, item)
if num != []:
if num[0] not in final:
final.append(num[0])
# 添加数据
conn = sqlite3.connect("test.db")
num2 += 1
par = (num2, num[0])
c = conn.cursor()
sql = '''
insert into Leon (num,id)
values(?,?);
'''
c.execute(sql, par)
conn.commit()
conn.close()
if i % 10 == 0:
ContactRatio.sleep(20)
print(final)
print(len(final))
def askurl(url):
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36 Edg/88.0.705.68'
}
req = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(req)
html = response.read().decode("utf-8")
return html
if __name__ == '__main__':
index()