-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathtwitter_user.py
150 lines (128 loc) · 5.15 KB
/
twitter_user.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import urllib
import os
import eventlet
import time
import pymysql
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from config import *
eventlet.monkey_patch()
chrome_options = Options()
chrome_options.add_argument('--headless') # 使用无头谷歌浏览器模式
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument("--proxy-server=" + proxy_server)
driver = webdriver.Chrome(options=chrome_options)
base_url = 'https://twitter.com/'
# urllib 获取图片并保存
def save_img(path_img, url, save_name):
file_name_full = path_img + save_name + '.png'
if os.path.exists(file_name_full) and os.path.getsize(file_name_full) > 0:
print('pic:' + save_name + '.png exists')
return 1
with eventlet.Timeout(TIME_OUT, False): # 设置超时时间为10秒
bytes = urllib.request.urlopen(url)
f = open(file_name_full, 'wb')
f.write(bytes.read())
f.flush()
f.close()
print('pic:' + save_name + '.png save success...')
return 1
return 0
# 图片保存命名
def save_img_src(img_src):
if 'profile_banners' in str(img_src):
save_name = username + '_banner'
elif 'profile_images' in str(img_src):
if '400x400' in str(img_src):
save_name = username + '_avatar'
else:
return 1
elif 'emoji' in str(img_src):
save_name = 'emoji_' + str(str(img_src).split('/')[-1][:-4])
else:
save_name = username + '_' + str(str(img_src).split('/')[-1][:-4])
return save_img(path_img, img_src, save_name)
# 插入图片src到数据库
def insert_user_info(username, img_src):
try:
cue.execute("insert ignore into twitter_img "
"(username, img_src) "
"values (%s,%s)",
[username, img_src])
except Exception as e:
print('Insert error:', e)
con.rollback()
else:
con.commit()
# selenium driver 获取用户meida下图片链接
def get_users_media_driver(username):
start_url = base_url + username + '/media'
print('driver get url:', start_url)
driver.get(start_url)
timeline_ok = driver.find_elements_by_xpath('//div[@class="ProfileTimeline "]')
print(timeline_ok)
if len(timeline_ok) < 1:
print(username, '媒体页面加载失败')
lst_usernames.append(username) # 放到序列后面再次尝试
return
# 判断是否滑到最后了
lst_end = driver.find_elements_by_xpath('//div[@class="timeline-end has-items"]')
lst_no_end = driver.find_elements_by_xpath('//div[@class="timeline-end has-items has-more-items"]')
count_scroll = 0
while len(lst_end) < 1 or len(lst_no_end) > 0 and count_scroll < 200: # 限制滑动次数
# 滑倒底
js = "window.scrollTo(0,document.body.scrollHeight)"
driver.execute_script(js)
count_scroll += 1
print(username, 'scroll:', count_scroll)
time.sleep(2)
lst_end = driver.find_elements_by_xpath('//div[@class="stream-end-inner"]')
lst_no_end = driver.find_elements_by_xpath('//div[@class="timeline-end has-items has-more-items"]')
# 获取所有图片链接
lst_img = [img.get_attribute('src') for img in driver.find_elements_by_xpath('//img')]
lst_img = list(set(lst_img))
count_all = len(lst_img)
print(username, 'unique img count:', count_all)
for count in range(count_all):
img_src = lst_img[count]
print(username, count, '/', count_all, img_src)
if img_src != None:
# 插入到数据库
insert_user_info(username, img_src)
def down_img(username=None):
if username:
cue.execute("select count(*) from twitter_img where username = '" + username + "' and is_crawled = 0")
else:
cue.execute("select count(*) from twitter_img where is_crawled = 0")
count_all = cue.fetchone()[0]
print('down img count all:', count_all)
count = 0
while count < count_all:
# 随机返回一条未爬取的图片链接
if username:
cue.execute(
"select img_src from twitter_img where username = '" + username + "' and is_crawled = 0 ORDER BY RAND() limit 1")
else:
cue.execute("select img_src from twitter_img where is_crawled = 0 ORDER BY RAND() limit 1")
img_src = cue.fetchone()[0]
print(username, count, '/', count_all, img_src)
r = save_img_src(img_src)
if r:
# 标记为已爬取
cue.execute("update twitter_img set is_crawled = 1 where img_src = (%s)", img_src)
con.commit()
count += 1
if __name__ == '__main__':
con = pymysql.connect(host=host, user=user, passwd=psd, db=db, charset=c,
port=port)
cue = con.cursor()
if TYPE_DB_OR_IMG == 1:
for username in lst_usernames:
get_users_media_driver(username)
elif TYPE_DB_OR_IMG == 2:
for username in lst_usernames:
path_img = PATH_IMGS + username + '/'
if not os.path.exists(path_img):
os.makedirs(path_img)
down_img(username)