-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawling.py
79 lines (55 loc) · 2.3 KB
/
crawling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from selenium import webdriver as wd
from bs4 import BeautifulSoup
import time
import argparse
import json
def getText(html_tag):
return html_tag.replace('\n', '').replace('\t', '').replace(' ','')
def getJson(tag):
json = {}
user_channel = tag.select('a.ytd-comment-renderer')[0].get('href')
json['user_channel'] = user_channel
user_text = tag.select('a.ytd-comment-renderer > span')[0].text
json['user_text'] = getText(user_text)
user_image = tag.select('img.yt-img-shadow')[0].get('src')
json['user_image'] =user_image
user_time = tag.select('a.yt-formatted-string')[0].text
json['user_time'] = getText(user_time);
user_comment_good_count = tag.select('span.ytd-comment-action-buttons-renderer')[0].text
json['user_comment_good_count'] = getText(user_comment_good_count)
user_comment = tag.select('div.ytd-expander')[0]
json['user_comment'] = str(user_comment)
# user_expander_comment = tag.select('div.ytd-comment-replies-renderer')
# if not user_expander_comment:
# json['user_expander_comment'] = getText(user_expander_comment[0].text);
print(json)
return json;
parser = argparse.ArgumentParser(description='유튜브 댓글을 크롤링합니다.')
parser.add_argument('--url', required=True, help='유튜브 주소를 입력해주세요')
args = parser.parse_args()
driver = wd.Chrome(executable_path="/usr/local/bin/chromedriver")
url = args.url
driver.get(url)
last_page_height = driver.execute_script("return document.documentElement.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
time.sleep(3.0)
new_page_height = driver.execute_script("return document.documentElement.scrollHeight")
if new_page_height == last_page_height:
break
last_page_height = new_page_height
html_source = driver.page_source
driver.close()
soup = BeautifulSoup(html_source, 'lxml')
result_list = []
youtube_tags = soup.select('div.ytd-item-section-renderer ytd-comment-thread-renderer.ytd-item-section-renderer')
for tag in youtube_tags:
print(tag)
result_list.append(getJson(tag))
print("===========")
print(getJson(tag))
print("")
print("")
print(result_list)
with open("student_file.json", "w") as json_file:
json.dump(result_list, json_file)