-
Notifications
You must be signed in to change notification settings - Fork 40
/
Copy pathquoraCrawler.py
executable file
·467 lines (390 loc) · 17.6 KB
/
quoraCrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import os
DEBUG = 1
def crawlTopicHierarchy():
if (DEBUG): print "In crawlTopicHierarchy()..."
# Create files for topic names and topic URLs
file_topic_names = open("topic_names.txt", mode = 'w')
file_topic_urls = open("topic_urls.txt", mode = 'w')
# Starting node link
url = 'http://www.quora.com/Preventive-Medicine?share=1'
depth = 0
topic_names_hierarchy = ""
# Create stack to keep track of links to visit and visited
urls_to_visit = []
urls_visited = []
# Add root to stack
urls_to_visit.append([url, depth])
#if (DEBUG): print urls_to_visit
while (len(urls_to_visit)):
# Pop stack of stack to get URL and current depth
url, current_depth = urls_to_visit.pop()
if (DEBUG): print 'Current url:{0} current depth:{1} depth:{2}'.format(url, str(current_depth), str(depth))
page_name = url[21:].split('?')[0]
if (DEBUG): print page_name
urls_visited.append([url, page_name])
if (current_depth < depth):
for i in range(depth - current_depth):
j = topic_names_hierarchy.rfind(" ")
if (j != -1):
topic_names_hierarchy = topic_names_hierarchy[:j]
depth = current_depth
# Record topic Name
if (depth == 0):
file_topic_names.write((page_name + '\n').encode('utf-8'))
else:
file_topic_names.write((topic_names_hierarchy + " " + page_name + '\n').encode('utf-8'))
depth += 1
# Record topic URL
file_topic_urls.write((url + '\n').encode('utf-8'))
url_about = url.split('?')[0] + "/about?share=1"
chromedriver = "chromedriver" # Needed?
os.environ["webdriver.chrome.driver"] = chromedriver # Needed?
browser = webdriver.Chrome()
browser.get(url_about)
# Fetch /about page
src_updated = browser.page_source
src = ""
while src != src_updated:
time.sleep(.5)
src = src_updated
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
src_updated = browser.page_source
html_source = browser.page_source
soup = BeautifulSoup(html_source)
raw_topics = soup.find_all(attrs={"class":"topic_name"})
#print raw_topics
# Split to get just child topics
split_html = html_source.split('<strong>Child Topics</strong>')
if (len(split_html) == 1):
browser.quit()
else:
split_html = split_html[1]
# Split to separate child topics
split_child = split_html.split('<div class="topic_list_item"')
child_count = 0
for i in range(1, len(split_child)):
part = split_child[i].split('class="light"')[0]
part_soup = BeautifulSoup(part)
for link in part_soup.find_all('a', href=True):
link_url = "http://www.quora.com" + link['href'] + "?share=1"
urls_to_visit.append([link_url, depth])
child_count += 1
browser.quit()
if (topic_names_hierarchy):
topic_names_hierarchy += " " + page_name
else:
topic_names_hierarchy += page_name
if (DEBUG): print "Links read: " + str(child_count)
# File cleanup
file_topic_names.close()
file_topic_urls.close()
return urls_visited
# Crawl each topic url and save each question url
def crawlTopicQuestions(topic_urls):
if (DEBUG): print "In crawlTopicQuestions()...", topic_urls
# Create a topic page and download all question text and URL
file_question_urls = open("question_urls.txt", mode = 'w')
file_topic_urls = open("topic_urls.txt", mode = 'r')
total = 0
for topic in range(len(topic_urls)):
current_url = topic_urls[topic][0]
current_topic = topic_urls[topic][1]
if (not current_url): # Needed?
break
# Open browser
chromedriver = "chromedriver" # Needed?
os.environ["webdriver.chrome.driver"] = chromedriver # Needed?
browser = webdriver.Chrome()
browser.get(current_url)
# Fetch current page
#fw = open("page", mode = 'w')
src_updated = browser.page_source
src = ""
while src != src_updated:
time.sleep(.5)
src = src_updated
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
src_updated = browser.page_source
html_source = browser.page_source
#fw.write(html_source.encode('utf8'))
#fw.close()
browser.quit()
split_html = html_source.split("<h3>")
for i in range(1,len(split_html)):
part = split_html[i].split('</h3>')[0]
part_soup = BeautifulSoup(part)
if ("<div") in part:
#print part_soup.get_text()
for link in part_soup.find_all('a' , href=True):
link_url = "http://www.quora.com" + link['href'] + "?share=1"
file_question_urls.write((link_url + " " + current_topic + '\n').encode('utf-8'))
total += 1
print "Total questions:{0}".format(str(total))
return 0
# Crawl a question URL and save data into a csv file
def crawlQuestionData(file):
if (DEBUG): print ("In crawlQuestionData...")
# Open question url file
file_question_urls = open(file, mode = 'r')
file_data = open("answers.csv", mode = 'w')
file_users = open("users.txt", mode = 'w')
current_line = file_question_urls.readline()
while (current_line):
if (DEBUG): print "***", current_line
question_id = current_line.split(" ")[0]
current_topic = current_line.split(" ")[1].rstrip('\n')
if (DEBUG): print question_id, "-", current_topic
# Open browser to current_question_url
chromedriver = "chromedriver" # Needed?
os.environ["webdriver.chrome.driver"] = chromedriver # Needed?
browser = webdriver.Chrome()
browser.get(question_id)
# Fetch page
src_updated = browser.page_source
src = ""
while src != src_updated:
time.sleep(.5)
src = src_updated
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
src_updated = browser.page_source
# Load "more" of the users who upvoted
more_link = browser.find_elements_by_partial_link_text("ore")
#if (DEBUG): print more_link
if (DEBUG): print "Number of clicks:", len(more_link)
for each in more_link:
if (DEBUG): print "Click on:", each
each.click()
time.sleep(.5)
html_source = browser.page_source
browser.quit()
# Find topics tagged on question
topic_string = ""
topic_list_soup = BeautifulSoup(html_source)
topic_raw = topic_list_soup.find_all(attrs={"class":"topic_list_item"})
for x in range(len(topic_raw)):
#if (DEBUG): print topic_raw[x]
topic_raw_soup = BeautifulSoup(str(topic_raw[x]))
for topic in topic_raw_soup.find_all('a', href=True):
if (topic_string):
topic_string += ", " + topic['href'].split("/")[1]
else:
topic_string += topic['href'].split("/")[1]
topic_string = "{{{" + topic_string + "}}}"
if (DEBUG): print "Topic List:{0}".format(topic_string)
# Find question text
question_text = html_source.split("<h1>")[1]
question_text = question_text.split("</h1>")[0]
question_text = question_text.split(">")
question_text = "{{{" + question_text[len(question_text)-1] + "}}}"
if (DEBUG): print "Question text:{0}".format(question_text)
# Split html to parts
split_html = html_source.split('<div class="e_col w5 answer_border answer_text_wrapper">')
if (DEBUG): print "Length of split_html:{0}".format(len(split_html))
for i in range(1, len(split_html)):
part = split_html[i]
if (DEBUG): print part
part_soup = BeautifulSoup(part)
# Find number of upvotes
upvote = part_soup.find(attrs={"class":"numbers"})
upvote = (str(upvote).split("</span")[0]).split(">")[1]
if (DEBUG): print "Upvote:{0}".format(upvote)
# Find user id
user_id_raw = part_soup.find(attrs={"class":"answer_user_wrapper"})
user_id_soup = BeautifulSoup(str(user_id_raw))
user_id = user_id_soup.find('a', href=True)
if (not user_id):
continue
user_id = "http://www.quora.com" + user_id['href'] + "?share=1"
file_users.write((user_id + '\n').encode('utf8'))
if (DEBUG): print user_id
# Set answer id as question url + user id
answer_id = question_id + "-" + user_id
# Find users (user id) who voted
users_voted = ""
users_voted_raw = part_soup.find_all(attrs={"class":"user"})
#if (DEBUG): print users_voted_raw
for x in range(1, len(users_voted_raw)):
users_voted_soup = BeautifulSoup(str(users_voted_raw[x]))
for user in users_voted_soup.find_all('a', href=True):
if (users_voted):
users_voted += ", " + "http://www.quora.com" + user['href'] + "?share=1"
else:
users_voted += "http://www.quora.com" + user['href'] + "?share=1"
file_users.write(("http://www.quora.com" + user['href'] + "?share=1" + '\n').encode('utf-8'))
users_voted = "{{{" + users_voted + "}}}"
# Find answer text
answer_text = ""
answer_text = part_soup.find(attrs={"class":"answer_content"}).text
answer_text = answer_text.split("Embed Quote")[0]
if (DEBUG): print answer_text
answer_text = "{{{" + answer_text + "}}}"
# Find date
date = ""
date = part_soup.find(attrs={"class":"answer_permalink"}).text
if (DEBUG): print "Date:", date
# Write to csv file
s = answer_id + ", " + question_id + ", " + user_id + ", " + str(date) + ", " + str(upvote) + ", " + users_voted + ", " + topic_string + ", " + current_topic + ", " + question_text + ", " + answer_text
file_data.write((s + '\n').encode('utf8'))
current_line = file_question_urls.readline()
file_question_urls.close()
file_data.close()
file_users.close()
return 0
# Gather user data and save into csv file
def crawlUser():
if (DEBUG): print "In crawlUser..."
unique_users = set(open("users.txt").readlines())
bar = open('temp.txt', 'w').writelines(set(unique_users))
file_users = open("temp.txt", mode='r')
file_users_csv = open("users.csv", mode='w')
total = 0
current_line = file_users.readline()
while(current_line):
# Open browser to current_question_url
chromedriver = "chromedriver" # Needed?
os.environ["webdriver.chrome.driver"] = chromedriver # Needed?
browser = webdriver.Chrome()
browser.get(current_line)
# Fetch page
src_updated = browser.page_source
src = ""
while src != src_updated:
time.sleep(.5)
src = src_updated
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Find user id
user_id = browser.current_url
html_source = browser.page_source
browser.quit()
source_soup = BeautifulSoup(html_source)
part = source_soup.find_all(attrs={"class":"link_label"})
part_soup = BeautifulSoup(str(part))
raw_info = part_soup.text.split(",")
if (DEBUG): print raw_info
for x in range(1, len(raw_info)):
#if (DEBUG): print raw_info[x]
key = raw_info[x].split(" ")[1]
value = raw_info[x].split(" ")[2]
if key == "Topics":
num_topics = value
if (DEBUG): print "num_topics:", num_topics
elif key == "Blogs":
num_blogs = value
if (DEBUG): print "num_blogs:", num_blogs
elif key == "Questions":
num_questions = value
if (DEBUG): print "num_questions:", num_questions
elif key == "Answers":
num_answers = value
if (DEBUG): print "num_answers:", num_answers
elif key == "Edits":
value = value.split("]")[0]
num_edits = value
if (DEBUG): print "num_edit:", num_edits
# Find followers
followers_url = user_id.split('?')[0] + "/followers?share=1"
browser = webdriver.Chrome()
browser.get(followers_url)
src_updated = browser.page_source
src = ""
while src != src_updated:
time.sleep(.5)
src = src_updated
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
src_updated = browser.page_source
followers_html_source = browser.page_source
browser.quit()
followers_soup = BeautifulSoup(followers_html_source)
followers_raw = followers_soup.find_all(attrs={"class":"user"})
if (DEBUG): print "num of followers:", len(followers_raw)
followers = ""
count = 0
for x in range(1, len(followers_raw)):
followers_soup = BeautifulSoup(str(followers_raw[x]))
for follower in followers_soup.find_all('a', href=True):
count += 1
if (followers):
followers += ", " + "http://www.quora.com" + follower['href'] + "?share=1"
else:
followers += "http://www.quora.com" + follower['href'] + "?share=1"
if (DEBUG): print "Followers count:", count
followers = "{{{" + followers + "}}}"
# Find following
following_url = user_id.split('?')[0] + "/following?share=1"
browser = webdriver.Chrome()
browser.get(following_url)
src_updated = browser.page_source
src = ""
while src != src_updated:
time.sleep(.5)
src = src_updated
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
src_updated = browser.page_source
following_html_source = browser.page_source
browser.quit()
following_soup = BeautifulSoup(following_html_source)
following_raw = following_soup.find_all(attrs={"class":"user"})
if (DEBUG): print "num of following:", len(following_raw)
following = ""
count = 0
for x in range(1, len(following_raw)):
following_soup = BeautifulSoup(str(following_raw[x]))
for each_following in following_soup.find_all('a', href=True):
count += 1
if (following):
following += ", " + "http://www.quora.com" + each_following['href'] + "?share=1"
else:
following += "http://www.quora.com" + each_following['href'] + "?share=1"
if (DEBUG): print "Following count:", count
following = "{{{" + following + "}}}"
s = user_id + ", " + str(num_topics) + ", " + str(num_blogs) + ", " + str(num_questions) + ", " + str(num_answers) + ", " + str(num_edits) + ", " + followers + ", " + following
if (DEBUG): print s
file_users_csv.write((s + '\n').encode('utf8'))
current_line = file_users.readline()
total += 1
file_users.close()
file_users_csv.close()
print "Total users:{0}".format(str(total))
return 0
# Reads a line of users.csv format and return the fields in separate variabes
def parseUsersFile(line):
parts = line.split(',', 6)
user_id = parts[0]
number_of_upvotes = parts[1]
number_of_blogs = parts[2]
number_of_questions = parts[3]
number_of_answers = parts[4]
number_of_edits = parts[5]
rest = parts[6]
followers = rest.split('}}}', 2)[0].split('{{{')[1]
following = rest.split('}}}', 2)[1].split('{{{')[1]
return user_id, number_of_upvotes, number_of_blogs, number_of_questions, number_of_answers, number_of_edits, followers, following
# Reads a line of answers.csv format and return the fields in separate variabes
def parseAnswersFile(line):
parts = line.split(',', 5)
answer_id = parts[0]
question_id = parts[1]
user_id = parts[2]
date = parts[3]
number_of_upvotes = parts[4]
rest = parts[5]
users_who_upvoted = (rest.split('}}}')[0]).split('{{{')[1]
topics = (rest.split('}}}',3)[1]).split('{{{')[1]
if (DEBUG): print topics
current_topics = (rest.split('}}}',3)[2]).split(',',2)[1].split(',',2)[0]
if (DEBUG): print current_topics
question_text = (rest.split('}}}',4)[2]).split('{{{')[1]
if (DEBUG): print question_text
answer_text = (rest.split('}}}',5)[3]).split('{{{')[1]
if (DEBUG): print answer_text
return answer_id, question_id, user_id, number_of_upvotes, users_who_upvoted, topics, current_topics, question_text, answer_text
def main():
topics = crawlTopicHierarchy()
crawlTopicQuestions(topics)
crawlQuestionData("question_urls.txt")
crawlUser();
return 0
if __name__ == "__main__": main()