forked from gontarini/Crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
graph_retrieving.py
429 lines (346 loc) · 14.5 KB
/
graph_retrieving.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
'''
Crawler application for social media.
Executable part of whole project. Giving name of social media into parameters will start for it a process.
Only one parameter is demanded, becouse only one social media can be exploring by one executing. Next parameter is optional
and it indicates if application should build database from scratch. If yes, it is necessary to add a parameter after social media
name called 'init'.
Args:
- facebook, twitter, youtube
- unnique identicator for running instance
- init
Example:
$ python graph_retrieving.py facebook (without building new database)
$ python graph_retrieving.py facebook init (with building new one)
'''
import urllib, json, yaml, twitter
import fifo,sys, Queue
import guess_language, databases
class BFS_retrieving:
'''Class implements BFS graph algorithm and its helpful method in order to collect data from specified social_media.
Executed correctly class methods effectively gives huge amount of data stored in specified database.
'''
def __init__(self, channel, identifier, init=None):
'''Loads configuration file, initialise databases to store output data and queue of page's id's.
:param channel: specified channel in command line arguments
:param init: by default set to None. Parameter tells if database have to be build from scratch.
:param identifier: current process name
'''
config = self.load_config_file()
self.channel = channel
if init is None:
self.manage_data = databases.data(config, self.channel)
self.fifo = fifo.Fifo_queue(config, self.channel, identifier)
else:
self.manage_data = databases.data(config, self.channel, init)
self.fifo = fifo.Fifo_queue(config, self.channel, identifier, init)
if channel == 'facebook':
self.set_access_token(config[channel]['access_token'])
self.url = config[channel]['url']
elif channel == 'youtube':
self.app_key = config[channel]['key']
self.url = config[channel]['url']
elif channel == 'twitter':
self.authentication(config)
pass
def load_config_file(self):
try:
stream = open('config/config.yml', 'r')
except:
try:
stream = open('config/config.yaml', 'r')
except IOError:
print IOError
return yaml.load(stream)
pass
def authentication(self, config):
'''Does process of future request authentication
Args:
config (dict): configuration file loaded
'''
try:
self.api = twitter.Api(consumer_key=config[self.channel]['Authentication']['consumer_key'],
consumer_secret=config[self.channel]['Authentication']['consumer_secret'],
access_token_key=config[self.channel]['Authentication']['access_token_key'],
access_token_secret=config[self.channel]['Authentication']['access_token_secret'],
sleep_on_rate_limit=True)
except twitter.TwitterError:
print twitter.TwitterError
pass
def get_followers(self, id, cursor):
'''Retrieves 200 followers from twitter with their data and next_cursor parameter.
Args:
id (int): specific id for the account
cursor (int): starting point to retrieve on
Returns:
:type tuple where index 0 indicates next cursor, index 1 previous cursor, index 2 list of 200 followers objects
'''
connection = True
while (connection):
try:
self.followers = self.api.GetFriendsPaged(user_id=id, cursor=cursor)
connection = False
except OSError:
connection = True
return self.followers
pass
def get_user_lookup(self, follower):
'''Retrieve useful data about each user from twitter.
Args:
follower: list of follower data
'''
self.user = follower.AsDict()
if 'followers_count' in self.user.keys():
self.followers_count = self.user["followers_count"] ##obserwujacy
else:
self.followers_count = None
if 'friends_count' in self.user.keys():
self.friends_count = self.user["friends_count"] ##obserwowani
else:
self.friends_count = None
self.id = self.user["id"]
if 'verified' in self.user.keys():
self.verified = self.user['verified']
else:
self.verified = None
if 'location' in self.user.keys():
self.location = self.user['location']
else:
self.location = None
if 'status' in self.user.keys() and 'urls' in self.user['status'] and len(self.user['status']['urls']) is not 0:
self.expanded_url = self.user['status']["urls"][0]['expanded_url']
else:
self.expanded_url = None
pass
def get_url(self, id=None):
'''Creates url adress to make request to facebook graph api and google api.
Args:
id (int, optionally): Default is None. Specific number of currently retrieving page or youtube channel
Returns:
url (string): url adress neccessery to make api request on.
'''
if self.channel == 'facebook':
if id is None:
url = self.url % ('sotrender', self.access_token)
url = str(url)
print url
return url
else:
self.id = id
url = self.url % (str(id), self.access_token)
url = str(url)
print url
return url
elif self.channel == 'youtube':
url = self.url % (id, self.app_key)
url = str(url)
print url
return url
pass
def set_access_token(self, access_token):
self.access_token = access_token
pass
def get_json(self, url):
'''Makes request to facebook graph api and youtube api, loads returned json to device memory.
Args:
url (string): adress to make request on
'''
data = urllib.urlopen(url)
self.json = json.load(data)
if self.json.get('error'):
print 'Probably your access token is invalid, trying one more time'
self.get_json(self.get_url(self.id))
pass
def check_conditions(self):
'''Check if given user perform demanded conditions (only for twitter)
Returns:
bool true if conditions performed, false otherwise
'''
if self.verified is True:
return True
elif (self.followers_count > 1000 and self.expanded_url is not None and self.location is not None):
return True
else:
return False
pass
def retrieve(self, id = None):
'''In facebook case retrieving is based on id's of particular page.
For each page included in data loaded from json file there are doing the following steps:
check if that page is already in database,
(if not) put its id into queue,
check language of such page,
insert into output database
In twitter case retrieving is based on id's of particular account which is performing certain condition.
In youtube case retrieving is based on featured channels url id's.
For each channel included in data loaded from json file there are doing the following steps:
check if channel is not already in database,
(if no)
check wether channel contains not empty 'featuredChannelsUrls' parameter,
(if yes) add all parameter into queue, insert channel into output database
(if no) insert channel into database
Args:
id(string): specific identifier for each account
'''
if self.channel == "facebook":
self.retrieve_facebook()
elif self.channel == "twitter":
self.retrieve_twitter(id)
elif self.channel == "youtube":
self.retrieve_youtube()
pass
def retrieve_facebook(self):
self.data = self.json['data']
for page in self.data:
name = page['name']
id = page['id']
check = self.manage_data.check_item(id)
if check is not None:
print "I've been here"
continue
else:
self.fifo.put(id)
if 'about' in page.keys():
about = page['about']
else:
about = None
if 'description' in page.keys():
description = page['description']
else:
description = None
language = self.language_check(name, about, description)
self.manage_data.insert_into(page, language)
pass
def retrieve_youtube(self):
items = self.json['items']
for item in items:
check = self.manage_data.check_item(item['id'])
if check is not None:
print "I've been here"
break
else:
if 'featuredChannelsUrls' in item['brandingSettings']['channel']:
featuredChannelsUrls = item['brandingSettings']['channel']['featuredChannelsUrls']
for url in featuredChannelsUrls:
self.fifo.put(url)
self.manage_data.insert_into(item)
else:
print 'empty parameter featuredChannelsUrls'
self.manage_data.insert_into(item)
break
pass
def retrieve_twitter(self, id):
next_cursor = -1
while next_cursor is not 0:
try:
print "retrieving user's from twitter"
followers_list = self.get_followers(id=id, cursor=next_cursor)
next_cursor = followers_list[0]
for follower in followers_list[2]:
self.get_user_lookup(follower)
check = self.manage_data.check_item(self.id)
if check is not None:
print "I've been here"
else:
if self.check_conditions():
# print "Conditions performed, user added to database"
self.fifo.put(self.id)
self.manage_data.insert_into(self.user)
else:
print "Condition's doesnt performed "
except twitter.TwitterError:
print twitter.TwitterError
pass
def language_check(self, name, about=None, description=None):
'''Guessing language of each page based basically on two available parameters
such as about and description pulled from json file. If both of them are not included,
guessing process is operate on the page name.
Args:
name (string): page name
about (string, optionally): by default set to None. Information about page
description (string, optionally): by default set to None. Different source of information about the page
Returns:
lang (string): guessed language
'''
if about and description is not None:
text = about + " " + description
else:
if description is not None:
text = description
else:
if about is not None:
text = about
else:
text = name
try:
lang = guess_language.guessLanguage(text)
except IndexError:
lang = None
return lang
pass
def run():
'''Basic function which do steps to retrieve all of wanted data from facebook, twitter or youtube.
It is looping until fifo queue is empty.
There are to options to start process:
FACEBOOK:
1. Start from sotrender node (id of sotrender)
2. Start from previously stopped node (page id stored in queue)
YOUTUBE:
There are to options to start process:
1. Start from 'z dupy' channel node (id of that channel)
2. Start from previously stopped node (channel id stored in queue)
TWITTER:
There are to options to start process:
1. Start from verified_pages account gathers all verified accounts on twitter (id of that account)
2. Start from previously stopped node (account id)
'''
if sys.argv.__contains__('init'):
b = BFS_retrieving(sys.argv[1], sys.argv[2], 'init')
else:
b = BFS_retrieving(sys.argv[1], sys.argv[2])
next_id = 0
internal_queue = Queue.Queue()
if b.fifo.is_empty():
while b.fifo.is_empty(): ##exploring graph
tuple_id = b.fifo.get()
for id in tuple_id:
internal_queue.put(id[0])
while internal_queue.empty() is not True:
last_id = next_id
next_id = internal_queue.get()
if next_id is not last_id:
b.fifo.remove(next_id)
if b.channel == "twitter":
b.retrieve(next_id)
else:
b.get_json(b.get_url(id=next_id))
b.retrieve()
print "current id", next_id
else:
if b.channel == "twitter":
id_verified_pages = '63796828' # verified page twitter
b.retrieve(id_verified_pages)
elif b.channel == "youtube":
id = "UCNhtAn1MT600xiNiX8ErnyQ" #id of 'z dupy' channel
b.get_json(b.get_url(id))
b.retrieve()
elif b.channel == "facebook":
b.get_json(b.get_url())
b.retrieve()
while b.fifo.is_empty(): ##exploring graph
tuple_id = b.fifo.get()
for id in tuple_id:
internal_queue.put(id[0])
while internal_queue.empty() is not True:
last_id = next_id
next_id = internal_queue.get()
if next_id is not last_id:
b.fifo.remove(next_id)
if b.channel == "twitter":
b.retrieve(next_id)
else:
b.get_json(b.get_url(id=next_id))
b.retrieve()
print "current id", next_id
pass
b.fifo.close()
if __name__ == '__main__':
run()