-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathSQL.py
322 lines (251 loc) · 11.2 KB
/
SQL.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
import MySQLdb
import logging
import urllib2
from django.utils.encoding import smart_str, smart_unicode
class SQL:
def __init__(self, host, db_user, db_pass, shorteners_file = None):
#default shorteners
self.shorteners = ['bit.ly', 'goo.gl', 'tiny.cc',
't.co', 'tinyurl.com', 'fb.me']
#load shorteners from file
if shorteners_file is not None:
regex = re.compile(' +|,+|\n')
with open(file) as shorteners_file:
ids = regex.split(shorteners_file.read().rstrip())
ids = filter(None, ids)
self.logger = logging.getLogger('TwitterCollector')
try:
self.db_con = MySQLdb.Connect(host, db_user, db_pass)
self.db_con.autocommit(True)
except Exception, e:
print e, "\nCheck DB pass/user in config file.\nExiting..."
self.logger.error(str(e)+"\nCheck DB pass/user in config file."+
"\nExiting...")
exit(0)
###################################################################################
############################# START insert methods ################################
###################################################################################
def __insert_user(self, tweet):
""" Insert user into db.
Args:
tweet: Tweet object containing parsed tweet
"""
try:
sql = "INSERT INTO users (user_id, created_at) VALUES"\
"(%d, '%s')"\
%(
tweet['user']['user_id'],
tweet['user']['created_at'].strftime('%Y-%m-%d %H:%M:%S')
)
#exec stms
self.cursor.execute(sql)
except Exception, e:
if "or read-only" not in str(e) and "1062" not in str(e):
self.self.print_err('__insert_user', str(e))
def __insert_user_info(self, tweet):
""" Insert user info into db.
Args:
tweet: Tweet object containing parsed tweet
"""
try:
sql = "INSERT INTO user_info VALUES"\
"(%d, '%s', '%s', %d, %d, '%s', '%s', '%s', '%s')"\
%(
tweet['user']['user_id'],
MySQLdb.escape_string(tweet['user']['screen_name'].encode('utf-8').strip()),
MySQLdb.escape_string(tweet['user']['name'].encode('utf-8').strip()),
tweet['user']['followers_count'],
tweet['user']['friends_count'],
MySQLdb.escape_string(tweet['user']['description'].encode('utf-8').strip()),
MySQLdb.escape_string(tweet['user']['image_url'].encode('utf-8').strip()),
tweet['user']['created_at'].strftime('%Y-%m-%d %H:%M:%S'),
MySQLdb.escape_string(tweet['user']['location'].encode('utf-8').strip())
)
self.cursor.execute(sql)
except Exception, e:
if "or read-only" not in str(e) and "1062" not in str(e):
self.print_err('__insert_info', str(e))
def __insert_tweet(self, tweet):
""" Insert tweet into db.
Args:
tweet: Tweet object containing parsed tweet
"""
try:
sql = "INSERT INTO tweets VALUES"\
"(%d, '%s', '%s', %d, %d, %d, '%s', %d, %d)"\
%(
tweet['tweet']['tweet_id'],
MySQLdb.escape_string(tweet['tweet']['tweet_text'].encode('utf-8').strip()),
tweet['tweet']['created_at'].strftime('%Y-%m-%d %H:%M:%S'),
tweet['tweet']['geo_lat'],
tweet['tweet']['geo_long'],
tweet['user']['user_id'],
MySQLdb.escape_string(tweet['tweet']['tweet_url'].encode('utf-8').strip()),
tweet['tweet']['retweet_count'],
tweet['tweet']['original_tweet_id']
)
self.cursor.execute(sql)
except Exception, e:
if "or read-only" not in str(e) and "1062" not in str(e):
self.print_err('__insert_tweet', str(e))
def __insert_hashtags(self, tweet):
""" Insert hashtags into db.
Args:
tweet: Tweet object containing parsed tweet
"""
try:
sql = "INSERT INTO tweet_hashtags VALUES(%d, '%s', '%s', %d)"
for hashtag in tweet['tweet']['hashtags']:
tmp_sql = sql\
%(
tweet['tweet']['tweet_id'],
#MySQLdb.escape_string(hashtag['text']).encode('utf-8').strip(),
MySQLdb.escape_string(smart_str(hashtag['text'])).strip(),
tweet['tweet']['created_at'].strftime('%Y-%m-%d %H:%M:%S'),
tweet['user']['user_id']
)
self.cursor.execute(tmp_sql)
except Exception, e:
if "or read-only" not in str(e) and "1062" not in str(e):
self.print_err('__insert_hashtags', str(e))
def __insert_mentions(self, tweet):
""" Insert mentions info into db.
Args:
tweet: Tweet object containing parsed tweet
"""
sql = "INSERT INTO tweet_mentions VALUES(%d, %d, %d)"
#insert mentions
try:
for mention in tweet['tweet']['mentions']:
tmp_sql = sql\
%(
tweet['tweet']['tweet_id'],
tweet['user']['user_id'],
mention['id']
)
self.cursor.execute(tmp_sql)
except Exception, e:
if "or read-only" not in str(e) and "1062" not in str(e):
self.print_err('__insert_mentions', str(e))
def __insert_urls(self, tweet):
""" Insert urls into db.
Args:
tweet: Tweet object containing parsed tweet
"""
sql = "INSERT INTO tweet_links VALUES(%d, %d, '%s', '%s', '%s')"
#insert urls
try:
for url in tweet['tweet']['urls']:
if url is None or url == '':
continue
tmp_sql = sql\
%(
tweet['tweet']['tweet_id'],
tweet['user']['user_id'],
MySQLdb.escape_string(self.__expand_url(url['expanded_url'])),
MySQLdb.escape_string(url['url']),
tweet['tweet']['created_at'].strftime('%Y-%m-%d %H:%M:%S')
)
self.cursor.execute(tmp_sql)
except Exception, e:
if "or read-only" not in str(e) and "1062" not in str(e):
self.print_err('__insert_urls', str(e))
def __insert_raw_JSON(self, tweet):
if tweet['raw_json'] is not None:
insert_raw_json = "INSERT INTO tweet_json_cache VALUES(%d, \"%s\")"\
%(tweet['tweet']['tweet_id'],
MySQLdb.escape_string(tweet['raw_json']
.encode('utf-8').strip()))
self.cursor.execute(insert_raw_json)
def insert_into(self, db, tweet):
""" Insert tweet info into db.
Args:
db: Name of db to use
tweet: Tweet object containing parsed tweet
"""
try:
#select db
self.db_con.select_db(db)
self.cursor = self.db_con.cursor()
except Exception, e:
self.self.print_err('insert_into', str(e))
self.__insert_user(tweet)
self.__insert_user_info(tweet)
self.__insert_tweet(tweet)
self.__insert_hashtags(tweet)
self.__insert_mentions(tweet)
self.__insert_urls(tweet)
self.__insert_raw_JSON(tweet)
def insert_into_userList(self, list, db):
""" Insert user info into db.
Args:
db: Name of db to use
list: list object containing parsed users and list
"""
if list is None:
return
try:
#select db
self.db_con.select_db(db)
self.cursor = self.db_con.cursor()
sql = "INSERT INTO user_list VALUES(%d, '%s', '%s')"
print 'Inserting users into user list'
self.logger.info('Inserting users into user list')
for subList in list:
for user in subList['users']:
try:
tmpSql = sql\
%(
user.id,
subList['slug'],
subList['owner']
)
self.cursor.execute(tmpSql)
except Exception, e: pass
except Exception, e:
self.print_err('insert_into_userlist', str(e))
###################################################################################
################################ END insert methods ###############################
###################################################################################
def print_err(self, func, msg):
err = "\nDB insert error\n"\
"@ %s IN SQL.py\n"\
"MSG( %s )"%(func, msg)
print err;
self.logger.error(err)
def testDB(self, db):
try:
#select db
self.db_con.select_db(db)
self.cursor = self.db_con.cursor()
#test db
sql = "SELECT * FROM users LIMIT 1"
self.cursor.execute(sql)
except Exception, e:
print e, "\nCheck DB name in config file.\nExiting..."
self.logger.error(str(e)+"\nCheck DB name in config file.\nExiting...")
exit(0)
def __expand_url(self, url):
if url is None:
return "N/A"
url = url.replace("https://", "")
url = url.replace("http://", "")
url = url.replace("www.", "")
#check for known shorteners
isShort = [shortener for shortener in self.shorteners
if shortener.lower() in url.lower()]
if len(isShort) == 0 \
and len(url[:url.index(".")]) > 7:
return url
url = "http://"+url
try:
#send request using short url
req = urllib2.Request(url)
res = urllib2.urlopen(req)
#return final url
url = res.geturl()
if "http://" not in url and "https://" not in url:
url = "http://"+url
return url
except Exception, e:
return "404/Invalid URL(",url,")"