-
Notifications
You must be signed in to change notification settings - Fork 1
/
preprocess_dataset.py
82 lines (66 loc) · 3.6 KB
/
preprocess_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# NOTE: This script creates the pre-processed dataset, hence should be run just once!
# The script extends the dataset by adding information on links (URLs) in tweets.
# For every tweet that includes a link (e.g. http://t.co/3ImaomknnA) we reveal
# the original URL (e.g. https://www.facebook.com/walkercotoday/posts/9894095),
# and fetch the HTML title tag from that web page.
# We handle the first 3 links in every tweet (4th+ link is ignored).
# The processed dataset is an extension of the original dataset, with 9 new
# fields: link_url1, link_uri1, link_title1,
# link_url2, link_uri2, link_title2,
# link_url3, link_uri3, link_title3,
import csv
import urllib2
from progressbar import Progressbar
from ttp import utils
from ttp import ttp
from bs4 import BeautifulSoup
from urlparse import urlparse
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# ORIGINAL_DATASET_PATH = 'dataset/socialmedia-disaster-tweets-DFE.csv'
# PROCESSED_DATASET_PATH = 'dataset/socialmedia-disaster-tweets-DFE-extended.csv'
# ORIGINAL_DATASET_PATH = 'dataset/recent_tweets_test/chicago_tweets.csv'
# PROCESSED_DATASET_PATH = 'dataset/recent_tweets_test/chicago_tweets-extended.csv'
# ORIGINAL_DATASET_PATH = 'dataset/recent_tweets_test/houston_tweets.csv'
# PROCESSED_DATASET_PATH = 'dataset/recent_tweets_test/houston_tweets-extended.csv'
ORIGINAL_DATASET_PATH = 'dataset/recent_tweets_test/miami_tweets.csv'
PROCESSED_DATASET_PATH = 'dataset/recent_tweets_test/miami_tweets-extended.csv'
# Read original dataset
logger.info('Reading from %s' % ORIGINAL_DATASET_PATH)
orig_dataset = [tweetRec for tweetRec in csv.DictReader(open(ORIGINAL_DATASET_PATH, 'rb'))]
total_tweets = len(orig_dataset)
logger.info('%d rows fetched' % total_tweets)
# Write new dataset
logger.info('Writing to %s' % PROCESSED_DATASET_PATH)
pb = Progressbar('Building extended dataset')
with open(PROCESSED_DATASET_PATH, 'wb') as csvfile:
dataset_keys = orig_dataset[0].keys() + ['link_url1', 'link_uri1', 'link_title1',\
'link_url2', 'link_uri2', 'link_title2',\
'link_url3', 'link_uri3', 'link_title3']
csvwriter = csv.DictWriter(csvfile, fieldnames=dataset_keys)
csvwriter.writeheader()
for i in xrange(len(orig_dataset)):
pb.update_progress(i, total_tweets)
tweet = orig_dataset[i]['text']
try:
# Step 1 - Check if there are any links in tweet
tweet_parser = ttp.Parser().parse(tweet)
for j in range(1,4):
# Step 2 - Follow URL redirection until final URL is revealed
if len(tweet_parser.urls) >= j:
shortened_url = tweet_parser.urls[j-1]
logger.info('Following URL %s' % shortened_url)
redirected_url = utils.follow_shortlink(shortened_url)[-1]
orig_dataset[i]['link_url%d' % j] = unicode(redirected_url).encode("utf-8")
parsed_uri = urlparse(redirected_url)
domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
orig_dataset[i]['link_uri%d' % j] = unicode(domain).encode("utf-8")
# Step 3 - Get HTML title from the URL
soup = BeautifulSoup(urllib2.urlopen(redirected_url), features='lxml')
orig_dataset[i]['link_title%d' % j] = unicode(soup.title.string).encode("utf-8")
except:
logger.error('catch')
pass
# Step 4 - Write new record to the new dataset
csvwriter.writerow(orig_dataset[i])