-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgather.py
121 lines (99 loc) · 3.69 KB
/
gather.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#
# authors:
# Paul Galatic
#
# description:
# Helper functions related to the gathering and storage of data.
#
# STD LIB
import os
import re
import csv
import pdb
import sys
import time
import pprint
import shutil
from datetime import datetime, timedelta
# EXTERNAL LIB
import tweepy
# PROJECT LIB
from extern import *
# CONSTANTS
def already_downloaded():
'''
Returns the names of files in RAW_DIR, WITHOUT their extensions. Example:
raw/
#ImpeachTrump.csv
#WednesdayWisdom.csv
returns: {#ImpeachTrump, #WednesdayWisdom}
'''
topics = set()
for fname in os.listdir(RAW_DIR):
topics.add(' '.join(os.path.splitext(fname)[0].split('_')))
return topics
def trending_tweets(api, woeid, num_topics):
# Create the data folder if it doesn't exist.
if not os.path.isdir(RAW_DIR):
os.mkdir(RAW_DIR)
# Grabs all trending topics with a known tweet volume.
topics = []
redundant = already_downloaded()
for trend in api.trends_place(woeid)[0]['trends']:
# Ignore any topics that are empty, those that do not have at least one
# character from the alphabet in their name, or those that we have
# already downloaded.
if (
re.search('[a-zA-Z]', trend['name']) and
trend['name'] not in redundant
):
topics.append(trend)
if len(topics) < 1:
log(f'WARN: There are {len(topics)} topics.')
# Process the first N topics.
for trend in topics[:num_topics]:
hashtag = trend['name']
proper_name = '_'.join(hashtag.split(' '))
log(f'Gathering tweets for {proper_name}...')
# Search for tweets matching the hashtag.
total_length = 0
total_tweets = 0
# Make a file to store the tweets in.
fname = str(RAW_DIR / (proper_name + '.csv'))
with open(fname, 'w+', newline='', encoding='utf-8') as topicfile:
# Use a cursor to find tweets and a csv writer to record them.
# Retweets would lead to data duplication, so those are skipped.
wtr = csv.DictWriter(topicfile, fieldnames=GATHER_FIELDNAMES)
wtr.writeheader()
cursor = tweepy.Cursor(
api.search,
q=hashtag + GATHER_FILTER,
count=100,
lang='en',
since=datetime.strftime(datetime.now() - timedelta(1), '%Y-%m-%d'),
tweet_mode='extended'
)
try:
for tweet in cursor.items():
tweet = tweet._json
# Record the body of the tweets and the timestamp.
wtr.writerow({
'index': total_tweets,
'text': tweet['full_text'],
'timestamp': tweet['created_at'],
'fav_count': tweet['favorite_count'],
'ret_count': tweet['retweet_count'],
'username': tweet['user']['name'],
'at_tag': tweet['user']['screen_name'],
'id': tweet['id']
})
total_length += len(tweet['full_text'])
total_tweets += 1
if total_length > GATHER_MAX_CHARS or total_tweets > GATHER_MAX_TWEETS:
log('...Quota met.')
break
except KeyboardInterrupt:
pass
log(f'stats for {proper_name}')
log(f'\ttotal_length:\t{total_length}')
log(f'\ttotal_tweets:\t{total_tweets}')