-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_tweets.py
106 lines (85 loc) · 3.45 KB
/
get_tweets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import simplejson
from twisted.web import client
from twisted.internet import reactor
import datetime
import base64
from ConfigParser import ConfigParser
config = ConfigParser()
config.read("CONFIG.DAT")
class GetStream:
username = config.get("twitter","username")
password = config.get("twitter","password")
def __init__(self):
self.start()
self.chunk = ""
self.stats = Stats()
def __makeAuthHeader(self, headers={}):
authorization = base64.encodestring('%s:%s'
% (self.username, self.password))[:-1]
headers['Authorization'] = "Basic %s" % authorization
return headers
def start(self):
client.downloadPage("http://stream.twitter.com/1/statuses/sample.json",
self, #file
headers = self.__makeAuthHeader()
).addBoth(self.stopped)
def stopped(self, data):
self.chunk = ""
reactor.callLater(10.0, self.start)
def write(self, b): self.process(b)
def close(self): pass
def open(self): pass
def read(self): return None
def process(self,s):
statuses = s.split('\r')
statuses[0]=self.chunk+statuses[0]
self.chunk = statuses[-1]
for status_json in statuses[:-1]:
try:
status = simplejson.loads(status_json)
if 'limit' in status or 'delete' in status: continue
self.stats.nb_of_tweets+=1
text = safe_str(status["text"])
if '#' not in text: continue
#print text
hashtags = status.get("entities",dict()).get("hashtags",[])
hashtags = [safe_str(e.get("text")).lower() for e in hashtags if "text" in e]
for hashtag in hashtags:
self.stats._hashtags[hashtag] = self.stats._hashtags.get(hashtag,0)+1
if not self.stats._hashtags_timeline.get(hashtag): self.stats._hashtags_timeline[hashtag] = []
self.stats._hashtags_timeline[hashtag].insert(0,datetime.datetime.utcnow())
except Exception, e:
#continue
print 50*"*"
print e
print 50*"-"
print status_json
print 50*"*"
if datetime.datetime.utcnow() - self.stats.last_cleanup_time > datetime.timedelta(minutes=1): self.stats.clean_up()
class Stats:
def __init__(self):
self.nb_of_tweets = 0
self._hashtags = dict()
self._hashtags_timeline = dict()
self.last_cleanup_time = datetime.datetime.utcnow()
def clean_up(self):
for h, tl in self._hashtags_timeline.items():
while tl:
old = tl.pop()
if not ( datetime.datetime.utcnow() - old > datetime.timedelta(hours=1) ):
tl.append(old)
break
self._hashtags[h] = len(tl)
if not tl:
try: del(self._hashtags[h])
except: pass
try: del(self._hashtags_timeline[h])
except: pass
self.last_cleanup_time = datetime.datetime.utcnow()
def safe_str(obj):
""" return the byte string representation of obj """
try:
return str(obj)
except UnicodeEncodeError:
# obj is unicode
return unicode(obj).encode('unicode_escape')