-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy paththreadparser.py
175 lines (148 loc) · 6.65 KB
/
threadparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
## Thread Parser
## Charles Seife, [email protected]
##
## Takes a threadified flat file and attempts to impute parent-->child relationships
## for use in a directed graph. Again, crude and needs improvement.
import twitterinfrastructure as TI
import datetime
def stringify_list(inlist,delimiter="\t"):
first = True
outstring = ""
for item in inlist:
if first:
first = False
else:
outstring += delimiter
outstring += str(item)
return outstring;
### MAIN BODY ###
delimiter = "\t"
originalfilesuffix = ".threadedlogfile.tsv"
threadedfilesuffix = ".parsedthreadfile.tsv"
threadidheader = "threadnum"
tweetidheader = "id_str"
tweetinreplytotidheader = "in_reply_to_status_id_str"
tweetquotingtidheader = "quoted_status_id_str"
tweeternameheader = "screen_name_tweeter"
rtusernameheader = "rtuserscreen_name"
replyusernameheader = "in_reply_to_screen_name"
tweettimeheader = "created_at"
threadnumheader = "threadnum"
threadmultheader = "thread_multiplicity"
threadearliestheader = "earliest_thread_timestamp"
threadtimedeltaheader = "thread_age_seconds"
userstreammultiplicityheader = "user_stream_multiplicity"
proximateparentheader = "proximate_parent"
userheaderlist = ["id_str_tweeter","name_tweeter","screen_name_tweeter","url_tweeter","description_tweeter","protected_tweeter","verified_tweeter","followers_count_tweeter","friends_count_tweeter","listed_count_tweeter","favourites_count_tweeter","statuses_count_tweeter","created_at_tweeter","utc_offset_tweeter","time_zone_tweeter","geo_enabled_tweeter","lang_tweeter","profile_background_color_tweeter","profile_link_color_tweeter","profile_sidebar_border_color_tweeter","profile_sidebar_fill_color_tweeter","profile_use","background_image_tweeter","withheld_in_countries_tweeter","withheld_scope_tweeter"]
statusdict = {}
nodedict = {}
edgedict = {}
fileprefix = input("What is the file prefix? ")
infilename = fileprefix + originalfilesuffix
outfilename = fileprefix + threadedfilesuffix
infilepointer = open(infilename,"r",encoding="utf-8")
headerline = infilepointer.readline().strip()
headerlist = headerline.split(delimiter)
for i in range(0,len(headerlist)):
rawheader = headerlist[i]
header = rawheader.lower().strip()
headerlist[i]=header
for line in infilepointer:
datalist = line.split(delimiter)
for i in range (0,len(datalist)):
rawdatum = datalist[i]
cleandatum = TI.ensure_string(rawdatum)
datalist[i] = cleandatum
tweetid = TI.lookup_value(tweetidheader,headerlist,datalist,cast="str")
statusdict[tweetid] = datalist
infilepointer.close()
#calculate thread multiplicities, earliest in thread, user mutliplicities
threadmultdict={}
threadearliestdict = {}
usermultdict = {}
for statusname in statusdict.keys():
status = statusdict[statusname]
threadid = TI.lookup_value(threadidheader,headerlist,status,cast="str")
tweettime = TI.lookup_value(tweettimeheader,headerlist,status,cast="datetime")
if threadid in threadmultdict.keys():
threadmult = threadmultdict[threadid]
threadmultdict[threadid] = threadmult + 1
oldtime = threadearliestdict[threadid]
if tweettime < oldtime:
threadearliestdict[threadid] = tweettime
else:
threadmultdict[threadid] = 1
threadearliestdict[threadid]=tweettime
status = statusdict[statusname]
tweetername =TI.lookup_value(tweeternameheader,headerlist,status,cast="str")
if tweetername in usermultdict.keys():
usermult = usermultdict[tweetername]
usermultdict[tweetername] = usermult +1
else:
usermultdict[tweetername] = 1
#add thread multiplicities, earliest in thread, usermultiplicities to statuses
headerlist.append(threadmultheader)
headerlist.append(threadearliestheader)
headerlist.append(threadtimedeltaheader)
headerlist.append(userstreammultiplicityheader)
for statusname in statusdict.keys():
status = statusdict[statusname]
threadid = TI.lookup_value(threadidheader,headerlist,status,cast="str")
status.append(threadmultdict[threadid])
status.append(threadearliestdict[threadid])
tweettime = TI.lookup_value(tweettimeheader,headerlist,status,cast="datetime")
deltatime = tweettime - threadearliestdict[threadid]
status.append(str(int(deltatime.total_seconds())))
tweetername =TI.lookup_value(tweeternameheader,headerlist,status,cast="str")
status.append(usermultdict[tweetername])
threadmultdict={} #free memory
threadearliestdict = {}
usermultdict = {}
proximateparentdict = {}
#calculate proximate parent
for statusname in statusdict.keys():
status = statusdict[statusname]
tweetername = TI.lookup_value(tweeternameheader,headerlist,status,cast="str").strip()
irtname = TI.lookup_value(replyusernameheader,headerlist,status,cast="str").strip()
rtname = TI.lookup_value(rtusernameheader,headerlist,status,cast="str").strip()
rtid = TI.lookup_value(tweetquotingtidheader,headerlist,status,cast="str").strip()
irtid = TI.lookup_value(tweetinreplytotidheader,headerlist,status,cast="str").strip()
threaddelta = TI.lookup_value(threadtimedeltaheader,headerlist,status,cast="int")
if irtname != "":
proximateparentdict[statusname] = irtname
elif rtname != "":
proximateparentdict[statusname] = rtname
elif (rtid !=""):
if rtid in statusdict.keys():
tempstatus = statusdict[rtid]
parent=TI.lookup_value(tweeternameheader,headerlist,tempstatus,cast="str").strip()
else:
parent = "author-" + rtid
proximateparentdict[statusname] = parent
elif (irtid !=""):
if irtid in statusdict.keys():
tempstatus = statusdict[irtid]
parent=TI.lookup_value(tweeternameheader,headerlist,tempstatus,cast="str").strip()
else:
parent ="author-" + irtid
proximateparentdict[statusname] = parent
elif threaddelta == 0:
proximateparentdict[statusname] = tweetername
else:
threadstring = TI.lookup_value(threadnumheader,headerlist,status,cast="str")
proximateparentdict[statusname]="author-thread-"+threadstring
#add proximate parent to statuses
headerlist.append(proximateparentheader)
for statusname in statusdict.keys():
status = statusdict[statusname]
status.append(proximateparentdict[statusname])
proximateparentdict = {} #free memory
# print results to flat file
outfilepointer = open(outfilename,"w",encoding = "utf-8")
newheaderline = stringify_list(headerlist)
print (newheaderline,file=outfilepointer)
for statusname in statusdict.keys():
status = statusdict[statusname]
statusstring = stringify_list(status)
print(statusstring,file=outfilepointer)
outfilepointer.flush()