forked from jimmoffitt/rbSearchAPI
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearch_api.rb
296 lines (235 loc) · 12.8 KB
/
search_api.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
#A command-line wrapper to the pt_search class.
#Uses the 'optparse' gem for parsing command-line options. For better or worse...
#Makes minute count request, then navigates those results, consolidating minute counts up to 500 for data requests.
#Loads up rules, and loops through them.
#Writes to database or to files.
#Example usage: see README.md
require_relative "./pt_search.rb"
#=======================================================================================================================
if __FILE__ == $0 #This script code is executed when running this file.
require 'optparse'
require 'base64'
#-------------------------------------------------------------------------------------------------------------------
#Example command-lines
#Options:
# Pass in configuration and rules files.
# Pass in everything on command-line.
# Pass in configuration file and all search parameters.
# Pass in configuration parameters and rules file.
#Pass in two files, the Gnip Search API config file and a Rules file.
# $ruby ./search_api.rb -c "./SearchConfig.yaml" -r "./rules/mySearchRules.yaml"
# $ruby ./search_api.rb -c "./SearchConfig.yaml" -r "./rules/mySearchRules.json"
#Typical command-line usage.
# Specifying URL. Passing in ISO formatted dates.
# $ruby ./search_api.rb -a "http://search.gnip.com/" -u [email protected] -pMyPassword -r "rain OR weather (profile_region:colorado)" -s "2013-10-18 06:00" -e "2013-10-20 06:00"
#Get minute counts. Returns JSON time-series of minute, hour, or day counts.
# $ruby ./search_api.rb -l -d "minutes"
# -a "http://search.gnip.com/" -u [email protected] -pMyPassword
# -r "rain OR weather (profile_region:colorado)" -s "2013-10-18 06:00" -e "2013-10-20 06:00"
# Passing in configuration file to specify Search API URL, dates as YYYYMMDDHHMM. Passing in search parameters.
#$ruby ./search_api.rb -c "./SearchConfig.yaml"
# -r "rain OR weather (profile_region:colorado)" -s "201310180600" -e "201310200600"
# $ruby ./search_api.rb [-r RULE] [-s SEARCH_URL] [-u USERNAME] [-p PASSWORD] [-c COUNT_ONLY]
#-------------------------------------------------------------------------------------------------------------------
OptionParser.new do |o|
#We need either a config file AND a rule parameter (which can be a single rule passed in, or a rules file)
# OR
#100% parameters, with no config file:
# Mandatory: username, password, address/account, rule
# Options: start(defaults to Now - 30 days), end (defaults to Now), tag,
# look, duration (defaults to minute), maxResults (defaults to 100), publisher (defaults to Twitter)
#Passing in a config file.... Or you can set a bunch of parameters.
o.on('-c CONFIG', '--config', 'Configuration file (including path) that provides account and download settings.
Config files include username, password, account name and stream label/name.') { |config| $config = config}
#Search rule. This can be a single rule ""this exact phrase\" OR keyword"
o.on('-r RULE', '--rule', 'Rule details. Either a single rule passed in, or a file containing either a
YAML or JSON array of rules.') {|rule| $rule = rule}
#Basic Authentication.
o.on('-u USERNAME','--user', 'User name for Basic Authentication. Same credentials used for console.gnip.com.') {|username| $username = username}
o.on('-p PASSWORD','--password', 'Password for Basic Authentication. Same credentials used for console.gnip.com.') {|password| $password = password}
#Search URL, based on account name.
o.on('-a ADDRESS', '--address', 'Either Search API URL, or the account name which is used to derive URL.') {|address| $address = address}
o.on('-n NAME', '--name', 'Label/name used for Stream API. Required if account name is supplied on command-line,
which together are used to derive URL.') {|name| $name = name}
#Period of search. Defaults to end = Now(), start = Now() - 30.days.
o.on('-s START', '--start_date', "UTC timestamp for beginning of Search period.
Specified as YYYYMMDDHHMM, \"YYYY-MM-DD HH:MM\", YYYY-MM-DDTHH:MM:SS.000Z or use ##d, ##h or ##m.") { |start_date| $start_date = start_date}
o.on('-e END', '--end_date', "UTC timestamp for ending of Search period.
Specified as YYYYMMDDHHMM, \"YYYY-MM-DD HH:MM\", YYYY-MM-DDTHH:MM:SS.000Z or use ##d, ##h or ##m.") { |end_date| $end_date = end_date}
#Search rule. This can be a single rule "\"this exact phrase\" OR keyword"
o.on('-r RULE', '--rule', 'A single rule passed in on command-line, or a file containing multiple rules.') {|rule| $rule = rule}
#Tag, optional. Not in payload, but triggers a "matching_rules" section with rule/tag values.
o.on('-t TAG', '--tag', 'Optional. Gets tacked onto payload if included. Alternatively, rules files can contain tags.') {|tag| $tag = tag}
o.on('-o OUTBOX', '--outbox', 'Optional. Triggers the generation of files and where to write them.') {|outbox| $outbox = outbox}
o.on('-z', '--zip', 'Optional. If writing files, compress the files with gzip.') {|zip| $zip = zip}
#These trigger the estimation process, based on "duration" bucket size.
o.on('-l', '--look', '"Look before you leap..." Trigger the return of counts only.') {|look| $look = look} #... as in look before you leap.
o.on('-d DURATION', '--duration', "The 'bucket size' for counts, minute, hour (default), or day" ) {|duration| $duration = duration} #... as in look before you leap.
o.on('-m MAXRESULTS', '--max', 'Specify the maximum amount of data results. 10 to 500, defaults to 100.') {|max_results| $max_results = max_results} #... as in look before you leap.
#Publisher defaults to Twitter.
o.on('-b PUBLISHER', '--pub', 'Defaults to Twitter, which currently is the only Publisher supported with Search.') {|publisher| $publisher = publisher}
#Help screen.
o.on( '-h', '--help', 'Display this screen.' ) do
puts o
exit
end
o.parse!
end
#Create a Rehyrdation PowerTrack object, passing in an account configuration file.
oSearch = PtSearch.new()
oSearch.rules.rules = Array.new
#Provided config file, which can include many things, especially username, password, account and stream names.
if !$config.nil? then
oSearch.get_system_config($config)
end
#So, we got what we got from the config file, so process what was passed in.
#Initial "gate-keeping" on what we have been provided. Enough information to proceed?
#Anything on command-line overrides configuration setting...
error_msgs = Array.new
#We need to have authentication details, username and password =====================================================
if !$username.nil?
oSearch.user_name = $username
end
if oSearch.user_name.nil? or oSearch.user_name == "" then
error_msgs << "User name is required. You can pass this in on command-line or specify in configuration file."
end
if !$password.nil?
if oSearch.password_encoded?($password) then
oSearch.password_encoded = $password
else
oSearch.password = $password
oSearch.password_encoded = Base64.encode64(oSearch.password)
end
end
if (oSearch.password.nil? or oSearch.password == "") and (oSearch.password_encoded == "") then
error_msgs << "Password is required. You can pass this in on command-line or specify in configuration file."
end
#We need to have account name and stream label (name), unless the Search API URL is provided =======================
if !$address.nil? then
#Do we have a URL or an account name?
if !$address.include?("gnip.com") then #we have an account name
oSearch.account_name = $address
else #we have a Search URL with form: https://search.gnip.com/accounts/jim/search/prod.json
#Parse out account name and stream name/label.
parts = $address.split("/")
oSearch.account_name = parts[-3]
oSearch.label = parts[-1].split(".")[-2]
end
end
#See if we have a stream label/name being provided.
if !$name.nil? then
oSearch.label = $name
end
#OK, now we should have both account_name and label, otherwise add another error message.
if oSearch.account_name.nil? or oSearch.account_name == "" then
error_msgs << "Account name is required. You can pass this in on command-line or specify in configuration file."
end
if oSearch.label.nil? or oSearch.label == "" then
error_msgs << "Search label is required. You can pass this in on command-line or specify in configuration file."
end
oSearch.set_http
#We need to have at least one rule.
if !$rule.nil? then
#Rules file provided?
extension = $rule.split(".")[-1]
if extension == "yaml" or extension == "json" then
oSearch.rules_file = $rule
if extension == "yaml" then
oSearch.rules.loadRulesYAML(oSearch.rules_file)
end
if extension == "json" then
oSearch.rules.loadRulesYAML(oSearch.rules_file)
end
else
rule = {}
rule["value"] = $rule
oSearch.rules.rules << rule
end
else
error_msgs << "Either a single rule or a rules files is required. "
end
#Everything else is option or can be driven by defaults.
#Tag is completely optional.
if !$tag.nil? then
rule = {}
rule = oSearch.rules.rules
rule[0]["tag"] = $tag
end
#Look is optional.
#Duration is optional, defaults to "hour" which is handled by Search API.
#Can only be "minute", "hour" or "day".
if !$duration.nil? then
if !['minute','hour','day'].include?($duration) then
p "Warning: unrecognized duration setting, defaulting to 'minute'."
$duration = 'minute'
end
end
#start_date, defaults to NOW - 30.days by Search API.
#end_date, defaults to NOW by Search API.
# OK, accepted parameters gets a bit fancy here.
# These can be specified on command-line in several formats:
# YYYYMMDDHHmm or ISO YYYY-MM-DD HH:MM.
# 14d = 14 days, 48h = 48 hours, 360m = 6 hours
# Or they can be in the rules file (but overridden on the command-line).
# start_date < end_date, and end_date <= NOW.
#We need to end up with PowerTrack timestamps in YYYYMMDDHHmm format.
#If numeric and length = 12 then we are all set.
#If ISO format and length 16 then apply o.gsub!(/\W+/, '')
#If ends in m, h, or d, then do some time.add math
#Handle start date.
#First see if it was passed in
if !$start_date.nil? then
oSearch.from_date = oSearch.set_date_string($start_date)
end
#Handle end date.
#First see if it was passed in
if !$end_date.nil? then
oSearch.to_date = oSearch.set_date_string($end_date)
end
#Max results is optional, defaults to 100 by Search API.
if !$max_results.nil? then
oSearch.max_results = $max_results
end
#Writing data to files.
if !$outbox.nil? then
oSearch.out_box = $outbox
oSearch.storage = "files"
if !$zip.nil? then
oSearch.compress_files = true
end
end
#Publisher defaults to 'Twitter' (handled in client).
if !$publisher.nil? then
#
if $publisher.downcase != "twitter" then
error_msgs << "The Search API currently supports only Twitter, which is set as the default."
else
oSearch.publisher = $publisher.downcase
end
end
#Check for configuration errors.
if error_msgs.length > 0 then
p "Errors in configuration: "
error_msgs.each { |e|
p e
}
p ""
p "Please check configuration and try again... Exiting."
exit
end
#Wow, we made it all the way through that! Documentation must be awesome...
if $look == true then #Handle count requests.
oSearch.rules.rules.each do |rule|
p "Getting counts for rule: #{rule["value"]}"
results = oSearch.get_counts(rule["value"], oSearch.from_date, oSearch.to_date, $duration)
puts results.to_json
end
else #Asking for data!
interval = "minute"
oSearch.rules.rules.each do |rule|
p "Getting activities for rule: #{rule["value"]}"
oSearch.get_data(rule["value"],oSearch.from_date,oSearch.to_date,interval,rule["tag"])
end
end
p "Exiting"
end