-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcdxpress.py
422 lines (345 loc) · 12.1 KB
/
cdxpress.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import time
startTime = time.time()
import argparse
import ijson
import json
import os
import os.path
import random
import re
import requests
import sys
import textwrap
import urllib.parse
from pathlib import Path
from requests.utils import quote
version = '0.2b'
#-------------------------------------#
# cdxpress by av1d #
#-------------------------------------#
# https://github.com/av1d/cdx-tools #
#-------------------------------------#
# This is the express version of cdx-tools and is probably fine for most people.
# though it is meant for quick pulls and lacks extensive features.
# Visit the repo above for tools with much more control.
def banner():
info1 = "+----------------------------+"
info2 = "\n| cdxpress v" + version + " by av1d |\n"
if sys.argv[1] == "-h" or sys.argv[1] == "--help":
info3 = "\nQuick start 1: cdxpress.py --url example.org --scan=.jpg,.png,.js,.css"
info4 = "\nQuick start 2: cdxpress.py --url example.org --scan=.ZIP,.TAR,.GZ --exclude=.exe,.jar --case-sensitive --from 1999 --to 2004 --out links.txt"
else:
info3 = ""
info4 = ""
banner = info1 + info2 + info1 + info3 + info4
return banner
def sep():
return "------------\n"
def setArgs():
parser = argparse.ArgumentParser(
description=banner(),
usage=(
'use "cdxpress --help" for more information'
),
formatter_class=argparse.RawTextHelpFormatter,
)
parser.add_argument(
'-u',
'--url',
metavar='URL',
required=True,
help=
"The URL to search.\n"
+ sep(),
)
parser.add_argument(
'-s',
'--scan',
type=str,
metavar='STRINGS',
required=True,
help=
"Strings to scan for.\n" +
"Example usage:\n" +
"--scan=.exe,.JPG,.zip,\"/cgi-bin/x.cgi?\",\"space space\"\n" +
"Items are comma-separated, no spaces.\n" +
"Enclose strings with spaces and special characters in\n" +
"single or double quotes.\n" +
"Leave blank to return ALL files (example: --scan= ).\n"
+ sep(),
)
parser.add_argument(
'-x',
'--exclude',
type=str,
metavar='NEGATIVE_STRINGS',
required=False,
help=
"Do not return results for URLs containing these words.\n" +
"Example usage:\n" +
"--exclude=www,.exe,cgi-bin\n" +
"Items are comma-separated, no spaces.\n" +
"Enclose strings with spaces and special characters in\n" +
"single or double quotes.\n"
+ sep(),
)
parser.add_argument(
'-c',
'--case-sensitive',
action='store_true',
required=False,
help=
"Case sensitive filtering on search strings.\n" +
"Default: insensitive.\n"
+ sep(),
)
parser.add_argument(
'-o',
'--outfile',
metavar='OUTPUT_FILE',
required=False,
help=
"Plain text output file.\n"
+ sep(),
)
parser.add_argument(
'-f',
'--from',
metavar='FROM_DATE',
required=False,
help=
"Search FROM this date. 1-14 digits.\n"
+ "Example: --from 2004 or use a Wayback timestamp:\n"
+ "--from 20040601150932\n"
+ "Timestamp format: yyyyMMddhhmmss\n"
+ "Omit --to and --from for all dates.\n"
+ sep(),
)
parser.add_argument(
'-t',
'--to',
metavar='TO_DATE',
required=False,
help=
"Search up TO this date. 1-14 digits.\n"
+ "Example: --to 2004 or use a Wayback timestamp:\n"
+ "--to 20040601150932\n"
+ "Timestamp format: yyyyMMddhhmmss\n"
+ "Omit --to and --from for all dates.\n"
+ sep(),
)
parser.add_argument(
'-v',
'--version',
action='store_true',
required=False,
help=
"Print version information then exit.\n"
+ sep(),
)
global args # dict
args = vars(parser.parse_args())
if args['to'] != None and args['from'] != None:
if int(args['to']) - int(args['from']) < 0:
print(
"--- Error: --to date is less than --from date."
)
sys.exit(1)
global toDateInt
global fromDateInt
try: # try to convert to int, if fail it's not so exit
if args['to'] != None:
toDateInt = int(args['to'])
if args['from'] != None:
fromDateInt = int(args['from'])
except:
print("--- Error: Dates must be integers.\n")
sys.exit(1)
# check date format
if args['to'] != None:
if int(args['to']) < 1000:
print(
"--- Error: date must be 4-14 digits.\n"
)
sys.exit(1)
# check date format
if args['from'] != None:
if int(args['from']) < 1000:
print(
"--- Error: date must be 4-14 digits.\n"
)
sys.exit(1)
# check if output file already exists...
if args['outfile'] != None:
outputFilename = args['outfile']
if os.path.isfile(outputFilename):
fileExists = input(
"File: " +
str(outputFilename) +
" exists. Append (y/n)? "
)
if fileExists.lower() != "y":
sys.exit(0)
def generateOutput(url_string, timestamp):
wayback = "https://web.archive.org/web/"
outURL = (
str(wayback) +
str(timestamp) +
"/" +
str(url_string)
)
if args['outfile'] != None:
with open(args['outfile'], 'a') as f:
f.write(outURL + "\n")
print(outURL)
def fetchResponse(URL):
timeoutSEC = 60 # http timeout in seconds
print("\nFetching: " + URL)
print("Response timeout set to: " + str(timeoutSEC) + " seconds")
clientVersion = "cdxpress/" + version
headers = {"User-Agent": clientVersion}
#Download the response
try:
response = requests.get(
URL,
headers=headers,
timeout=timeoutSEC,
stream=True
)
except requests.exceptions.Timeout:
raise SystemExit("Connection timed out")
except requests.exceptions.RequestException as e:
raise SystemExit(e)
if str(response.status_code) == "200":
print("Received HTTP status: ", response.status_code, response.reason, " - connected to server.")
global status_code
status_code = str(response.status_code)
if status_code != "200": # exit unless 200
print("Received HTTP status: " + status_code)
sys.exit(0)
print("\nDownloading the response may take a long time, do not stop the program...\n")
n = json.loads(response.text) # parse API response
keys = [] # holds JSON keys
if not n: # if list is empty...
print("\n--- Error: Response file is empty. Likely the URL provided is invalid or is not archived.")
sys.exit(1)
for i in n[0]:
try:
keys.append(i) # add the keys to the list
except:
print("\n--- Error: An unknown error has occurred. The response is missing the appropriate keys.\n")
sys.exit(1)
x = [dict(zip(keys, l)) for l in n] # create list of dictionaries
x.pop(0) # remove first line containing JSON keys
final_out = '[\n' + ',\n'.join(json.dumps(i) for i in x) + '\n]' # format list
final_out = json.loads(final_out)
return(final_out)
def checkMatch(url_string, timestamp):
global scanLINES # int. counter for --scan
global textLINES # int. counter for --textfile
global neg_match # bool. track if a negative keyword was found
originalString = url_string
for key in options.keys():
currentKey = options[key]
for string in currentKey:
if args['case_sensitive'] == False:
string = string.lower()
url_string = url_string.lower()
if neg_words: # if negative keywords were specified
checkNegMatch(url_string)
if neg_match == False: # if no match on neg word, continue
if string in url_string:
scanLINES += 1
generateOutput(originalString, timestamp)
neg_match = False
def checkNegMatch(url_string): # check for negative keywords, if specified
global neg_match # bool. track if a negative keyword was found
for word in neg_words:
if word in url_string:
neg_match = True
def main():
if sys.argv[1] == "-v" or sys.argv[1] == "--version":
print("cdxpress v" + version)
sys.exit(0)
setArgs()
# if user-provided URL doesn't contain a scheme, add it...
if "://" in args['url']: # if it has scheme, get the netloc
theHost = urllib.parse.urlsplit(args['url'])
thePath = theHost.path
theHost = theHost.netloc
if thePath == '' or thePath == '/':
matchType = "&matchType=domain"
else:
matchType = "&matchType=prefix"
else: # otherwise, add scheme then get netloc
theHost = "http://" + str(args['url'])
theHost = urllib.parse.urlsplit(theHost)
thePath = theHost.path
theHost = theHost.netloc
if thePath == '' or thePath == '/':
matchType = "&matchType=domain"
else:
matchType = "&matchType=prefix"
if thePath == '': # fix an empty path, just for sanity
thePath = "/"
cdxURL = "https://web.archive.org/cdx/search/cdx?"
cdxURL = (
cdxURL
+ "url="
+ theHost
+ thePath
+ matchType
+ "&collapse=urlkey&output=json&gzip=false&filter=statuscode:200&fl=timestamp,original"
)
if args['to'] != None:
userToDate = "&to=" + str(args['to'])
cdxURL = cdxURL + userToDate
if args['from'] != None:
userFromDate = "&from=" + str(args['from'])
cdxURL = cdxURL + userFromDate
data = fetchResponse(cdxURL) # JSON containing lists of dictionaries from API response
global scanLINES #int. counts found items
scanLINES = 0
global neg_words #list. contains negative search words
neg_words = []
global neg_match #bool. track if a negative keyword was found
neg_match = False
global options #dict. comtains search queries
options = {}
## --exclude negative keywords. build list of negative search keywords
if args['exclude'] != None:
neg_words = args['exclude'].split(',') # split input string into list
## --scan search
if args['scan'] != None:
scanType = 'scan'
scanList = args['scan'].split(',') # split input string into list
options['scan'] = scanList # add the list to the dict to scan
count = 0
for line in data:
fileURL = data[count]['original']
fileTimestamp = data[count]['timestamp']
checkMatch(fileURL, fileTimestamp)
count += 1
## RESULTS
print("\nScan complete.")
if args['case_sensitive'] == True:
msg = "Performed case -SENSITIVE- search."
else:
msg = "Performed case insensitive search."
print(msg)
if scanType == 'json':
print("\nResults:")
print(json.dumps(jsonCounter, indent=4)+"\n")
executionTime = (time.time() - startTime)
print(
"Found: " +
str(scanLINES) +
" files." +
"\nExecution time: " +
str(executionTime) +
" seconds"
)
if __name__ == '__main__':
main()