-
Notifications
You must be signed in to change notification settings - Fork 7
/
tshark_extractor.py
executable file
·171 lines (156 loc) · 7.77 KB
/
tshark_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#!/usr/bin/python
import string
import binascii
import sys
import argparse
import gzip
import os
try:
from cStringIO import StringIO
except:
from StringIO import StringIO
from subprocess import check_output
def parse_http_stream(matching_item):
"""
Based on a tshark http stream, returns a list item of a file name and binary data
"""
end_of_header=-1
file_bytes=binascii.unhexlify(matching_item[1].replace(":","").strip("\""))
try:
# Find the end of the response header. This should always be \r\n\r\n to satisfy the HTTP standard.
end_of_header=file_bytes.index('\r\n\r\n')+4
except ValueError:
return
# Print(matching_item[:end_of_header]).
if 'Content-Encoding: gzip' in file_bytes[:end_of_header]:
# Content-Encoding header indicates gzipped content. Try to uncompress.
buf=StringIO(file_bytes[end_of_header:])
f = gzip.GzipFile(fileobj=buf)
file_bytes = f.read()
else:
# Not gzipped, just grab the response body.
file_bytes = file_bytes[end_of_header:]
# Just base the file name on the stream number.
return ["http_stream_"+matching_item[2].strip("\""),file_bytes]
def parse_smb_stream(matching_item):
"""
Based on a tshark smb stream, returns a list item of a file name and binary data
"""
file_bytes=binascii.unhexlify(matching_item[4].replace(":","").strip("\""))
# SMB file names are easily extracted from tshark.
# Use the file name_file id as the name to avoid duplicates.
return ["smb_id_" + matching_item[3].strip("\""), file_bytes]
def parse_tftp_stream(matching_item):
"""
Based on a tshark tftp stream, returns a list item of a file name and binary data.
"""
file_bytes=binascii.unhexlify(matching_item[5].replace('\"','').replace(":",""))
file_name=""
# Use either the source_file or destination_file, source port, and destination port for the file name.
file_name="tftp_stream_" + matching_item[6].strip("\"")
return [file_name,file_bytes]
def extract_files(outdir, infile, displayfilter):
"""
Based on command line arguments, extracts files to the specified directory
"""
# Extract all the stream numbers containing the specified display filter.
# Return stream numbers and the reassembled data. We'll use the stream number in the file name so it can be found in wireshark later if necessary.
# Return columns.
# Used to determine protocol:
# [0]:_ws.col.Protocol
# Used by HTTP:
# [1]:tcp.reassembled.data
# Used by HTTP and FTP:
# [2]:tcp.stream
# Used by SMB:
# [3]:smb.fid
# [4]:smb.file_data
# Used by TFTP:
# [5]:data
# [6]:udp.stream
if displayfilter=='':
hex_stream_data_list = check_output(["tshark", "-r", infile, "-Y", "(http.content_length > 0 || (smb.file_data && smb.remaining==0) || ftp-data || tftp.opcode==3)", "-T", "fields", "-e", "_ws.col.Protocol", "-e", "tcp.reassembled.data", "-e", "tcp.stream", "-e", "smb.fid", "-e", "smb.file_data","-e", "data", "-e", "tftp.source_file", "-e", "tftp.destination_file", "-e", "udp.srcport", "-e", "udp.dstport", "-E", "quote=d","-E", "occurrence=a", "-E", "separator=|"]).split()
else:
hex_stream_data_list = check_output(["tshark", "-r", infile, "-Y", displayfilter + " && (http.content_length > 0 || (smb.file_data && smb.remaining==0) || ftp-data || tftp.opcode==3)", "-T", "fields", "-e", "_ws.col.Protocol", "-e", "tcp.reassembled.data", "-e", "tcp.stream", "-e", "smb.fid", "-e", "smb.file_data","-e", "data", "-e", "tftp.source_file", "-e", "tftp.destination_file", "-e", "udp.srcport", "-e", "udp.dstport", "-E", "quote=d","-E", "occurrence=a", "-E", "separator=|"]).split()
ftp_data_streams=[]
reassembled_streams=[]
# Tshark returns stream numbers with no data sometimes. So, we'll find the items with hex encoded data and convert them to their normal binary values.
# When only take the stream info that immediately follows the data to avoid the extraneous items.
for matching_item in hex_stream_data_list:
x_item=matching_item.split("|")
x_protocol=x_item[0].strip("\"")
# Pick a parsing method based on the protocol as defined by tshark.
if (x_protocol=='HTTP' or x_protocol=='HTTP/XML'):
# Use HTTP parsing method.
parsed_stream = parse_http_stream(x_item)
# Parse_http_stream can trap partial streams and return a None value.
if parsed_stream is not None:
# We have a valid stream. search the list of previous streams. Create a list of all files coming from the current stream.
search_index=[x for x,y in enumerate(reassembled_streams) if parsed_stream[0] in y[0]]
if len(search_index)>0:
# If we found a match, then we need to modify our filename so we don't overwrite the others.
parsed_stream[0]=parsed_stream[0]+"_"+str(len(search_index))
# Add the file to the list of extracted files.
reassembled_streams.append(parsed_stream)
elif x_protocol=='SMB':
# Use SMB parsing method.
parsed_stream = parse_smb_stream(x_item)
# Search the previous streams. Create a list of matching file names.
search_index=[x for x,y in enumerate(reassembled_streams) if (y[0])==parsed_stream[0]]
if len(search_index)>0:
# If the file name already exists, append the raw bytes to those of the existing file.
reassembled_streams[search_index[0]][1]=reassembled_streams[search_index[0]][1]+parsed_stream[1]
else:
# The file has not yet had any packets parsed out, start a new reassembled stream.
reassembled_streams.append(parsed_stream)
elif x_protocol=='TFTP':
# Use TFTP parsing method.
parsed_stream = parse_tftp_stream(x_item)
# Search the previous streams. Create a list of matching file names.
search_index=[x for x,y in enumerate(reassembled_streams) if (y[0])==parsed_stream[0]]
if len(search_index)>0:
# If the file name already exists, append the raw bytes to those of the existing file.
reassembled_streams[search_index[0]][1]=reassembled_streams[search_index[0]][1]+parsed_stream[1]
else:
# The file has not yet had any packets parsed out, start a new reassembled stream.
reassembled_streams.append(parsed_stream)
elif x_protocol=='FTP-DATA':
# FTP streams are handled in a totally different method.
ftp_data_streams.append(x_item[2].strip("\""))
elif x_protocol!='':
# This shouldn't be possible, display a warning message.
print("WARNING: untrapped protocol: ---" + x_protocol + "---\n")
for reassembled_item in reassembled_streams:
# Write all reassembled streams to files.
fh=open(os.path.join(outdir,reassembled_item[0]),'w')
fh.write(reassembled_item[1])
fh.close()
for stream_number in ftp_data_streams:
# Handle FTP streams.
# For each stream, rerun tshark to extract raw data from the stream.
hex_stream_list = check_output(["tshark", "-q", "-n", "-r", infile, "-z", "follow,tcp,raw," + stream_number]).split("\n")
list_length = len(hex_stream_list)
# Strip the excess output from the tshark extraction.
hex_stream_text = ''.join(hex_stream_list[6:list_length-2])
# Convert the hex back to raw bytes.
file_bytes=binascii.unhexlify(hex_stream_text)
# Write extracted FTP files.
fh=open(os.path.join(outdir,'ftp_stream_'+stream_number),'w')
fh.write(file_bytes)
fh.close()
def main(args):
parser = argparse.ArgumentParser()
parser.add_argument('-o', '--outdir', default='output/')
parser.add_argument('-i', '--infile')
parser.add_argument('-D', '--displayfilter', default='')
args = parser.parse_args()
if not args.infile:
parser.error('Missing input file argument.')
try:
os.makedirs(vars(args)['outdir'])
except OSError:
if not os.path.isdir(vars(args)['outdir']):
raise
extract_files(vars(args)['outdir'], vars(args)['infile'], vars(args)['displayfilter'])
if __name__ == "__main__":
main(sys.argv[1:])