-
Notifications
You must be signed in to change notification settings - Fork 0
/
json_parser.py
304 lines (229 loc) · 11.2 KB
/
json_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
# JSON parser for Python
# References:
## Lexer/parser understanding: https://dev.to/codingwithadam/introduction-to-lexers-parsers-and-interpreters-with-chevrotain-5c7b
import sys
import re
# create token class to store json regular expression patterns
class token:
def __init__(self, name, pattern):
self.name = name
self.pattern = pattern
# all json regular expression patterns
## opening bracket can be { or [ at beginning of object
opening_bracket = token('opening_bracket','^{|^\\[')
## key must be from first double quote to end of next double quote (for first key) or first quote to comma that precedes quote, insofar comma is not inside a nested JSON object list
key = token('key','(?<=^").+?(?=")|(?<=,(?![^{]*\\})").+?(?=")')
## colon must be outside of nested json object or list
colon = token('colon','(?<=(:(?![^{]*\\})))')
## value must be from colon to end of comma, insofar comma is not inside nested json object or list OR the end for last key value pair
value = token('value','(?<=:).+?(?=,(?![^{]*\\})".+?(?=")|$)')
## separator (,) must be outside of nested json object or list
separator = token('separator','(?<=(,(?![^{\\[]*[\\}\\]])))')
## closing bracket can be } or ] at end of object
closing_bracket = token('closing_bracket','}$|//]$')
# full list of tokens as name, pattern dictionary
tokens = [vars(opening_bracket), vars(colon), vars(separator), vars(key), vars(value), vars(closing_bracket)]
# checks of json file extension is a valid json file extension
def valid_json_file(arg):
if len(arg) > 5:
start = len(arg)-5
end = len(arg)
if arg[start:end] == '.json':
return 0
return 1
# removes all whitespace from json to evaluate
def remove_whitespace(string):
return re.sub('\\s+(?=(?:[^"]*"[^"]*")*[^"]*$)','', string)
# reads json file and checks if file is valid json file
def read_json(arg):
# return code for valid json file extension
valid_json_file_return_code = valid_json_file(arg)
if valid_json_file_return_code == 1:
sys.exit("Error: File input is not a valid JSON file. Return code: " + str(valid_json_file_return_code))
else:
print("Valid JSON file. Return code: " + str(valid_json_file_return_code))
# read json file and remove whitespace
f = open(arg, 'r')
json_string = remove_whitespace(f.read())
f.close()
# if all checks pass, return json in string format
return json_string
# for lexer to interpret type of value token
def check_type(string):
# if string is digits, then numeric
if string.isdigit():
value_type = 'numeric'
# if string is a true or false value, then boolean
elif string.lower() in ('true','false'):
value_type = 'boolean'
# if length of string >= 2 and has {} brackets, then object
elif len(string) >= 2 and string[0] == '{' and string[len(string)-1] == '}':
value_type = 'object'
# if length of string >=2 and has [], then list
elif len(string) >= 2 and string[0] == '[' and string[len(string)-1] == ']':
value_type = 'list'
# if null, then python type should be none
elif string.lower() == 'null':
value_type = 'none'
# after aforementioned checks, if none left, then string
else:
value_type = 'string'
return value_type
# create lexer class instantiated with tokens and string json output
## tokenize function creates a list with dictionary for each token containing value of token (string), token type, token pattern, value type (if token is value, otherwise blank), token start in string and token end in string (important for ordering)
class lexer:
def __init__(self, tokens, json_input):
self.tokens = tokens
self.json_input = json_input
def tokenize(self):
tokenized_output = []
index = 0
value_type = ''
bracket_pattern = ('|').join([token['pattern'] for token in tokens if token['name'] in ('opening_bracket','closing_bracket')])
for token in tokens:
# start with brackets
if token['name'] in ('opening_bracket','closing_bracket'):
match = re.finditer(token['pattern'],self.json_input)
# remove brackets for evaluation so that nested json objects and lists can be properly interpreted
else:
eval_string = re.sub(bracket_pattern,'',self.json_input)
match = re.finditer(token['pattern'], eval_string)
for item in match:
start, end = item.span()
value = item.group()
# add one to start and end to adjust for opening bracket not included in eval
if token['name'] not in ('opening_bracket','closing_bracket'):
start += 1
end += 1
if token['name'] == 'value':
value_type = check_type(value)
# append result to tokenized result
tokenized_result = {
'string_value': item.group(),
'token_type': tokens[index]['name'],
'token_pattern': tokens[index]['pattern'],
'value_type': value_type,
'position_start': start,
'position_end': end
}
tokenized_output.append(tokenized_result)
# index += 1 for every token in tokens
index += 1
# bubble sort to arrange tokens in original order by position start
for i in range(len(tokenized_output)):
for j in range(len(tokenized_output) - i - 1):
if tokenized_output[j + 1]['position_start'] < tokenized_output[j]['position_start']:
temp = tokenized_output[j + 1]
tokenized_output[j + 1] = tokenized_output[j]
tokenized_output[j] = temp
# tokenize returns a list of every token in the json string
return tokenized_output
# checks if the order of opening bracket, closing bracket, key, colon, value, separator is in a valid JSON format. Otherwise a parse error will occur when parser is instantiated and used
def parse_error(tokenized_output):
for i in range(len(tokenized_output)):
if tokenized_output[0]['token_type'] == 'opening_bracket' and tokenized_output[len(tokenized_output) - 1]['token_type'] == 'closing_bracket':
return 0
if tokenized_output[i]['token_type'] == 'key':
if tokenized_output[i - 1]['token_type'] in ('opening_bracket','separator') and tokenized_output[i + 1]['token_type'] == 'colon':
return 0
if tokenized_output[i]['token_type'] == 'value':
if tokenized_output[i - 1]['token_type'] == 'colon' and tokenized_output[i + 1]['token_type'] in ('closing_bracket','separator'):
return 0
return 1
# parse string to actual value type
def parse_string(value, value_type):
# if value is string, put double quotes around it
if value_type == 'string':
output = re.sub('^"|"$','', value)
# if value is numeric, try integer, if value error occurs, try float
elif value_type == 'numeric':
try:
output = int(value)
except ValueError:
output = float(value)
# if value is true or false, turn to boolean
elif value_type == 'boolean':
if value.lower() == 'true':
output = True
else:
output = False
# if value is object (nested JSON/dictionary)
elif value_type =='object':
if len(value) > 2:
tokenized_string = lexer(tokens, remove_whitespace(value)).tokenize()
dictionary = parser(tokenized_string).value_parse()
# if object is empty, return empty dictionary object
else:
dictionary = {}
output = dictionary
# if value is list
elif value_type == 'list':
# remove brackets
string = value.strip("[]")
# split string on comma
split_string = string.split(',')
# create list with actual value types
new_list = []
# if list has items
if len(split_string) > 0:
# check type and use function recursively for list
for item in split_string:
value = item
value_type = check_type(item)
new_list.append(parse_string(value, value_type))
output = new_list
# remaining type is None (null)
else:
output = None
return output
# create parser class instantiated on tokenized output from lexer
class parser:
def __init__(self, tokenized_output):
self.tokenized_output = tokenized_output
# parses json values into a python dictoinary format
def value_parse(self):
# use parse error to check if json order of opening bracket, key, colon, value, comma separator (if more than one key, value) and closing bracket is correct
error_code = parse_error(self.tokenized_output)
if error_code == 1:
sys.exit("Parse error: incorrect JSON format structure. Return code: " + str(error_code))
else:
print("Valid JSON object. Return code: "+ str(error_code))
# for every value, get the value with correct value type
if len(self.tokenized_output) == 2:
dictionary = {}
else:
for item in self.tokenized_output:
if item['token_type'] == 'value':
# input string value and value type to parse string function
value = item['string_value']
value_type = item['value_type']
item['value'] = parse_string(value, value_type)
# get keys and corresponding values and use zip to map key to value
keys = [item['string_value'] for item in self.tokenized_output if item['token_type'] == 'key']
values = [item['value'] for item in self.tokenized_output if item['token_type'] == 'value']
dictionary = dict(zip(keys,values))
#returns json as python dictionary
return dictionary
class JSON:
def __init__(self,json_input):
self.json_input = json_input
# if json_input is a json file
if "." in json_input:
self.json_read = read_json(self.json_input)
# else if json_input is directly stored in python string, remove whitespace
else:
self.json_read = remove_whitespace(self.json_input)
self.tokenized_string = lexer(tokens, self.json_read).tokenize()
self.parsed_json = parser(self.tokenized_string).value_parse()
def tokens(self):
return self.tokenized_string
def parse_json(self):
return self.parsed_json
# if script will be called directly, then use command line argument for json file
if __name__ == "__main__":
input = sys.argv[1]
json_object = JSON(input)
# prints tokens
print(json_object.tokens())
# print parsed JSON
print(json_object.parse_json())