This repository has been archived by the owner on Jan 26, 2023. It is now read-only.
forked from remram44/pybabel-godot
-
Notifications
You must be signed in to change notification settings - Fork 1
/
json_extractor.py
143 lines (116 loc) · 4.84 KB
/
json_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from collections import deque
from babel.messages.jslexer import tokenize, unquote_string
# This helper is copied to all necessary files because once again python imports are being a pain
def reopen_normal_read(file_obj, encoding):
"""Re-open a file obj in plain read mode"""
return open(file_obj.name, "r", encoding=encoding)
JSON_GETTEXT_KEYWORD = 'type'
JSON_GETTEXT_VALUE = 'gettext_string'
JSON_GETTEXT_KEY_CONTENT = 'content'
JSON_GETTEXT_KEY_ALT_CONTENT = 'alt_content'
JSON_GETTEXT_KEY_FUNCNAME = 'funcname'
class JsonExtractor(object):
def __init__(self, data):
self.state = 'start'
self.data = data
self.token_to_add = None
self.is_value = False
self.gettext_mode = False
self.current_key = None
self.in_array = False
self.nested_in_array = []
self.results = []
self.token_params = {}
# TODO: fix the duplicate name between this and the other add_result
def add_result(self, token):
value = unquote_string(token.value)
if value not in self.results:
self.results[value] = deque()
self.results[value].append(token.lineno)
def start_object(self):
self.gettext_mode = False
self.state = 'key'
# Things will be incorrect if an object is contained in an array, so
# we use a stack of states to return to like this in order to support
# that kind of JSON structures
self.nested_in_array.append(self.in_array)
self.in_array = False
def with_separator(self, token):
self.state = 'value'
def start_array(self):
self.in_array = True
def end_array(self):
self.in_array = False
self.end_pair()
def end_pair(self, add_gettext_object=False):
if self.token_to_add:
if not self.gettext_mode or (self.gettext_mode and add_gettext_object):
self.add_result(self.token_to_add)
if not self.in_array:
self.current_key = None
self.state = 'key'
def end_object(self):
self.end_pair(add_gettext_object=True)
self.gettext_mode = False
self.state = 'end'
self.in_array = self.nested_in_array.pop()
def add_result(self, token):
value = unquote_string(token.value)
result = dict(
line_number=token.lineno,
content=value
)
for key, value in self.token_params.items():
if key == 'alt_token':
result['alt_content'] = unquote_string(value.value)
result['alt_line_number'] = value.lineno
else:
result[key] = unquote_string(value)
self.results.append(result)
self.token_to_add = None
self.token_params = {}
def get_lines_data(self):
"""
Returns string:line_numbers list
Since all strings are unique it is OK to get line numbers this way.
Since same string can occur several times inside single .json file the values should be popped(FIFO) from the list
:rtype: list
"""
for token in tokenize(self.data):
if token.type == 'operator':
if token.value == '{':
self.start_object()
elif token.value == '[':
self.start_array()
elif token.value == ':':
self.with_separator(token)
elif token.value == '}':
self.end_object()
elif token.value == ']':
self.end_array()
elif token.value == ',':
self.end_pair()
elif token.type == 'string':
if self.state == 'key':
self.current_key = unquote_string(token.value)
if self.current_key == JSON_GETTEXT_KEYWORD:
self.gettext_mode = True
else:
# TODO: auto-detecting items to extract through the keywords passed to extract_json would be very nice
if self.current_key.lower() in (
"groupname", "displayname", "name", "message", "messages"):
self.token_to_add = token
return self.results
def extract_json(file_obj, keywords, comment_tags, options):
"""
Supports: gettext, ngettext. See package README or github ( https://github.com/tigrawap/pybabel-json ) for more usage info.
"""
with reopen_normal_read(file_obj, options.get('encoding', 'utf-8')) as f:
data = f.read()
json_extractor = JsonExtractor(data)
strings_data = json_extractor.get_lines_data()
for item in strings_data:
messages = [item['content']]
if item.get('funcname') == 'ngettext':
messages.append(item['alt_content'])
yield item['line_number'], item.get('funcname', 'gettext'), tuple(messages), []