Skip to content

Commit 9245e4f

Browse files
committed
parser rewrite for correctness
Summary: - added tests - added wrapper around the python tokenizer to manage rewinding and retokenizing in the middle of the stream - switched to a modified version of python's tokenize module that's a little more forgiving about EOF and unexpected dedents - wrote a dumb HTML parser from the spec, with additional support for shoving python tokens inside attributes - watch for '{' as we're feeding tokens to the HTML parser; when we see one, recurse, recurse! Note that this change is BACKWARDS INCOMPATIBLE. Primarily, we no longer support \ as an escape character. There are others changes to edge cases involving whitespace. Test Plan: Greatly expanded test suite Compared with existing behaviour on the dropbox codebase. There are some differences, but I'm pretty sure they are all features, not bugs. ack --python "#\s*coding\s*[=:]\s*pyxl\s*$" -l > ~/pyxl_files.txt # check out old pyxl for file in `cat ~/pyxl_files.txt` ; do echo $file ; genfile=$file.old_pyxl ; parse_file.py $file > $genfile ; ~/normalize_pyxl_output.sh < $genfile > $genfile.norm ; done # check out new pyxl for file in `cat ~/pyxl_files.txt` ; do echo $file ; genfile=$file.new_pyxl ; parse_file.py $file > $genfile ; ~/normalize_pyxl_output.sh < $genfile > $genfile.norm ; done for file in `cat ~/pyxl_files.txt` ; do diff -u -b $file.old_pyxl.norm $file.new_pyxl.norm ; done > ~/pyxl.diff for file in `cat ~/pyxl_files.txt` ; do [[ `cat $file | wc -l` == `cat $file.new_pyxl | wc -l` ]] || (echo NOT OK ; wc -l $file{,.new_pyxl,.old_pyxl}) ; done normalize_pyxl_output.sh contains: sed -e '1,2{/coding.*pyxl/d}' | # strip coding line (decoded already) ~/pythontidy.py | sed -e '/^$/d' # strip empty lines Reviewers: prha
1 parent f22faa7 commit 9245e4f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+1446
-206
lines changed

pyxl/codec/html_tokenizer.py

+416
Large diffs are not rendered by default.

pyxl/codec/parser.py

+209-188
Large diffs are not rendered by default.

pyxl/codec/pytokenize.py

+468
Large diffs are not rendered by default.

pyxl/codec/register.py

+2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import codecs, cStringIO, encodings
55
import sys
6+
import traceback
67
from encodings import utf_8
78
from pyxl.codec.tokenizer import pyxl_tokenize, pyxl_untokenize
89

@@ -11,6 +12,7 @@ def pyxl_transform(stream):
1112
output = pyxl_untokenize(pyxl_tokenize(stream.readline))
1213
except Exception, ex:
1314
print ex
15+
traceback.print_exc()
1416
raise
1517

1618
return output.rstrip()

pyxl/codec/tokenizer.py

+157-17
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,101 @@
11
#!/usr/bin/env python
22

3-
import tokenize
4-
from HTMLParser import HTMLParseError
3+
import pytokenize as tokenize
4+
import re
5+
from StringIO import StringIO
56
from pyxl.codec.parser import PyxlParser
7+
from pytokenize import Untokenizer
68

79
class PyxlParseError(Exception): pass
810

11+
def get_end_pos(start_pos, tvalue):
12+
row, col = start_pos
13+
for c in tvalue:
14+
if c == '\n':
15+
col = 0
16+
row += 1
17+
else:
18+
col += 1
19+
return (row, col)
20+
21+
class RewindableTokenStream(object):
22+
"""
23+
A token stream, with the ability to rewind and restart tokenization while maintaining correct
24+
token position information.
25+
26+
Invariants:
27+
- zero_row and zero_col are the correct values to adjust the line and possibly column of the
28+
tokens being produced by _tokens.
29+
- Tokens in unshift_buffer have locations with absolute position (relative to the beginning
30+
of the file, not relative to where we last restarted tokenization).
31+
"""
32+
33+
def __init__(self, readline):
34+
self.orig_readline = readline
35+
self.unshift_buffer = []
36+
self.rewound_buffer = None
37+
self._tokens = tokenize.generate_tokens(self._readline)
38+
self.zero_row, self.zero_col = (0, 0)
39+
self.stop_readline = False
40+
41+
def _dumpstate(self):
42+
print "tokenizer state:"
43+
print " zero:", (self.zero_row, self.zero_col)
44+
print " rewound_buffer:", self.rewound_buffer
45+
print " unshift_buffer:", self.unshift_buffer
46+
47+
def _readline(self):
48+
if self.stop_readline:
49+
return ""
50+
if self.rewound_buffer:
51+
line = self.rewound_buffer.readline()
52+
if line:
53+
return line
54+
else:
55+
self.rewound_buffer = None # fallthrough to orig_readline
56+
return self.orig_readline()
57+
58+
def _flush(self):
59+
self.stop_readline = True
60+
tokens = list(tok for tok in self)
61+
self.stop_readline = False
62+
return tokens
63+
64+
def _adjust_position(self, pos):
65+
row, col = pos
66+
if row == 0:
67+
col += self.zero_col
68+
row += self.zero_row
69+
return (row, col)
70+
71+
def rewind_and_retokenize(self, rewind_token):
72+
"""Rewind the given token (which is expected to be the last token read from this stream, or
73+
the end of such token); then restart tokenization."""
74+
ttype, tvalue, (row, col), tend, tline = rewind_token
75+
tokens = [rewind_token] + self._flush()
76+
self.zero_row, self.zero_col = (row - 1, col - 1)
77+
self.rewound_buffer = StringIO(Untokenizer().untokenize(tokens))
78+
self.unshift_buffer = []
79+
self._tokens = tokenize.generate_tokens(self._readline)
80+
81+
def next(self):
82+
if self.unshift_buffer:
83+
token = self.unshift_buffer.pop(0)
84+
else:
85+
ttype, tvalue, tstart, tend, tline = self._tokens.next()
86+
tstart = self._adjust_position(tstart)
87+
tend = self._adjust_position(tend)
88+
token = (ttype, tvalue, tstart, tend, tline)
89+
return token
90+
91+
def __iter__(self):
92+
return self
93+
94+
def unshift(self, token):
95+
"""Rewind the given token, without retokenizing. It will be the next token read from the
96+
stream."""
97+
self.unshift_buffer[:0] = [token]
98+
999
def pyxl_untokenize(tokens):
10100
parts = []
11101
prev_row = 1
@@ -33,10 +123,14 @@ def pyxl_untokenize(tokens):
33123
return ''.join(parts)
34124

35125
def pyxl_tokenize(readline):
126+
return transform_tokens(RewindableTokenStream(readline))
127+
128+
def transform_tokens(tokens):
36129
last_nw_token = None
37130
prev_token = None
38131

39-
tokens = tokenize.generate_tokens(readline)
132+
curly_depth = 0
133+
40134
while 1:
41135
try:
42136
token = tokens.next()
@@ -45,14 +139,25 @@ def pyxl_tokenize(readline):
45139

46140
ttype, tvalue, tstart, tend, tline = token
47141

48-
if (ttype == tokenize.OP and tvalue == '<' and last_nw_token and
49-
((last_nw_token[0] == tokenize.OP and last_nw_token[1] == '=') or
142+
if ttype == tokenize.OP and tvalue == '{':
143+
curly_depth += 1
144+
if ttype == tokenize.OP and tvalue == '}':
145+
curly_depth -= 1
146+
if curly_depth < 0:
147+
tokens.unshift(token)
148+
return
149+
150+
if (ttype == tokenize.OP and tvalue == '<' and
151+
(last_nw_token == None or # if we have *just* entered python mode e.g
152+
(last_nw_token[0] == tokenize.OP and last_nw_token[1] == '=') or
50153
(last_nw_token[0] == tokenize.OP and last_nw_token[1] == '(') or
51154
(last_nw_token[0] == tokenize.OP and last_nw_token[1] == '[') or
52155
(last_nw_token[0] == tokenize.OP and last_nw_token[1] == '{') or
53156
(last_nw_token[0] == tokenize.OP and last_nw_token[1] == ',') or
54157
(last_nw_token[0] == tokenize.OP and last_nw_token[1] == ':') or
55158
(last_nw_token[0] == tokenize.NAME and last_nw_token[1] == 'print') or
159+
(last_nw_token[0] == tokenize.NAME and last_nw_token[1] == 'else') or
160+
(last_nw_token[0] == tokenize.NAME and last_nw_token[1] == 'yield') or
56161
(last_nw_token[0] == tokenize.NAME and last_nw_token[1] == 'return'))):
57162
token = get_pyxl_token(token, tokens)
58163

@@ -97,26 +202,61 @@ def pyxl_tokenize(readline):
97202

98203
def get_pyxl_token(start_token, tokens):
99204
ttype, tvalue, tstart, tend, tline = start_token
100-
pyxl_parser = PyxlParser(tstart[0], tstart[1], tline)
205+
pyxl_parser = PyxlParser(tstart[0], tstart[1])
101206
pyxl_parser.feed(start_token)
102207

103208
for token in tokens:
104209
ttype, tvalue, tstart, tend, tline = token
105210

106-
try:
107-
pyxl_parser.feed(token)
108-
except HTMLParseError, html_ex:
109-
msg = 'HTMLParseError: %s (line:%d: %s)' % (html_ex.msg, tstart[0], tline.strip())
110-
raise PyxlParseError(msg)
111-
except AssertionError, assert_ex:
112-
msg = '%s (line:%d: %s)' % (assert_ex, tstart[0], tline.strip())
113-
raise PyxlParseError(msg)
211+
if tvalue and tvalue[0] == '{':
212+
if pyxl_parser.python_mode_allowed():
213+
mid, right = tvalue[0], tvalue[1:]
214+
division = get_end_pos(tstart, mid)
215+
pyxl_parser.feed_position_only((ttype, mid, tstart, division, tline))
216+
tokens.rewind_and_retokenize((ttype, right, division, tend, tline))
217+
python_tokens = list(transform_tokens(tokens))
218+
219+
close_curly = tokens.next()
220+
ttype, tvalue, tstart, tend, tline = close_curly
221+
close_curly_sub = (ttype, '', tend, tend, tline)
222+
223+
pyxl_parser.feed_python(python_tokens + [close_curly_sub])
224+
continue
225+
# else fallthrough to pyxl_parser.feed(token)
226+
elif tvalue and ttype == tokenize.COMMENT:
227+
if not pyxl_parser.python_comment_allowed():
228+
tvalue, rest = tvalue[0], tvalue[1:]
229+
division = get_end_pos(tstart, tvalue)
230+
tokens.unshift((tokenize.ERRORTOKEN, rest, division, tend, tline))
231+
token = ttype, tvalue, tstart, division, tline
232+
# fallthrough to pyxl_parser.feed(token)
233+
else:
234+
pyxl_parser.feed_comment(token)
235+
continue
236+
elif tvalue and tvalue[0] == '#':
237+
# let the python tokenizer grab the whole comment token
238+
tokens.rewind_and_retokenize(token)
239+
continue
240+
else:
241+
sp = re.split('([#{])', tvalue, maxsplit=1)
242+
if len(sp) > 1:
243+
tvalue, mid, right = sp
244+
division = get_end_pos(tstart, tvalue)
245+
tokens.unshift((ttype, mid+right, division, tend, tline))
246+
token = ttype, tvalue, tstart, division, tline
247+
# fallthrough to pyxl_parser.feed(token)
248+
249+
pyxl_parser.feed(token)
114250

115251
if pyxl_parser.done(): break
116252

117253
if not pyxl_parser.done():
118-
lines = ['<%s> at (line:%d: %s)' % (tag, row, line.strip())
119-
for tag, row, line in pyxl_parser.openTags]
254+
lines = ['<%s> at (line:%d)' % (tag_info['tag'], tag_info['row'])
255+
for tag_info in pyxl_parser.open_tags]
120256
raise PyxlParseError('Unclosed Tags: %s' % ', '.join(lines))
121257

122-
return pyxl_parser.getToken()
258+
remainder = pyxl_parser.get_remainder()
259+
if remainder:
260+
tokens.rewind_and_retokenize(remainder)
261+
262+
return pyxl_parser.get_token()

pyxl/scripts/parse_file.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@
44
from pyxl.codec.tokenizer import pyxl_tokenize, pyxl_untokenize
55

66
f = open(sys.argv[1], 'r')
7-
print pyxl_untokenize(pyxl_tokenize(f.readline))
7+
print pyxl_untokenize(pyxl_tokenize(f.readline)),

tests/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
import pyxl.codec.register

tests/test_attr_name_case.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# coding: pyxl
2+
from pyxl import html
3+
def test():
4+
assert str(<div cLaSs="foo"></div>) == '<div class="foo"></div>'

pyxl_tests.py tests/test_basic.py

File renamed without changes.

tests/test_curlies_in_attrs_1.py

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# coding: pyxl
2+
from pyxl import html
3+
def test():
4+
# kannan thinks this should be different
5+
assert str(<frag><img src="{'foo'}" /></frag>) == """<img src="foo" />"""

tests/test_curlies_in_attrs_2.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# coding: pyxl
2+
from pyxl import html
3+
def test():
4+
assert str(<frag><img src="barbaz{'foo'}" /></frag>) == """<img src="barbazfoo" />"""

tests/test_curlies_in_strings_1.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# coding: pyxl
2+
from pyxl import html
3+
def test():
4+
assert str(<frag> '{'foobar'}' </frag>) == """ 'foobar' """

tests/test_curlies_in_strings_2.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# coding: pyxl
2+
from pyxl import html
3+
def test():
4+
assert str(<frag> "{' "foobar'} </frag>) == ''' " &quot;foobar '''

tests/test_curlies_in_strings_3.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# coding: pyxl
2+
from pyxl import html
3+
def test():
4+
assert str(<frag> "{' "foobar" '}" </frag>) == ''' " &quot;foobar&quot; " '''

tests/test_curlies_in_strings_4.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# coding: pyxl
2+
from pyxl import html
3+
def test():
4+
assert str(<frag>"</frag>) + '{}' == '''"{}'''

tests/test_eof_1.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# coding: pyxl
2+
from pyxl import html
3+
def test():
4+
assert str(<frag>'''</frag>) == """'''"""

tests/test_html_comments_1.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# coding: pyxl
2+
from pyxl import html
3+
def test():
4+
assert str(<frag><!-- comment here --></frag>) == ""

tests/test_html_comments_2.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# coding: pyxl
2+
from pyxl import html
3+
def test():
4+
assert str(<frag><!-- comment-here --></frag>) == ""

tests/test_nested_curlies.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# coding: pyxl
2+
from pyxl import html
3+
def test():
4+
assert str(<frag>{'{text}'}</frag>) == """{text}"""

tests/test_python_comments_1.py

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# coding: pyxl
2+
from pyxl import html
3+
def test():
4+
assert str(<frag>Im cool # lol
5+
</frag>) == """Im cool """

tests/test_python_comments_2.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# coding: pyxl
2+
from pyxl import html
3+
def test():
4+
assert str(<div style="background-color: #1f75cc;"></div>) == """<div style="background-color: #1f75cc;"></div>"""

tests/test_python_comments_3.py

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# coding: pyxl
2+
from pyxl import html
3+
def test():
4+
assert str(<div #style="display: none;"
5+
></div>) == "<div></div>"

tests/test_tags_in_curlies_1.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# coding: pyxl
2+
from pyxl import html
3+
def test():
4+
assert str(<frag>{'<br />'}</frag>) == """&lt;br /&gt;"""

tests/test_tags_in_curlies_10.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# coding: pyxl
2+
from pyxl import html
3+
def test():
4+
assert str(<frag>{<br /> if False else <div></div>}</frag>) == '''<div></div>'''

tests/test_tags_in_curlies_2.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# coding: pyxl
2+
from pyxl import html
3+
def test():
4+
assert str(<frag>{'<img src="foo" />'}</frag>) == """&lt;img src=&quot;foo&quot; /&gt;"""

tests/test_tags_in_curlies_3.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# coding: pyxl
2+
from pyxl import html
3+
def test():
4+
assert str(<frag>{'<div> foobar </div>'}</frag>) == """&lt;div&gt; foobar &lt;/div&gt;"""

tests/test_tags_in_curlies_4.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# coding: pyxl
2+
from pyxl import html
3+
def test():
4+
assert str(<frag>{'<div class="foo"> foobar </div>'}</frag>) == """&lt;div class=&quot;foo&quot;&gt; foobar &lt;/div&gt;"""

tests/test_tags_in_curlies_5.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# coding: pyxl
2+
from pyxl import html
3+
def test():
4+
assert str(<frag> {'<img src="{cond}" />'} </frag>) == """ &lt;img src=&quot;{cond}&quot; /&gt; """

tests/test_tags_in_curlies_6.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# coding: pyxl
2+
from pyxl import html
3+
def test():
4+
assert str(<frag> {' "<br /> '} </frag>) == ''' &quot;&lt;br /&gt; '''

tests/test_tags_in_curlies_7.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# coding: pyxl
2+
from pyxl import html
3+
def test():
4+
assert str(<frag> {' "<br />" '} </frag>) == ''' &quot;&lt;br /&gt;&quot; '''

tests/test_tags_in_curlies_8.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# coding: pyxl
2+
from pyxl import html
3+
def test():
4+
assert str(<frag>{<br />}</frag>) == '''<br />'''

tests/test_tags_in_curlies_9.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# coding: pyxl
2+
from pyxl import html
3+
def test():
4+
assert str(<frag>{<br /> if True else <div></div>}</frag>) == '''<br />'''

tests/test_whitespace_1.py

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# coding: pyxl
2+
from pyxl import html
3+
def test():
4+
assert str(<div class="{'blah'}">
5+
blah <a href="%(url)s">blah</a> blah.
6+
</div>) == """<div class="blah">blah <a href="%(url)s">blah</a> blah.</div>"""

tests/test_whitespace_10.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# coding: pyxl
2+
from pyxl import html
3+
def test():
4+
assert str(<div class="{'foo'} {'bar'}"></div>) == '<div class="foo bar"></div>'

tests/test_whitespace_11.py

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# coding: pyxl
2+
from pyxl import html
3+
4+
def test():
5+
# Presence of paretheses around html should not affect contents of tags. (In old pyxl,
6+
# this led to differences in whitespace handling.)
7+
assert str(get_frag1()) == str(get_frag2())
8+
9+
def get_frag1():
10+
return <frag>
11+
{'foo'}
12+
</frag>
13+
14+
def get_frag2():
15+
return (<frag>
16+
{'foo'}
17+
</frag>)

0 commit comments

Comments
 (0)