parser rewrite for correctness

jboning · jboning · commit 9245e4f41b11 · 2014-03-28T17:26:53.000-07:00
Summary:
 - added tests
 - added wrapper around the python tokenizer to manage rewinding and retokenizing in the middle of the stream
 - switched to a modified version of python's tokenize module that's a little more forgiving about EOF and unexpected dedents
 - wrote a dumb HTML parser from the spec, with additional support for shoving python tokens inside attributes
 - watch for '{' as we're feeding tokens to the HTML parser; when we see one, recurse, recurse!

Note that this change is BACKWARDS INCOMPATIBLE. Primarily, we no longer
support \ as an escape character. There are others changes to edge cases
involving whitespace.

Test Plan:
Greatly expanded test suite

Compared with existing behaviour on the dropbox codebase. There are some differences, but I'm pretty sure they are all features, not bugs.

      ack --python "#\s*coding\s*[=:]\s*pyxl\s*$" -l &gt; ~/pyxl_files.txt
      # check out old pyxl
      for file in `cat ~/pyxl_files.txt` ; do echo $file ; genfile=$file.old_pyxl ; parse_file.py $file &gt; $genfile ; ~/normalize_pyxl_output.sh &lt; $genfile &gt; $genfile.norm ; done
      # check out new pyxl
      for file in `cat ~/pyxl_files.txt` ; do echo $file ; genfile=$file.new_pyxl ; parse_file.py $file &gt; $genfile ; ~/normalize_pyxl_output.sh &lt; $genfile &gt; $genfile.norm ; done

      for file in `cat ~/pyxl_files.txt` ; do diff -u -b $file.old_pyxl.norm $file.new_pyxl.norm ; done &gt; ~/pyxl.diff

      for file in `cat ~/pyxl_files.txt` ; do [[ `cat $file | wc -l` == `cat $file.new_pyxl | wc -l` ]] || (echo NOT OK ; wc -l $file{,.new_pyxl,.old_pyxl}) ; done

normalize_pyxl_output.sh contains:

      sed -e '1,2{/coding.*pyxl/d}' |          # strip coding line (decoded already)
      ~/pythontidy.py |
      sed -e '/^$/d'                           # strip empty lines

Reviewers: prha
diff --git a/pyxl/codec/html_tokenizer.py b/pyxl/codec/html_tokenizer.py
diff --git a/pyxl/codec/parser.py b/pyxl/codec/parser.py
diff --git a/pyxl/codec/pytokenize.py b/pyxl/codec/pytokenize.py
diff --git a/pyxl/codec/register.py b/pyxl/codec/register.py
@@ -3,6 +3,7 @@
 
 import codecs, cStringIO, encodings
 import sys
+import traceback
 from encodings import utf_8
 from pyxl.codec.tokenizer import pyxl_tokenize, pyxl_untokenize
 
@@ -11,6 +12,7 @@ def pyxl_transform(stream):
         output = pyxl_untokenize(pyxl_tokenize(stream.readline))
     except Exception, ex:
         print ex
+        traceback.print_exc()
         raise
 
     return output.rstrip()
diff --git a/pyxl/codec/tokenizer.py b/pyxl/codec/tokenizer.py
@@ -1,11 +1,101 @@
 #!/usr/bin/env python
 
-import tokenize
-from HTMLParser import HTMLParseError
+import pytokenize as tokenize
+import re
+from StringIO import StringIO
 from pyxl.codec.parser import PyxlParser
+from pytokenize import Untokenizer
 
 class PyxlParseError(Exception): pass
 
+def get_end_pos(start_pos, tvalue):
+    row, col = start_pos
+    for c in tvalue:
+        if c == '\n':
+            col = 0
+            row += 1
+        else:
+            col += 1
+    return (row, col)
+
+class RewindableTokenStream(object):
+    """
+    A token stream, with the ability to rewind and restart tokenization while maintaining correct
+    token position information.
+
+    Invariants:
+        - zero_row and zero_col are the correct values to adjust the line and possibly column of the
+        tokens being produced by _tokens.
+        - Tokens in unshift_buffer have locations with absolute position (relative to the beginning
+          of the file, not relative to where we last restarted tokenization).
+    """
+
+    def __init__(self, readline):
+        self.orig_readline = readline
+        self.unshift_buffer = []
+        self.rewound_buffer = None
+        self._tokens = tokenize.generate_tokens(self._readline)
+        self.zero_row, self.zero_col = (0, 0)
+        self.stop_readline = False
+
+    def _dumpstate(self):
+        print "tokenizer state:"
+        print "  zero:", (self.zero_row, self.zero_col)
+        print "  rewound_buffer:", self.rewound_buffer
+        print "  unshift_buffer:", self.unshift_buffer
+
+    def _readline(self):
+        if self.stop_readline:
+            return ""
+        if self.rewound_buffer:
+            line = self.rewound_buffer.readline()
+            if line:
+                return line
+            else:
+                self.rewound_buffer = None  # fallthrough to orig_readline
+        return self.orig_readline()
+
+    def _flush(self):
+        self.stop_readline = True
+        tokens = list(tok for tok in self)
+        self.stop_readline = False
+        return tokens
+
+    def _adjust_position(self, pos):
+        row, col = pos
+        if row == 0:
+            col += self.zero_col
+        row += self.zero_row
+        return (row, col)
+
+    def rewind_and_retokenize(self, rewind_token):
+        """Rewind the given token (which is expected to be the last token read from this stream, or
+        the end of such token); then restart tokenization."""
+        ttype, tvalue, (row, col), tend, tline = rewind_token
+        tokens = [rewind_token] + self._flush()
+        self.zero_row, self.zero_col = (row - 1, col - 1)
+        self.rewound_buffer = StringIO(Untokenizer().untokenize(tokens))
+        self.unshift_buffer = []
+        self._tokens = tokenize.generate_tokens(self._readline)
+
+    def next(self):
+        if self.unshift_buffer:
+            token = self.unshift_buffer.pop(0)
+        else:
+            ttype, tvalue, tstart, tend, tline = self._tokens.next()
+            tstart = self._adjust_position(tstart)
+            tend = self._adjust_position(tend)
+            token = (ttype, tvalue, tstart, tend, tline)
+        return token
+
+    def __iter__(self):
+        return self
+
+    def unshift(self, token):
+        """Rewind the given token, without retokenizing. It will be the next token read from the
+        stream."""
+        self.unshift_buffer[:0] = [token]
+
 def pyxl_untokenize(tokens):
     parts = []
     prev_row = 1
@@ -33,10 +123,14 @@ def pyxl_untokenize(tokens):
     return ''.join(parts)
 
 def pyxl_tokenize(readline):
+    return transform_tokens(RewindableTokenStream(readline))
+
+def transform_tokens(tokens):
     last_nw_token = None
     prev_token = None
 
-    tokens = tokenize.generate_tokens(readline)
+    curly_depth = 0
+
     while 1:
         try:
             token = tokens.next()
@@ -45,14 +139,25 @@ def pyxl_tokenize(readline):
 
         ttype, tvalue, tstart, tend, tline = token
 
-        if (ttype == tokenize.OP and tvalue == '<' and last_nw_token and
-            ((last_nw_token[0] == tokenize.OP and last_nw_token[1] == '=') or
+        if ttype == tokenize.OP and tvalue == '{':
+            curly_depth += 1
+        if ttype == tokenize.OP and tvalue == '}':
+            curly_depth -= 1
+            if curly_depth < 0:
+                tokens.unshift(token)
+                return
+
+        if (ttype == tokenize.OP and tvalue == '<' and
+            (last_nw_token == None or # if we have *just* entered python mode e.g
+             (last_nw_token[0] == tokenize.OP and last_nw_token[1] == '=') or
              (last_nw_token[0] == tokenize.OP and last_nw_token[1] == '(') or
              (last_nw_token[0] == tokenize.OP and last_nw_token[1] == '[') or
              (last_nw_token[0] == tokenize.OP and last_nw_token[1] == '{') or
              (last_nw_token[0] == tokenize.OP and last_nw_token[1] == ',') or
              (last_nw_token[0] == tokenize.OP and last_nw_token[1] == ':') or
              (last_nw_token[0] == tokenize.NAME and last_nw_token[1] == 'print') or
+             (last_nw_token[0] == tokenize.NAME and last_nw_token[1] == 'else') or
+             (last_nw_token[0] == tokenize.NAME and last_nw_token[1] == 'yield') or
              (last_nw_token[0] == tokenize.NAME and last_nw_token[1] == 'return'))):
             token = get_pyxl_token(token, tokens)
 
@@ -97,26 +202,61 @@ def pyxl_tokenize(readline):
 
 def get_pyxl_token(start_token, tokens):
     ttype, tvalue, tstart, tend, tline = start_token
-    pyxl_parser = PyxlParser(tstart[0], tstart[1], tline)
+    pyxl_parser = PyxlParser(tstart[0], tstart[1])
     pyxl_parser.feed(start_token)
 
     for token in tokens:
         ttype, tvalue, tstart, tend, tline = token
 
-        try:
-            pyxl_parser.feed(token)
-        except HTMLParseError, html_ex:
-            msg = 'HTMLParseError: %s (line:%d: %s)' % (html_ex.msg, tstart[0], tline.strip())
-            raise PyxlParseError(msg)
-        except AssertionError, assert_ex:
-            msg = '%s (line:%d: %s)' % (assert_ex, tstart[0], tline.strip())
-            raise PyxlParseError(msg)
+        if tvalue and tvalue[0] == '{':
+            if pyxl_parser.python_mode_allowed():
+                mid, right = tvalue[0], tvalue[1:]
+                division = get_end_pos(tstart, mid)
+                pyxl_parser.feed_position_only((ttype, mid, tstart, division, tline))
+                tokens.rewind_and_retokenize((ttype, right, division, tend, tline))
+                python_tokens = list(transform_tokens(tokens))
+
+                close_curly = tokens.next()
+                ttype, tvalue, tstart, tend, tline = close_curly
+                close_curly_sub = (ttype, '', tend, tend, tline)
+
+                pyxl_parser.feed_python(python_tokens + [close_curly_sub])
+                continue
+            # else fallthrough to pyxl_parser.feed(token)
+        elif tvalue and ttype == tokenize.COMMENT:
+            if not pyxl_parser.python_comment_allowed():
+                tvalue, rest = tvalue[0], tvalue[1:]
+                division = get_end_pos(tstart, tvalue)
+                tokens.unshift((tokenize.ERRORTOKEN, rest, division, tend, tline))
+                token = ttype, tvalue, tstart, division, tline
+                # fallthrough to pyxl_parser.feed(token)
+            else:
+                pyxl_parser.feed_comment(token)
+                continue
+        elif tvalue and tvalue[0] == '#':
+            # let the python tokenizer grab the whole comment token
+            tokens.rewind_and_retokenize(token)
+            continue
+        else:
+            sp = re.split('([#{])', tvalue, maxsplit=1)
+            if len(sp) > 1:
+                tvalue, mid, right = sp
+                division = get_end_pos(tstart, tvalue)
+                tokens.unshift((ttype, mid+right, division, tend, tline))
+                token = ttype, tvalue, tstart, division, tline
+                # fallthrough to pyxl_parser.feed(token)
+
+        pyxl_parser.feed(token)
 
         if pyxl_parser.done(): break
 
     if not pyxl_parser.done():
-        lines = ['<%s> at (line:%d: %s)' % (tag, row, line.strip())
-                 for tag, row, line in pyxl_parser.openTags]
+        lines = ['<%s> at (line:%d)' % (tag_info['tag'], tag_info['row'])
+                 for tag_info in pyxl_parser.open_tags]
         raise PyxlParseError('Unclosed Tags: %s' % ', '.join(lines))
 
-    return pyxl_parser.getToken()
+    remainder = pyxl_parser.get_remainder()
+    if remainder:
+        tokens.rewind_and_retokenize(remainder)
+
+    return pyxl_parser.get_token()
diff --git a/pyxl/scripts/parse_file.py b/pyxl/scripts/parse_file.py
@@ -4,4 +4,4 @@
 from pyxl.codec.tokenizer import pyxl_tokenize, pyxl_untokenize
 
 f = open(sys.argv[1], 'r')
-print pyxl_untokenize(pyxl_tokenize(f.readline))
+print pyxl_untokenize(pyxl_tokenize(f.readline)),
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -0,0 +1 @@
+import pyxl.codec.register
diff --git a/tests/test_attr_name_case.py b/tests/test_attr_name_case.py
@@ -0,0 +1,4 @@
+# coding: pyxl
+from pyxl import html
+def test():
+    assert str(<div cLaSs="foo"></div>) == '<div class="foo"></div>'
diff --git a/tests/test_basic.py b/tests/test_basic.py
diff --git a/tests/test_curlies_in_attrs_1.py b/tests/test_curlies_in_attrs_1.py
@@ -0,0 +1,5 @@
+# coding: pyxl
+from pyxl import html
+def test():
+    # kannan thinks this should be different
+    assert str(<frag><img src="{'foo'}" /></frag>) == """<img src="foo" />"""
diff --git a/tests/test_curlies_in_attrs_2.py b/tests/test_curlies_in_attrs_2.py
@@ -0,0 +1,4 @@
+# coding: pyxl
+from pyxl import html
+def test():
+    assert str(<frag><img src="barbaz{'foo'}" /></frag>) == """<img src="barbazfoo" />"""
diff --git a/tests/test_curlies_in_strings_1.py b/tests/test_curlies_in_strings_1.py
@@ -0,0 +1,4 @@
+# coding: pyxl
+from pyxl import html
+def test():
+    assert str(<frag> '{'foobar'}' </frag>) == """ 'foobar' """
diff --git a/tests/test_curlies_in_strings_2.py b/tests/test_curlies_in_strings_2.py
@@ -0,0 +1,4 @@
+# coding: pyxl
+from pyxl import html
+def test():
+    assert str(<frag> "{' "foobar'} </frag>) == ''' " &quot;foobar '''
diff --git a/tests/test_curlies_in_strings_3.py b/tests/test_curlies_in_strings_3.py
@@ -0,0 +1,4 @@
+# coding: pyxl
+from pyxl import html
+def test():
+    assert str(<frag> "{' "foobar" '}" </frag>) == ''' " &quot;foobar&quot; " '''
diff --git a/tests/test_curlies_in_strings_4.py b/tests/test_curlies_in_strings_4.py
@@ -0,0 +1,4 @@
+# coding: pyxl
+from pyxl import html
+def test():
+    assert str(<frag>"</frag>) + '{}' == '''"{}'''
diff --git a/tests/test_eof_1.py b/tests/test_eof_1.py
@@ -0,0 +1,4 @@
+# coding: pyxl
+from pyxl import html
+def test():
+    assert str(<frag>'''</frag>) == """'''"""
diff --git a/tests/test_html_comments_1.py b/tests/test_html_comments_1.py
@@ -0,0 +1,4 @@
+# coding: pyxl
+from pyxl import html
+def test():
+    assert str(<frag><!-- comment here --></frag>) == ""
diff --git a/tests/test_html_comments_2.py b/tests/test_html_comments_2.py
@@ -0,0 +1,4 @@
+# coding: pyxl
+from pyxl import html
+def test():
+    assert str(<frag><!-- comment-here --></frag>) == ""
diff --git a/tests/test_nested_curlies.py b/tests/test_nested_curlies.py
@@ -0,0 +1,4 @@
+# coding: pyxl
+from pyxl import html
+def test():
+    assert str(<frag>{'{text}'}</frag>) == """{text}"""
diff --git a/tests/test_python_comments_1.py b/tests/test_python_comments_1.py
@@ -0,0 +1,5 @@
+# coding: pyxl
+from pyxl import html
+def test():
+    assert str(<frag>Im cool # lol
+</frag>) == """Im cool """
diff --git a/tests/test_python_comments_2.py b/tests/test_python_comments_2.py
@@ -0,0 +1,4 @@
+# coding: pyxl
+from pyxl import html
+def test():
+    assert str(<div style="background-color: #1f75cc;"></div>) == """<div style="background-color: #1f75cc;"></div>"""
diff --git a/tests/test_python_comments_3.py b/tests/test_python_comments_3.py
@@ -0,0 +1,5 @@
+# coding: pyxl
+from pyxl import html
+def test():
+    assert str(<div #style="display: none;"
+               ></div>) == "<div></div>"
diff --git a/tests/test_tags_in_curlies_1.py b/tests/test_tags_in_curlies_1.py
@@ -0,0 +1,4 @@
+# coding: pyxl
+from pyxl import html
+def test():
+    assert str(<frag>{'<br />'}</frag>) == """&lt;br /&gt;"""
diff --git a/tests/test_tags_in_curlies_10.py b/tests/test_tags_in_curlies_10.py
@@ -0,0 +1,4 @@
+# coding: pyxl
+from pyxl import html
+def test():
+    assert str(<frag>{<br /> if False else <div></div>}</frag>) == '''<div></div>'''
diff --git a/tests/test_tags_in_curlies_2.py b/tests/test_tags_in_curlies_2.py
@@ -0,0 +1,4 @@
+# coding: pyxl
+from pyxl import html
+def test():
+    assert str(<frag>{'<img src="foo" />'}</frag>) == """&lt;img src=&quot;foo&quot; /&gt;"""
diff --git a/tests/test_tags_in_curlies_3.py b/tests/test_tags_in_curlies_3.py
@@ -0,0 +1,4 @@
+# coding: pyxl
+from pyxl import html
+def test():
+    assert str(<frag>{'<div> foobar </div>'}</frag>) == """&lt;div&gt; foobar &lt;/div&gt;"""
diff --git a/tests/test_tags_in_curlies_4.py b/tests/test_tags_in_curlies_4.py
@@ -0,0 +1,4 @@
+# coding: pyxl
+from pyxl import html
+def test():
+    assert str(<frag>{'<div class="foo"> foobar </div>'}</frag>) == """&lt;div class=&quot;foo&quot;&gt; foobar &lt;/div&gt;"""
diff --git a/tests/test_tags_in_curlies_5.py b/tests/test_tags_in_curlies_5.py
@@ -0,0 +1,4 @@
+# coding: pyxl
+from pyxl import html
+def test():
+    assert str(<frag> {'<img src="{cond}" />'} </frag>) == """ &lt;img src=&quot;{cond}&quot; /&gt; """
diff --git a/tests/test_tags_in_curlies_6.py b/tests/test_tags_in_curlies_6.py
@@ -0,0 +1,4 @@
+# coding: pyxl
+from pyxl import html
+def test():
+    assert str(<frag> {' "<br /> '} </frag>) == '''  &quot;&lt;br /&gt;  '''
diff --git a/tests/test_tags_in_curlies_7.py b/tests/test_tags_in_curlies_7.py
@@ -0,0 +1,4 @@
+# coding: pyxl
+from pyxl import html
+def test():
+    assert str(<frag> {' "<br />" '} </frag>) == '''  &quot;&lt;br /&gt;&quot;  '''
diff --git a/tests/test_tags_in_curlies_8.py b/tests/test_tags_in_curlies_8.py
@@ -0,0 +1,4 @@
+# coding: pyxl
+from pyxl import html
+def test():
+    assert str(<frag>{<br />}</frag>) == '''<br />'''
diff --git a/tests/test_tags_in_curlies_9.py b/tests/test_tags_in_curlies_9.py
@@ -0,0 +1,4 @@
+# coding: pyxl
+from pyxl import html
+def test():
+    assert str(<frag>{<br /> if True else <div></div>}</frag>) == '''<br />'''
diff --git a/tests/test_whitespace_1.py b/tests/test_whitespace_1.py
@@ -0,0 +1,6 @@
+# coding: pyxl
+from pyxl import html
+def test():
+    assert str(<div class="{'blah'}">
+                   blah <a href="%(url)s">blah</a> blah.
+               </div>) == """<div class="blah">blah <a href="%(url)s">blah</a> blah.</div>"""
diff --git a/tests/test_whitespace_10.py b/tests/test_whitespace_10.py
@@ -0,0 +1,4 @@
+# coding: pyxl
+from pyxl import html
+def test():
+    assert str(<div class="{'foo'} {'bar'}"></div>) == '<div class="foo bar"></div>'
diff --git a/tests/test_whitespace_11.py b/tests/test_whitespace_11.py
@@ -0,0 +1,17 @@
+# coding: pyxl
+from pyxl import html
+
+def test():
+    # Presence of paretheses around html should not affect contents of tags. (In old pyxl,
+    # this led to differences in whitespace handling.)
+    assert str(get_frag1()) == str(get_frag2())
+
+def get_frag1():
+    return <frag>
+        {'foo'}
+    </frag>
+
+def get_frag2():
+    return (<frag>
+        {'foo'}
+    </frag>)
diff --git a/tests/test_whitespace_12.py b/tests/test_whitespace_12.py
diff --git a/tests/test_whitespace_2.py b/tests/test_whitespace_2.py
diff --git a/tests/test_whitespace_3.py b/tests/test_whitespace_3.py
diff --git a/tests/test_whitespace_4.py b/tests/test_whitespace_4.py
diff --git a/tests/test_whitespace_5.py b/tests/test_whitespace_5.py
diff --git a/tests/test_whitespace_6.py b/tests/test_whitespace_6.py
diff --git a/tests/test_whitespace_7.py b/tests/test_whitespace_7.py
diff --git a/tests/test_whitespace_8.py b/tests/test_whitespace_8.py
diff --git a/tests/test_whitespace_9.py b/tests/test_whitespace_9.py