From 73baa3c9ae0c91a9ad3c731f0f6d4b91e0ad06af Mon Sep 17 00:00:00 2001 From: Eric Wieser Date: Tue, 24 Mar 2020 17:29:51 +0000 Subject: [PATCH] Fix `layout.parse_multivector` to not produce nonsense Now: * Supports unusual basis names * Gives error messages instead of producing zeros ```python >>> layout.parse_multivector("e3 + whoops") Traceback (most recent call last): File "", line 1 e3 + whoops ^ SyntaxError: Unexpected unrecognized token after sign ``` Before: ``` >>> layout.parse_multivector("2e+1^e3") (20.0^e3) >>> layout.parse_multivector("2.e+1^e3") (1.0^e3) # wat >>> layout.parse_multivector("2.e+1") 0 # wat >>> layout.parse_multivector("e1") 0 # wat ``` --- clifford/_layout.py | 39 +---------- clifford/_parser.py | 126 +++++++++++++++++++++++++++++++++++ clifford/test/test_parser.py | 71 ++++++++++++++++++++ docs/changelog.rst | 4 ++ 4 files changed, 204 insertions(+), 36 deletions(-) create mode 100644 clifford/_parser.py create mode 100644 clifford/test/test_parser.py diff --git a/clifford/_layout.py b/clifford/_layout.py index c8fa5ffd..7b0ce45c 100644 --- a/clifford/_layout.py +++ b/clifford/_layout.py @@ -1,4 +1,3 @@ -import re import functools from typing import List, Tuple, Optional, Dict, Container import warnings @@ -39,13 +38,6 @@ def __get__(self, obj, cls): return val -# The blade finding regex for parsing strings of mvs -_blade_pattern = re.compile(r""" - ((^|\s)-?\s?\d+(\.\d+)?)\s| - ((^|\+|-)\s?(\d+((e(\+|-))|\.)?(\d+)?)\^e\d+(\s|$)) -""", re.VERBOSE) - - @_numba_utils.njit def canonical_reordering_sign(bitmap_a, bitmap_b, metric): """ @@ -492,34 +484,9 @@ def __eq__(self, other): def parse_multivector(self, mv_string: str) -> MultiVector: """ Parses a multivector string into a MultiVector object """ - # Get the names of the canonical blades - blade_name_index_map = {name: index for index, name in enumerate(self.names)} - - # Clean up the input string a bit - cleaned_string = re.sub('[()]', '', mv_string) - - # Create a multivector - mv_out = MultiVector(self) - - # Apply the regex - for m in _blade_pattern.finditer(cleaned_string): - # Clean up the search result - cleaned_match = m.group(0) - - # Split on the '^' - stuff = cleaned_match.split('^') - - if len(stuff) == 2: - # Extract the value of the blade and the index of the blade - blade_val = float("".join(stuff[0].split())) - blade_index = blade_name_index_map[stuff[1].strip()] - mv_out[blade_index] = blade_val - elif len(stuff) == 1: - # Extract the value of the scalar - blade_val = float("".join(stuff[0].split())) - blade_index = 0 - mv_out[blade_index] = blade_val - return mv_out + # guarded import in case the parse become heavier weight + from ._parser import parse_multivector + return parse_multivector(self, mv_string) def _genTables(self): "Generate the multiplication tables." diff --git a/clifford/_parser.py b/clifford/_parser.py new file mode 100644 index 00000000..9064075b --- /dev/null +++ b/clifford/_parser.py @@ -0,0 +1,126 @@ +""" Functions for string parsing """ + +from typing import Tuple +import re +import warnings + +from ._multivector import MultiVector +from ._layout import Layout + +# from the python docs, https://docs.python.org/3/library/re.html#simulating-scanf +_unsigned_float_pattern = r"(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?" + + +def _match_line_offset(m: 're.Match') -> Tuple[int, int, str]: + """ Convert a match position into a lineno, offset, and line """ + pos = m.span()[0] + lines = m.string.split('\n') + for line_i, line in enumerate(lines, 1): + new_pos = pos - len(line) - 1 + if new_pos < 0: + return line_i, pos + 1, line + pos = new_pos + assert False # pragma: no cover + + +def _parse_error(m: 're.Match', msg: str) -> SyntaxError: + """ Produce a syntax error indicating the given match was invalid """ + return SyntaxError(msg, (None, *_match_line_offset(m))) + + +def _tokenize(layout: Layout, mv_string: str): + # Get the names of the canonical blades + blade_name_index_map = {name: index for index, name in enumerate(layout.names)} + + tokenizer = re.Scanner([( + r'\s+', + lambda s, t: ('space', s.match, None) + ), ( + r'\(', + lambda s, t: ('(', s.match, None) + ), ( + r'\)', + lambda s, t: (')', s.match, None) + ), ( + r'[+-]', + lambda s, t: ('sign', s.match, 1 if t == '+' else -1) + ), ( + _unsigned_float_pattern, + lambda s, t: ('coeff', s.match, float(t)) + ), ( + r'\^', + lambda s, t: ('wedge', s.match, None) + ), ( + r'\b(?:{})\b'.format('|'.join( + re.escape(name) + for name in layout.names + if name + )), + lambda s, t: ('blade', s.match, blade_name_index_map[t]) + ), ( + r'.', + lambda s, t: ('unrecognized', s.match, None) + )]) + + tokens, rest = tokenizer.scan(mv_string) + assert not rest # our unrecognized match should handle this + return tokens + [ + ('end', re.compile(r'$').match(mv_string, len(mv_string)), None) + ] + + +def parse_multivector(layout: Layout, mv_string: str) -> MultiVector: + # Create a multivector + mv_out = MultiVector(layout) + + # parser state + sign = None + coeff = None + last_t = None + + # TODO: this could do with a recursive parser to handle parens + for t, m, data in _tokenize(layout, mv_string): + if t == 'space': + continue # don't update the tokenizer + elif t in '()': + # Not implemented, old behavior was to just strip these - do the same + warnings.warn( + "Parentheses are not parsed, behavior may be surprising", + stacklevel=3) + continue + elif t == 'sign' and last_t is None: + sign = data + elif t == 'sign' and last_t == 'blade': + sign = data + elif t == 'sign' and last_t == 'sign': + sign *= data + elif t == 'sign' and last_t == 'coeff': + # last coeff was a scalar + mv_out[()] += coeff + sign = data + + elif t == 'coeff' and last_t == 'sign': + coeff = sign * data + elif t == 'coeff' and last_t is None: + coeff = data + + elif t == 'blade' and last_t == 'wedge': + mv_out[data] += coeff + elif t == 'blade' and last_t == 'sign': + mv_out[data] += sign + elif t == 'blade' and last_t is None: + mv_out[data] += 1 + + elif t == 'wedge' and last_t == 'coeff': + pass + + elif t == 'end' and last_t == 'coeff': + mv_out[()] += coeff + elif t == 'end' and last_t == 'blade': + pass + + else: + raise _parse_error(m, "Unexpected {} token after {}".format(t, last_t)) + last_t = t + + return mv_out diff --git a/clifford/test/test_parser.py b/clifford/test/test_parser.py new file mode 100644 index 00000000..54e541b0 --- /dev/null +++ b/clifford/test/test_parser.py @@ -0,0 +1,71 @@ +import pytest + +from clifford import Layout + + +@pytest.fixture(scope='module') +def g3(): + return Layout([1, 1, 1]) + + +class TestParser: + def test_valid(self, g3): + e1, e2, e3 = g3.basis_vectors_lst + assert g3.parse_multivector("e1") == e1 + assert g3.parse_multivector("-e1") == -e1 + assert g3.parse_multivector("--e1") == e1 + assert g3.parse_multivector("1.0e2 ^ e1") == 1.0e2 * e1 + assert g3.parse_multivector("1.0E2 ^ e1") == 1.0E2 * e1 + assert g3.parse_multivector("-1 ^ e1") == -e1 + assert g3.parse_multivector("1") == 1 + assert g3.parse_multivector("1 - e12") == 1 - (e1^e2) + assert g3.parse_multivector("e12 + 1") == (e1^e2) + 1 + assert g3.parse_multivector("1 +\n 2") == 3 + + def test_parentheses(self, g3): + """ parsing ignores these for now """ + e1, e2, e3 = g3.basis_vectors_lst + with pytest.warns(UserWarning): + assert g3.parse_multivector("(1^e1) + (1^e2)") == e1 + e2 + + with pytest.warns(UserWarning): + assert g3.parse_multivector("1^(e1 + 1)^e2") == e1 + e2 + + @pytest.mark.parametrize(["s", "err_pos"], [( + "+", + " ^", + ), ( + "", + "^", + ), ( + "e2 + e1^", + " ^", + ), ( + "e1+\n+", + " \n ^", + ), ( + "1 e1", + " ^", + ), ( + "1 + e1 e2", + " ^", + )]) + def test_invalid(self, g3, s, err_pos): + """ Test that when invalid strings are passed, the invalid character is shown """ + with pytest.raises(SyntaxError) as exc_info: + g3.parse_multivector(s) + exc = exc_info.value + + # remove trailing spaces on the expected marker + expected_pos = "\n".join(l.rstrip() for l in err_pos.split("\n")) + actual_pos = (exc.lineno - 1) * "\n" + (exc.offset - 1) * " " + '^' + assert expected_pos == actual_pos + + assert '\n' not in exc.text + + def test_unusual_names(self): + g = Layout([1, 1], names=['', 'x', 'y', 'I']) + x, y = g.basis_vectors_lst + I = x ^ y + assert g.parse_multivector('2 + I') == 2 + I + assert g.parse_multivector('2^x + 3^y') == 2*x + 3*y diff --git a/docs/changelog.rst b/docs/changelog.rst index 7da11288..198bb3bd 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -26,6 +26,10 @@ Bugs fixed as ``mv[e1]``. * :func:`~clifford.operator.ip` (the inner product) no longer performs the outer product. + * :func:`Layout.parse_multivector` now throws :exc:`SyntaxError` on invalid + input, rather than silenltly producing nonsense. + * :func:`Layout.parse_multivector` supports basis vector names which do not + start with e. Compatibility notes -------------------