diff --git a/examples/advanced/scan_wikitext.py b/examples/advanced/scan_wikitext.py new file mode 100644 index 000000000..c94e6d7bb --- /dev/null +++ b/examples/advanced/scan_wikitext.py @@ -0,0 +1,42 @@ +""" +Showcases how to use `Lark.scan` to select a pattern from a larger text without having to parse all of it. + +Uses `requests` to fetch the current wikitext from `Python (Programming Language)` and uses a simple grammar +to extract all wikitext templates used in the page. + +""" + +from collections import Counter +from pprint import pprint + +import lark +import requests + +page_name = "Python_(programming_language)" +url = f"https://en.wikipedia.org/wiki/{page_name}?action=raw" + +wikitext = requests.get(url).text + +grammar = r""" +template: "{{" TEXT ("|" argument)* "}}" +text: (TEXT|template)+ +argument: /\w+(?==)/ "=" text -> named_argument + | text -> numbered_argument + +TEXT: / (?:[^{}|] + | \{(?!\{) + | \}(?!\}) + )+/x +""" +parser = lark.Lark(grammar, parser='lalr', start='template') +used_templates = Counter() +inner_templates = 0 +for (start, end), res in parser.scan(wikitext): + for temp in res.find_data('template'): + if temp != res: + inner_templates += 1 + used_templates[temp.children[0].value] += 1 + +pprint(used_templates) +print("Total templates used:", used_templates.total()) +print("Number of templates nested inside others:", inner_templates) diff --git a/lark/__init__.py b/lark/__init__.py index a13c7b3be..e819f99de 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -9,7 +9,7 @@ UnexpectedToken, ) from .lark import Lark -from .lexer import Token +from .lexer import Token, TextSlice from .tree import ParseTree, Tree from .utils import logger from .visitors import Discard, Transformer, Transformer_NonRecursive, Visitor, v_args @@ -27,6 +27,7 @@ "UnexpectedToken", "Lark", "Token", + "TextSlice", "ParseTree", "Tree", "logger", diff --git a/lark/lark.py b/lark/lark.py index 6d34aa620..85634c7d1 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -16,7 +16,7 @@ from typing import Literal else: from typing_extensions import Literal - from .parser_frontends import ParsingFrontend + from .parser_frontends import ParsingFrontend, ScanMatch from .exceptions import ConfigurationError, assert_config, UnexpectedInput from .utils import Serialize, SerializeMemoizer, FS, isascii, logger @@ -24,7 +24,7 @@ from .tree import Tree from .common import LexerConf, ParserConf, _ParserArgType, _LexerArgType -from .lexer import Lexer, BasicLexer, TerminalDef, LexerThread, Token +from .lexer import Lexer, BasicLexer, TerminalDef, LexerThread, Token, TextSlice from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import _validate_frontend_args, _get_lexer_callbacks, _deserialize_parsing_frontend, _construct_parsing_frontend from .grammar import Rule @@ -600,8 +600,7 @@ def open_from_package(cls: Type[_T], package: str, grammar_path: str, search_pat def __repr__(self): return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer) - - def lex(self, text: str, dont_ignore: bool=False) -> Iterator[Token]: + def lex(self, text: Union[str, 'TextSlice'], dont_ignore: bool = False) -> Iterator[Token]: """Only lex (and postlex) the text, without parsing it. Only relevant when lexer='basic' When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore. @@ -613,6 +612,7 @@ def lex(self, text: str, dont_ignore: bool=False) -> Iterator[Token]: lexer = self._build_lexer(dont_ignore) else: lexer = self.lexer + text = TextSlice.from_text(text) lexer_thread = LexerThread.from_text(lexer, text) stream = lexer_thread.lex(None) if self.options.postlex: @@ -623,21 +623,25 @@ def get_terminal(self, name: str) -> TerminalDef: """Get information about a terminal""" return self._terminals_dict[name] - def parse_interactive(self, text: Optional[str]=None, start: Optional[str]=None) -> 'InteractiveParser': + def parse_interactive(self, text: Optional[str] = None, start: Optional[str] = None, + *, start_pos: Optional[int] = None, end_pos: Optional[int] = None) -> 'InteractiveParser': """Start an interactive parsing session. Parameters: text (str, optional): Text to be parsed. Required for ``resume_parse()``. start (str, optional): Start symbol + start_pos (int, optional): Position at which the parser starts. Defaults to 0. + end_pos (int, optional): Position at which the parser stops. Defaults to len(text). Returns: A new InteractiveParser instance. See Also: ``Lark.parse()`` """ - return self.parser.parse_interactive(text, start=start) + return self.parser.parse_interactive(text, start=start, start_pos=start_pos, end_pos=end_pos) - def parse(self, text: str, start: Optional[str]=None, on_error: 'Optional[Callable[[UnexpectedInput], bool]]'=None) -> 'ParseTree': + def parse(self, text: Union[str, 'TextSlice'], start: Optional[str] = None, + on_error: 'Optional[Callable[[UnexpectedInput], bool]]' = None) -> 'ParseTree': """Parse the given text, according to the options provided. Parameters: @@ -645,6 +649,13 @@ def parse(self, text: str, start: Optional[str]=None, on_error: 'Optional[Callab start (str, optional): Required if Lark was given multiple possible start symbols (using the start option). on_error (function, optional): if provided, will be called on UnexpectedToken error. Return true to resume parsing. LALR only. See examples/advanced/error_handling.py for an example of how to use on_error. + start_pos (int, optional): Position at which the parser starts. Defaults to 0. + end_pos (int, optional): Position at which the parser stops. Defaults to len(text). + Both of these don't work with lexer='dynamic'/'dynamic_complete' + Their behavior mirrors the behavior of the corresponding parameters in the Standard Library re module, + which most notably means that look behinds in regex will look behind start_pos, but lookaheads + won't look after end_pos. See [re.search](https://docs.python.org/3/library/re.html#re.Pattern.search) + for more information Returns: If a transformer is supplied to ``__init__``, returns whatever is the @@ -657,5 +668,29 @@ def parse(self, text: str, start: Optional[str]=None, on_error: 'Optional[Callab """ return self.parser.parse(text, start=start, on_error=on_error) + def scan(self, text: Union[str, TextSlice], start: Optional[str] = None) -> Iterable['ScanMatch']: + """ + Scans the input text for non-overlapping matches of this grammar. + + Only works with parser='lalr'. Works best if the first terminal(s) + that can be matched by grammar are unique in the text and always indicate the start of a match. + + A found match will never start or end with an ignored terminal. + + Does not raise any exceptions except for invalid arguments/configurations. + + Parameters: + text (str, optional): Text to be parsed. Required for ``resume_parse()``. + start (str, optional): Start symbol + start_pos (int, optional): Position at which the parser starts. Defaults to 0. + end_pos (int, optional): Position at which the parser stops. Defaults to len(text). + + Returns: + An Iterable of `ScanMatch` instances, which contain two attributes: `range` a tuple with + the indices of the start and end of the found match, and `tree`, the parsed Tree object. + + See Also: ``Lark.parse()`` + """ + return self.parser.scan(text, start=start) ###} diff --git a/lark/lexer.py b/lark/lexer.py index 9061d6001..b8333276a 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -1,11 +1,12 @@ # Lexer Implementation - +import sys from abc import abstractmethod, ABC import re from contextlib import suppress +from dataclasses import dataclass from typing import ( TypeVar, Type, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any, - ClassVar, TYPE_CHECKING, overload + ClassVar, TYPE_CHECKING, overload, Tuple, AnyStr, Generic, Union ) from types import ModuleType import warnings @@ -137,6 +138,33 @@ def user_repr(self) -> str: else: return self.name + +@dataclass(frozen=True) +class TextSlice(Generic[AnyStr]): + text: AnyStr + start: int + end: int + + def __post_init__(self): + if self.start < 0: + object.__setattr__(self, 'start', self.start + len(self.text)) + if self.end < 0: + object.__setattr__(self, 'end', self.end + len(self.text)) + + @classmethod + def from_text(cls, text: Union[AnyStr, 'TextSlice[AnyStr]']) -> 'TextSlice[AnyStr]': + if isinstance(text, TextSlice): + return text + else: + return cls(text, 0, len(text)) + + def is_complete_text(self): + return self.start == 0 and self.end == len(self.text) + + def start_from(self, pos: int): + return TextSlice(self.text, pos, self.end) + + _T = TypeVar('_T', bound="Token") class Token(str): @@ -254,13 +282,17 @@ def new_borrow_pos(cls: Type[_T], type_: str, value: Any, borrow_t: 'Token') -> return cls(type_, value, borrow_t.start_pos, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column, borrow_t.end_pos) def __reduce__(self): - return (self.__class__, (self.type, self.value, self.start_pos, self.line, self.column)) + return (self.__class__, (self.type, self.value, + self.start_pos, self.line, self.column, + self.end_line, self.end_column, self.end_pos)) def __repr__(self): return 'Token(%r, %r)' % (self.type, self.value) def __deepcopy__(self, memo): - return Token(self.type, self.value, self.start_pos, self.line, self.column) + return Token(self.type, self.value, + self.start_pos, self.line, self.column, + self.end_line, self.end_column, self.end_pos) def __eq__(self, other): if isinstance(other, Token) and self.type != other.type: @@ -282,14 +314,15 @@ def __init__(self, newline_char): self.line = 1 self.column = 1 self.line_start_pos = 0 - + def __repr__(self): + return f"" def __eq__(self, other): if not isinstance(other, LineCounter): return NotImplemented return self.char_pos == other.char_pos and self.newline_char == other.newline_char - def feed(self, token: Token, test_newline=True): + def feed(self, token: str, test_newline=True): """Consume a token and calculate the new line & column. As an optional optimization, set test_newline=False if token doesn't contain a newline. @@ -303,15 +336,24 @@ def feed(self, token: Token, test_newline=True): self.char_pos += len(token) self.column = self.char_pos - self.line_start_pos + 1 + def feed_substring(self, text: str, start_pos: int, end_pos: int): + newlines = text.count(self.newline_char, start_pos, end_pos) + if newlines: + self.line += newlines + self.line_start_pos = self.char_pos + text.rindex(self.newline_char, start_pos, end_pos) + 1 + + self.char_pos += end_pos - start_pos + self.column = self.char_pos - self.line_start_pos + 1 + class UnlessCallback: def __init__(self, scanner): self.scanner = scanner def __call__(self, t): - res = self.scanner.match(t.value, 0) - if res: - _value, t.type = res + res = self.scanner.fullmatch(t.value) + if res is not None: + t.type = res return t @@ -347,19 +389,18 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes): if strtok.pattern.flags <= retok.pattern.flags: embedded_strs.add(strtok) if unless: - callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes)) + callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, use_bytes=use_bytes)) new_terminals = [t for t in terminals if t not in embedded_strs] return new_terminals, callback class Scanner: - def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False): + def __init__(self, terminals, g_regex_flags, re_, use_bytes): self.terminals = terminals self.g_regex_flags = g_regex_flags self.re_ = re_ self.use_bytes = use_bytes - self.match_whole = match_whole self.allowed_types = {t.name for t in self.terminals} @@ -369,10 +410,9 @@ def _build_mres(self, terminals, max_size): # Python sets an unreasonable group limit (currently 100) in its re module # Worse, the only way to know we reached it is by catching an AssertionError! # This function recursively tries less and less groups until it's successful. - postfix = '$' if self.match_whole else '' mres = [] while terminals: - pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size]) + pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp()) for t in terminals[:max_size]) if self.use_bytes: pattern = pattern.encode('latin-1') try: @@ -384,12 +424,31 @@ def _build_mres(self, terminals, max_size): terminals = terminals[max_size:] return mres - def match(self, text, pos): + def fullmatch(self, text: str): for mre in self._mres: - m = mre.match(text, pos) + m = mre.fullmatch(text) + if m: + return m.lastgroup + + + def match(self, text: TextSlice, pos: int): + assert pos >= text.start + for mre in self._mres: + m = mre.match(text.text, pos, text.end) if m: return m.group(0), m.lastgroup + def search(self, text: TextSlice, pos: int): + results = list(filter(None, [ + mre.search(text.text, pos, text.end) + for mre in self._mres + ])) + if not results: + return None + + best = min(results, key=lambda m: m.start()) + return (best.group(0), best.lastgroup), best.start() + def _regexp_has_newline(r: str): r"""Expressions that may indicate newlines in a regexp: @@ -407,22 +466,26 @@ class LexerState: (Lexer objects are only instantiated per grammar, not per text) """ - __slots__ = 'text', 'line_ctr', 'last_token' + __slots__ = 'text', 'line_ctr', 'end_pos', 'last_token' - text: str + text: TextSlice line_ctr: LineCounter last_token: Optional[Token] - def __init__(self, text: str, line_ctr: Optional[LineCounter]=None, last_token: Optional[Token]=None): - self.text = text + def __init__(self, text: Optional[TextSlice], line_ctr: Optional[LineCounter] = None, last_token: Optional[Token] = None): + self.text = text # type: ignore[assignment] self.line_ctr = line_ctr or LineCounter(b'\n' if isinstance(text, bytes) else '\n') + if text is not None and text.start != 0: + self.line_ctr.feed_substring(text.text, 0, text.start) self.last_token = last_token def __eq__(self, other): if not isinstance(other, LexerState): return NotImplemented - return self.text is other.text and self.line_ctr == other.line_ctr and self.last_token == other.last_token + return (self.text == other.text and + self.line_ctr == other.line_ctr and + self.end_pos == other.end_pos) def __copy__(self): return type(self)(self.text, copy(self.line_ctr), self.last_token) @@ -437,7 +500,7 @@ def __init__(self, lexer: 'Lexer', lexer_state: LexerState): self.state = lexer_state @classmethod - def from_text(cls, lexer: 'Lexer', text: str) -> 'LexerThread': + def from_text(cls, lexer: 'Lexer', text: TextSlice) -> 'LexerThread': return cls(lexer, LexerState(text)) def lex(self, parser_state): @@ -459,7 +522,10 @@ class Lexer(ABC): """ @abstractmethod def lex(self, lexer_state: LexerState, parser_state: Any) -> Iterator[Token]: - return NotImplemented + raise NotImplementedError + + def search_start(self, text: TextSlice, start_state, pos: int) -> Optional[Token]: + raise TypeError("This lexer can not be used for searching in text") def make_lexer_state(self, text): "Deprecated" @@ -563,7 +629,7 @@ def __init__(self, conf: 'LexerConf', comparator=None) -> None: self.use_bytes = conf.use_bytes self.terminals_by_name = conf.terminals_by_name - self._scanner = None + self._scanner: Optional[Scanner] = None def _build_scanner(self): terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes) @@ -579,23 +645,24 @@ def _build_scanner(self): self._scanner = Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes) @property - def scanner(self): + def scanner(self) -> Scanner: if self._scanner is None: self._build_scanner() + assert self._scanner is not None return self._scanner - def match(self, text, pos): + def match(self, text: TextSlice, pos: int) -> Optional[Tuple[str, str]]: return self.scanner.match(text, pos) def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token: line_ctr = lex_state.line_ctr - while line_ctr.char_pos < len(lex_state.text): + while line_ctr.char_pos < lex_state.text.end: res = self.match(lex_state.text, line_ctr.char_pos) if not res: allowed = self.scanner.allowed_types - self.ignore_types if not allowed: allowed = {""} - raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, + raise UnexpectedCharacters(lex_state.text.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token], state=parser_state, terminals_by_name=self.terminals_by_name) @@ -621,6 +688,19 @@ def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token: # EOF raise EOFError(self) + def search_start(self, text: TextSlice, start_state, pos: int) -> Optional[Token]: + while True: + res = self.scanner.search(text, pos) + if not res: + return None + (value, type_), actual_pos = res + if type_ in self.ignore_types: + pos = actual_pos + len(value) + continue + t = Token(type_, value, actual_pos, end_pos=actual_pos + len(value)) + return t + + class ContextualLexer(Lexer): lexers: Dict[int, AbstractBasicLexer] @@ -675,4 +755,8 @@ def lex(self, lexer_state: LexerState, parser_state: 'ParserState') -> Iterator[ except UnexpectedCharacters: raise e # Raise the original UnexpectedCharacters. The root lexer raises it with the wrong expected set. + def search_start(self, text: TextSlice, start_state, pos: int) -> Optional[Token]: + return self.lexers[start_state].search_start(text, start_state, pos) + + ###} diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 186058a6b..20c53ab65 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -1,8 +1,8 @@ -from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING +from typing import Any, Callable, Dict, Optional, Collection, Union, TYPE_CHECKING, NamedTuple, Iterable, Tuple -from .exceptions import ConfigurationError, GrammarError, assert_config +from .exceptions import ConfigurationError, GrammarError, assert_config, UnexpectedInput from .utils import get_regexp_width, Serialize -from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer +from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer, TextSlice from .parsers import earley, xearley, cyk from .parsers.lalr_parser import LALR_Parser from .tree import Tree @@ -14,17 +14,41 @@ ###{standalone + +class ScanMatch(NamedTuple): + range: Tuple[int, int] + tree: Tree + + def _wrap_lexer(lexer_class): - future_interface = getattr(lexer_class, '__future_interface__', False) - if future_interface: + future_interface = getattr(lexer_class, '__future_interface__', 0) + if future_interface == 2: return lexer_class - else: + elif future_interface == 1: + class CustomLexerWrapper(Lexer): + def __init__(self, lexer_conf): + self.lexer = lexer_class(lexer_conf) + + def lex(self, lexer_state, parser_state): + if not lexer_state.text.is_complete_text(): + raise TypeError("Interface=1 Custom Lexer don't support TextSlice") + lexer_state.text = lexer_state.text.text + return self.lexer.lex(lexer_state, parser_state) + return CustomLexerWrapper + elif future_interface == 0: class CustomLexerWrapper(Lexer): def __init__(self, lexer_conf): self.lexer = lexer_class(lexer_conf) + def lex(self, lexer_state, parser_state): - return self.lexer.lex(lexer_state.text) + if not lexer_state.text.is_complete_text(): + raise TypeError("Interface=0 Custom Lexer don't support TextSlice") + return self.lexer.lex(lexer_state.text.text) return CustomLexerWrapper + else: + raise ValueError(f"Unknown __future_interface__ value {future_interface}, integer 0-2 expected") + + def _deserialize_parsing_frontend(data, memo, lexer_conf, callbacks, options): @@ -93,19 +117,28 @@ def _verify_start(self, start=None): raise ConfigurationError("Unknown start rule %s. Must be one of %r" % (start, self.parser_conf.start)) return start - def _make_lexer_thread(self, text: str) -> Union[str, LexerThread]: + def _make_lexer_thread(self, text: Union[str, TextSlice]) -> Union[str, LexerThread]: cls = (self.options and self.options._plugins.get('LexerThread')) or LexerThread - return text if self.skip_lexer else cls.from_text(self.lexer, text) - - def parse(self, text: str, start=None, on_error=None): + if self.skip_lexer: + if isinstance(text, TextSlice): + if not text.is_complete_text(): + raise TypeError("lexer='dynamic' does not support TextSlice") + return text.text + return text + text = TextSlice.from_text(text) + return cls.from_text(self.lexer, text) + + def parse(self, text: Union[str, TextSlice], start=None, on_error=None): chosen_start = self._verify_start(start) kw = {} if on_error is None else {'on_error': on_error} stream = self._make_lexer_thread(text) return self.parser.parse(stream, chosen_start, **kw) - def parse_interactive(self, text: Optional[str]=None, start=None): + def parse_interactive(self, text: Union[None, str, TextSlice]=None, start=None): # TODO BREAK - Change text from Optional[str] to text: str = ''. # Would break behavior of exhaust_lexer(), which currently raises TypeError, and after the change would just return [] + # When this is done, also adjust the code in `LexerState.__init__` since it currently works around being + # passed `None` chosen_start = self._verify_start(start) if self.parser_conf.parser_type != 'lalr': raise ConfigurationError("parse_interactive() currently only works with parser='lalr' ") @@ -113,6 +146,54 @@ def parse_interactive(self, text: Optional[str]=None, start=None): return self.parser.parse_interactive(stream, chosen_start) + def scan(self, text: Union[str, TextSlice], start: Optional[str]=None) -> Iterable[ScanMatch]: + """ + In contrast to the other functions here, this one actually does work. See `Lark.scan` + for a description of what this function is for. + """ + if self.options.parser != 'lalr': + raise ValueError("scan requires parser='lalr'") + start_states = self.parser._parse_table.start_states + chosen_start = self._verify_start(start) + start_state = start_states[chosen_start] + text: TextSlice = TextSlice.from_text(text) # ignore[no-redef] + pos = text.start + while True: + # Find the next candidate location + found = self.lexer.search_start(text, start_state, pos) + # No more valid candidates + if found is None: + break + assert found.end_pos <= text.end + # Collect the potential end points found for this parse + # We need to keep track of multiple options in case there are false `$END`s in the `ip.choices()` + # We don't want to check early since this can be expensive. + valid_end = [] + ip = self.parse_interactive(text.start_from(pos), start=chosen_start) + tokens = ip.lexer_thread.lex(ip.parser_state) + while True: + try: + token = next(tokens) + ip.feed_token(token) + except (UnexpectedInput, StopIteration): + # Either we couldn't parse the characters or the resulting token wasn't valid. + # Either way, stop + break + if '$END' in ip.choices(): + valid_end.append((token, ip.copy())) + # Check through all potential ending points and see if passing in `$END` actually works + for (last, pot) in valid_end[::-1]: + try: + res = pot.feed_eof(last) + except UnexpectedInput: + continue + else: + yield ScanMatch((found.start_pos, last.end_pos), res) + pos = last.end_pos + break + else: + pos = found.start_pos + 1 + def _validate_frontend_args(parser, lexer) -> None: assert_config(parser, ('lalr', 'earley', 'cyk')) if not isinstance(lexer, type): # not custom lexer? @@ -132,7 +213,7 @@ def _get_lexer_callbacks(transformer, terminals): result[terminal.name] = callback return result -class PostLexConnector: +class PostLexConnector(Lexer): def __init__(self, lexer, postlexer): self.lexer = lexer self.postlexer = postlexer diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 6ae2a04fd..728753c44 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -55,7 +55,7 @@ def parse(self, lexer, start, on_error=None): if isinstance(e, UnexpectedCharacters): # If user didn't change the character position, then we should if p == s.line_ctr.char_pos: - s.line_ctr.feed(s.text[p:p+1]) + s.line_ctr.feed(s.text.text[p:p+1]) try: return e.interactive_parser.resume_parse() diff --git a/lark/tools/nearley.py b/lark/tools/nearley.py index 1fc27d565..955696398 100644 --- a/lark/tools/nearley.py +++ b/lark/tools/nearley.py @@ -44,7 +44,7 @@ """ -nearley_grammar_parser = Lark(nearley_grammar, parser='earley', lexer='basic') +# nearley_grammar_parser = Lark(nearley_grammar, parser='earley', lexer='basic') def _get_rulename(name): name = {'_': '_ws_maybe', '__': '_ws'}.get(name, name) diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py index 9940ccbf5..78a548d79 100644 --- a/lark/tools/standalone.py +++ b/lark/tools/standalone.py @@ -30,8 +30,9 @@ from typing import ( TypeVar, Generic, Type, Tuple, List, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any, Union, Iterable, IO, TYPE_CHECKING, overload, Sequence, - Pattern as REPattern, ClassVar, Set, Mapping + Pattern as REPattern, ClassVar, Set, Mapping, NamedTuple, AnyStr ) +from dataclasses import dataclass ###} import sys diff --git a/pyproject.toml b/pyproject.toml index 8e40e13dc..624806067 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,7 +76,7 @@ version = {attr = "lark.__version__"} [tool.mypy] files = "lark" -python_version = "3.6" +python_version = "3.8" show_error_codes = true enable_error_code = ["ignore-without-code"] exclude = [ diff --git a/tests/__main__.py b/tests/__main__.py index c5298a770..bd53fe8d6 100644 --- a/tests/__main__.py +++ b/tests/__main__.py @@ -14,6 +14,7 @@ from .test_lexer import TestLexer from .test_python_grammar import TestPythonParser from .test_tree_templates import * # We define __all__ to list which TestSuites to run +from .test_scan import TestScan try: from .test_nearley.test_nearley import TestNearley diff --git a/tests/test_lexer.py b/tests/test_lexer.py index 0996c8973..cf9dcc48e 100644 --- a/tests/test_lexer.py +++ b/tests/test_lexer.py @@ -1,6 +1,7 @@ from unittest import TestCase, main -from lark import Lark, Tree +from lark import Lark, Tree, TextSlice + class TestLexer(TestCase): def setUp(self): @@ -18,6 +19,18 @@ def test_basic(self): res = list(p.lex("abc cba dd", dont_ignore=True)) assert res == list('abc cba dd') + def test_subset_lex(self): + p = Lark(""" + start: "a" "b" "c" "d" + %ignore " " + """) + + res = list(p.lex(TextSlice("xxxabc cba ddxx", 3, -2))) + assert res == list('abccbadd') + + res = list(p.lex(TextSlice("aaaabc cba dddd", 3, -2))) + assert res == list('abccbadd') + if __name__ == '__main__': main() diff --git a/tests/test_parser.py b/tests/test_parser.py index 1ab705b04..5e79c38fe 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -30,7 +30,7 @@ from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters from lark.tree import Tree from lark.visitors import Transformer, Transformer_InPlace, v_args, Transformer_InPlaceRecursive -from lark.lexer import Lexer, BasicLexer +from lark.lexer import Lexer, BasicLexer, TextSlice from lark.indenter import Indenter __all__ = ['TestParsers'] @@ -912,6 +912,7 @@ def test_cycles_with_child_filter(self): self.assertEqual(tree, Tree('a', [Tree('x', [Tree('b', [])])])) + _NAME = "TestFullEarley" + LEXER.capitalize() _TestFullEarley.__name__ = _NAME globals()[_NAME] = _TestFullEarley @@ -2584,8 +2585,32 @@ def test_strict(self): """ self.assertRaises(GrammarError, _Lark, grammar, strict=True) + @unittest.skipIf(LEXER in ('dynamic', 'dynamic_complete', 'custom_old'), + "start_pos and end_pos not compatible with old style custom/dynamic lexer ") + def test_subset_parse(self): + grammar = r""" + start: (WORD|FRAG_END|FRAG_START)+ + WORD: /\b\w+\b/ # match full word + FRAG_END: /\B\w+/ # end of a word, i.e. start is not at a word boundary + FRAG_START: /\w+\B/ # start of a word, i.e. end is not at a word boundary + %ignore /\s+/ + """ + + parser = _Lark(grammar) + self.assertEqual(parser.parse(TextSlice(" abc def ", 1, -1)), + Tree('start', [Token('WORD', 'abc'), Token('WORD', 'def')])) + self.assertEqual(parser.parse(TextSlice(" abc def ", 1-9, -1+9)), + Tree('start', [Token('WORD', 'abc'), Token('WORD', 'def')])) + self.assertEqual(parser.parse(TextSlice("xabc def ", 1, -1)), + Tree('start', [Token('FRAG_END', 'abc'), Token('WORD', 'def')])) + + # We match the behavior of python's re module here: It doesn't look ahead beyond `end_pos`, + # despite looking behind before `start_pos` + self.assertEqual(parser.parse(TextSlice(" abc defx", 1, -1)), + Tree('start', [Token('WORD', 'abc'), Token('WORD', 'def')])) + - _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() + _NAME = "TestParser" + PARSER.capitalize() + LEXER.capitalize() _TestParser.__name__ = _NAME _TestParser.__qualname__ = "tests.test_parser." + _NAME globals()[_NAME] = _TestParser diff --git a/tests/test_scan.py b/tests/test_scan.py new file mode 100644 index 000000000..53ccc37a8 --- /dev/null +++ b/tests/test_scan.py @@ -0,0 +1,107 @@ +import unittest + +from lark import Lark, Tree, TextSlice + + +class TestScan(unittest.TestCase): + def test_scan(self): + parser = Lark(r""" + expr: "(" (WORD|expr)* ")" + %ignore / +/ + WORD: /\w+/ + """, parser='lalr', start="expr") + + text = "|() | (a) | ((//)) | (c ((d))) |" + finds = list(parser.scan(text)) + self.assertEqual(finds, [((1, 3), Tree('expr', [])), + ((6, 9), Tree('expr', ['a'])), + ((21, 30), Tree('expr', ['c', Tree('expr', [Tree('expr', ['d'])])])), + ]) + + def test_scan_basic_lexer(self): + parser = Lark(r""" + expr: "(" (WORD|expr)* ")" + %ignore / +/ + WORD: /\w+/ + """, parser='lalr', start="expr", lexer='basic') + + text = "|() | (a) | ((//)) | (c ((d))) |" + finds = list(parser.scan(text)) + self.assertEqual(finds, [((1, 3), Tree('expr', [])), + ((6, 9), Tree('expr', ['a'])), + ((21, 30), Tree('expr', ['c', Tree('expr', [Tree('expr', ['d'])])])), + ]) + + def test_scan_meta(self): + parser = Lark(r""" + expr: "(" (WORD|expr)* ")" + %ignore /\s+/ + WORD: /\w+/ + """, parser='lalr', start="expr", propagate_positions=True) + + text = " (a)\n(b)\n (\n)" + finds = list(parser.scan(text)) + self.assertEqual(finds, [((1, 4), Tree('expr', ['a'])), + ((5, 8), Tree('expr', ['b'])), + ((10, 13), Tree('expr', []))]) + + self.assertEqual(1, finds[0][1].meta.start_pos) + self.assertEqual(4, finds[0][1].meta.end_pos) + self.assertEqual(1, finds[0][1].meta.line) + self.assertEqual(1, finds[0][1].meta.end_line) + self.assertEqual(2, finds[0][1].meta.column) + self.assertEqual(5, finds[0][1].meta.end_column) + + self.assertEqual(5, finds[1][1].meta.start_pos) + self.assertEqual(8, finds[1][1].meta.end_pos) + self.assertEqual(2, finds[1][1].meta.line) + self.assertEqual(2, finds[1][1].meta.end_line) + self.assertEqual(1, finds[1][1].meta.column) + self.assertEqual(4, finds[1][1].meta.end_column) + + self.assertEqual(10, finds[2][1].meta.start_pos) + self.assertEqual(13, finds[2][1].meta.end_pos) + self.assertEqual(3, finds[2][1].meta.line) + self.assertEqual(4, finds[2][1].meta.end_line) + self.assertEqual(2, finds[2][1].meta.column) + self.assertEqual(2, finds[2][1].meta.end_column) + + def test_scan_backtrack(self): + """ Tests that the scan function properly backtracks if it finds partial, but incorrect parses""" + + parser = Lark(r""" + start: expr+ + expr: "(" (WORD|expr)* ")" + %ignore /\s+/ + WORD: /\w+/ + """, parser='lalr', start="start") + + text = "(a)(b) || (c)(d(e) || (f)" + finds = list(parser.scan(text)) + self.assertEqual(finds, [ + ((0, 6), Tree('start', [Tree('expr', ['a']), Tree('expr', ['b'])])), + ((10, 13), Tree('start', [Tree('expr', ['c'])])), + ((15, 18), Tree('start', [Tree('expr', ['e'])])), + ((22, 25), Tree('start', [Tree('expr', ['f'])])), + ]) + + def test_scan_subset(self): + parser = Lark(r""" + expr: "(" (WORD|expr)* ")" + %ignore /\s+/ + WORD: /\w+/ + """, parser='lalr', start="expr", propagate_positions=True) + + text = "()\n()(a)\n(b)\n (\n) | \n(\n)" + finds = list(parser.scan(TextSlice(text, 5, -1))) + self.assertEqual(finds, [((5, 8), Tree('expr', ['a'])), + ((9, 12), Tree('expr', ['b'])), + ((14, 17), Tree('expr', []))]) + self.assertEqual(2, finds[0][1].meta.line) + + text = "()\n()(a)\n(b)\n (\n) | \n(\n)" + finds = list(parser.scan(TextSlice(text, 5-len(text), -1+len(text)))) + self.assertEqual(finds, [((5, 8), Tree('expr', ['a'])), + ((9, 12), Tree('expr', ['b'])), + ((14, 17), Tree('expr', []))]) + self.assertEqual(2, finds[0][1].meta.line) diff --git a/tests/test_tree_forest_transformer.py b/tests/test_tree_forest_transformer.py index e96007355..f7ac2276a 100644 --- a/tests/test_tree_forest_transformer.py +++ b/tests/test_tree_forest_transformer.py @@ -16,9 +16,9 @@ class TestTreeForestTransformer(unittest.TestCase): !bc: "B"? "C"? !cd: "C"? "D" """ - - parser = Lark(grammar, parser='earley', ambiguity='forest') - forest = parser.parse("ABCD") + def setUp(self): + self.parser = Lark(self.grammar, parser='earley', ambiguity='forest') + self.forest = self.parser.parse("ABCD") def test_identity_resolve_ambiguity(self): l = Lark(self.grammar, parser='earley', ambiguity='resolve') diff --git a/tests/test_tree_templates.py b/tests/test_tree_templates.py index ae3c3e079..dd452eacd 100644 --- a/tests/test_tree_templates.py +++ b/tests/test_tree_templates.py @@ -35,7 +35,8 @@ class TestTreeTemplatesConf(unittest.TestCase): - parser = Lark(SOME_TEMPLATING_GRAMMAR) + def setUp(self): + self.parser = Lark(SOME_TEMPLATING_GRAMMAR) def test_conf_test_var__not_var(self): conf = TemplateConf(self.parser.parse) @@ -95,8 +96,9 @@ def test_template_match__only_tree(self): class TestTreeTemplatesTemplate(unittest.TestCase): - parser = Lark(SOME_TEMPLATING_GRAMMAR) - conf = TemplateConf(parser.parse) + def setUp(self): + self.parser = Lark(SOME_TEMPLATING_GRAMMAR) + self.conf = TemplateConf(self.parser.parse) def test_template_match__same_tree_no_template__empty_dictionary(self): template = Template(SOME_NON_TEMPLATE_TREE, conf=self.conf) @@ -193,8 +195,9 @@ def test_template_apply_vars__matching_vars__template_replaced(self): class TestTreeTemplatesTemplateTranslator(unittest.TestCase): - parser = Lark(SOME_TEMPLATING_GRAMMAR) - conf = TemplateConf(parser.parse) + def setUp(self): + self.parser = Lark(SOME_TEMPLATING_GRAMMAR) + self.conf = TemplateConf(self.parser.parse) def test_translate__empty_translations__same_tree(self): # no translations to match, so doesn't replace anything & can't error