Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add scanning #1429

Draft
wants to merge 8 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 24 additions & 8 deletions lark/lark.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from typing import Literal
else:
from typing_extensions import Literal
from .parser_frontends import ParsingFrontend
from .parser_frontends import ParsingFrontend, ScanMatch

from .exceptions import ConfigurationError, assert_config, UnexpectedInput
from .utils import Serialize, SerializeMemoizer, FS, isascii, logger
Expand Down Expand Up @@ -600,8 +600,8 @@ def open_from_package(cls: Type[_T], package: str, grammar_path: str, search_pat
def __repr__(self):
return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer)


def lex(self, text: str, dont_ignore: bool=False) -> Iterator[Token]:
def lex(self, text: str, dont_ignore: bool = False, *, start_pos: Optional[int] = None,
end_pos: Optional[int] = None) -> Iterator[Token]:
"""Only lex (and postlex) the text, without parsing it. Only relevant when lexer='basic'

When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore.
Expand All @@ -613,7 +613,7 @@ def lex(self, text: str, dont_ignore: bool=False) -> Iterator[Token]:
lexer = self._build_lexer(dont_ignore)
else:
lexer = self.lexer
lexer_thread = LexerThread.from_text(lexer, text)
lexer_thread = LexerThread.from_text(lexer, text, start_pos=start_pos, end_pos=end_pos)
stream = lexer_thread.lex(None)
if self.options.postlex:
return self.options.postlex.process(stream)
Expand All @@ -623,7 +623,8 @@ def get_terminal(self, name: str) -> TerminalDef:
"""Get information about a terminal"""
return self._terminals_dict[name]

def parse_interactive(self, text: Optional[str]=None, start: Optional[str]=None) -> 'InteractiveParser':
def parse_interactive(self, text: Optional[str] = None, start: Optional[str] = None,
*, start_pos: Optional[int] = None, end_pos: Optional[int] = None) -> 'InteractiveParser':
"""Start an interactive parsing session.

Parameters:
Expand All @@ -635,9 +636,11 @@ def parse_interactive(self, text: Optional[str]=None, start: Optional[str]=None)

See Also: ``Lark.parse()``
"""
return self.parser.parse_interactive(text, start=start)
return self.parser.parse_interactive(text, start=start, start_pos=start_pos, end_pos=end_pos)

def parse(self, text: str, start: Optional[str]=None, on_error: 'Optional[Callable[[UnexpectedInput], bool]]'=None) -> 'ParseTree':
def parse(self, text: str, start: Optional[str] = None,
on_error: 'Optional[Callable[[UnexpectedInput], bool]]' = None,
*, start_pos: Optional[int] = None, end_pos: Optional[int] = None) -> 'ParseTree':
"""Parse the given text, according to the options provided.

Parameters:
Expand All @@ -655,7 +658,20 @@ def parse(self, text: str, start: Optional[str]=None, on_error: 'Optional[Callab
For convenience, these sub-exceptions also inherit from ``ParserError`` and ``LexerError``.

"""
return self.parser.parse(text, start=start, on_error=on_error)
return self.parser.parse(text, start=start, on_error=on_error, start_pos=start_pos, end_pos=end_pos)

def scan(self, text: str, start: Optional[str] = None, *, start_pos: Optional[int] = None,
end_pos: Optional[int] = None) -> Iterable['ScanMatch']:
"""
Scans the input text for non-overlapping matches of the rule specified by 'start' and
yields the start and end position as well as the resulting tree.

Only works with parser='lalr' and lexer='contextual'. Works best if the first terminal(s)
that can be matched by grammar are unique in the text and always indicate the start of a match.

Does not raise any exceptions except for invalid arguments/configurations.

"""
return self.parser.scan(text, start=start, start_pos=start_pos, end_pos=end_pos)

###}
104 changes: 84 additions & 20 deletions lark/lexer.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# Lexer Implementation

import sys
from abc import abstractmethod, ABC
import re
from contextlib import suppress
from typing import (
TypeVar, Type, Dict, Iterator, Collection, Callable, Optional, FrozenSet, Any,
ClassVar, TYPE_CHECKING, overload
ClassVar, TYPE_CHECKING, overload, Tuple
)
from types import ModuleType
import warnings
Expand Down Expand Up @@ -254,13 +254,17 @@ def new_borrow_pos(cls: Type[_T], type_: str, value: Any, borrow_t: 'Token') ->
return cls(type_, value, borrow_t.start_pos, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column, borrow_t.end_pos)

def __reduce__(self):
return (self.__class__, (self.type, self.value, self.start_pos, self.line, self.column))
return (self.__class__, (self.type, self.value,
self.start_pos, self.line, self.column,
self.end_line, self.end_column, self.end_pos))

def __repr__(self):
return 'Token(%r, %r)' % (self.type, self.value)

def __deepcopy__(self, memo):
return Token(self.type, self.value, self.start_pos, self.line, self.column)
return Token(self.type, self.value,
self.start_pos, self.line, self.column,
self.end_line, self.end_column, self.end_pos)

def __eq__(self, other):
if isinstance(other, Token) and self.type != other.type:
Expand Down Expand Up @@ -289,7 +293,7 @@ def __eq__(self, other):

return self.char_pos == other.char_pos and self.newline_char == other.newline_char

def feed(self, token: Token, test_newline=True):
def feed(self, token: str, test_newline=True):
"""Consume a token and calculate the new line & column.

As an optional optimization, set test_newline=False if token doesn't contain a newline.
Expand All @@ -303,6 +307,15 @@ def feed(self, token: Token, test_newline=True):
self.char_pos += len(token)
self.column = self.char_pos - self.line_start_pos + 1

def feed_substring(self, text: str, start_pos: int, end_pos: int):
newlines = text.count(self.newline_char, start_pos, end_pos)
if newlines:
self.line += newlines
self.line_start_pos = self.char_pos + text.rindex(self.newline_char, start_pos, end_pos) + 1

self.char_pos += end_pos - start_pos
self.column = self.char_pos - self.line_start_pos + 1


class UnlessCallback:
def __init__(self, scanner):
Expand Down Expand Up @@ -384,12 +397,25 @@ def _build_mres(self, terminals, max_size):
terminals = terminals[max_size:]
return mres

def match(self, text, pos):
def match(self, text, pos, *, end_pos=sys.maxsize):
for mre in self._mres:
m = mre.match(text, pos)
m = mre.match(text, pos, end_pos)
if m:
return m.group(0), m.lastgroup

def search(self, text, start_pos, end_pos):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I propose a different way to write this function:

    def search(self, text, start_pos, end_pos):
        results = list(filter(None, [
            mre.search(text, start_pos, end_pos)
            for mre in self._mres
        ]))
        if not results:
            return None

        best = min(results, key=lambda m: m.start())
        return (best.group(0), best.lastgroup), best.start()

best = None
for mre in self._mres:
mre: re.Pattern
m = mre.search(text, start_pos, end_pos)
if m:
if best is None or m.start() < best.start():
best = m
if best is None:
return best
else:
return (best.group(0), best.lastgroup), best.start()


def _regexp_has_newline(r: str):
r"""Expressions that may indicate newlines in a regexp:
Expand All @@ -407,25 +433,41 @@ class LexerState:
(Lexer objects are only instantiated per grammar, not per text)
"""

__slots__ = 'text', 'line_ctr', 'last_token'
__slots__ = 'text', 'line_ctr', 'end_pos', 'last_token'

text: str
line_ctr: LineCounter
end_pos: int
last_token: Optional[Token]

def __init__(self, text: str, line_ctr: Optional[LineCounter]=None, last_token: Optional[Token]=None):
self.text = text
def __init__(self, text: Optional[str], line_ctr: Optional[LineCounter] = None, last_token: Optional[Token] = None,
*, start_pos: Optional[int] = None, end_pos: Optional[int] = None):
self.text = text # type: ignore[assignment]
self.line_ctr = line_ctr or LineCounter(b'\n' if isinstance(text, bytes) else '\n')
self.last_token = last_token
# If we are not given a text (i.e. via `parse_interactive`), `start_pos` and `end_pos` are ignored
if text is None:
self.end_pos = sys.maxsize
return
self.end_pos = end_pos if end_pos is not None else len(self.text)
if start_pos is not None:
if start_pos < 0:
start_pos += len(text)
self.line_ctr.feed_substring(text, 0, start_pos)
if self.end_pos < 0:
self.end_pos += len(text)

def __eq__(self, other):
if not isinstance(other, LexerState):
return NotImplemented

return self.text is other.text and self.line_ctr == other.line_ctr and self.last_token == other.last_token
return (self.text is other.text and
self.line_ctr == other.line_ctr and
self.end_pos == other.end_pos and
self.last_token == other.last_token)

def __copy__(self):
return type(self)(self.text, copy(self.line_ctr), self.last_token)
return type(self)(self.text, copy(self.line_ctr), self.last_token, end_pos=self.end_pos)


class LexerThread:
Expand All @@ -437,8 +479,9 @@ def __init__(self, lexer: 'Lexer', lexer_state: LexerState):
self.state = lexer_state

@classmethod
def from_text(cls, lexer: 'Lexer', text: str) -> 'LexerThread':
return cls(lexer, LexerState(text))
def from_text(cls, lexer: 'Lexer', text: str, *, start_pos: Optional[int] = None,
end_pos: Optional[int] = None) -> 'LexerThread':
return cls(lexer, LexerState(text, start_pos=start_pos, end_pos=end_pos))

def lex(self, parser_state):
return self.lexer.lex(self.state, parser_state)
Expand All @@ -461,6 +504,9 @@ class Lexer(ABC):
def lex(self, lexer_state: LexerState, parser_state: Any) -> Iterator[Token]:
return NotImplemented

def search_start(self, text: str, start_state, start_pos: int, end_pos: int) -> Optional[Token]:
raise TypeError("This lexer can not be used for searching in text")

def make_lexer_state(self, text):
"Deprecated"
return LexerState(text)
Expand Down Expand Up @@ -563,7 +609,7 @@ def __init__(self, conf: 'LexerConf', comparator=None) -> None:
self.use_bytes = conf.use_bytes
self.terminals_by_name = conf.terminals_by_name

self._scanner = None
self._scanner: Optional[Scanner] = None

def _build_scanner(self):
terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes)
Expand All @@ -579,18 +625,19 @@ def _build_scanner(self):
self._scanner = Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes)

@property
def scanner(self):
def scanner(self) -> Scanner:
if self._scanner is None:
self._build_scanner()
assert self._scanner is not None
return self._scanner

def match(self, text, pos):
return self.scanner.match(text, pos)
def match(self, text, pos, *, end_pos):
return self.scanner.match(text, pos, end_pos=end_pos)

def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token:
line_ctr = lex_state.line_ctr
while line_ctr.char_pos < len(lex_state.text):
res = self.match(lex_state.text, line_ctr.char_pos)
while line_ctr.char_pos < lex_state.end_pos:
res = self.match(lex_state.text, line_ctr.char_pos, end_pos=lex_state.end_pos)
if not res:
allowed = self.scanner.allowed_types - self.ignore_types
if not allowed:
Expand Down Expand Up @@ -621,6 +668,19 @@ def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token:
# EOF
raise EOFError(self)

def search_start(self, text: str, start_state, start_pos: int, end_pos: int) -> Optional[Token]:
while True:
res = self.scanner.search(text, start_pos, end_pos)
if not res:
return None
(value, type_), actual_pos = res
if type_ in self.ignore_types:
start_pos = actual_pos + len(value)
continue
t = Token(type_, value, actual_pos, end_pos=start_pos + len(value))
return t



class ContextualLexer(Lexer):
lexers: Dict[int, AbstractBasicLexer]
Expand Down Expand Up @@ -675,4 +735,8 @@ def lex(self, lexer_state: LexerState, parser_state: 'ParserState') -> Iterator[
except UnexpectedCharacters:
raise e # Raise the original UnexpectedCharacters. The root lexer raises it with the wrong expected set.

def search_start(self, text: str, start_state, start_pos: int, end_pos: int) -> Optional[Token]:
return self.lexers[start_state].search_start(text, start_state, start_pos, end_pos)


###}
Loading
Loading