Skip to content

Commit

Permalink
ContextualLexer now uses self.basic_lexer for easy extensibility (issue
Browse files Browse the repository at this point in the history
  • Loading branch information
erezsh committed Sep 23, 2023
1 parent c2e44ea commit 4db7629
Showing 1 changed file with 25 additions and 11 deletions.
36 changes: 25 additions & 11 deletions lark/lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,24 @@ def _check_regex_collisions(terminal_to_regexp: Dict[TerminalDef, str], comparat
return


class BasicLexer(Lexer):
class AbstractBasicLexer(Lexer):
terminals_by_name: Dict[str, TerminalDef]

@abstractmethod
def __init__(self, conf: 'LexerConf', comparator=None) -> None:
...

@abstractmethod
def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token:
...

def lex(self, state: LexerState, parser_state: Any) -> Iterator[Token]:
with suppress(EOFError):
while True:
yield self.next_token(state, parser_state)


class BasicLexer(AbstractBasicLexer):
terminals: Collection[TerminalDef]
ignore_types: FrozenSet[str]
newline_types: FrozenSet[str]
Expand Down Expand Up @@ -569,11 +586,6 @@ def scanner(self):
def match(self, text, pos):
return self.scanner.match(text, pos)

def lex(self, state: LexerState, parser_state: Any) -> Iterator[Token]:
with suppress(EOFError):
while True:
yield self.next_token(state, parser_state)

def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token:
line_ctr = lex_state.line_ctr
while line_ctr.char_pos < len(lex_state.text):
Expand Down Expand Up @@ -611,8 +623,10 @@ def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token:

class ContextualLexer(Lexer):

lexers: Dict[str, BasicLexer]
root_lexer: BasicLexer
lexers: Dict[str, AbstractBasicLexer]
root_lexer: AbstractBasicLexer

basic_lexer: Type[AbstractBasicLexer] = BasicLexer

def __init__(self, conf: 'LexerConf', states: Dict[str, Collection[str]], always_accept: Collection[str]=()) -> None:
terminals = list(conf.terminals)
Expand All @@ -625,7 +639,7 @@ def __init__(self, conf: 'LexerConf', states: Dict[str, Collection[str]], always
comparator = interegular.Comparator.from_regexes({t: t.pattern.to_regexp() for t in terminals})
else:
comparator = None
lexer_by_tokens: Dict[FrozenSet[str], BasicLexer] = {}
lexer_by_tokens: Dict[FrozenSet[str], AbstractBasicLexer] = {}
self.lexers = {}
for state, accepts in states.items():
key = frozenset(accepts)
Expand All @@ -635,14 +649,14 @@ def __init__(self, conf: 'LexerConf', states: Dict[str, Collection[str]], always
accepts = set(accepts) | set(conf.ignore) | set(always_accept)
lexer_conf = copy(trad_conf)
lexer_conf.terminals = [terminals_by_name[n] for n in accepts if n in terminals_by_name]
lexer = BasicLexer(lexer_conf, comparator)
lexer = self.basic_lexer(lexer_conf, comparator)
lexer_by_tokens[key] = lexer

self.lexers[state] = lexer

assert trad_conf.terminals is terminals
trad_conf.skip_validation = True # We don't need to verify all terminals again
self.root_lexer = BasicLexer(trad_conf, comparator)
self.root_lexer = self.basic_lexer(trad_conf, comparator)

def lex(self, lexer_state: LexerState, parser_state: Any) -> Iterator[Token]:
try:
Expand Down

0 comments on commit 4db7629

Please sign in to comment.