Skip to content

Commit

Permalink
Earley now uses OrderedSet for better output stability
Browse files Browse the repository at this point in the history
  • Loading branch information
erezsh committed Sep 22, 2023
1 parent 19ec0d5 commit bed05b0
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 37 deletions.
6 changes: 5 additions & 1 deletion lark/lark.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ class LarkOptions(Serialize):
regex: bool
g_regex_flags: int
keep_all_tokens: bool
tree_class: Any
tree_class: Callable
parser: _ParserArgType
lexer: _LexerArgType
ambiguity: 'Literal["auto", "resolve", "explicit", "forest"]'
Expand All @@ -73,6 +73,7 @@ class LarkOptions(Serialize):
edit_terminals: Optional[Callable[[TerminalDef], TerminalDef]]
import_paths: 'List[Union[str, Callable[[Union[None, str, PackageResource], str], Tuple[str, str]]]]'
source_path: Optional[str]
ordered_sets: bool

OPTIONS_DOC = """
**=== General Options ===**
Expand Down Expand Up @@ -141,6 +142,8 @@ class LarkOptions(Serialize):
Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution.
use_bytes
Accept an input of type ``bytes`` instead of ``str``.
ordered_sets
Should Earley use ordered-sets to achieve stable output (~10%% slower than regular sets. Default: True)
edit_terminals
A callback for editing the terminals before parse.
import_paths
Expand Down Expand Up @@ -179,6 +182,7 @@ class LarkOptions(Serialize):
'edit_terminals': None,
'g_regex_flags': 0,
'use_bytes': False,
'ordered_sets': True,
'import_paths': [],
'source_path': None,
'_plugins': {},
Expand Down
4 changes: 2 additions & 2 deletions lark/load_grammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from contextlib import suppress
from typing import List, Tuple, Union, Callable, Dict, Optional, Sequence

from .utils import bfs, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors
from .utils import bfs, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors, OrderedSet
from .lexer import Token, TerminalDef, PatternStr, PatternRE

from .parse_tree_builder import ParseTreeBuilder
Expand Down Expand Up @@ -781,7 +781,7 @@ def compile(self, start, terminals_to_keep):
assert len({(r.alias, r.order, r.options) for r in dups}) == len(dups)

# Remove duplicates
compiled_rules = list(set(compiled_rules))
compiled_rules = list(OrderedSet(compiled_rules))

# Filter out unused rules
while True:
Expand Down
5 changes: 3 additions & 2 deletions lark/parser_frontends.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Any, Callable, Dict, Optional, Collection

from .exceptions import ConfigurationError, GrammarError, assert_config
from .utils import get_regexp_width, Serialize
from .utils import get_regexp_width, Serialize, OrderedSet
from .parsers.grammar_analysis import GrammarAnalyzer
from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer
from .parsers import earley, xearley, cyk
Expand Down Expand Up @@ -208,7 +208,8 @@ def create_earley_parser(lexer_conf: LexerConf, parser_conf: ParserConf, options
else:
f = create_earley_parser__basic

return f(lexer_conf, parser_conf, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class, **extra)
return f(lexer_conf, parser_conf, resolve_ambiguity=resolve_ambiguity,
debug=debug, tree_class=tree_class, ordered_sets=options.ordered_sets, **extra)



Expand Down
40 changes: 21 additions & 19 deletions lark/parsers/earley.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,34 +9,36 @@
is explained here: https://lark-parser.readthedocs.io/en/latest/_static/sppf/sppf.html
"""

import typing

from typing import TYPE_CHECKING, Callable
from collections import deque

from ..lexer import Token
from ..tree import Tree
from ..exceptions import UnexpectedEOF, UnexpectedToken
from ..utils import logger
from ..utils import logger, OrderedSet
from .grammar_analysis import GrammarAnalyzer
from ..grammar import NonTerminal
from .earley_common import Item
from .earley_forest import ForestSumVisitor, SymbolNode, TokenNode, ForestToParseTree
from .earley_forest import ForestSumVisitor, SymbolNode, StableSymbolNode, TokenNode, ForestToParseTree

if typing.TYPE_CHECKING:
if TYPE_CHECKING:
from ..common import LexerConf, ParserConf

class Parser:
lexer_conf: 'LexerConf'
parser_conf: 'ParserConf'
debug: bool

def __init__(self, lexer_conf: 'LexerConf', parser_conf: 'ParserConf', term_matcher, resolve_ambiguity=True, debug=False, tree_class=Tree):
def __init__(self, lexer_conf: 'LexerConf', parser_conf: 'ParserConf', term_matcher: Callable,
resolve_ambiguity: bool=True, debug: bool=False, tree_class: type=Tree, ordered_sets: bool=True):
analysis = GrammarAnalyzer(parser_conf)
self.lexer_conf = lexer_conf
self.parser_conf = parser_conf
self.resolve_ambiguity = resolve_ambiguity
self.debug = debug
self.tree_class = tree_class
self.Tree = tree_class
self.Set = OrderedSet if ordered_sets else set
self.SymbolNode = StableSymbolNode if ordered_sets else SymbolNode

self.FIRST = analysis.FIRST
self.NULLABLE = analysis.NULLABLE
Expand Down Expand Up @@ -94,7 +96,7 @@ def predict_and_complete(self, i, to_scan, columns, transitives):
if item.is_complete: ### (item.s == string)
if item.node is None:
label = (item.s, item.start, i)
item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
item.node.add_family(item.s, item.rule, item.start, None, None)

# create_leo_transitives(item.rule.origin, item.start)
Expand All @@ -109,7 +111,7 @@ def predict_and_complete(self, i, to_scan, columns, transitives):

new_item = Item(transitive.rule, transitive.ptr, transitive.start)
label = (root_transitive.s, root_transitive.start, i)
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
new_item.node.add_path(root_transitive, item.node)
if new_item.expect in self.TERMINALS:
# Add (B :: aC.B, h, y) to Q
Expand All @@ -133,7 +135,7 @@ def predict_and_complete(self, i, to_scan, columns, transitives):
for originator in originators:
new_item = originator.advance()
label = (new_item.s, originator.start, i)
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node)
if new_item.expect in self.TERMINALS:
# Add (B :: aC.B, h, y) to Q
Expand All @@ -154,7 +156,7 @@ def predict_and_complete(self, i, to_scan, columns, transitives):
if item.expect in held_completions:
new_item = item.advance()
label = (new_item.s, item.start, i)
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect])
new_items.append(new_item)

Expand Down Expand Up @@ -190,13 +192,13 @@ def scan(i, token, to_scan):
Earley predictor, based on the previously completed tokens.
This ensures that at each phase of the parse we have a custom
lexer context, allowing for more complex ambiguities."""
next_to_scan = set()
next_set = set()
next_to_scan = self.Set()
next_set = self.Set()
columns.append(next_set)
transitives.append({})
node_cache = {}

for item in set(to_scan):
for item in self.Set(to_scan):
if match(item.expect, token):
new_item = item.advance()
label = (new_item.s, new_item.start, i)
Expand All @@ -209,7 +211,7 @@ def scan(i, token, to_scan):
# ForestSumVisitor after the basic lexer has already
# "used up" the terminal priorities
token_node = TokenNode(token, term, priority=0)
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token_node)

if new_item.expect in self.TERMINALS:
Expand Down Expand Up @@ -260,8 +262,8 @@ def parse(self, lexer, start):
assert start, start
start_symbol = NonTerminal(start)

columns = [set()]
to_scan = set() # The scan buffer. 'Q' in E.Scott's paper.
columns = [self.Set()]
to_scan = self.Set() # The scan buffer. 'Q' in E.Scott's paper.

## Predict for the start_symbol.
# Add predicted items to the first Earley set (for the predictor) if they
Expand Down Expand Up @@ -296,9 +298,9 @@ def parse(self, lexer, start):
if len(solutions) > 1:
assert False, 'Earley should not generate multiple start symbol items!'

if self.tree_class is not None:
if self.Tree is not None:
# Perform our SPPF -> AST conversion
transformer = ForestToParseTree(self.tree_class, self.callbacks, self.forest_sum_visitor and self.forest_sum_visitor(), self.resolve_ambiguity)
transformer = ForestToParseTree(self.Tree, self.callbacks, self.forest_sum_visitor and self.forest_sum_visitor(), self.resolve_ambiguity)
return transformer.transform(solutions[0])

# return the root of the SPPF
Expand Down
13 changes: 9 additions & 4 deletions lark/parsers/earley_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

from ..parse_tree_builder import AmbiguousIntermediateExpander
from ..visitors import Discard
from ..utils import logger
from ..utils import logger, OrderedSet
from ..tree import Tree

class ForestNode:
Expand Down Expand Up @@ -44,13 +44,14 @@ class SymbolNode(ForestNode):
is_intermediate: True if this node is an intermediate node.
priority: The priority of the node's symbol.
"""
Set = set # Overridden by StableSymbolNode
__slots__ = ('s', 'start', 'end', '_children', 'paths', 'paths_loaded', 'priority', 'is_intermediate', '_hash')
def __init__(self, s, start, end):
self.s = s
self.start = start
self.end = end
self._children = set()
self.paths = set()
self._children = self.Set()
self.paths = self.Set()
self.paths_loaded = False

### We use inf here as it can be safely negated without resorting to conditionals,
Expand All @@ -68,7 +69,7 @@ def add_path(self, transitive, node):
def load_paths(self):
for transitive, node in self.paths:
if transitive.next_titem is not None:
vn = SymbolNode(transitive.next_titem.s, transitive.next_titem.start, self.end)
vn = type(self)(transitive.next_titem.s, transitive.next_titem.start, self.end)
vn.add_path(transitive.next_titem, node)
self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, vn)
else:
Expand Down Expand Up @@ -110,6 +111,10 @@ def __repr__(self):
symbol = self.s.name
return "({}, {}, {}, {})".format(symbol, self.start, self.end, self.priority)

class StableSymbolNode(SymbolNode):
"A version of SymbolNode that uses OrderedSet for output stability"
Set = OrderedSet

class PackedNode(ForestNode):
"""
A Packed Node represents a single derivation in a symbol node.
Expand Down
22 changes: 14 additions & 8 deletions lark/parsers/xearley.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""This module implements an experimental Earley parser with a dynamic lexer
"""This module implements an Earley parser with a dynamic lexer
The core Earley algorithm used here is based on Elizabeth Scott's implementation, here:
https://www.sciencedirect.com/science/article/pii/S1571066108001497
Expand All @@ -14,19 +14,25 @@
Earley's power in parsing any CFG.
"""

from typing import TYPE_CHECKING, Callable
from collections import defaultdict

from ..tree import Tree
from ..exceptions import UnexpectedCharacters
from ..lexer import Token
from ..grammar import Terminal
from .earley import Parser as BaseParser
from .earley_forest import SymbolNode, TokenNode
from .earley_forest import TokenNode

if TYPE_CHECKING:
from ..common import LexerConf, ParserConf

class Parser(BaseParser):
def __init__(self, lexer_conf, parser_conf, term_matcher, resolve_ambiguity=True, complete_lex = False, debug=False, tree_class=Tree):
BaseParser.__init__(self, lexer_conf, parser_conf, term_matcher, resolve_ambiguity, debug, tree_class)
def __init__(self, lexer_conf: 'LexerConf', parser_conf: 'ParserConf', term_matcher: Callable,
resolve_ambiguity: bool=True, complete_lex: bool=False, debug: bool=False,
tree_class: type=Tree, ordered_sets: bool=True):
BaseParser.__init__(self, lexer_conf, parser_conf, term_matcher, resolve_ambiguity,
debug, tree_class, ordered_sets)
self.ignore = [Terminal(t) for t in lexer_conf.ignore]
self.complete_lex = complete_lex

Expand All @@ -49,7 +55,7 @@ def scan(i, to_scan):
# they complete, we push all tokens into a buffer (delayed_matches), to
# be held possibly for a later parse step when we reach the point in the
# input stream at which they complete.
for item in set(to_scan):
for item in self.Set(to_scan):
m = match(item.expect, stream, i)
if m:
t = Token(item.expect.name, m.group(0), i, text_line, text_column)
Expand Down Expand Up @@ -81,8 +87,8 @@ def scan(i, to_scan):
# If we're ignoring up to the end of the file, # carry over the start symbol if it already completed.
delayed_matches[m.end()].extend([(item, i, None) for item in columns[i] if item.is_complete and item.s == start_symbol])

next_to_scan = set()
next_set = set()
next_to_scan = self.Set()
next_set = self.Set()
columns.append(next_set)
transitives.append({})

Expand All @@ -100,7 +106,7 @@ def scan(i, to_scan):
new_item = item.advance()
label = (new_item.s, new_item.start, i)
token_node = TokenNode(token, terminals[token.type])
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, self.SymbolNode(*label))
new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token_node)
else:
new_item = item
Expand Down
26 changes: 25 additions & 1 deletion lark/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os
from itertools import product
from collections import deque
from typing import Callable, Iterator, List, Optional, Tuple, Type, TypeVar, Union, Dict, Any, Sequence, Iterable
from typing import Callable, Iterator, List, Optional, Tuple, Type, TypeVar, Union, Dict, Any, Sequence, Iterable, Generic

###{standalone
import sys, re
Expand Down Expand Up @@ -328,3 +328,27 @@ def small_factors(n: int, max_factor: int) -> List[Tuple[int, int]]:
if a + b <= max_factor:
return small_factors(r, max_factor) + [(a, b)]
assert False, "Failed to factorize %s" % n


class OrderedSet(Generic[T]):
"""A minimal OrderedSet implementation, using a dictionary.
(relies on the dictionary being ordered)
"""
def __init__(self, items: Iterable[T] =()):
self.d = dict.fromkeys(items)

def __contains__(self, item: T) -> bool:
return item in self.d

def add(self, item: T):
self.d[item] = None

def __iter__(self) -> Iterator[T]:
return iter(self.d)

def remove(self, item: T):
del self.d[item]

def __bool__(self):
return bool(self.d)

0 comments on commit bed05b0

Please sign in to comment.