Skip to content

Commit

Permalink
Enable parsing of input seeds with syntax errors
Browse files Browse the repository at this point in the history
Previously, only seed files that could be fully recognized by the
grammar were transformed into trees by grammarinator-parse. This
restriction excluded partially recognizable seeds, which can be
valuable for fuzzing.
With this change, `grammarinator-parse` now supports transforming
seeds containing syntax errors.
  • Loading branch information
renatahodovan committed Nov 26, 2024
1 parent f5de911 commit 2b5a5d1
Show file tree
Hide file tree
Showing 6 changed files with 38 additions and 13 deletions.
45 changes: 32 additions & 13 deletions grammarinator/tool/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,19 @@ def syntaxError(self, recognizer, offendingSymbol, line, column, msg, e):
error.ErrorListener.ConsoleErrorListener.INSTANCE = ConsoleListener()


class ExtendedErrorListener(error.ErrorListener.ErrorListener):
"""
Custom error listener for the ANTLR lexer ensuring to insert the
unrecognized tokens into the tree as well.
"""
def syntaxError(self, recognizer, offendingSymbol, line, column, msg, e):
recognizer.inputStream.consume()
recognizer.type = Token.INVALID_TYPE
recognizer.channel = Token.DEFAULT_CHANNEL
recognizer.emit()
recognizer.type = Token.MIN_USER_TOKEN_TYPE


class ParserTool:
"""
Tool to parse existing sources and create a tree pool from them. These
Expand Down Expand Up @@ -185,8 +198,9 @@ def _antlr_to_grammarinator_tree(self, antlr_node, parser, visited=None):
depth = max(depth, child_depth + 1)
else:
assert isinstance(antlr_node, TerminalNode), f'An ANTLR node must either be a ParserRuleContext or a TerminalNode but {antlr_node.__class__.__name__} was found.'
name, text = parser.symbolicNames[antlr_node.symbol.type] if len(parser.symbolicNames) >= antlr_node.symbol.type else '<INVALID>', antlr_node.symbol.text
name, text = parser.symbolicNames[antlr_node.symbol.type] if len(parser.symbolicNames) > antlr_node.symbol.type else '<INVALID>', antlr_node.symbol.text
assert name, f'{name} is None or empty'

if antlr_node.symbol.type == Token.EOF:
return None, 0, []

Expand Down Expand Up @@ -310,7 +324,9 @@ def _match_seq(grammar_vertices, tree_node_pos):
# They MUST match, since ANTLR has already parsed them
# During matching, quantifier and alternation structures are identified
rule_children, rule_tree_node_pos = _match_seq(self._graph.vertices[rule.name].out_neighbours + [None], 0)
assert rule_children is not None, f'Failed to match {rule.name} tree node to the related grammar rule at {rule_tree_node_pos}.'
if rule_children is None:
logger.warning('Failed to match %s tree node to the related grammar rule at %d.', rule.name, rule_tree_node_pos)
return

# Detach all children from the tree node so that they can be reattached
# in a structured way afterwards
Expand Down Expand Up @@ -368,21 +384,24 @@ def _reattach_children(rule, children):
# Create an ANTLR tree from the input stream and convert it to Grammarinator tree.
def _create_tree(self, input_stream, fn):
try:
parser = self._parser_cls(CommonTokenStream(self._lexer_cls(input_stream)))
lexer = self._lexer_cls(input_stream)
lexer.addErrorListener(ExtendedErrorListener())
parser = self._parser_cls(CommonTokenStream(lexer))
parse_tree_root = getattr(parser, self._rule)()
if not parser._syntaxErrors:
root, depth, rules = self._antlr_to_grammarinator_tree(parse_tree_root, parser)
if depth > self._max_depth:
logger.info('The tree representation of %s is %s, too deep. Skipping.', fn, depth)
return None
if parser._syntaxErrors:
logger.warning('%s syntax errors detected in %s.', parser._syntaxErrors, fn)

root, depth, rules = self._antlr_to_grammarinator_tree(parse_tree_root, parser)
if depth > self._max_depth:
logger.info('The tree representation of %s is %s, too deep. Skipping.', fn, depth)
return None

self._adjust_tree_to_generator(rules)
for transformer in self._transformers:
root = transformer(root)
self._adjust_tree_to_generator(rules)
for transformer in self._transformers:
root = transformer(root)

return root
return root

logger.warning('%s syntax errors detected in %s.', parser._syntaxErrors, fn)
except Exception as e:
logger.warning('Exception while parsing %s.', fn, exc_info=e)
return None
Expand Down
1 change: 1 addition & 0 deletions tests/parser/exp6.grtj
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"t": "p", "n": "start", "c": [{"t": "a", "ai": 0, "i": 1, "c": [{"t": "p", "n": "start_Quantifiers_test", "c": [{"t": "p", "n": "element", "c": [{"t": "l", "n": "<INVALID>", "s": "pass", "z": [1, 1], "i": false}, {"t": "q", "i": 0, "b": 0, "e": 1, "c": []}]}, {"t": "q", "i": 1, "b": 1, "e": -1, "c": [{"t": "qd", "c": [{"t": "l", "n": "<INVALID>", "s": " | ", "z": [1, 1], "i": false}, {"t": "p", "n": "element", "c": [{"t": "l", "n": "<INVALID>", "s": "|", "z": [1, 1], "i": false}, {"t": "l", "n": "<INVALID>", "s": "pass", "z": [1, 1], "i": false}]}]}]}]}]}]}
1 change: 1 addition & 0 deletions tests/parser/exp7.grtj
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"t": "p", "n": "start", "c": [{"t": "a", "ai": 0, "i": 0, "c": [{"t": "p", "n": "start_Quantifiers_test", "c": [{"t": "l", "n": "<INVALID>", "s": "*", "z": [1, 1], "i": false}, {"t": "p", "n": "element", "c": [{"t": "l", "n": "<INVALID>", "s": "pass", "z": [1, 1], "i": false}, {"t": "q", "i": 0, "b": 0, "e": 1, "c": []}]}]}]}]}
1 change: 1 addition & 0 deletions tests/parser/inp6.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pass | | pass
1 change: 1 addition & 0 deletions tests/parser/inp7.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
* pass
2 changes: 2 additions & 0 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
(os.path.join(parser_dir, 'inp3.txt'), os.path.join(parser_dir, 'exp3.grtj')),
(os.path.join(parser_dir, 'inp4.txt'), os.path.join(parser_dir, 'exp4.grtj')),
(os.path.join(parser_dir, 'inp5.txt'), os.path.join(parser_dir, 'exp5.grtj')),
(os.path.join(parser_dir, 'inp6.txt'), os.path.join(parser_dir, 'exp6.grtj')),
(os.path.join(parser_dir, 'inp7.txt'), os.path.join(parser_dir, 'exp7.grtj')),
])
def test_parser(inp, expected, tmpdir):
with open(inp, 'r') as f:
Expand Down

0 comments on commit 2b5a5d1

Please sign in to comment.