From 2b5a5d1cd8e61d551a7b310b1feb4f84465f3b83 Mon Sep 17 00:00:00 2001 From: Renata Hodovan Date: Sun, 24 Nov 2024 23:00:27 +0100 Subject: [PATCH] Enable parsing of input seeds with syntax errors Previously, only seed files that could be fully recognized by the grammar were transformed into trees by grammarinator-parse. This restriction excluded partially recognizable seeds, which can be valuable for fuzzing. With this change, `grammarinator-parse` now supports transforming seeds containing syntax errors. --- grammarinator/tool/parser.py | 45 +++++++++++++++++++++++++----------- tests/parser/exp6.grtj | 1 + tests/parser/exp7.grtj | 1 + tests/parser/inp6.txt | 1 + tests/parser/inp7.txt | 1 + tests/test_parser.py | 2 ++ 6 files changed, 38 insertions(+), 13 deletions(-) create mode 100644 tests/parser/exp6.grtj create mode 100644 tests/parser/exp7.grtj create mode 100644 tests/parser/inp6.txt create mode 100644 tests/parser/inp7.txt diff --git a/grammarinator/tool/parser.py b/grammarinator/tool/parser.py index 2be9cae..d3940cc 100644 --- a/grammarinator/tool/parser.py +++ b/grammarinator/tool/parser.py @@ -34,6 +34,19 @@ def syntaxError(self, recognizer, offendingSymbol, line, column, msg, e): error.ErrorListener.ConsoleErrorListener.INSTANCE = ConsoleListener() +class ExtendedErrorListener(error.ErrorListener.ErrorListener): + """ + Custom error listener for the ANTLR lexer ensuring to insert the + unrecognized tokens into the tree as well. + """ + def syntaxError(self, recognizer, offendingSymbol, line, column, msg, e): + recognizer.inputStream.consume() + recognizer.type = Token.INVALID_TYPE + recognizer.channel = Token.DEFAULT_CHANNEL + recognizer.emit() + recognizer.type = Token.MIN_USER_TOKEN_TYPE + + class ParserTool: """ Tool to parse existing sources and create a tree pool from them. These @@ -185,8 +198,9 @@ def _antlr_to_grammarinator_tree(self, antlr_node, parser, visited=None): depth = max(depth, child_depth + 1) else: assert isinstance(antlr_node, TerminalNode), f'An ANTLR node must either be a ParserRuleContext or a TerminalNode but {antlr_node.__class__.__name__} was found.' - name, text = parser.symbolicNames[antlr_node.symbol.type] if len(parser.symbolicNames) >= antlr_node.symbol.type else '', antlr_node.symbol.text + name, text = parser.symbolicNames[antlr_node.symbol.type] if len(parser.symbolicNames) > antlr_node.symbol.type else '', antlr_node.symbol.text assert name, f'{name} is None or empty' + if antlr_node.symbol.type == Token.EOF: return None, 0, [] @@ -310,7 +324,9 @@ def _match_seq(grammar_vertices, tree_node_pos): # They MUST match, since ANTLR has already parsed them # During matching, quantifier and alternation structures are identified rule_children, rule_tree_node_pos = _match_seq(self._graph.vertices[rule.name].out_neighbours + [None], 0) - assert rule_children is not None, f'Failed to match {rule.name} tree node to the related grammar rule at {rule_tree_node_pos}.' + if rule_children is None: + logger.warning('Failed to match %s tree node to the related grammar rule at %d.', rule.name, rule_tree_node_pos) + return # Detach all children from the tree node so that they can be reattached # in a structured way afterwards @@ -368,21 +384,24 @@ def _reattach_children(rule, children): # Create an ANTLR tree from the input stream and convert it to Grammarinator tree. def _create_tree(self, input_stream, fn): try: - parser = self._parser_cls(CommonTokenStream(self._lexer_cls(input_stream))) + lexer = self._lexer_cls(input_stream) + lexer.addErrorListener(ExtendedErrorListener()) + parser = self._parser_cls(CommonTokenStream(lexer)) parse_tree_root = getattr(parser, self._rule)() - if not parser._syntaxErrors: - root, depth, rules = self._antlr_to_grammarinator_tree(parse_tree_root, parser) - if depth > self._max_depth: - logger.info('The tree representation of %s is %s, too deep. Skipping.', fn, depth) - return None + if parser._syntaxErrors: + logger.warning('%s syntax errors detected in %s.', parser._syntaxErrors, fn) + + root, depth, rules = self._antlr_to_grammarinator_tree(parse_tree_root, parser) + if depth > self._max_depth: + logger.info('The tree representation of %s is %s, too deep. Skipping.', fn, depth) + return None - self._adjust_tree_to_generator(rules) - for transformer in self._transformers: - root = transformer(root) + self._adjust_tree_to_generator(rules) + for transformer in self._transformers: + root = transformer(root) - return root + return root - logger.warning('%s syntax errors detected in %s.', parser._syntaxErrors, fn) except Exception as e: logger.warning('Exception while parsing %s.', fn, exc_info=e) return None diff --git a/tests/parser/exp6.grtj b/tests/parser/exp6.grtj new file mode 100644 index 0000000..5e44c0a --- /dev/null +++ b/tests/parser/exp6.grtj @@ -0,0 +1 @@ +{"t": "p", "n": "start", "c": [{"t": "a", "ai": 0, "i": 1, "c": [{"t": "p", "n": "start_Quantifiers_test", "c": [{"t": "p", "n": "element", "c": [{"t": "l", "n": "", "s": "pass", "z": [1, 1], "i": false}, {"t": "q", "i": 0, "b": 0, "e": 1, "c": []}]}, {"t": "q", "i": 1, "b": 1, "e": -1, "c": [{"t": "qd", "c": [{"t": "l", "n": "", "s": " | ", "z": [1, 1], "i": false}, {"t": "p", "n": "element", "c": [{"t": "l", "n": "", "s": "|", "z": [1, 1], "i": false}, {"t": "l", "n": "", "s": "pass", "z": [1, 1], "i": false}]}]}]}]}]}]} \ No newline at end of file diff --git a/tests/parser/exp7.grtj b/tests/parser/exp7.grtj new file mode 100644 index 0000000..892534e --- /dev/null +++ b/tests/parser/exp7.grtj @@ -0,0 +1 @@ +{"t": "p", "n": "start", "c": [{"t": "a", "ai": 0, "i": 0, "c": [{"t": "p", "n": "start_Quantifiers_test", "c": [{"t": "l", "n": "", "s": "*", "z": [1, 1], "i": false}, {"t": "p", "n": "element", "c": [{"t": "l", "n": "", "s": "pass", "z": [1, 1], "i": false}, {"t": "q", "i": 0, "b": 0, "e": 1, "c": []}]}]}]}]} \ No newline at end of file diff --git a/tests/parser/inp6.txt b/tests/parser/inp6.txt new file mode 100644 index 0000000..e60f3a6 --- /dev/null +++ b/tests/parser/inp6.txt @@ -0,0 +1 @@ +pass | | pass \ No newline at end of file diff --git a/tests/parser/inp7.txt b/tests/parser/inp7.txt new file mode 100644 index 0000000..2cbfa30 --- /dev/null +++ b/tests/parser/inp7.txt @@ -0,0 +1 @@ +* pass \ No newline at end of file diff --git a/tests/test_parser.py b/tests/test_parser.py index fec34a6..987cf7c 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -24,6 +24,8 @@ (os.path.join(parser_dir, 'inp3.txt'), os.path.join(parser_dir, 'exp3.grtj')), (os.path.join(parser_dir, 'inp4.txt'), os.path.join(parser_dir, 'exp4.grtj')), (os.path.join(parser_dir, 'inp5.txt'), os.path.join(parser_dir, 'exp5.grtj')), + (os.path.join(parser_dir, 'inp6.txt'), os.path.join(parser_dir, 'exp6.grtj')), + (os.path.join(parser_dir, 'inp7.txt'), os.path.join(parser_dir, 'exp7.grtj')), ]) def test_parser(inp, expected, tmpdir): with open(inp, 'r') as f: