From 2b5a5d1cd8e61d551a7b310b1feb4f84465f3b83 Mon Sep 17 00:00:00 2001
From: Renata Hodovan <reni@inf.u-szeged.hu>
Date: Sun, 24 Nov 2024 23:00:27 +0100
Subject: [PATCH] Enable parsing of input seeds with syntax errors

Previously, only seed files that could be fully recognized by the
grammar were transformed into trees by grammarinator-parse. This
restriction excluded partially recognizable seeds, which can be
valuable for fuzzing.
With this change, `grammarinator-parse` now supports transforming
seeds containing syntax errors.
---
 grammarinator/tool/parser.py | 45 +++++++++++++++++++++++++-----------
 tests/parser/exp6.grtj       |  1 +
 tests/parser/exp7.grtj       |  1 +
 tests/parser/inp6.txt        |  1 +
 tests/parser/inp7.txt        |  1 +
 tests/test_parser.py         |  2 ++
 6 files changed, 38 insertions(+), 13 deletions(-)
 create mode 100644 tests/parser/exp6.grtj
 create mode 100644 tests/parser/exp7.grtj
 create mode 100644 tests/parser/inp6.txt
 create mode 100644 tests/parser/inp7.txt
diff --git a/grammarinator/tool/parser.py b/grammarinator/tool/parser.py
index 2be9cae..d3940cc 100644
--- a/grammarinator/tool/parser.py
+++ b/grammarinator/tool/parser.py
@@ -34,6 +34,19 @@ def syntaxError(self, recognizer, offendingSymbol, line, column, msg, e):
 error.ErrorListener.ConsoleErrorListener.INSTANCE = ConsoleListener()
 
 
+class ExtendedErrorListener(error.ErrorListener.ErrorListener):
+    """
+    Custom error listener for the ANTLR lexer ensuring to insert the
+    unrecognized tokens into the tree as well.
+    """
+    def syntaxError(self, recognizer, offendingSymbol, line, column, msg, e):
+        recognizer.inputStream.consume()
+        recognizer.type = Token.INVALID_TYPE
+        recognizer.channel = Token.DEFAULT_CHANNEL
+        recognizer.emit()
+        recognizer.type = Token.MIN_USER_TOKEN_TYPE
+
+
 class ParserTool:
     """
     Tool to parse existing sources and create a tree pool from them. These
@@ -185,8 +198,9 @@ def _antlr_to_grammarinator_tree(self, antlr_node, parser, visited=None):
                 depth = max(depth, child_depth + 1)
         else:
             assert isinstance(antlr_node, TerminalNode), f'An ANTLR node must either be a ParserRuleContext or a TerminalNode but {antlr_node.__class__.__name__} was found.'
-            name, text = parser.symbolicNames[antlr_node.symbol.type] if len(parser.symbolicNames) >= antlr_node.symbol.type else '<INVALID>', antlr_node.symbol.text
+            name, text = parser.symbolicNames[antlr_node.symbol.type] if len(parser.symbolicNames) > antlr_node.symbol.type else '<INVALID>', antlr_node.symbol.text
             assert name, f'{name} is None or empty'
+
             if antlr_node.symbol.type == Token.EOF:
                 return None, 0, []
 
@@ -310,7 +324,9 @@ def _match_seq(grammar_vertices, tree_node_pos):
             # They MUST match, since ANTLR has already parsed them
             # During matching, quantifier and alternation structures are identified
             rule_children, rule_tree_node_pos = _match_seq(self._graph.vertices[rule.name].out_neighbours + [None], 0)
-            assert rule_children is not None, f'Failed to match {rule.name} tree node to the related grammar rule at {rule_tree_node_pos}.'
+            if rule_children is None:
+                logger.warning('Failed to match %s tree node to the related grammar rule at %d.', rule.name, rule_tree_node_pos)
+                return
 
             # Detach all children from the tree node so that they can be reattached
             # in a structured way afterwards
@@ -368,21 +384,24 @@ def _reattach_children(rule, children):
     # Create an ANTLR tree from the input stream and convert it to Grammarinator tree.
     def _create_tree(self, input_stream, fn):
         try:
-            parser = self._parser_cls(CommonTokenStream(self._lexer_cls(input_stream)))
+            lexer = self._lexer_cls(input_stream)
+            lexer.addErrorListener(ExtendedErrorListener())
+            parser = self._parser_cls(CommonTokenStream(lexer))
             parse_tree_root = getattr(parser, self._rule)()
-            if not parser._syntaxErrors:
-                root, depth, rules = self._antlr_to_grammarinator_tree(parse_tree_root, parser)
-                if depth > self._max_depth:
-                    logger.info('The tree representation of %s is %s, too deep. Skipping.', fn, depth)
-                    return None
+            if parser._syntaxErrors:
+                logger.warning('%s syntax errors detected in %s.', parser._syntaxErrors, fn)
+
+            root, depth, rules = self._antlr_to_grammarinator_tree(parse_tree_root, parser)
+            if depth > self._max_depth:
+                logger.info('The tree representation of %s is %s, too deep. Skipping.', fn, depth)
+                return None
 
-                self._adjust_tree_to_generator(rules)
-                for transformer in self._transformers:
-                    root = transformer(root)
+            self._adjust_tree_to_generator(rules)
+            for transformer in self._transformers:
+                root = transformer(root)
 
-                return root
+            return root
 
-            logger.warning('%s syntax errors detected in %s.', parser._syntaxErrors, fn)
         except Exception as e:
             logger.warning('Exception while parsing %s.', fn, exc_info=e)
         return None
diff --git a/tests/parser/exp6.grtj b/tests/parser/exp6.grtj
new file mode 100644
index 0000000..5e44c0a
--- /dev/null
+++ b/tests/parser/exp6.grtj
@@ -0,0 +1 @@
+{"t": "p", "n": "start", "c": [{"t": "a", "ai": 0, "i": 1, "c": [{"t": "p", "n": "start_Quantifiers_test", "c": [{"t": "p", "n": "element", "c": [{"t": "l", "n": "<INVALID>", "s": "pass", "z": [1, 1], "i": false}, {"t": "q", "i": 0, "b": 0, "e": 1, "c": []}]}, {"t": "q", "i": 1, "b": 1, "e": -1, "c": [{"t": "qd", "c": [{"t": "l", "n": "<INVALID>", "s": " | ", "z": [1, 1], "i": false}, {"t": "p", "n": "element", "c": [{"t": "l", "n": "<INVALID>", "s": "|", "z": [1, 1], "i": false}, {"t": "l", "n": "<INVALID>", "s": "pass", "z": [1, 1], "i": false}]}]}]}]}]}]}
\ No newline at end of file
diff --git a/tests/parser/exp7.grtj b/tests/parser/exp7.grtj
new file mode 100644
index 0000000..892534e
--- /dev/null
+++ b/tests/parser/exp7.grtj
@@ -0,0 +1 @@
+{"t": "p", "n": "start", "c": [{"t": "a", "ai": 0, "i": 0, "c": [{"t": "p", "n": "start_Quantifiers_test", "c": [{"t": "l", "n": "<INVALID>", "s": "*", "z": [1, 1], "i": false}, {"t": "p", "n": "element", "c": [{"t": "l", "n": "<INVALID>", "s": "pass", "z": [1, 1], "i": false}, {"t": "q", "i": 0, "b": 0, "e": 1, "c": []}]}]}]}]}
\ No newline at end of file
diff --git a/tests/parser/inp6.txt b/tests/parser/inp6.txt
new file mode 100644
index 0000000..e60f3a6
--- /dev/null
+++ b/tests/parser/inp6.txt
@@ -0,0 +1 @@
+pass | | pass
\ No newline at end of file
diff --git a/tests/parser/inp7.txt b/tests/parser/inp7.txt
new file mode 100644
index 0000000..2cbfa30
--- /dev/null
+++ b/tests/parser/inp7.txt
@@ -0,0 +1 @@
+* pass
\ No newline at end of file
diff --git a/tests/test_parser.py b/tests/test_parser.py
index fec34a6..987cf7c 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -24,6 +24,8 @@
     (os.path.join(parser_dir, 'inp3.txt'), os.path.join(parser_dir, 'exp3.grtj')),
     (os.path.join(parser_dir, 'inp4.txt'), os.path.join(parser_dir, 'exp4.grtj')),
     (os.path.join(parser_dir, 'inp5.txt'), os.path.join(parser_dir, 'exp5.grtj')),
+    (os.path.join(parser_dir, 'inp6.txt'), os.path.join(parser_dir, 'exp6.grtj')),
+    (os.path.join(parser_dir, 'inp7.txt'), os.path.join(parser_dir, 'exp7.grtj')),
 ])
 def test_parser(inp, expected, tmpdir):
     with open(inp, 'r') as f: