Enable parsing of input seeds with syntax errors

Previously, only seed files that could be fully recognized by the grammar were transformed into trees by grammarinator-parse. This restriction excluded partially recognizable seeds, which can be valuable for fuzzing. With this change, `grammarinator-parse` now supports transforming seeds containing syntax errors.
renatahodovan · Nov 26, 2024 · 2b5a5d1 · 2b5a5d1
1 parent f5de911
commit 2b5a5d1
Show file tree

Hide file tree

Showing 6 changed files with 38 additions and 13 deletions.
diff --git a/grammarinator/tool/parser.py b/grammarinator/tool/parser.py
@@ -34,6 +34,19 @@ def syntaxError(self, recognizer, offendingSymbol, line, column, msg, e):
 error.ErrorListener.ConsoleErrorListener.INSTANCE = ConsoleListener()
 
 
+class ExtendedErrorListener(error.ErrorListener.ErrorListener):
+    """
+    Custom error listener for the ANTLR lexer ensuring to insert the
+    unrecognized tokens into the tree as well.
+    """
+    def syntaxError(self, recognizer, offendingSymbol, line, column, msg, e):
+        recognizer.inputStream.consume()
+        recognizer.type = Token.INVALID_TYPE
+        recognizer.channel = Token.DEFAULT_CHANNEL
+        recognizer.emit()
+        recognizer.type = Token.MIN_USER_TOKEN_TYPE
+
+
 class ParserTool:
     """
     Tool to parse existing sources and create a tree pool from them. These
@@ -185,8 +198,9 @@ def _antlr_to_grammarinator_tree(self, antlr_node, parser, visited=None):
                 depth = max(depth, child_depth + 1)
         else:
             assert isinstance(antlr_node, TerminalNode), f'An ANTLR node must either be a ParserRuleContext or a TerminalNode but {antlr_node.__class__.__name__} was found.'
-            name, text = parser.symbolicNames[antlr_node.symbol.type] if len(parser.symbolicNames) >= antlr_node.symbol.type else '<INVALID>', antlr_node.symbol.text
+            name, text = parser.symbolicNames[antlr_node.symbol.type] if len(parser.symbolicNames) > antlr_node.symbol.type else '<INVALID>', antlr_node.symbol.text
             assert name, f'{name} is None or empty'
+
             if antlr_node.symbol.type == Token.EOF:
                 return None, 0, []
 
@@ -310,7 +324,9 @@ def _match_seq(grammar_vertices, tree_node_pos):
             # They MUST match, since ANTLR has already parsed them
             # During matching, quantifier and alternation structures are identified
             rule_children, rule_tree_node_pos = _match_seq(self._graph.vertices[rule.name].out_neighbours + [None], 0)
-            assert rule_children is not None, f'Failed to match {rule.name} tree node to the related grammar rule at {rule_tree_node_pos}.'
+            if rule_children is None:
+                logger.warning('Failed to match %s tree node to the related grammar rule at %d.', rule.name, rule_tree_node_pos)
+                return
 
             # Detach all children from the tree node so that they can be reattached
             # in a structured way afterwards
@@ -368,21 +384,24 @@ def _reattach_children(rule, children):
     # Create an ANTLR tree from the input stream and convert it to Grammarinator tree.
     def _create_tree(self, input_stream, fn):
         try:
-            parser = self._parser_cls(CommonTokenStream(self._lexer_cls(input_stream)))
+            lexer = self._lexer_cls(input_stream)
+            lexer.addErrorListener(ExtendedErrorListener())
+            parser = self._parser_cls(CommonTokenStream(lexer))
             parse_tree_root = getattr(parser, self._rule)()
-            if not parser._syntaxErrors:
-                root, depth, rules = self._antlr_to_grammarinator_tree(parse_tree_root, parser)
-                if depth > self._max_depth:
-                    logger.info('The tree representation of %s is %s, too deep. Skipping.', fn, depth)
-                    return None
+            if parser._syntaxErrors:
+                logger.warning('%s syntax errors detected in %s.', parser._syntaxErrors, fn)
+
+            root, depth, rules = self._antlr_to_grammarinator_tree(parse_tree_root, parser)
+            if depth > self._max_depth:
+                logger.info('The tree representation of %s is %s, too deep. Skipping.', fn, depth)
+                return None
 
-                self._adjust_tree_to_generator(rules)
-                for transformer in self._transformers:
-                    root = transformer(root)
+            self._adjust_tree_to_generator(rules)
+            for transformer in self._transformers:
+                root = transformer(root)
 
-                return root
+            return root
 
-            logger.warning('%s syntax errors detected in %s.', parser._syntaxErrors, fn)
         except Exception as e:
             logger.warning('Exception while parsing %s.', fn, exc_info=e)
         return None

diff --git a/tests/parser/exp6.grtj b/tests/parser/exp6.grtj
@@ -0,0 +1 @@
+{"t": "p", "n": "start", "c": [{"t": "a", "ai": 0, "i": 1, "c": [{"t": "p", "n": "start_Quantifiers_test", "c": [{"t": "p", "n": "element", "c": [{"t": "l", "n": "<INVALID>", "s": "pass", "z": [1, 1], "i": false}, {"t": "q", "i": 0, "b": 0, "e": 1, "c": []}]}, {"t": "q", "i": 1, "b": 1, "e": -1, "c": [{"t": "qd", "c": [{"t": "l", "n": "<INVALID>", "s": " | ", "z": [1, 1], "i": false}, {"t": "p", "n": "element", "c": [{"t": "l", "n": "<INVALID>", "s": "|", "z": [1, 1], "i": false}, {"t": "l", "n": "<INVALID>", "s": "pass", "z": [1, 1], "i": false}]}]}]}]}]}]}
diff --git a/tests/parser/exp7.grtj b/tests/parser/exp7.grtj
@@ -0,0 +1 @@
+{"t": "p", "n": "start", "c": [{"t": "a", "ai": 0, "i": 0, "c": [{"t": "p", "n": "start_Quantifiers_test", "c": [{"t": "l", "n": "<INVALID>", "s": "*", "z": [1, 1], "i": false}, {"t": "p", "n": "element", "c": [{"t": "l", "n": "<INVALID>", "s": "pass", "z": [1, 1], "i": false}, {"t": "q", "i": 0, "b": 0, "e": 1, "c": []}]}]}]}]}
diff --git a/tests/parser/inp6.txt b/tests/parser/inp6.txt
@@ -0,0 +1 @@
+pass | | pass
diff --git a/tests/parser/inp7.txt b/tests/parser/inp7.txt
@@ -0,0 +1 @@
+* pass
diff --git a/tests/test_parser.py b/tests/test_parser.py
@@ -24,6 +24,8 @@
     (os.path.join(parser_dir, 'inp3.txt'), os.path.join(parser_dir, 'exp3.grtj')),
     (os.path.join(parser_dir, 'inp4.txt'), os.path.join(parser_dir, 'exp4.grtj')),
     (os.path.join(parser_dir, 'inp5.txt'), os.path.join(parser_dir, 'exp5.grtj')),
+    (os.path.join(parser_dir, 'inp6.txt'), os.path.join(parser_dir, 'exp6.grtj')),
+    (os.path.join(parser_dir, 'inp7.txt'), os.path.join(parser_dir, 'exp7.grtj')),
 ])
 def test_parser(inp, expected, tmpdir):
     with open(inp, 'r') as f:
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"t": "p", "n": "start", "c": [{"t": "a", "ai": 0, "i": 1, "c": [{"t": "p", "n": "start_Quantifiers_test", "c": [{"t": "p", "n": "element", "c": [{"t": "l", "n": "<INVALID>", "s": "pass", "z": [1, 1], "i": false}, {"t": "q", "i": 0, "b": 0, "e": 1, "c": []}]}, {"t": "q", "i": 1, "b": 1, "e": -1, "c": [{"t": "qd", "c": [{"t": "l", "n": "<INVALID>", "s": " \| ", "z": [1, 1], "i": false}, {"t": "p", "n": "element", "c": [{"t": "l", "n": "<INVALID>", "s": "\|", "z": [1, 1], "i": false}, {"t": "l", "n": "<INVALID>", "s": "pass", "z": [1, 1], "i": false}]}]}]}]}]}]}