WIP: Optional feature: More verbose failed expression reporting

An option is added to the Parser class. This option is disabled by default, and the existing behavior is fully preserved. When the option is enabled, the final expected message is extended with extra information about the previously "weakly failed" rules. This way, not only the last failed NoMatch exception and its failing rules are displayed but also all the rules that were not matched during the whole parsing process.
textX · Apr 22, 2023 · 66f4b3a · 66f4b3a
1 parent 0b47dc0
commit 66f4b3a
Show file tree

Hide file tree

Showing 5 changed files with 405 additions and 36 deletions.
diff --git a/arpeggio/__init__.py b/arpeggio/__init__.py
@@ -12,11 +12,16 @@
 ###############################################################################
 
 from __future__ import print_function, unicode_literals
+
+import collections
 import sys
 from collections import OrderedDict
 import codecs
 import re
 import bisect
+from enum import Enum
+from typing import Tuple, List, Deque
+
 from arpeggio.utils import isstr
 import types
 
@@ -78,6 +83,15 @@ def eval_attrs(self):
         """
         Call this to evaluate `message`, `context`, `line` and `col`. Called by __str__.
         """
+
+        # We reach this branch if a failed NoMatch exception is created from
+        # an unmatched Not rule.
+        if self.rules is None or len(self.rules) == 0:
+            self.context = self.parser.context(position=self.position)
+            self.line, self.col = self.parser.pos_to_linecol(self.position)
+            self.message = f"Not expected input at position ({self.line}, {self.col})"
+            return
+
         def rule_to_exp_str(rule):
             if hasattr(rule, '_exp_str'):
                 # Rule may override expected report string
@@ -90,24 +104,150 @@ def rule_to_exp_str(rule):
             else:
                 return rule.name
 
-        if not self.rules:
-            self.message = "Not expected input"
+        flattened_pos_rules: List[Tuple] = list(
+            self.parser.weakly_failed_errors
+        )
+        rules_set = set(map(lambda pr: pr[1], flattened_pos_rules))
+
+        def enumerate_child_nodes(node):
+            # FIXME: How do we end up with repeating nodes in the tree?
+            visited = set()
+            queue = list(node.nodes)
+            while len(queue) > 0:
+                current = queue.pop(0)
+                if current in visited:
+                    continue
+                visited.add(current)
+                yield current
+                queue.extend(current.nodes)
+
+        if not self.parser.verbose2:
+            # Mark all nodes as relevant or irrelevant for the printed error message.
+            for _, rule in flattened_pos_rules:
+                # "Not" nodes do not contribute to the reporting of weakly failed
+                # rules.
+                assert not isinstance(rule, Not)
+                if not isinstance(rule, Match):
+                    rule.good_node = NodeMarker.BAD
+                    # We find if all nodes have parents.
+                    for node in enumerate_child_nodes(rule):
+                        if not isinstance(node, Match):
+                            node.good_node = NodeMarker.BAD
+                            continue
+
+                        # Node is part of the final failed expression.
+                        if node in self.rules:
+                            node.good_node = NodeMarker.GOOD
+                        # Node has a failing parent. It is a good node.
+                        elif node in rules_set:
+                            node.good_node = NodeMarker.GOOD
+                        # Node is orphan. **Nothing was unsuccessful** with this node.
+                        else:
+                            node.good_node = NodeMarker.BAD
+                else:
+                    rule.good_node = (
+                        NodeMarker.GOOD if rule in self.rules else NodeMarker.BAD
+                    )
+            flattened_pos_rules = list(
+                filter(
+                    lambda pr: pr[1].good_node == NodeMarker.GOOD, flattened_pos_rules
+                )
+            )
         else:
-            what_is_expected = OrderedDict.fromkeys(
-                ["{}".format(rule_to_exp_str(r)) for r in self.rules])
-            what_str = " or ".join(what_is_expected)
-            self.message = "Expected {}".format(what_str)
+            flattened_pos_rules = list(
+                filter(
+                    lambda pos_and_rule: isinstance(pos_and_rule[1], Match), flattened_pos_rules
+                )
+            )
+
+        positions = {}
+        for position, rule in flattened_pos_rules:
+            if rule not in positions:
+                positions[rule] = position
+            else:
+                if positions[rule] < position:
+                    positions[rule] = position
+
+        flattened_pos_rules = [(positions[k], k) for k in positions]
+
+
+        several_positions = False
+        current_failed_position = None
+        for pos_rule in flattened_pos_rules:
+            if current_failed_position is None:
+                current_failed_position = pos_rule[0]
+                continue
+            if current_failed_position != pos_rule[0]:
+                several_positions = True
+            if current_failed_position > pos_rule[0]:
+                current_failed_position = pos_rule[0]
+
+        if current_failed_position is None:
+            current_failed_position = self.position
+        if len(flattened_pos_rules) == 0:
+            flattened_pos_rules = [(self.position, rule) for rule in self.rules]
+        flattened_pos_rules.sort(key=lambda pos_rule_: pos_rule_[0])
 
         self.context = self.parser.context(position=self.position)
         self.line, self.col = self.parser.pos_to_linecol(self.position)
 
+        if not several_positions:
+            what_is_expected = OrderedDict.fromkeys(
+                ["{}".format(rule_to_exp_str(r[1])) for r in flattened_pos_rules])
+            what_str = " or ".join(what_is_expected)
+            what_str += f" at position ({self.line}, {self.col})"
+            self.message = "Expected {}".format(what_str)
+        elif self.parser.verbose2:
+            descriptions = []
+            current_position = flattened_pos_rules[0][0]
+            current_rules = []
+            for pos, rule in flattened_pos_rules:
+                if current_position == pos:
+                    current_rules.append(rule_to_exp_str(rule))
+                else:
+                    joined_rules = " or ".join(current_rules)
+                    line, col = self.parser.pos_to_linecol(current_position)
+                    descriptions.append(
+                        f"{line}:{col}: {joined_rules}"
+                    )
+                    current_position = pos
+                    current_rules = [rule_to_exp_str(rule)]
+            joined_rules = " or ".join(current_rules)
+            line, col = self.parser.pos_to_linecol(current_position)
+            descriptions.append(
+                f"{line}:{col}: {joined_rules}"
+            )
+
+            what_str = "\n".join(descriptions)
+            self.message = "Expected:\n{}\n".format(what_str)
+        else:
+            descriptions = []
+            current_position = flattened_pos_rules[0][0]
+            current_rules = []
+            for pos, rule in flattened_pos_rules:
+                if current_position == pos:
+                    current_rules.append(rule_to_exp_str(rule))
+                else:
+                    joined_rules = " or ".join(current_rules)
+                    descriptions.append(
+                        f"{joined_rules} at position {self.parser.pos_to_linecol(current_position)}"
+                    )
+                    current_position = pos
+                    current_rules = [rule_to_exp_str(rule)]
+            joined_rules = " or ".join(current_rules)
+            descriptions.append(
+                f"{joined_rules} at position {self.parser.pos_to_linecol(current_position)}"
+            )
+
+            what_str = " or ".join(descriptions)
+            self.message = "Expected {}".format(what_str)
+
     def __str__(self):
         self.eval_attrs()
-        return "{} at position {}{} => '{}'."\
-            .format(self.message,
-                    "{}:".format(self.parser.file_name)
+        return "{}{} => '{}'."\
+            .format("{}: ".format(self.parser.file_name)
                     if self.parser.file_name else "",
-                    (self.line, self.col),
+                    self.message,
                     self.context)
 
     def __unicode__(self):
@@ -161,6 +301,11 @@ def dprint(self, message, indent_change=0):
 # ---------------------------------------------------------
 # Parser Model (PEG Abstract Semantic Graph) elements
 
+class NodeMarker(str, Enum):
+    UNKNOWN = "UNKNOWN"
+    GOOD = "GOOD"
+    BAD = "BAD"
+
 
 class ParsingExpression(object):
     """
@@ -195,7 +340,7 @@ def __init__(self, *elements, **kwargs):
         if not hasattr(nodes, '__iter__'):
             nodes = [nodes]
         self.nodes = nodes
-
+        self.good_node = NodeMarker.UNKNOWN
         if 'suppress' in kwargs:
             self.suppress = kwargs['suppress']
 
@@ -430,6 +575,11 @@ def _parse(self, parser):
         if not match:
             parser._nm_raise(self, c_pos, parser)
 
+        if parser.verbose2 and not parser.in_not:
+            for node in self.nodes:
+                if isinstance(node, Match):
+                    parser.weakly_failed_errors.append((c_pos, node))
+
         return result
 
 
@@ -1413,7 +1563,7 @@ class Parser(DebugPrinter):
     FIRST_NOT = Not()
 
     def __init__(self, skipws=True, ws=None, reduce_tree=False, autokwd=False,
-                 ignore_case=False, memoization=False, **kwargs):
+                 ignore_case=False, memoization=False, verbose=False, verbose2=False, **kwargs):
         """
         Args:
             skipws (bool): Should the whitespace skipping be done.  Default is
@@ -1473,6 +1623,12 @@ def __init__(self, skipws=True, ws=None, reduce_tree=False, autokwd=False,
         # Last parsing expression traversed
         self.last_pexpression = None
 
+        self.verbose = verbose
+        self.verbose2 = verbose2
+        self.weakly_failed_errors: Deque = (
+            collections.deque() if verbose or verbose2 else collections.deque(maxlen=0)
+        )
+
     @property
     def ws(self):
         return self._ws
@@ -1709,6 +1865,10 @@ def _nm_raise(self, *args):
         """
 
         rule, position, parser = args
+
+        if not self.in_not:
+            self.weakly_failed_errors.append((position, rule))
+
         if self.nm is None or not parser.in_parse_comments:
             if self.nm is None or position > self.nm.position:
                 if self.in_not:
@@ -1718,7 +1878,16 @@ def _nm_raise(self, *args):
             elif position == self.nm.position and isinstance(rule, Match) \
                     and not self.in_not:
                 self.nm.rules.append(rule)
-
+            else:
+                # We reach here if the _nm_raise is called on a failed parent
+                # expression which is not Match-based (e.g. OrderedChoice).
+                # Such parent expressions do not contribute to the final error
+                # reporting. Instead, the previously failed Match-based NoMatch
+                # exception is reported. Note that _nm_raise is always called
+                # first on the failed Match expressions and only then the
+                # failure is propagated to the parent _nm_raise invocation that
+                # reaches this branch.
+                pass
         raise self.nm
 
     def _clear_caches(self):

diff --git a/arpeggio/tests/test_error_reporting.py b/arpeggio/tests/test_error_reporting.py
@@ -90,7 +90,7 @@ def grammar():      return Optional('a'), 'b', EOF
     with pytest.raises(NoMatch) as e:
         parser.parse("\n\n   a c", file_name="test_file.peg")
     assert (
-        "Expected 'b' at position test_file.peg:(3, 6) => '     a *c'."
+        "test_file.peg: Expected 'b' at position (3, 6) => '     a *c'."
     ) == str(e.value)
     assert (e.value.line, e.value.col) == (3, 6)
 

diff --git a/arpeggio/tests/test_error_reporting_verbose.py b/arpeggio/tests/test_error_reporting_verbose.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+#######################################################################
+# Name: test_error_reporting_verbose
+# Purpose: Test error reporting for various cases when verbose=True enabled.
+# Author: Igor R. Dejanović <igor DOT dejanovic AT gmail DOT com>
+# Copyright: (c) 2015 Igor R. Dejanović <igor DOT dejanovic AT gmail DOT com>
+# License: MIT License
+#######################################################################
+from __future__ import unicode_literals
+import pytest
+
+from arpeggio import Optional, Not, ParserPython, NoMatch, EOF, Sequence, RegExMatch, StrMatch, OrderedChoice
+from arpeggio import RegExMatch as _
+
+
+def test_optional_with_better_match():
+    """
+    Test that optional match that has gone further in the input stream
+    has precedence over non-optional.
+    """
+
+    def grammar():  return [first, (Optional(second), 'six')]
+    def first():    return 'one', 'two', 'three', '4'
+    def second():   return 'one', 'two', 'three', 'four', 'five'
+
+    parser = ParserPython(grammar, verbose=True)
+    assert parser.verbose
+
+    with pytest.raises(NoMatch) as e:
+        parser.parse('one two three four 5')
+
+    assert (
+        "Expected "
+        "'six' at position (1, 1) or "
+        "'4' at position (1, 15) or "
+        "'five' at position (1, 20) => "
+        "'hree four *5'."
+    ) == str(e.value)
+    assert (e.value.line, e.value.col) == (1, 20)