From 57a9003eff7c8638c9ecbfa7d67af28a500e14e0 Mon Sep 17 00:00:00 2001
From: Stanislav Pankevich <s.pankevich@gmail.com>
Date: Fri, 14 Apr 2023 20:46:20 +0200
Subject: [PATCH 1/2] WIP: Optional feature: More verbose failed expression
 reporting

An option is added to the Parser class. This option is disabled by default,
and the existing behavior is fully preserved.

When the option is enabled, the final expected message is extended with
extra information about the previously "weakly failed" rules. This way,
not only the last failed NoMatch exception and its failing rules are displayed
but also all the rules that were not matched during the whole parsing process.
---
 arpeggio/__init__.py                          | 187 ++++++++++++++++--
 arpeggio/tests/test_error_reporting.py        |   2 +-
 .../tests/test_error_reporting_verbose.py     |  39 ++++
 .../tests/test_error_reporting_verbose2.py    | 127 ++++++++++++
 4 files changed, 341 insertions(+), 14 deletions(-)
 create mode 100644 arpeggio/tests/test_error_reporting_verbose.py
 create mode 100644 arpeggio/tests/test_error_reporting_verbose2.py

diff --git a/arpeggio/__init__.py b/arpeggio/__init__.py
index 1acc831..427794e 100644
--- a/arpeggio/__init__.py
+++ b/arpeggio/__init__.py
@@ -12,11 +12,16 @@
 ###############################################################################
 
 from __future__ import print_function, unicode_literals
+
+import collections
 import sys
 from collections import OrderedDict
 import codecs
 import re
 import bisect
+from enum import Enum
+from typing import Tuple, List, Deque
+
 from arpeggio.utils import isstr
 import types
 
@@ -78,6 +83,15 @@ def eval_attrs(self):
         """
         Call this to evaluate `message`, `context`, `line` and `col`. Called by __str__.
         """
+
+        # We reach this branch if a failed NoMatch exception is created from
+        # an unmatched Not rule.
+        if self.rules is None or len(self.rules) == 0:
+            self.context = self.parser.context(position=self.position)
+            self.line, self.col = self.parser.pos_to_linecol(self.position)
+            self.message = f"Not expected input at position ({self.line}, {self.col})"
+            return
+
         def rule_to_exp_str(rule):
             if hasattr(rule, '_exp_str'):
                 # Rule may override expected report string
@@ -90,24 +104,139 @@ def rule_to_exp_str(rule):
             else:
                 return rule.name
 
-        if not self.rules:
-            self.message = "Not expected input"
+        flattened_pos_rules: List[Tuple] = list(
+            self.parser.weakly_failed_errors
+        )
+        rules_set = set(map(lambda pr: pr[1], flattened_pos_rules))
+
+        def enumerate_child_nodes(node):
+            # FIXME: How do we end up with repeating nodes in the tree?
+            visited = set()
+            queue = list(node.nodes)
+            while len(queue) > 0:
+                current = queue.pop(0)
+                if current in visited:
+                    continue
+                visited.add(current)
+                yield current
+                queue.extend(current.nodes)
+
+        if not self.parser.verbose2:
+            # Mark all nodes as relevant or irrelevant for the printed error message.
+            for _, rule in flattened_pos_rules:
+                # "Not" nodes do not contribute to the reporting of weakly failed
+                # rules.
+                assert not isinstance(rule, Not)
+                if not isinstance(rule, Match):
+                    rule.good_node = NodeMarker.BAD
+                    # We find if all nodes have parents.
+                    for node in enumerate_child_nodes(rule):
+                        if not isinstance(node, Match):
+                            node.good_node = NodeMarker.BAD
+                            continue
+
+                        # Node is part of the final failed expression.
+                        if node in self.rules:
+                            node.good_node = NodeMarker.GOOD
+                        # Node has a failing parent. It is a good node.
+                        elif node in rules_set:
+                            node.good_node = NodeMarker.GOOD
+                        # Node is orphan. **Nothing was unsuccessful** with this node.
+                        else:
+                            node.good_node = NodeMarker.BAD
+                else:
+                    rule.good_node = (
+                        NodeMarker.GOOD if rule in self.rules else NodeMarker.BAD
+                    )
+            flattened_pos_rules = list(
+                filter(
+                    lambda pr: pr[1].good_node == NodeMarker.GOOD, flattened_pos_rules
+                )
+            )
         else:
-            what_is_expected = OrderedDict.fromkeys(
-                ["{}".format(rule_to_exp_str(r)) for r in self.rules])
-            what_str = " or ".join(what_is_expected)
-            self.message = "Expected {}".format(what_str)
+            flattened_pos_rules = list(
+                filter(
+                    lambda pos_and_rule: isinstance(pos_and_rule[1], Match), flattened_pos_rules
+                )
+            )
+
+        positions = {}
+        found_positions = set()
+        for position, rule in flattened_pos_rules:
+            found_positions.add(position)
+            if rule not in positions:
+                positions[rule] = position
+            else:
+                if positions[rule] < position:
+                    positions[rule] = position
+
+        flattened_pos_rules = [(positions[k], k) for k in positions]
+        several_positions = len(found_positions) > 1
+
+        if len(flattened_pos_rules) == 0:
+            flattened_pos_rules = [(self.position, rule) for rule in self.rules]
+        flattened_pos_rules.sort(key=lambda pos_rule_: pos_rule_[0])
 
         self.context = self.parser.context(position=self.position)
         self.line, self.col = self.parser.pos_to_linecol(self.position)
 
+        if not several_positions:
+            what_is_expected = OrderedDict.fromkeys(
+                ["{}".format(rule_to_exp_str(r[1])) for r in flattened_pos_rules])
+            what_str = " or ".join(what_is_expected)
+            what_str += f" at position ({self.line}, {self.col})"
+            self.message = "Expected {}".format(what_str)
+        elif self.parser.verbose2:
+            descriptions = []
+            current_position = flattened_pos_rules[0][0]
+            current_rules = []
+            for pos, rule in flattened_pos_rules:
+                if current_position == pos:
+                    current_rules.append(rule_to_exp_str(rule))
+                else:
+                    joined_rules = " or ".join(current_rules)
+                    line, col = self.parser.pos_to_linecol(current_position)
+                    descriptions.append(
+                        f"{line}:{col}: {joined_rules}"
+                    )
+                    current_position = pos
+                    current_rules = [rule_to_exp_str(rule)]
+            joined_rules = " or ".join(current_rules)
+            line, col = self.parser.pos_to_linecol(current_position)
+            descriptions.append(
+                f"{line}:{col}: {joined_rules}"
+            )
+
+            what_str = "\n".join(descriptions)
+            self.message = "Expected:\n{}\n".format(what_str)
+        else:
+            descriptions = []
+            current_position = flattened_pos_rules[0][0]
+            current_rules = []
+            for pos, rule in flattened_pos_rules:
+                if current_position == pos:
+                    current_rules.append(rule_to_exp_str(rule))
+                else:
+                    joined_rules = " or ".join(current_rules)
+                    descriptions.append(
+                        f"{joined_rules} at position {self.parser.pos_to_linecol(current_position)}"
+                    )
+                    current_position = pos
+                    current_rules = [rule_to_exp_str(rule)]
+            joined_rules = " or ".join(current_rules)
+            descriptions.append(
+                f"{joined_rules} at position {self.parser.pos_to_linecol(current_position)}"
+            )
+
+            what_str = " or ".join(descriptions)
+            self.message = "Expected {}".format(what_str)
+
     def __str__(self):
         self.eval_attrs()
-        return "{} at position {}{} => '{}'."\
-            .format(self.message,
-                    "{}:".format(self.parser.file_name)
+        return "{}{} => '{}'."\
+            .format("{}: ".format(self.parser.file_name)
                     if self.parser.file_name else "",
-                    (self.line, self.col),
+                    self.message,
                     self.context)
 
     def __unicode__(self):
@@ -161,6 +290,11 @@ def dprint(self, message, indent_change=0):
 # ---------------------------------------------------------
 # Parser Model (PEG Abstract Semantic Graph) elements
 
+class NodeMarker(str, Enum):
+    UNKNOWN = "UNKNOWN"
+    GOOD = "GOOD"
+    BAD = "BAD"
+
 
 class ParsingExpression(object):
     """
@@ -195,7 +329,7 @@ def __init__(self, *elements, **kwargs):
         if not hasattr(nodes, '__iter__'):
             nodes = [nodes]
         self.nodes = nodes
-
+        self.good_node = NodeMarker.UNKNOWN
         if 'suppress' in kwargs:
             self.suppress = kwargs['suppress']
 
@@ -412,12 +546,14 @@ def _parse(self, parser):
             old_skipws = parser.skipws
             parser.skipws = self.skipws
 
+        successful_node = None
         try:
             for e in self.nodes:
                 try:
                     result = e.parse(parser)
                     match = True
                     result = [result]
+                    successful_node = e
                     break
                 except NoMatch:
                     parser.position = c_pos  # Backtracking
@@ -430,6 +566,11 @@ def _parse(self, parser):
         if not match:
             parser._nm_raise(self, c_pos, parser)
 
+        if parser.verbose2 and not parser.in_not:
+            for node in self.nodes:
+                if isinstance(node, Match) and node != successful_node:
+                    parser.weakly_failed_errors.append((c_pos, node))
+
         return result
 
 
@@ -1413,7 +1554,7 @@ class Parser(DebugPrinter):
     FIRST_NOT = Not()
 
     def __init__(self, skipws=True, ws=None, reduce_tree=False, autokwd=False,
-                 ignore_case=False, memoization=False, **kwargs):
+                 ignore_case=False, memoization=False, verbose=False, verbose2=False, **kwargs):
         """
         Args:
             skipws (bool): Should the whitespace skipping be done.  Default is
@@ -1473,6 +1614,12 @@ def __init__(self, skipws=True, ws=None, reduce_tree=False, autokwd=False,
         # Last parsing expression traversed
         self.last_pexpression = None
 
+        self.verbose = verbose
+        self.verbose2 = verbose2
+        self.weakly_failed_errors: Deque = (
+            collections.deque() if verbose or verbose2 else collections.deque(maxlen=0)
+        )
+
     @property
     def ws(self):
         return self._ws
@@ -1516,6 +1663,7 @@ def parse(self, _input, file_name=None):
         self.comment_positions = {}
         self.cache_hits = 0
         self.cache_misses = 0
+        self.weakly_failed_errors.clear()
         try:
             self.parse_tree = self._parse()
         except NoMatch as e:
@@ -1709,6 +1857,10 @@ def _nm_raise(self, *args):
         """
 
         rule, position, parser = args
+
+        if not self.in_not:
+            self.weakly_failed_errors.append((position, rule))
+
         if self.nm is None or not parser.in_parse_comments:
             if self.nm is None or position > self.nm.position:
                 if self.in_not:
@@ -1718,7 +1870,16 @@ def _nm_raise(self, *args):
             elif position == self.nm.position and isinstance(rule, Match) \
                     and not self.in_not:
                 self.nm.rules.append(rule)
-
+            else:
+                # We reach here if the _nm_raise is called on a failed parent
+                # expression which is not Match-based (e.g. OrderedChoice).
+                # Such parent expressions do not contribute to the final error
+                # reporting. Instead, the previously failed Match-based NoMatch
+                # exception is reported. Note that _nm_raise is always called
+                # first on the failed Match expressions and only then the
+                # failure is propagated to the parent _nm_raise invocation that
+                # reaches this branch.
+                pass
         raise self.nm
 
     def _clear_caches(self):
diff --git a/arpeggio/tests/test_error_reporting.py b/arpeggio/tests/test_error_reporting.py
index affc9f3..f8eb51a 100644
--- a/arpeggio/tests/test_error_reporting.py
+++ b/arpeggio/tests/test_error_reporting.py
@@ -90,7 +90,7 @@ def grammar():      return Optional('a'), 'b', EOF
     with pytest.raises(NoMatch) as e:
         parser.parse("\n\n   a c", file_name="test_file.peg")
     assert (
-        "Expected 'b' at position test_file.peg:(3, 6) => '     a *c'."
+        "test_file.peg: Expected 'b' at position (3, 6) => '     a *c'."
     ) == str(e.value)
     assert (e.value.line, e.value.col) == (3, 6)
 
diff --git a/arpeggio/tests/test_error_reporting_verbose.py b/arpeggio/tests/test_error_reporting_verbose.py
new file mode 100644
index 0000000..5589b86
--- /dev/null
+++ b/arpeggio/tests/test_error_reporting_verbose.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+#######################################################################
+# Name: test_error_reporting_verbose
+# Purpose: Test error reporting for various cases when verbose=True enabled.
+# Author: Igor R. Dejanović <igor DOT dejanovic AT gmail DOT com>
+# Copyright: (c) 2015 Igor R. Dejanović <igor DOT dejanovic AT gmail DOT com>
+# License: MIT License
+#######################################################################
+from __future__ import unicode_literals
+import pytest
+
+from arpeggio import Optional, Not, ParserPython, NoMatch, EOF, Sequence, RegExMatch, StrMatch, OrderedChoice
+from arpeggio import RegExMatch as _
+
+
+def test_optional_with_better_match():
+    """
+    Test that optional match that has gone further in the input stream
+    has precedence over non-optional.
+    """
+
+    def grammar():  return [first, (Optional(second), 'six')]
+    def first():    return 'one', 'two', 'three', '4'
+    def second():   return 'one', 'two', 'three', 'four', 'five'
+
+    parser = ParserPython(grammar, verbose=True)
+    assert parser.verbose
+
+    with pytest.raises(NoMatch) as e:
+        parser.parse('one two three four 5')
+
+    assert (
+        "Expected "
+        "'six' at position (1, 1) or "
+        "'4' at position (1, 15) or "
+        "'five' at position (1, 20) => "
+        "'hree four *5'."
+    ) == str(e.value)
+    assert (e.value.line, e.value.col) == (1, 20)
diff --git a/arpeggio/tests/test_error_reporting_verbose2.py b/arpeggio/tests/test_error_reporting_verbose2.py
new file mode 100644
index 0000000..cdc81d1
--- /dev/null
+++ b/arpeggio/tests/test_error_reporting_verbose2.py
@@ -0,0 +1,127 @@
+# -*- coding: utf-8 -*-
+#######################################################################
+# Name: test_error_reporting_verbose
+# Purpose: Test error reporting for various cases when verbose=True enabled.
+# Author: Igor R. Dejanović <igor DOT dejanovic AT gmail DOT com>
+# Copyright: (c) 2015 Igor R. Dejanović <igor DOT dejanovic AT gmail DOT com>
+# License: MIT License
+#######################################################################
+from __future__ import unicode_literals
+import pytest
+
+from arpeggio import Optional, Not, ParserPython, NoMatch, EOF, Sequence, \
+    RegExMatch, StrMatch, OrderedChoice, UnorderedGroup, ZeroOrMore, OneOrMore
+from arpeggio import RegExMatch as _
+
+
+def test_ordered_choice():
+    def grammar():
+        return ["a", "b", "c"], EOF
+
+    parser = ParserPython(grammar, verbose2=True)
+    with pytest.raises(NoMatch) as e:
+        parser.parse("ab")
+    assert (
+       "Expected:\n"
+       "1:1: 'b' or 'c'\n"
+       "1:2: EOF\n"
+       " => 'a*b'."
+    ) == str(e.value)
+
+    parser = ParserPython(grammar, verbose2=True)
+    with pytest.raises(NoMatch) as e:
+        parser.parse("bb")
+    assert (
+       "Expected:\n"
+       "1:1: 'a' or 'c'\n"
+       "1:2: EOF\n"
+       " => 'b*b'."
+    ) == str(e.value)
+
+
+def test_unordered_group_with_optionals_and_separator():
+    def grammar():
+        return UnorderedGroup("a", Optional("b"), "c", sep=","), EOF
+
+    parser = ParserPython(grammar)
+    with pytest.raises(NoMatch) as e:
+        parser.parse("a, c, ")
+    assert (
+       "Expected 'b' at position (1, 7) => 'a, c, *'."
+    ) == str(e.value)
+
+    parser = ParserPython(grammar, verbose2=True)
+    with pytest.raises(NoMatch) as e:
+        parser.parse("a, c, ")
+    assert (
+       "Expected:\n"
+       "1:5: EOF\n"
+       "1:7: 'b'\n"
+       " => 'a, c, *'."
+    ) == str(e.value)
+
+
+def test_zero_or_more_with_separator():
+    def grammar():
+        return ZeroOrMore("a", sep=","), EOF
+
+    parser = ParserPython(grammar)
+    with pytest.raises(NoMatch) as e:
+        parser.parse("a,a ,a,")
+    assert (
+       "Expected 'a' at position (1, 8) => 'a,a ,a,*'."
+    ) == str(e.value)
+
+    parser = ParserPython(grammar, verbose2=True)
+    with pytest.raises(NoMatch) as e:
+        parser.parse("a,a ,a,")
+    assert (
+       "Expected:\n"
+       "1:7: EOF\n"
+       "1:8: 'a'\n"
+       " => 'a,a ,a,*'."
+    ) == str(e.value)
+
+
+def test_zero_or_more_with_optional_separator():
+    def grammar():
+        return ZeroOrMore("a", sep=RegExMatch(",?")), EOF
+
+    parser = ParserPython(grammar)
+    with pytest.raises(NoMatch) as e:
+        parser.parse("a,a ,a,")
+    assert (
+       "Expected 'a' at position (1, 8) => 'a,a ,a,*'."
+    ) == str(e.value)
+
+    parser = ParserPython(grammar, verbose2=True)
+    with pytest.raises(NoMatch) as e:
+        parser.parse("a,a ,a,")
+    assert (
+       "Expected:\n"
+       "1:7: EOF\n"
+       "1:8: 'a'\n"
+       " => 'a,a ,a,*'."
+    ) == str(e.value)
+
+
+def test_one_or_more_with_optional_separator():
+    def grammar():
+        return OneOrMore("a", sep=RegExMatch(",?")), "b"
+
+    parser = ParserPython(grammar)
+    with pytest.raises(NoMatch) as e:
+        parser.parse("a a, b")
+    assert (
+        "Expected 'a' at position (1, 6) => 'a a, *b'."
+    ) == str(e.value)
+
+    parser = ParserPython(grammar, verbose2=True)
+    with pytest.raises(NoMatch) as e:
+        parser.parse("a a, b")
+    assert (
+        "Expected:\n"
+        "1:4: 'b'\n"
+        "1:6: 'a'\n"
+        " => 'a a, *b'."
+    ) == str(e.value)

From e6de5306ce5a0ff5f5bfccf09b68e9a5c6c893ac Mon Sep 17 00:00:00 2001
From: Stanislav Pankevich <s.pankevich@gmail.com>
Date: Sun, 23 Apr 2023 15:23:35 +0200
Subject: [PATCH 2/2] WIP: capture more weakly failed rules (ZeroOrMore,
 Sequence)

---
 arpeggio/__init__.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arpeggio/__init__.py b/arpeggio/__init__.py
index 427794e..39c03c7 100644
--- a/arpeggio/__init__.py
+++ b/arpeggio/__init__.py
@@ -512,8 +512,12 @@ def _parse(self, parser):
             for e in self.nodes:
                 result = e.parse(parser)
                 if result is not None:
+                    if parser.verbose2 and isinstance(result, list) and len(result) == 0:
+                        parser.weakly_failed_errors.append((c_pos, e))
                     append(result)
-
+                else:
+                    if parser.verbose2:
+                        parser.weakly_failed_errors.append((c_pos, e))
         except NoMatch:
             parser.position = c_pos     # Backtracking
             raise
@@ -635,6 +639,8 @@ def _parse(self, parser):
                 append(result)
             except NoMatch:
                 parser.position = c_pos  # Backtracking
+                if parser.verbose2:
+                    parser.weakly_failed_errors.append((c_pos, self.nodes[0]))
                 break
 
         if self.eolterm: