From b3ec8d1abdcd1f883b08f544ae75cd00feb4296a Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Thu, 18 Jul 2024 15:47:32 +0200
Subject: [PATCH] ICU-22707 Port the old monkey rule changes to ICU4J

---
 .../ibm/icu/dev/test/rbbi/RBBITestMonkey.java | 265 +++++++++++++++---
 1 file changed, 223 insertions(+), 42 deletions(-)

diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
index 368060aa7f0a..66f7a94024a8 100644
--- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
+++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
@@ -977,7 +977,7 @@ int next(int startPos) {
                 // LB 9 Treat X CM* as if it were X
                 //        No explicit action required.
 
-                // LB 10     Treat any remaining combining mark as AL
+                // LB 10     Treat any remaining combining mark as lb=AL, ea=Na
                 if (fCM.contains(thisChar)) {
                     thisChar = 'A';
                 }
@@ -1035,32 +1035,6 @@ int next(int startPos) {
                     break;
                 }
 
-                //          Move this test up, before LB8a, because numbers can match a longer sequence that would
-                //          also match 8a.  e.g. NU ZWJ IS PO     (ZWJ acts like CM)
-                matchVals = LBNumberCheck(fText, prevPos, matchVals);
-                if (matchVals[0] != -1) {
-                    // Matched a number.  But could have been just a single digit, which would
-                    //    not represent a "no break here" between prevChar and thisChar
-                    int numEndIdx = matchVals[1];  // idx of first char following num
-                    if (numEndIdx > pos) {
-                        // Number match includes at least the two chars being checked
-                        if (numEndIdx > nextPos) {
-                            // Number match includes additional chars.  Update pos and nextPos
-                            //   so that next loop iteration will continue at the end of the number,
-                            //   checking for breaks between last char in number & whatever follows.
-                            nextPos = numEndIdx;
-                            pos     = numEndIdx;
-                            do {
-                                pos = moveIndex32(fText, pos, -1);
-                                thisChar = UTF16.charAt(fText, pos);
-                            }
-                            while (fCM.contains(thisChar));
-                        }
-                        setAppliedRule(pos, "LB 25  Numbers");
-                        continue;
-                    }
-                }
-
                 //       The monkey test's way of ignoring combining characters doesn't work
                 //       for this rule. ZWJ is also a CM. Need to get the actual character
                 //       preceding "thisChar", not ignoring combining marks, possibly ZWJ.
@@ -1217,26 +1191,108 @@ int next(int startPos) {
                     break;
                 }
 
-                //    x   QU
-                //    QU  x
-                if (fQU.contains(thisChar) || fQU.contains(prevChar)) {
-                        setAppliedRule(pos, "LB 19");
+                // LB 19
+                // × [QU-\p{Pi}]
+                if (fQU.contains(thisChar) && !fPi.contains(thisChar)) {
+                        setAppliedRule(pos, "LB 19 × [QU-\\p{Pi}]");
+                    continue;
+                }
+                // [QU-\p{Pf}] ×
+                if (fQU.contains(prevChar) && !fPf.contains(prevChar)) {
+                    setAppliedRule(pos, "LB 19 [QU-\\p{Pf}] ×");
+                    continue;
+                }
+
+                // LB 19a
+                // [^\p{ea=F}\p{ea=W}\p{ea=H}] × QU
+                if (!feaFWH.contains(prevChar) && fQU.contains(thisChar)) {
+                    setAppliedRule(pos, "LB 19a [^\\p{ea=F}\\p{ea=W}\\p{ea=H}] × QU");
+                    continue;
+                }
+                // × QU ( [^\p{ea=F}\p{ea=W}\p{ea=H}] | eot )
+                if (fQU.contains(thisChar)) {
+                    if (nextPos < fText.length()) {
+                        int nextChar = fText.codePointAt(nextPos);
+                        if (!feaFWH.contains(nextChar)) {
+                            setAppliedRule(pos, "LB 19a × QU [^\\p{ea=F}\\p{ea=W}\\p{ea=H}]");
+                            continue;
+                        }
+                    } else {
+                        setAppliedRule(pos, "LB 19 × QU eot");
+                        continue;
+                    }
+                }
+                // QU × [^\p{ea=F}\p{ea=W}\p{ea=H}]
+                if (fQU.contains(prevChar) && !feaFWH.contains(thisChar)) {
+                    setAppliedRule(pos, "LB 19a QU × [^\\p{ea=F}\\p{ea=W}\\p{ea=H}]");
                     continue;
                 }
+                // ( sot | [^\p{ea=F}\p{ea=W}\p{ea=H}] ) QU ×
+                if (fQU.contains(prevChar)) {
+                    if (prevPos == 0) {
+                        setAppliedRule(pos, "LB 19a sot QU ×");
+                        continue;
+                    }
+                    // prevPosX2 is -1 if there was a break, and prevCharX2 is 0; but the UAX #14 rules can
+                    // look through breaks.
+                    int breakObliviousPrevPosX2 = moveIndex32(fText, prevPos, -1);
+                    while (fCM.contains(fText.codePointAt(breakObliviousPrevPosX2))) {
+                        if (breakObliviousPrevPosX2 == 0) {
+                            break;
+                        }
+                        int beforeCM = moveIndex32(fText, breakObliviousPrevPosX2, -1);
+                        if (fBK.contains(fText.codePointAt(beforeCM)) ||
+                            fCR.contains(fText.codePointAt(beforeCM)) ||
+                            fLF.contains(fText.codePointAt(beforeCM)) ||
+                            fNL.contains(fText.codePointAt(beforeCM)) ||
+                            fSP.contains(fText.codePointAt(beforeCM)) ||
+                            fZW.contains(fText.codePointAt(beforeCM))) {
+                            break;
+                        }
+                        breakObliviousPrevPosX2 = beforeCM;
+                    }
+                    if (!feaFWH.contains(fText.codePointAt(breakObliviousPrevPosX2)) ||
+                        fCM.contains(fText.codePointAt(breakObliviousPrevPosX2))) {
+                        setAppliedRule(pos, "LB 19a [^\\p{ea=F}\\p{ea=W}\\p{ea=H}] QU ×");
+                        continue;
+                    }
+                }
 
                 if (fCB.contains(thisChar) || fCB.contains(prevChar)) {
                     setAppliedRule(pos, "LB 20  Break around a CB");
                     break;
                 }
 
-                //           Don't break between Hyphens and letters if a break precedes the hyphen.
-                //           Formerly this was a Finnish tailoring.
-                //           Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
-                //    ^($HY | $HH) $AL;
-                if (fAL.contains(thisChar) && (fHY.contains(prevChar) || fHH.contains(prevChar)) &&
-                        prevPosX2 == -1) {
-                    setAppliedRule(pos, "LB 20.09");
-                    continue;
+                // Don't break between Hyphens and letters if a break or a space precedes the hyphen.
+                // Formerly this was a Finnish tailoring.
+                // (sot | BK | CR | LF | NL | SP | ZW | CB | GL) ( HY | [\u2010] ) × AL
+                if (fAL.contains(thisChar) && (fHY.contains(prevChar) || fHH.contains(prevChar))) {
+                    // sot ( HY | [\u2010] ) × AL.
+                    if (prevPos == 0) {
+                        setAppliedRule(pos, "LB 20a");
+                        continue;
+                    }
+                    // prevPosX2 is -1 if there was a break; but the UAX #14 rules can
+                    // look through breaks.
+                    int breakObliviousPrevPosX2 = moveIndex32(fText, prevPos, -1);
+                    if (fBK.contains(fText.codePointAt(breakObliviousPrevPosX2)) ||
+                        fCR.contains(fText.codePointAt(breakObliviousPrevPosX2)) ||
+                        fLF.contains(fText.codePointAt(breakObliviousPrevPosX2)) ||
+                        fNL.contains(fText.codePointAt(breakObliviousPrevPosX2)) ||
+                        fSP.contains(fText.codePointAt(breakObliviousPrevPosX2)) ||
+                        fGL.contains(fText.codePointAt(breakObliviousPrevPosX2)) ||
+                        fZW.contains(fText.codePointAt(breakObliviousPrevPosX2))) {
+                        setAppliedRule(pos, "LB 20a");
+                        continue;
+                    }
+                    while (breakObliviousPrevPosX2 > 0 &&
+                            fCM.contains(fText.codePointAt(breakObliviousPrevPosX2))) {
+                        breakObliviousPrevPosX2 = moveIndex32(fText, breakObliviousPrevPosX2, -1);
+                    }
+                    if (fCB.contains(fText.codePointAt(breakObliviousPrevPosX2))) {
+                        setAppliedRule(pos, "LB 20a");
+                        continue;
+                    }
                 }
 
                 if (fBA.contains(thisChar) ||
@@ -1247,8 +1303,11 @@ int next(int startPos) {
                     continue;
                 }
 
-                if (fHL.contains(prevCharX2) && (fHY.contains(prevChar) || fBA.contains(prevChar))) {
-                    setAppliedRule(pos, "LB 21a HL (HY | BA) x");
+                if (fHL.contains(prevCharX2) &&
+                    (fHY.contains(prevChar) ||
+                     (fBA.contains(prevChar) && !feaFWH.contains(prevChar))) &&
+                    !fHL.contains(thisChar)) {
+                    setAppliedRule(pos, "LB 21a HL (HY | BA) x [^HL]");
                     continue;
                 }
 
@@ -1301,7 +1360,127 @@ int next(int startPos) {
                     continue;
                 }
 
-                // appliedRule: "LB 25 numbers match"; // moved up, before LB 8a,
+                boolean continueToNextPosition = false;
+                // LB 25.
+                for (XUnicodeSet[] pair : new XUnicodeSet[][]{
+                         new XUnicodeSet[]{fCL, fPO}, // 1. NU (SY | IS)* CL × PO
+                         new XUnicodeSet[]{fCP, fPO}, // 2. NU (SY | IS)* CP × PO
+                         new XUnicodeSet[]{fCL, fPR}, // 3. NU (SY | IS)* CL × PR
+                         new XUnicodeSet[]{fCP, fPR}, // 4. NU (SY | IS)* CP × PR
+                     }) {
+                    XUnicodeSet left = pair[0];
+                    XUnicodeSet right = pair[1];
+                    if (left.contains(prevChar) && right.contains(thisChar)) {
+                        // Check for the NU (SY | IS)* part.
+                        boolean leftHandSideMatches = false;
+                        tPos = moveIndex32(fText, prevPos, -1);
+                        for (;;) {
+                            while (tPos > 0 && fCM.contains(fText.codePointAt(tPos))) {
+                                tPos = moveIndex32(fText, tPos, -1);
+                            }
+                            final int tChar = fText.codePointAt(tPos);
+                            if (fSY.contains(tChar) || fIS.contains(tChar)) {
+                                if (tPos == 0) {
+                                    leftHandSideMatches = false;
+                                    break;
+                                }
+                                tPos = moveIndex32(fText, tPos, -1);
+                            } else if (fNU.contains(tChar)) {
+                                leftHandSideMatches = true;
+                                break;
+                            } else {
+                                leftHandSideMatches = false;
+                                break;
+                            }
+                        }
+                        if (leftHandSideMatches) {
+                            setAppliedRule(pos, "LB 25/1..4");
+                            continueToNextPosition = true;
+                            break;
+                        }
+                    }
+                }
+                if (continueToNextPosition) {
+                    continue;
+                }
+                // 5. NU (SY | IS)* × PO
+                // 6. NU (SY | IS)* × PR
+                // 13. NU (SY | IS)* × NU
+                boolean leftHandSideMatches;
+                tPos = prevPos;
+                for (;;) {
+                    while (tPos > 0 && fCM.contains(fText.codePointAt(tPos))) {
+                        tPos = moveIndex32(fText, tPos, -1);
+                    }
+                    final int tChar = fText.codePointAt(tPos);
+                    if (fSY.contains(tChar) || fIS.contains(tChar)) {
+                        if (tPos == 0) {
+                            leftHandSideMatches = false;
+                            break;
+                        }
+                        tPos = moveIndex32(fText, tPos, -1);
+                    } else if (fNU.contains(tChar)) {
+                        leftHandSideMatches = true;
+                        break;
+                    } else {
+                        leftHandSideMatches = false;
+                        break;
+                    }
+                }
+                if (leftHandSideMatches &&
+                    (fPO.contains(thisChar) || fPR.contains(thisChar) || fNU.contains(thisChar))) {
+                    setAppliedRule(pos, "LB 25/5,6,13,14");
+                    continue;
+                }
+                if (nextPos < fText.length()) {
+                    final int nextChar = fText.codePointAt(nextPos);
+                    // 7. PO × OP NU
+                    if (fPO.contains(prevChar) && fOP.contains(thisChar) && fNU.contains(nextChar)) {
+                        setAppliedRule(pos, "LB 25/7");
+                        continue;
+                    }
+                    // 9. PR × OP NU
+                    if (fPR.contains(prevChar) && fOP.contains(thisChar) && fNU.contains(nextChar)) {
+                        setAppliedRule(pos, "LB 25/9");
+                        continue;
+                    }
+                    int nextPosX2 = moveIndex32(fText, nextPos, 1);
+                    while (nextPosX2 < fText.length() && fCM.contains(fText.codePointAt(nextPosX2))) {
+                        nextPosX2 = moveIndex32(fText, nextPosX2, 1);
+                    }
+        
+                    if (nextPosX2 < fText.length()) {
+                        final int nextCharX2 = fText.codePointAt(nextPosX2);
+                        // 7bis. PO × OP IS NU
+                        if (fPO.contains(prevChar) && fOP.contains(thisChar) && fIS.contains(nextChar) &&
+                            fNU.contains(nextCharX2)) {
+                            setAppliedRule(pos, "LB 25/7bis");
+                            continue;
+                        }
+                        // 9bis. PR × OP IS NU
+                        if (fPR.contains(prevChar) && fOP.contains(thisChar) && fIS.contains(nextChar) &&
+                            fNU.contains(nextCharX2)) {
+                            setAppliedRule(pos, "LB 25/9bis");
+                            continue;
+                        }
+                    }
+                }
+                for (XUnicodeSet[] pair : new XUnicodeSet[][]{
+                         new XUnicodeSet[]{fPO, fNU}, // 8. PO × NU
+                         new XUnicodeSet[]{fPR, fNU}, // 10. PR × NU
+                         new XUnicodeSet[]{fHY, fNU}, // 11. HY × NU
+                         new XUnicodeSet[]{fIS, fNU}, // 12. IS × NU
+                     }) {
+                    XUnicodeSet left = pair[0];
+                    XUnicodeSet right = pair[1];
+                    if (left.contains(prevChar) && right.contains(thisChar)) {
+                        continueToNextPosition = true;
+                        break;
+                    }
+                }
+                if (continueToNextPosition) {
+                  continue;
+                }        
 
                 if (fJL.contains(prevChar) && (fJL.contains(thisChar) ||
                         fJV.contains(thisChar) ||
@@ -1966,6 +2145,8 @@ static int  nextCP(StringBuffer s, int i) {
      *    1.  Using this code allows obtaining the same sequences as those from the ICU4C monkey test.
      *    2.  We need to get and restore the seed from values occurring in the middle
      *        of a long sequence, to more easily reproduce failing cases.
+     * TODO(egg): We need a better random number generator; ideally the same as in C++, but that may
+     *            be tricky.
      */
     private static int m_seed = 1;
     private static int  m_rand()