Skip to content

Commit

Permalink
ICU-22518 Add a flag to export the output of the reference implementa…
Browse files Browse the repository at this point in the history
…tion from the old segmentation monkey tests
  • Loading branch information
eggrobin committed Oct 11, 2023
1 parent cdbf0ef commit 5a20d09
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 11 deletions.
72 changes: 62 additions & 10 deletions icu4c/source/test/intltest/rbbitst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3538,6 +3538,9 @@ RBBILineMonkey::~RBBILineMonkey() {
//
// type = char | word | line | sent | title
//
// export = (path) Export test cases to (path)_(type).txt in the UCD
// test case format.
//
// Example:
// intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
//
Expand Down Expand Up @@ -3974,6 +3977,8 @@ void RBBITest::TestMonkey() {
UnicodeString breakType = "all";
Locale locale("en");
UBool useUText = false;
UBool scalarsOnly = false;
std::string exportPath;

if (quick == false) {
loopCount = 10000;
Expand All @@ -3998,6 +4003,19 @@ void RBBITest::TestMonkey() {
p = u.replaceFirst("", status);
}

RegexMatcher pathMatcher(" *export *= *([^ ]+) *", p, 0, status);
if (pathMatcher.find()) {
pathMatcher.group(1, status).toUTF8String(exportPath);
pathMatcher.reset();
p = pathMatcher.replaceFirst("", status);
}

RegexMatcher s(" *scalars_only", p, 0, status);
if (s.find()) {
scalarsOnly = true;
s.reset();
p = s.replaceFirst("", status);
}

// m.reset(p);
if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
Expand All @@ -4013,64 +4031,80 @@ void RBBITest::TestMonkey() {
}

if (breakType == "char" || breakType == "all") {
FILE *file = exportPath.empty() ? nullptr : fopen((exportPath + "_char.txt").c_str(), "w");
RBBICharMonkey m;
BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
if (U_SUCCESS(status)) {
RunMonkey(bi, m, "char", seed, loopCount, useUText);
RunMonkey(bi, m, "char", seed, loopCount, useUText, file, scalarsOnly);
if (breakType == "all" && useUText==false) {
// Also run a quick test with UText when "all" is specified
RunMonkey(bi, m, "char", seed, loopCount, true);
RunMonkey(bi, m, "char", seed, loopCount, true, nullptr, scalarsOnly);
}
}
else {
errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
}
delete bi;
if (file != nullptr) {
fclose(file);
}
}

if (breakType == "word" || breakType == "all") {
logln("Word Break Monkey Test");
FILE *file = exportPath.empty() ? nullptr : fopen((exportPath + "_word.txt").c_str(), "w");
RBBIWordMonkey m;
BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
if (U_SUCCESS(status)) {
RunMonkey(bi, m, "word", seed, loopCount, useUText);
RunMonkey(bi, m, "word", seed, loopCount, useUText, file, scalarsOnly);
}
else {
errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
}
delete bi;
if (file != nullptr) {
fclose(file);
}
}

if (breakType == "line" || breakType == "all") {
logln("Line Break Monkey Test");
FILE *file = exportPath.empty() ? nullptr : fopen((exportPath + "_line.txt").c_str(), "w");
RBBILineMonkey m;
BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
if (loopCount >= 10) {
loopCount = loopCount / 5; // Line break runs slower than the others.
}
if (U_SUCCESS(status)) {
RunMonkey(bi, m, "line", seed, loopCount, useUText);
RunMonkey(bi, m, "line", seed, loopCount, useUText, file, scalarsOnly);
}
else {
errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
}
delete bi;
if (file != nullptr) {
fclose(file);
}
}

if (breakType == "sent" || breakType == "all" ) {
logln("Sentence Break Monkey Test");
FILE *file = exportPath.empty() ? nullptr : fopen((exportPath + "_sent.txt").c_str(), "w");
RBBISentMonkey m;
BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
if (loopCount >= 10) {
loopCount = loopCount / 10; // Sentence runs slower than the other break types
}
if (U_SUCCESS(status)) {
RunMonkey(bi, m, "sent", seed, loopCount, useUText);
RunMonkey(bi, m, "sent", seed, loopCount, useUText, file, scalarsOnly);
}
else {
errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
}
delete bi;
if (file != nullptr) {
fclose(file);
}
}

#endif
Expand All @@ -4079,14 +4113,19 @@ void RBBITest::TestMonkey() {
//
// Run a RBBI monkey test. Common routine, for all break iterator types.
// Parameters:
// bi - the break iterator to use
// mk - MonkeyKind, abstraction for obtaining expected results
// name - Name of test (char, word, etc.) for use in error messages
// seed - Seed for starting random number generator (parameter from user)
// bi - the break iterator to use
// mk - MonkeyKind, abstraction for obtaining expected results
// name - Name of test (char, word, etc.) for use in error messages
// seed - Seed for starting random number generator (parameter from user)
// numIterations
// exportFile - Pointer to a file to which the test cases will be written in
// UCD format. May be null.
// scalarsOnly - Only test sequences of Unicode scalar values; if this is false,
// arbitrary sequences of code points (including unpaired surrogates)
// are tested.
//
void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
int32_t numIterations, UBool useUText) {
int32_t numIterations, UBool useUText, FILE *exportFile, UBool scalarsOnly) {

#if !UCONFIG_NO_REGULAR_EXPRESSIONS

Expand Down Expand Up @@ -4151,6 +4190,9 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
errln("%s:%d c < 0", __FILE__, __LINE__);
break;
}
if (scalarsOnly && U16_IS_SURROGATE(c)) {
continue;
}
// Do not assemble a supplementary character from randomly generated separate surrogates.
// (It could be a dictionary character)
if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
Expand Down Expand Up @@ -4267,6 +4309,16 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
}
}

if (exportFile != nullptr) {
for (i = 0; i < testText.length();) {
fprintf(exportFile, expectedBreaks[i] ? "÷ " : "× ");
char32_t const c = testText.char32At(i);
fprintf(exportFile, "%04X ", static_cast<uint32_t>(c));
i += U16_LENGTH(c);
}
fprintf(exportFile, expectedBreaks[testText.length()] ? "÷ # 🐒\n" : "× # 🐒\n");
}

// Compare the expected and actual results.
for (i=0; i<=testText.length(); i++) {
const char *errorType = nullptr;
Expand Down
4 changes: 3 additions & 1 deletion icu4c/source/test/intltest/rbbitst.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

#if !UCONFIG_NO_BREAK_ITERATION

#include <stdio.h>

#include <memory>

#include "intltest.h"
Expand Down Expand Up @@ -122,7 +124,7 @@ class RBBITest: public IntlTest {
**/

void RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
int32_t loopCount, UBool useUText);
int32_t loopCount, UBool useUText, FILE *exportFile, UBool scalarsOnly);

// Run one of the Unicode Consortium boundary test data files.
void runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi);
Expand Down

0 comments on commit 5a20d09

Please sign in to comment.