Skip to content

Commit

Permalink
ICU-22518 Add a flag to export the output of the reference implementa…
Browse files Browse the repository at this point in the history
…tion from the old segmentation monkey tests

cleaner exportation

scalars only
  • Loading branch information
eggrobin committed Oct 11, 2023
1 parent cdbf0ef commit 087377a
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 11 deletions.
72 changes: 62 additions & 10 deletions icu4c/source/test/intltest/rbbitst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3538,6 +3538,9 @@ RBBILineMonkey::~RBBILineMonkey() {
//
// type = char | word | line | sent | title
//
// export = (path) Export test cases to (path)_(type).txt in the UCD
// test case format.
//
// Example:
// intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
//
Expand Down Expand Up @@ -3974,6 +3977,8 @@ void RBBITest::TestMonkey() {
UnicodeString breakType = "all";
Locale locale("en");
UBool useUText = false;
UBool scalarsOnly = false;
std::string exportPath;

if (quick == false) {
loopCount = 10000;
Expand All @@ -3998,6 +4003,19 @@ void RBBITest::TestMonkey() {
p = u.replaceFirst("", status);
}

RegexMatcher pathMatcher(" *export *= *([^ ]+) *", p, 0, status);
if (pathMatcher.find()) {
pathMatcher.group(1, status).toUTF8String(exportPath);
pathMatcher.reset();
p = pathMatcher.replaceFirst("", status);
}

RegexMatcher s(" *scalars_only", p, 0, status);
if (s.find()) {
scalarsOnly = true;
s.reset();
p = s.replaceFirst("", status);
}

// m.reset(p);
if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
Expand All @@ -4013,64 +4031,80 @@ void RBBITest::TestMonkey() {
}

if (breakType == "char" || breakType == "all") {
FILE *file = exportPath.empty() ? nullptr : fopen((exportPath + "_char.txt").c_str(), "w");
RBBICharMonkey m;
BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
if (U_SUCCESS(status)) {
RunMonkey(bi, m, "char", seed, loopCount, useUText);
RunMonkey(bi, m, "char", seed, loopCount, useUText, file, scalarsOnly);
if (breakType == "all" && useUText==false) {
// Also run a quick test with UText when "all" is specified
RunMonkey(bi, m, "char", seed, loopCount, true);
RunMonkey(bi, m, "char", seed, loopCount, true, nullptr, scalarsOnly);
}
}
else {
errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
}
delete bi;
if (file != nullptr) {
fclose(file);
}
}

if (breakType == "word" || breakType == "all") {
logln("Word Break Monkey Test");
FILE *file = exportPath.empty() ? nullptr : fopen((exportPath + "_word.txt").c_str(), "w");
RBBIWordMonkey m;
BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
if (U_SUCCESS(status)) {
RunMonkey(bi, m, "word", seed, loopCount, useUText);
RunMonkey(bi, m, "word", seed, loopCount, useUText, file, scalarsOnly);
}
else {
errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
}
delete bi;
if (file != nullptr) {
fclose(file);
}
}

if (breakType == "line" || breakType == "all") {
logln("Line Break Monkey Test");
FILE *file = exportPath.empty() ? nullptr : fopen((exportPath + "_line.txt").c_str(), "w");
RBBILineMonkey m;
BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
if (loopCount >= 10) {
loopCount = loopCount / 5; // Line break runs slower than the others.
}
if (U_SUCCESS(status)) {
RunMonkey(bi, m, "line", seed, loopCount, useUText);
RunMonkey(bi, m, "line", seed, loopCount, useUText, file, scalarsOnly);
}
else {
errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
}
delete bi;
if (file != nullptr) {
fclose(file);
}
}

if (breakType == "sent" || breakType == "all" ) {
logln("Sentence Break Monkey Test");
FILE *file = exportPath.empty() ? nullptr : fopen((exportPath + "_sent.txt").c_str(), "w");
RBBISentMonkey m;
BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
if (loopCount >= 10) {
loopCount = loopCount / 10; // Sentence runs slower than the other break types
}
if (U_SUCCESS(status)) {
RunMonkey(bi, m, "sent", seed, loopCount, useUText);
RunMonkey(bi, m, "sent", seed, loopCount, useUText, file, scalarsOnly);
}
else {
errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
}
delete bi;
if (file != nullptr) {
fclose(file);
}
}

#endif
Expand All @@ -4079,14 +4113,19 @@ void RBBITest::TestMonkey() {
//
// Run a RBBI monkey test. Common routine, for all break iterator types.
// Parameters:
// bi - the break iterator to use
// mk - MonkeyKind, abstraction for obtaining expected results
// name - Name of test (char, word, etc.) for use in error messages
// seed - Seed for starting random number generator (parameter from user)
// bi - the break iterator to use
// mk - MonkeyKind, abstraction for obtaining expected results
// name - Name of test (char, word, etc.) for use in error messages
// seed - Seed for starting random number generator (parameter from user)
// numIterations
// exportFile - Pointer to a file to which the test cases will be written in
// UCD format. May be null.
// scalarsOnly - Only test sequences of Unicode scalar values; if this is false,
// arbitrary sequences of code points (including unpaired surrogates)
// are tested.
//
void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
int32_t numIterations, UBool useUText) {
int32_t numIterations, UBool useUText, FILE *exportFile, UBool scalarsOnly) {

#if !UCONFIG_NO_REGULAR_EXPRESSIONS

Expand Down Expand Up @@ -4151,6 +4190,9 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
errln("%s:%d c < 0", __FILE__, __LINE__);
break;
}
if (scalarsOnly && U16_IS_SURROGATE(c)) {
continue;
}
// Do not assemble a supplementary character from randomly generated separate surrogates.
// (It could be a dictionary character)
if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
Expand Down Expand Up @@ -4267,6 +4309,16 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
}
}

if (exportFile != nullptr) {
for (i = 0; i < testText.length();) {
fprintf(exportFile, expectedBreaks[i] ? "÷ " : "× ");
char32_t const c = testText.char32At(i);
fprintf(exportFile, "%04X ", static_cast<uint32_t>(c));
i += U16_LENGTH(c);
}
fprintf(exportFile, expectedBreaks[testText.length()] ? "÷ # 🐒\n" : "× # 🐒\n");
}

// Compare the expected and actual results.
for (i=0; i<=testText.length(); i++) {
const char *errorType = nullptr;
Expand Down
4 changes: 3 additions & 1 deletion icu4c/source/test/intltest/rbbitst.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

#if !UCONFIG_NO_BREAK_ITERATION

#include <stdio.h>

#include <memory>

#include "intltest.h"
Expand Down Expand Up @@ -122,7 +124,7 @@ class RBBITest: public IntlTest {
**/

void RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
int32_t loopCount, UBool useUText);
int32_t loopCount, UBool useUText, FILE *exportFile, UBool scalarsOnly);

// Run one of the Unicode Consortium boundary test data files.
void runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi);
Expand Down

0 comments on commit 087377a

Please sign in to comment.