Skip to content

Commit

Permalink
ICU-20715 C++: reset defaultCE32 before they are used/recomputed
Browse files Browse the repository at this point in the history
  • Loading branch information
markusicu committed Apr 1, 2022
1 parent fd1d270 commit 0e3268e
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 14 deletions.
52 changes: 39 additions & 13 deletions icu4c/source/i18n/collationdatabuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,13 +86,25 @@ struct ConditionalCE32 : public UMemory {
* When fetching CEs from the builder, the contexts are built into their runtime form
* so that the normal collation implementation can process them.
* The result is cached in the list head. It is reset when the contexts are modified.
* All of these builtCE32 are invalidated by clearContexts(),
* via incrementing the contextsEra.
*/
uint32_t builtCE32;
/**
* The "era" of building intermediate contexts when the above builtCE32 was set.
* When the array of cached, temporary contexts overflows, then clearContexts()
* removes them all and invalidates the builtCE32 that used to point to built tries.
*/
int32_t era = -1;
/**
* Index of the next ConditionalCE32.
* Negative for the end of the list.
*/
int32_t next;
// Note: We could create a separate class for all of the contextual mappings for
// a code point, with the builtCE32, the era, and a list of the actual mappings.
// The class that represents one mapping would then not need to
// store those fields in each element.
};

U_CDECL_BEGIN
Expand Down Expand Up @@ -267,14 +279,15 @@ DataBuilderCollationIterator::getCE32FromBuilderData(uint32_t ce32, UErrorCode &
// TODO: ICU-21531 figure out why this happens.
return 0;
}
if(cond->builtCE32 == Collation::NO_CE32) {
if(cond->builtCE32 == Collation::NO_CE32 || cond->era != builder.contextsEra) {
// Build the context-sensitive mappings into their runtime form and cache the result.
cond->builtCE32 = builder.buildContext(cond, errorCode);
if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
errorCode = U_ZERO_ERROR;
builder.clearContexts();
cond->builtCE32 = builder.buildContext(cond, errorCode);
}
cond->era = builder.contextsEra;
builderData.contexts = builder.contexts.getBuffer();
}
return cond->builtCE32;
Expand Down Expand Up @@ -1322,21 +1335,18 @@ CollationDataBuilder::buildMappings(CollationData &data, UErrorCode &errorCode)
void
CollationDataBuilder::clearContexts() {
contexts.remove();
UnicodeSetIterator iter(contextChars);
while(iter.next()) {
U_ASSERT(!iter.isString());
uint32_t ce32 = utrie2_get32(trie, iter.getCodepoint());
U_ASSERT(isBuilderContextCE32(ce32));
getConditionalCE32ForCE32(ce32)->builtCE32 = Collation::NO_CE32;
}
// Incrementing the contexts build "era" invalidates all of the builtCE32
// from before this clearContexts() call.
// Simpler than finding and resetting all of those fields.
++contextsEra;
}

void
CollationDataBuilder::buildContexts(UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return; }
// Ignore abandoned lists and the cached builtCE32,
// and build all contexts from scratch.
contexts.remove();
clearContexts();
UnicodeSetIterator iter(contextChars);
while(U_SUCCESS(errorCode) && iter.next()) {
U_ASSERT(!iter.isString());
Expand All @@ -1362,18 +1372,34 @@ CollationDataBuilder::buildContext(ConditionalCE32 *head, UErrorCode &errorCode)
U_ASSERT(head->next >= 0);
UCharsTrieBuilder prefixBuilder(errorCode);
UCharsTrieBuilder contractionBuilder(errorCode);
// This outer loop goes from each prefix to the next.
// For each prefix it finds the one or more same-prefix entries (firstCond..lastCond).
// If there are multiple suffixes for the same prefix,
// then an inner loop builds a contraction trie for them.
for(ConditionalCE32 *cond = head;; cond = getConditionalCE32(cond->next)) {
if(U_FAILURE(errorCode)) { return 0; } // early out for memory allocation errors
// After the list head, the prefix or suffix can be empty, but not both.
U_ASSERT(cond == head || cond->hasContext());
int32_t prefixLength = cond->prefixLength();
UnicodeString prefix(cond->context, 0, prefixLength + 1);
// Collect all contraction suffixes for one prefix.
ConditionalCE32 *firstCond = cond;
ConditionalCE32 *lastCond = cond;
while(cond->next >= 0 &&
(cond = getConditionalCE32(cond->next))->context.startsWith(prefix)) {
ConditionalCE32 *lastCond;
do {
lastCond = cond;
}
// Clear the defaultCE32 fields as we go.
// They are left over from building a previous version of this list of contexts.
//
// One of the code paths below may copy a preceding defaultCE32
// into its emptySuffixCE32.
// If a new suffix has been inserted before what used to be
// the firstCond for its prefix, then that previous firstCond could still
// contain an outdated defaultCE32 from an earlier buildContext() and
// result in an incorrect emptySuffixCE32.
// So we reset all defaultCE32 before reading and setting new values.
cond->defaultCE32 = Collation::NO_CE32;
} while(cond->next >= 0 &&
(cond = getConditionalCE32(cond->next))->context.startsWith(prefix));
uint32_t ce32;
int32_t suffixStart = prefixLength + 1; // == prefix.length()
if(lastCond->context.length() == suffixStart) {
Expand Down
9 changes: 9 additions & 0 deletions icu4c/source/i18n/collationdatabuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,15 @@ class U_I18N_API CollationDataBuilder : public UObject {
UnicodeSet contextChars;
// Serialized UCharsTrie structures for finalized contexts.
UnicodeString contexts;
private:
/**
* The "era" of building intermediate contexts.
* When the array of cached, temporary contexts overflows, then clearContexts()
* removes them all and invalidates the builtCE32 that used to point to built tries.
* See ConditionalCE32::era.
*/
int32_t contextsEra = 0;
protected:
UnicodeSet unsafeBackwardSet;
UBool modified;

Expand Down
9 changes: 8 additions & 1 deletion icu4c/source/test/intltest/collationtest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1866,7 +1866,11 @@ void CollationTest::TestLongLocale() {

void CollationTest::TestBuilderContextsOverflow() {
IcuTestErrorCode errorCode(*this, "TestBuilderContextsOverflow");
// ICU-20715
// ICU-20715: Bad memory access in what looks like a bogus CharsTrie after
// intermediate contextual-mappings data overflowed.
// Caused by the CollationDataBuilder using some outdated values when building
// contextual mappings with both prefix and contraction matching.
// Fixed by resetting those outdated values before code looks at them.
char16_t rules[] = {
u'&', 0x10, 0x2ff, 0x503c, 0x4617,
u'=', 0x80, 0x4f7f, 0xff, 0x3c3d, 0x1c4f, 0x3c3c,
Expand All @@ -1883,6 +1887,9 @@ void CollationTest::TestBuilderContextsOverflow() {
};
UnicodeString s(false, rules, UPRV_LENGTHOF(rules));
LocalPointer<Collator> coll(new RuleBasedCollator(s, errorCode), errorCode);
if(errorCode.isSuccess()) {
logln("successfully built the Collator");
}
}

#endif // !UCONFIG_NO_COLLATION

0 comments on commit 0e3268e

Please sign in to comment.