Skip to content

Commit

Permalink
ICU-22942 MF2 ICU4C: NFC-normalize names and keys according to spec
Browse files Browse the repository at this point in the history
Includes adding !UCONFIG_NO_NORMALIZATION guards to all MF2 files
  • Loading branch information
catamorphism committed Nov 6, 2024
1 parent 376da67 commit 0357501
Show file tree
Hide file tree
Showing 32 changed files with 283 additions and 20 deletions.
36 changes: 26 additions & 10 deletions icu4c/source/i18n/messageformat2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

#include "unicode/utypes.h"

#if !UCONFIG_NO_NORMALIZATION

#if !UCONFIG_NO_FORMATTING

#if !UCONFIG_NO_MF2
Expand All @@ -11,8 +13,10 @@
#include "unicode/messageformat2_data_model.h"
#include "unicode/messageformat2_formattable.h"
#include "unicode/messageformat2.h"
#include "unicode/normalizer2.h"
#include "unicode/unistr.h"
#include "messageformat2_allocation.h"
#include "messageformat2_checker.h"
#include "messageformat2_evaluation.h"
#include "messageformat2_macros.h"

Expand All @@ -37,7 +41,7 @@ static Formattable evalLiteral(const Literal& lit) {
// The fallback for a variable name is itself.
UnicodeString str(DOLLAR);
str += var;
const Formattable* val = context.getGlobal(var, errorCode);
const Formattable* val = context.getGlobal(*this, var, errorCode);
if (U_SUCCESS(errorCode)) {
return (FormattedPlaceholder(*val, str));
}
Expand All @@ -52,9 +56,9 @@ static Formattable evalLiteral(const Literal& lit) {
}

[[nodiscard]] FormattedPlaceholder MessageFormatter::formatOperand(const Environment& env,
const Operand& rand,
MessageContext& context,
UErrorCode &status) const {
const Operand& rand,
MessageContext& context,
UErrorCode &status) const {
if (U_FAILURE(status)) {
return {};
}
Expand All @@ -71,15 +75,19 @@ static Formattable evalLiteral(const Literal& lit) {
// Eager vs. lazy evaluation is an open issue:
// see https://github.com/unicode-org/message-format-wg/issues/299

// NFC-normalize the variable name. See
// https://github.com/unicode-org/message-format-wg/blob/main/spec/syntax.md#names-and-identifiers
const VariableName normalized = normalizeNFC(var);

// Look up the variable in the environment
if (env.has(var)) {
if (env.has(normalized)) {
// `var` is a local -- look it up
const Closure& rhs = env.lookup(var);
const Closure& rhs = env.lookup(normalized);
// Format the expression using the environment from the closure
return formatExpression(rhs.getEnv(), rhs.getExpr(), context, status);
}
// Variable wasn't found in locals -- check if it's global
FormattedPlaceholder result = evalArgument(var, context, status);
FormattedPlaceholder result = evalArgument(normalized, context, status);
if (status == U_ILLEGAL_ARGUMENT_ERROR) {
status = U_ZERO_ERROR;
// Unbound variable -- set a resolution error
Expand Down Expand Up @@ -761,6 +769,7 @@ void MessageFormatter::formatSelectors(MessageContext& context, const Environmen
UnicodeString MessageFormatter::formatToString(const MessageArguments& arguments, UErrorCode &status) {
EMPTY_ON_ERROR(status);


// Create a new environment that will store closures for all local variables
Environment* env = Environment::create(status);
// Create a new context with the given arguments and the `errors` structure
Expand Down Expand Up @@ -813,12 +822,14 @@ void MessageFormatter::check(MessageContext& context, const Environment& localEn

// Check that variable is in scope
const VariableName& var = rand.asVariable();
UnicodeString normalized = normalizeNFC(var);

// Check local scope
if (localEnv.has(var)) {
if (localEnv.has(normalized)) {
return;
}
// Check global scope
context.getGlobal(var, status);
context.getGlobal(*this, normalized, status);
if (status == U_ILLEGAL_ARGUMENT_ERROR) {
status = U_ZERO_ERROR;
context.getErrors().setUnresolvedVariable(var, status);
Expand Down Expand Up @@ -855,7 +866,10 @@ void MessageFormatter::checkDeclarations(MessageContext& context, Environment*&
// memoizing the value of localEnv up to this point

// Add the LHS to the environment for checking the next declaration
env = Environment::create(decl.getVariable(), Closure(rhs, *env), env, status);
env = Environment::create(normalizeNFC(decl.getVariable()),
Closure(rhs, *env),
env,
status);
CHECK_ERROR(status);
}
}
Expand All @@ -866,3 +880,5 @@ U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_MF2 */

#endif /* #if !UCONFIG_NO_FORMATTING */

#endif /* #if !UCONFIG_NO_NORMALIZATION */
4 changes: 4 additions & 0 deletions icu4c/source/i18n/messageformat2_allocation.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

#if U_SHOW_CPLUSPLUS_API

#if !UCONFIG_NO_NORMALIZATION

#if !UCONFIG_NO_FORMATTING

#if !UCONFIG_NO_MF2
Expand Down Expand Up @@ -139,6 +141,8 @@ U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_FORMATTING */

#endif /* #if !UCONFIG_NO_NORMALIZATION */

#endif /* U_SHOW_CPLUSPLUS_API */

#endif // MESSAGEFORMAT2_UTILS_H
Expand Down
14 changes: 12 additions & 2 deletions icu4c/source/i18n/messageformat2_arguments.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,16 @@

#include "unicode/utypes.h"

#if !UCONFIG_NO_NORMALIZATION

#if !UCONFIG_NO_FORMATTING

#if !UCONFIG_NO_MF2

#include "unicode/messageformat2.h"
#include "unicode/messageformat2_arguments.h"
#include "unicode/messageformat2_data_model_names.h"
#include "messageformat2_evaluation.h"
#include "uvector.h" // U_ASSERT

U_NAMESPACE_BEGIN
Expand All @@ -22,11 +26,15 @@ namespace message2 {

using Arguments = MessageArguments;

const Formattable* Arguments::getArgument(const VariableName& arg, UErrorCode& errorCode) const {
const Formattable* Arguments::getArgument(const MessageFormatter& context,
const VariableName& arg,
UErrorCode& errorCode) const {
if (U_SUCCESS(errorCode)) {
U_ASSERT(argsLen == 0 || arguments.isValid());
for (int32_t i = 0; i < argsLen; i++) {
if (argumentNames[i] == arg) {
UnicodeString normalized = context.normalizeNFC(argumentNames[i]);
// arg already assumed to be normalized
if (normalized == arg) {
return &arguments[i];
}
}
Expand Down Expand Up @@ -57,3 +65,5 @@ U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_MF2 */

#endif /* #if !UCONFIG_NO_FORMATTING */

#endif /* #if !UCONFIG_NO_NORMALIZATION */
15 changes: 14 additions & 1 deletion icu4c/source/i18n/messageformat2_checker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,16 @@

#include "unicode/utypes.h"

#if !UCONFIG_NO_NORMALIZATION

#if !UCONFIG_NO_FORMATTING

#if !UCONFIG_NO_MF2

#include "unicode/messageformat2.h"
#include "messageformat2_allocation.h"
#include "messageformat2_checker.h"
#include "messageformat2_evaluation.h"
#include "messageformat2_macros.h"
#include "uvector.h" // U_ASSERT

Expand Down Expand Up @@ -104,6 +108,13 @@ TypeEnvironment::~TypeEnvironment() {}

// ---------------------

UnicodeString Checker::normalizeNFC(const Key& k) const {
if (k.isWildcard()) {
return UnicodeString("*");
}
return context.normalizeNFC(k.asLiteral().unquoted());
}

static bool areDefaultKeys(const Key* keys, int32_t len) {
U_ASSERT(len > 0);
for (int32_t i = 0; i < len; i++) {
Expand Down Expand Up @@ -185,7 +196,7 @@ void Checker::checkVariants(UErrorCode& status) {
// This variant was already checked,
// so we know keys1.len == len
for (int32_t kk = 0; kk < len; kk++) {
if (!(keys[kk] == keys1[kk])) {
if (!(normalizeNFC(keys[kk]) == normalizeNFC(keys1[kk]))) {
allEqual = false;
break;
}
Expand Down Expand Up @@ -312,3 +323,5 @@ U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_MF2 */

#endif /* #if !UCONFIG_NO_FORMATTING */

#endif /* #if !UCONFIG_NO_NORMALIZATION */
14 changes: 13 additions & 1 deletion icu4c/source/i18n/messageformat2_checker.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

#if U_SHOW_CPLUSPLUS_API

#if !UCONFIG_NO_NORMALIZATION

#if !UCONFIG_NO_FORMATTING

#if !UCONFIG_NO_MF2
Expand Down Expand Up @@ -56,14 +58,19 @@ namespace message2 {
// an explicit declaration
}; // class TypeEnvironment

class MessageFormatter;

// Checks a data model for semantic errors
// (Errors are defined in https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md )
class Checker {
public:
void check(UErrorCode&);
Checker(const MFDataModel& m, StaticErrors& e) : dataModel(m), errors(e) {}
Checker(const MFDataModel& d, StaticErrors& e, const MessageFormatter& mf)
: dataModel(d), errors(e), context(mf) {}
private:

UnicodeString normalizeNFC(const Key&) const;

void requireAnnotated(const TypeEnvironment&, const Expression&, UErrorCode&);
void addFreeVars(TypeEnvironment& t, const Operand&, UErrorCode&);
void addFreeVars(TypeEnvironment& t, const Operator&, UErrorCode&);
Expand All @@ -78,6 +85,9 @@ namespace message2 {
void check(const Pattern&);
const MFDataModel& dataModel;
StaticErrors& errors;

// Used for NFC normalization
const MessageFormatter& context;
}; // class Checker

} // namespace message2
Expand All @@ -88,6 +98,8 @@ U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_FORMATTING */

#endif /* #if !UCONFIG_NO_NORMALIZATION */

#endif /* U_SHOW_CPLUSPLUS_API */

#endif // MESSAGEFORMAT_CHECKER_H
Expand Down
4 changes: 4 additions & 0 deletions icu4c/source/i18n/messageformat2_data_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

#include "unicode/utypes.h"

#if !UCONFIG_NO_NORMALIZATION

#if !UCONFIG_NO_FORMATTING

#if !UCONFIG_NO_MF2
Expand Down Expand Up @@ -918,3 +920,5 @@ U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_MF2 */

#endif /* #if !UCONFIG_NO_FORMATTING */

#endif /* #if !UCONFIG_NO_NORMALIZATION */
4 changes: 4 additions & 0 deletions icu4c/source/i18n/messageformat2_errors.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

#include "unicode/utypes.h"

#if !UCONFIG_NO_NORMALIZATION

#if !UCONFIG_NO_FORMATTING

#if !UCONFIG_NO_MF2
Expand Down Expand Up @@ -290,3 +292,5 @@ U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_MF2 */

#endif /* #if !UCONFIG_NO_FORMATTING */

#endif /* #if !UCONFIG_NO_NORMALIZATION */
4 changes: 4 additions & 0 deletions icu4c/source/i18n/messageformat2_errors.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
* \brief C++ API: Formats messages using the draft MessageFormat 2.0.
*/

#if !UCONFIG_NO_NORMALIZATION

#if !UCONFIG_NO_FORMATTING

#if !UCONFIG_NO_MF2
Expand Down Expand Up @@ -151,6 +153,8 @@ U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_FORMATTING */

#endif /* #if !UCONFIG_NO_NORMALIZATION */

#endif /* U_SHOW_CPLUSPLUS_API */

#endif // MESSAGEFORMAT2_ERRORS_H
Expand Down
11 changes: 9 additions & 2 deletions icu4c/source/i18n/messageformat2_evaluation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

#include "unicode/utypes.h"

#if !UCONFIG_NO_NORMALIZATION

#if !UCONFIG_NO_FORMATTING

#if !UCONFIG_NO_MF2
Expand Down Expand Up @@ -190,13 +192,16 @@ PrioritizedVariant::~PrioritizedVariant() {}
errors.checkErrors(status);
}

const Formattable* MessageContext::getGlobal(const VariableName& v, UErrorCode& errorCode) const {
return arguments.getArgument(v, errorCode);
const Formattable* MessageContext::getGlobal(const MessageFormatter& context,
const VariableName& v,
UErrorCode& errorCode) const {
return arguments.getArgument(context, v, errorCode);
}

MessageContext::MessageContext(const MessageArguments& args,
const StaticErrors& e,
UErrorCode& status) : arguments(args), errors(e, status) {}

MessageContext::~MessageContext() {}

} // namespace message2
Expand All @@ -205,3 +210,5 @@ U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_MF2 */

#endif /* #if !UCONFIG_NO_FORMATTING */

#endif /* #if !UCONFIG_NO_NORMALIZATION */
10 changes: 9 additions & 1 deletion icu4c/source/i18n/messageformat2_evaluation.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
* \file
* \brief C++ API: Formats messages using the draft MessageFormat 2.0.
*/
#if !UCONFIG_NO_NORMALIZATION

#if !UCONFIG_NO_FORMATTING

Expand Down Expand Up @@ -174,11 +175,15 @@ namespace message2 {
// The context contains all the information needed to process
// an entire message: arguments, formatter cache, and error list

class MessageFormatter;

class MessageContext : public UMemory {
public:
MessageContext(const MessageArguments&, const StaticErrors&, UErrorCode&);

const Formattable* getGlobal(const VariableName&, UErrorCode&) const;
const Formattable* getGlobal(const MessageFormatter&,
const VariableName&,
UErrorCode&) const;

// If any errors were set, update `status` accordingly
void checkErrors(UErrorCode& status) const;
Expand All @@ -191,6 +196,7 @@ namespace message2 {
const MessageArguments& arguments; // External message arguments
// Errors accumulated during parsing/formatting
DynamicErrors errors;

}; // class MessageContext

} // namespace message2
Expand All @@ -201,6 +207,8 @@ U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_FORMATTING */

#endif /* #if !UCONFIG_NO_NORMALIZATION */

#endif /* U_SHOW_CPLUSPLUS_API */

#endif // MESSAGEFORMAT2_EVALUATION_H
Expand Down
Loading

0 comments on commit 0357501

Please sign in to comment.