diff --git a/icu4c/source/i18n/messageformat2.cpp b/icu4c/source/i18n/messageformat2.cpp index 73f7fa45e69f..149a8695b94f 100644 --- a/icu4c/source/i18n/messageformat2.cpp +++ b/icu4c/source/i18n/messageformat2.cpp @@ -11,8 +11,10 @@ #include "unicode/messageformat2_data_model.h" #include "unicode/messageformat2_formattable.h" #include "unicode/messageformat2.h" +#include "unicode/normalizer2.h" #include "unicode/unistr.h" #include "messageformat2_allocation.h" +#include "messageformat2_checker.h" #include "messageformat2_evaluation.h" #include "messageformat2_macros.h" @@ -37,7 +39,7 @@ static Formattable evalLiteral(const Literal& lit) { // The fallback for a variable name is itself. UnicodeString str(DOLLAR); str += var; - const Formattable* val = context.getGlobal(var, errorCode); + const Formattable* val = context.getGlobal(*this, var, errorCode); if (U_SUCCESS(errorCode)) { return (FormattedPlaceholder(*val, str)); } @@ -52,9 +54,9 @@ static Formattable evalLiteral(const Literal& lit) { } [[nodiscard]] FormattedPlaceholder MessageFormatter::formatOperand(const Environment& env, - const Operand& rand, - MessageContext& context, - UErrorCode &status) const { + const Operand& rand, + MessageContext& context, + UErrorCode &status) const { if (U_FAILURE(status)) { return {}; } @@ -71,15 +73,19 @@ static Formattable evalLiteral(const Literal& lit) { // Eager vs. lazy evaluation is an open issue: // see https://github.com/unicode-org/message-format-wg/issues/299 + // NFC-normalize the variable name. See + // https://github.com/unicode-org/message-format-wg/blob/main/spec/syntax.md#names-and-identifiers + const VariableName normalized = normalizeNFC(var); + // Look up the variable in the environment - if (env.has(var)) { + if (env.has(normalized)) { // `var` is a local -- look it up - const Closure& rhs = env.lookup(var); + const Closure& rhs = env.lookup(normalized); // Format the expression using the environment from the closure return formatExpression(rhs.getEnv(), rhs.getExpr(), context, status); } // Variable wasn't found in locals -- check if it's global - FormattedPlaceholder result = evalArgument(var, context, status); + FormattedPlaceholder result = evalArgument(normalized, context, status); if (status == U_ILLEGAL_ARGUMENT_ERROR) { status = U_ZERO_ERROR; // Unbound variable -- set a resolution error @@ -761,6 +767,7 @@ void MessageFormatter::formatSelectors(MessageContext& context, const Environmen UnicodeString MessageFormatter::formatToString(const MessageArguments& arguments, UErrorCode &status) { EMPTY_ON_ERROR(status); + // Create a new environment that will store closures for all local variables Environment* env = Environment::create(status); // Create a new context with the given arguments and the `errors` structure @@ -813,12 +820,14 @@ void MessageFormatter::check(MessageContext& context, const Environment& localEn // Check that variable is in scope const VariableName& var = rand.asVariable(); + UnicodeString normalized = normalizeNFC(var); + // Check local scope - if (localEnv.has(var)) { + if (localEnv.has(normalized)) { return; } // Check global scope - context.getGlobal(var, status); + context.getGlobal(*this, normalized, status); if (status == U_ILLEGAL_ARGUMENT_ERROR) { status = U_ZERO_ERROR; context.getErrors().setUnresolvedVariable(var, status); @@ -855,7 +864,10 @@ void MessageFormatter::checkDeclarations(MessageContext& context, Environment*& // memoizing the value of localEnv up to this point // Add the LHS to the environment for checking the next declaration - env = Environment::create(decl.getVariable(), Closure(rhs, *env), env, status); + env = Environment::create(normalizeNFC(decl.getVariable()), + Closure(rhs, *env), + env, + status); CHECK_ERROR(status); } } diff --git a/icu4c/source/i18n/messageformat2_arguments.cpp b/icu4c/source/i18n/messageformat2_arguments.cpp index ded3f4dda160..79b754003bb7 100644 --- a/icu4c/source/i18n/messageformat2_arguments.cpp +++ b/icu4c/source/i18n/messageformat2_arguments.cpp @@ -7,8 +7,10 @@ #if !UCONFIG_NO_MF2 +#include "unicode/messageformat2.h" #include "unicode/messageformat2_arguments.h" #include "unicode/messageformat2_data_model_names.h" +#include "messageformat2_evaluation.h" #include "uvector.h" // U_ASSERT U_NAMESPACE_BEGIN @@ -22,11 +24,15 @@ namespace message2 { using Arguments = MessageArguments; - const Formattable* Arguments::getArgument(const VariableName& arg, UErrorCode& errorCode) const { + const Formattable* Arguments::getArgument(const MessageFormatter& context, + const VariableName& arg, + UErrorCode& errorCode) const { if (U_SUCCESS(errorCode)) { U_ASSERT(argsLen == 0 || arguments.isValid()); for (int32_t i = 0; i < argsLen; i++) { - if (argumentNames[i] == arg) { + UnicodeString normalized = context.normalizeNFC(argumentNames[i]); + // arg already assumed to be normalized + if (normalized == arg) { return &arguments[i]; } } diff --git a/icu4c/source/i18n/messageformat2_checker.cpp b/icu4c/source/i18n/messageformat2_checker.cpp index bdc5c383b6e8..775141993122 100644 --- a/icu4c/source/i18n/messageformat2_checker.cpp +++ b/icu4c/source/i18n/messageformat2_checker.cpp @@ -7,8 +7,10 @@ #if !UCONFIG_NO_MF2 +#include "unicode/messageformat2.h" #include "messageformat2_allocation.h" #include "messageformat2_checker.h" +#include "messageformat2_evaluation.h" #include "messageformat2_macros.h" #include "uvector.h" // U_ASSERT @@ -104,6 +106,13 @@ TypeEnvironment::~TypeEnvironment() {} // --------------------- +UnicodeString Checker::normalizeNFC(const Key& k) const { + if (k.isWildcard()) { + return UnicodeString("*"); + } + return context.normalizeNFC(k.asLiteral().unquoted()); +} + static bool areDefaultKeys(const Key* keys, int32_t len) { U_ASSERT(len > 0); for (int32_t i = 0; i < len; i++) { @@ -185,7 +194,7 @@ void Checker::checkVariants(UErrorCode& status) { // This variant was already checked, // so we know keys1.len == len for (int32_t kk = 0; kk < len; kk++) { - if (!(keys[kk] == keys1[kk])) { + if (!(normalizeNFC(keys[kk]) == normalizeNFC(keys1[kk]))) { allEqual = false; break; } diff --git a/icu4c/source/i18n/messageformat2_checker.h b/icu4c/source/i18n/messageformat2_checker.h index 4bb0498efb99..d0d75430af51 100644 --- a/icu4c/source/i18n/messageformat2_checker.h +++ b/icu4c/source/i18n/messageformat2_checker.h @@ -56,14 +56,19 @@ namespace message2 { // an explicit declaration }; // class TypeEnvironment + class MessageFormatter; + // Checks a data model for semantic errors // (Errors are defined in https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md ) class Checker { public: void check(UErrorCode&); - Checker(const MFDataModel& m, StaticErrors& e) : dataModel(m), errors(e) {} + Checker(const MFDataModel& d, StaticErrors& e, const MessageFormatter& mf) + : dataModel(d), errors(e), context(mf) {} private: + UnicodeString normalizeNFC(const Key&) const; + void requireAnnotated(const TypeEnvironment&, const Expression&, UErrorCode&); void addFreeVars(TypeEnvironment& t, const Operand&, UErrorCode&); void addFreeVars(TypeEnvironment& t, const Operator&, UErrorCode&); @@ -78,6 +83,9 @@ namespace message2 { void check(const Pattern&); const MFDataModel& dataModel; StaticErrors& errors; + + // Used for NFC normalization + const MessageFormatter& context; }; // class Checker } // namespace message2 diff --git a/icu4c/source/i18n/messageformat2_evaluation.cpp b/icu4c/source/i18n/messageformat2_evaluation.cpp index 41e4c9a8020a..0cbef90491a1 100644 --- a/icu4c/source/i18n/messageformat2_evaluation.cpp +++ b/icu4c/source/i18n/messageformat2_evaluation.cpp @@ -190,13 +190,16 @@ PrioritizedVariant::~PrioritizedVariant() {} errors.checkErrors(status); } - const Formattable* MessageContext::getGlobal(const VariableName& v, UErrorCode& errorCode) const { - return arguments.getArgument(v, errorCode); + const Formattable* MessageContext::getGlobal(const MessageFormatter& context, + const VariableName& v, + UErrorCode& errorCode) const { + return arguments.getArgument(context, v, errorCode); } MessageContext::MessageContext(const MessageArguments& args, const StaticErrors& e, UErrorCode& status) : arguments(args), errors(e, status) {} + MessageContext::~MessageContext() {} } // namespace message2 diff --git a/icu4c/source/i18n/messageformat2_evaluation.h b/icu4c/source/i18n/messageformat2_evaluation.h index b8ae0242367d..abecdc2ca0dc 100644 --- a/icu4c/source/i18n/messageformat2_evaluation.h +++ b/icu4c/source/i18n/messageformat2_evaluation.h @@ -174,11 +174,15 @@ namespace message2 { // The context contains all the information needed to process // an entire message: arguments, formatter cache, and error list + class MessageFormatter; + class MessageContext : public UMemory { public: MessageContext(const MessageArguments&, const StaticErrors&, UErrorCode&); - const Formattable* getGlobal(const VariableName&, UErrorCode&) const; + const Formattable* getGlobal(const MessageFormatter&, + const VariableName&, + UErrorCode&) const; // If any errors were set, update `status` accordingly void checkErrors(UErrorCode& status) const; @@ -191,6 +195,7 @@ namespace message2 { const MessageArguments& arguments; // External message arguments // Errors accumulated during parsing/formatting DynamicErrors errors; + }; // class MessageContext } // namespace message2 diff --git a/icu4c/source/i18n/messageformat2_formatter.cpp b/icu4c/source/i18n/messageformat2_formatter.cpp index 8d17ae49b99a..f14ba3a13c9f 100644 --- a/icu4c/source/i18n/messageformat2_formatter.cpp +++ b/icu4c/source/i18n/messageformat2_formatter.cpp @@ -116,6 +116,24 @@ namespace message2 { // MessageFormatter + // Returns the NFC-normalized version of s, returning s itself + // if it's already normalized. + UnicodeString MessageFormatter::normalizeNFC(const UnicodeString& s) const { + UErrorCode status = U_ZERO_ERROR; + // Check if string is already normalized + UNormalizationCheckResult result = nfcNormalizer->quickCheck(s, status); + // If so, return it + if (U_SUCCESS(status) && result == UNORM_YES) { + return s; + } + // Otherwise, normalize it + UnicodeString normalized = nfcNormalizer->normalize(s, status); + if (U_FAILURE(status)) { + return {}; + } + return normalized; + } + MessageFormatter::MessageFormatter(const MessageFormatter::Builder& builder, UErrorCode &success) : locale(builder.locale), customMFFunctionRegistry(builder.customMFFunctionRegistry) { CHECK_ERROR(success); @@ -163,6 +181,8 @@ namespace message2 { errors = errorsNew.orphan(); } + nfcNormalizer = Normalizer2::getNFCInstance(success); + // Note: we currently evaluate variables lazily, // without memoization. This call is still necessary // to check out-of-scope uses of local variables in @@ -170,7 +190,7 @@ namespace message2 { // only be checked when arguments are known) // Check for resolution errors - Checker(dataModel, *errors).check(success); + Checker(dataModel, *errors, *this).check(success); } void MessageFormatter::cleanup() noexcept { @@ -191,6 +211,7 @@ namespace message2 { signalErrors = other.signalErrors; errors = other.errors; other.errors = nullptr; + nfcNormalizer = other.nfcNormalizer; return *this; } diff --git a/icu4c/source/i18n/unicode/messageformat2.h b/icu4c/source/i18n/unicode/messageformat2.h index c5459f042f40..c6853c02b337 100644 --- a/icu4c/source/i18n/unicode/messageformat2.h +++ b/icu4c/source/i18n/unicode/messageformat2.h @@ -20,6 +20,7 @@ #include "unicode/messageformat2_arguments.h" #include "unicode/messageformat2_data_model.h" #include "unicode/messageformat2_function_registry.h" +#include "unicode/normalizer2.h" #include "unicode/unistr.h" #ifndef U_HIDE_DEPRECATED_API @@ -325,6 +326,8 @@ namespace message2 { private: friend class Builder; + friend class Checker; + friend class MessageArguments; friend class MessageContext; MessageFormatter(const MessageFormatter::Builder& builder, UErrorCode &status); @@ -352,6 +355,9 @@ namespace message2 { void resolvePreferences(MessageContext&, UVector&, UVector&, UErrorCode&) const; // Formatting methods + + // Used for normalizing variable names and keys for comparison + UnicodeString normalizeNFC(const UnicodeString&) const; [[nodiscard]] FormattedPlaceholder formatLiteral(const data_model::Literal&) const; void formatPattern(MessageContext&, const Environment&, const data_model::Pattern&, UErrorCode&, UnicodeString&) const; // Formats a call to a formatting function @@ -445,6 +451,10 @@ namespace message2 { // formatting methods return best-effort output. // The default is false. bool signalErrors = false; + + // Used for implementing normalizeNFC() + const Normalizer2* nfcNormalizer = nullptr; + }; // class MessageFormatter } // namespace message2 diff --git a/icu4c/source/i18n/unicode/messageformat2_arguments.h b/icu4c/source/i18n/unicode/messageformat2_arguments.h index c43d96191f16..0adbe48578fb 100644 --- a/icu4c/source/i18n/unicode/messageformat2_arguments.h +++ b/icu4c/source/i18n/unicode/messageformat2_arguments.h @@ -43,7 +43,7 @@ template class U_I18N_API LocalArray; namespace message2 { - class MessageContext; + class MessageFormatter; // Arguments // ---------- @@ -112,7 +112,9 @@ namespace message2 { private: friend class MessageContext; - const Formattable* getArgument(const data_model::VariableName&, UErrorCode&) const; + const Formattable* getArgument(const MessageFormatter&, + const data_model::VariableName&, + UErrorCode&) const; // Avoids using Hashtable so that code constructing a Hashtable // doesn't have to appear in this header file diff --git a/icu4c/source/test/intltest/messageformat2test_read_json.cpp b/icu4c/source/test/intltest/messageformat2test_read_json.cpp index ddf93da632ce..d679246c7218 100644 --- a/icu4c/source/test/intltest/messageformat2test_read_json.cpp +++ b/icu4c/source/test/intltest/messageformat2test_read_json.cpp @@ -309,6 +309,9 @@ void TestMessageFormat2::jsonTestsFromFiles(IcuTestErrorCode& errorCode) { runTestsFromJsonFile(*this, "spec/functions/time.json", errorCode); // Other tests (non-spec) + // TODO: Delete this file after https://github.com/unicode-org/message-format-wg/pull/904 + // lands and the tests here are updated from the spec repo + runTestsFromJsonFile(*this, "normalization.json", errorCode); runTestsFromJsonFile(*this, "more-functions.json", errorCode); runTestsFromJsonFile(*this, "valid-tests.json", errorCode); runTestsFromJsonFile(*this, "resolution-errors.json", errorCode); diff --git a/testdata/message2/normalization.json b/testdata/message2/normalization.json new file mode 100644 index 000000000000..53c34b9e5b9e --- /dev/null +++ b/testdata/message2/normalization.json @@ -0,0 +1,47 @@ +{ + "$schema": "https://raw.githubusercontent.com/unicode-org/message-format-wg/main/test/schemas/v0/tests.schema.json", + "scenario": "Syntax", + "description": "Test cases that do not depend on any registry definitions.", + "defaultTestProperties": { + "locale": "en-US" + }, + "tests": [ + { + "description": "NFC: literals are not normalized", + "src": "\u1E0A\u0323", + "exp": "\u1E0A\u0323" + }, + { + "description": "NFC: variables are compared to each other as-if normalized; decl is non-normalized, use is", + "src": ".local $\u0044\u0323\u0307 = {foo} {{{$\u1E0c\u0307}}}", + "exp": "foo" + }, + { + "description": "NFC: variables are compared to each other as-if normalized; decl is normalized, use isn't", + "src": ".local $\u1E0c\u0307 = {foo} {{{$\u0044\u0323\u0307}}}", + "exp": "foo" + }, + { + "description": "NFC: variables are compared to each other as-if normalized; decl is normalized, use isn't", + "src": ".input {$\u1E0c\u0307} {{{$\u0044\u0323\u0307}}}", + "params": [{"name": "\u1E0c\u0307", "value": "foo"}], + "exp": "foo" + }, + { + "description": "NFC: variables are compared to each other as-if normalized; decl is non-normalized, use is", + "src": ".input {$\u0044\u0323\u0307} {{{$\u1E0c\u0307}}}", + "params": [{"name": "\u0044\u0323\u0307", "value": "foo"}], + "exp": "foo" + }, + { + "description": "NFC: keys are normalized", + "src": ".local $x = {\u1E0A\u0323 :string} .match {$x} \u1E0A\u0323 {{Not normalized}} \u1E0C\u0307 {{Normalized}} * {{Wrong}}", + "expErrors": [{"type": "duplicate-variant"}] + }, + { + "description": "NFC: keys are normalized", + "src": ".local $x = {\u1E0A\u0323 :string} .match {$x} \u1E0C\u0307 {{Right}} * {{Wrong}}", + "exp": "Right" + } +] +}