Skip to content

Commit

Permalink
ICU-22942 MF2 ICU4C: NFC-normalize names and keys according to spec
Browse files Browse the repository at this point in the history
  • Loading branch information
catamorphism committed Oct 10, 2024
1 parent 8bdb306 commit be97a13
Show file tree
Hide file tree
Showing 11 changed files with 146 additions and 20 deletions.
32 changes: 22 additions & 10 deletions icu4c/source/i18n/messageformat2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@
#include "unicode/messageformat2_data_model.h"
#include "unicode/messageformat2_formattable.h"
#include "unicode/messageformat2.h"
#include "unicode/normalizer2.h"
#include "unicode/unistr.h"
#include "messageformat2_allocation.h"
#include "messageformat2_checker.h"
#include "messageformat2_evaluation.h"
#include "messageformat2_macros.h"

Expand All @@ -37,7 +39,7 @@ static Formattable evalLiteral(const Literal& lit) {
// The fallback for a variable name is itself.
UnicodeString str(DOLLAR);
str += var;
const Formattable* val = context.getGlobal(var, errorCode);
const Formattable* val = context.getGlobal(*this, var, errorCode);
if (U_SUCCESS(errorCode)) {
return (FormattedPlaceholder(*val, str));
}
Expand All @@ -52,9 +54,9 @@ static Formattable evalLiteral(const Literal& lit) {
}

[[nodiscard]] FormattedPlaceholder MessageFormatter::formatOperand(const Environment& env,
const Operand& rand,
MessageContext& context,
UErrorCode &status) const {
const Operand& rand,
MessageContext& context,
UErrorCode &status) const {
if (U_FAILURE(status)) {
return {};
}
Expand All @@ -71,15 +73,19 @@ static Formattable evalLiteral(const Literal& lit) {
// Eager vs. lazy evaluation is an open issue:
// see https://github.com/unicode-org/message-format-wg/issues/299

// NFC-normalize the variable name. See
// https://github.com/unicode-org/message-format-wg/blob/main/spec/syntax.md#names-and-identifiers
const VariableName normalized = normalizeNFC(var);

// Look up the variable in the environment
if (env.has(var)) {
if (env.has(normalized)) {
// `var` is a local -- look it up
const Closure& rhs = env.lookup(var);
const Closure& rhs = env.lookup(normalized);
// Format the expression using the environment from the closure
return formatExpression(rhs.getEnv(), rhs.getExpr(), context, status);
}
// Variable wasn't found in locals -- check if it's global
FormattedPlaceholder result = evalArgument(var, context, status);
FormattedPlaceholder result = evalArgument(normalized, context, status);
if (status == U_ILLEGAL_ARGUMENT_ERROR) {
status = U_ZERO_ERROR;
// Unbound variable -- set a resolution error
Expand Down Expand Up @@ -761,6 +767,7 @@ void MessageFormatter::formatSelectors(MessageContext& context, const Environmen
UnicodeString MessageFormatter::formatToString(const MessageArguments& arguments, UErrorCode &status) {
EMPTY_ON_ERROR(status);


// Create a new environment that will store closures for all local variables
Environment* env = Environment::create(status);
// Create a new context with the given arguments and the `errors` structure
Expand Down Expand Up @@ -813,12 +820,14 @@ void MessageFormatter::check(MessageContext& context, const Environment& localEn

// Check that variable is in scope
const VariableName& var = rand.asVariable();
UnicodeString normalized = normalizeNFC(var);

// Check local scope
if (localEnv.has(var)) {
if (localEnv.has(normalized)) {
return;
}
// Check global scope
context.getGlobal(var, status);
context.getGlobal(*this, normalized, status);
if (status == U_ILLEGAL_ARGUMENT_ERROR) {
status = U_ZERO_ERROR;
context.getErrors().setUnresolvedVariable(var, status);
Expand Down Expand Up @@ -855,7 +864,10 @@ void MessageFormatter::checkDeclarations(MessageContext& context, Environment*&
// memoizing the value of localEnv up to this point

// Add the LHS to the environment for checking the next declaration
env = Environment::create(decl.getVariable(), Closure(rhs, *env), env, status);
env = Environment::create(normalizeNFC(decl.getVariable()),
Closure(rhs, *env),
env,
status);
CHECK_ERROR(status);
}
}
Expand Down
10 changes: 8 additions & 2 deletions icu4c/source/i18n/messageformat2_arguments.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@

#if !UCONFIG_NO_MF2

#include "unicode/messageformat2.h"
#include "unicode/messageformat2_arguments.h"
#include "unicode/messageformat2_data_model_names.h"
#include "messageformat2_evaluation.h"
#include "uvector.h" // U_ASSERT

U_NAMESPACE_BEGIN
Expand All @@ -22,11 +24,15 @@ namespace message2 {

using Arguments = MessageArguments;

const Formattable* Arguments::getArgument(const VariableName& arg, UErrorCode& errorCode) const {
const Formattable* Arguments::getArgument(const MessageFormatter& context,
const VariableName& arg,
UErrorCode& errorCode) const {
if (U_SUCCESS(errorCode)) {
U_ASSERT(argsLen == 0 || arguments.isValid());
for (int32_t i = 0; i < argsLen; i++) {
if (argumentNames[i] == arg) {
UnicodeString normalized = context.normalizeNFC(argumentNames[i]);
// arg already assumed to be normalized
if (normalized == arg) {
return &arguments[i];
}
}
Expand Down
11 changes: 10 additions & 1 deletion icu4c/source/i18n/messageformat2_checker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@

#if !UCONFIG_NO_MF2

#include "unicode/messageformat2.h"
#include "messageformat2_allocation.h"
#include "messageformat2_checker.h"
#include "messageformat2_evaluation.h"
#include "messageformat2_macros.h"
#include "uvector.h" // U_ASSERT

Expand Down Expand Up @@ -104,6 +106,13 @@ TypeEnvironment::~TypeEnvironment() {}

// ---------------------

UnicodeString Checker::normalizeNFC(const Key& k) const {
if (k.isWildcard()) {
return UnicodeString("*");
}
return context.normalizeNFC(k.asLiteral().unquoted());
}

static bool areDefaultKeys(const Key* keys, int32_t len) {
U_ASSERT(len > 0);
for (int32_t i = 0; i < len; i++) {
Expand Down Expand Up @@ -185,7 +194,7 @@ void Checker::checkVariants(UErrorCode& status) {
// This variant was already checked,
// so we know keys1.len == len
for (int32_t kk = 0; kk < len; kk++) {
if (!(keys[kk] == keys1[kk])) {
if (!(normalizeNFC(keys[kk]) == normalizeNFC(keys1[kk]))) {
allEqual = false;
break;
}
Expand Down
10 changes: 9 additions & 1 deletion icu4c/source/i18n/messageformat2_checker.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,19 @@ namespace message2 {
// an explicit declaration
}; // class TypeEnvironment

class MessageFormatter;

// Checks a data model for semantic errors
// (Errors are defined in https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md )
class Checker {
public:
void check(UErrorCode&);
Checker(const MFDataModel& m, StaticErrors& e) : dataModel(m), errors(e) {}
Checker(const MFDataModel& d, StaticErrors& e, const MessageFormatter& mf)
: dataModel(d), errors(e), context(mf) {}
private:

UnicodeString normalizeNFC(const Key&) const;

void requireAnnotated(const TypeEnvironment&, const Expression&, UErrorCode&);
void addFreeVars(TypeEnvironment& t, const Operand&, UErrorCode&);
void addFreeVars(TypeEnvironment& t, const Operator&, UErrorCode&);
Expand All @@ -78,6 +83,9 @@ namespace message2 {
void check(const Pattern&);
const MFDataModel& dataModel;
StaticErrors& errors;

// Used for NFC normalization
const MessageFormatter& context;
}; // class Checker

} // namespace message2
Expand Down
7 changes: 5 additions & 2 deletions icu4c/source/i18n/messageformat2_evaluation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,13 +190,16 @@ PrioritizedVariant::~PrioritizedVariant() {}
errors.checkErrors(status);
}

const Formattable* MessageContext::getGlobal(const VariableName& v, UErrorCode& errorCode) const {
return arguments.getArgument(v, errorCode);
const Formattable* MessageContext::getGlobal(const MessageFormatter& context,
const VariableName& v,
UErrorCode& errorCode) const {
return arguments.getArgument(context, v, errorCode);
}

MessageContext::MessageContext(const MessageArguments& args,
const StaticErrors& e,
UErrorCode& status) : arguments(args), errors(e, status) {}

MessageContext::~MessageContext() {}

} // namespace message2
Expand Down
7 changes: 6 additions & 1 deletion icu4c/source/i18n/messageformat2_evaluation.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,11 +174,15 @@ namespace message2 {
// The context contains all the information needed to process
// an entire message: arguments, formatter cache, and error list

class MessageFormatter;

class MessageContext : public UMemory {
public:
MessageContext(const MessageArguments&, const StaticErrors&, UErrorCode&);

const Formattable* getGlobal(const VariableName&, UErrorCode&) const;
const Formattable* getGlobal(const MessageFormatter&,
const VariableName&,
UErrorCode&) const;

// If any errors were set, update `status` accordingly
void checkErrors(UErrorCode& status) const;
Expand All @@ -191,6 +195,7 @@ namespace message2 {
const MessageArguments& arguments; // External message arguments
// Errors accumulated during parsing/formatting
DynamicErrors errors;

}; // class MessageContext

} // namespace message2
Expand Down
23 changes: 22 additions & 1 deletion icu4c/source/i18n/messageformat2_formatter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,24 @@ namespace message2 {

// MessageFormatter

// Returns the NFC-normalized version of s, returning s itself
// if it's already normalized.
UnicodeString MessageFormatter::normalizeNFC(const UnicodeString& s) const {
UErrorCode status = U_ZERO_ERROR;
// Check if string is already normalized
UNormalizationCheckResult result = nfcNormalizer->quickCheck(s, status);
// If so, return it
if (U_SUCCESS(status) && result == UNORM_YES) {
return s;
}
// Otherwise, normalize it
UnicodeString normalized = nfcNormalizer->normalize(s, status);
if (U_FAILURE(status)) {
return {};
}
return normalized;
}

MessageFormatter::MessageFormatter(const MessageFormatter::Builder& builder, UErrorCode &success) : locale(builder.locale), customMFFunctionRegistry(builder.customMFFunctionRegistry) {
CHECK_ERROR(success);

Expand Down Expand Up @@ -163,14 +181,16 @@ namespace message2 {
errors = errorsNew.orphan();
}

nfcNormalizer = Normalizer2::getNFCInstance(success);

// Note: we currently evaluate variables lazily,
// without memoization. This call is still necessary
// to check out-of-scope uses of local variables in
// right-hand sides (unresolved variable errors can
// only be checked when arguments are known)

// Check for resolution errors
Checker(dataModel, *errors).check(success);
Checker(dataModel, *errors, *this).check(success);
}

void MessageFormatter::cleanup() noexcept {
Expand All @@ -191,6 +211,7 @@ namespace message2 {
signalErrors = other.signalErrors;
errors = other.errors;
other.errors = nullptr;
nfcNormalizer = other.nfcNormalizer;
return *this;
}

Expand Down
10 changes: 10 additions & 0 deletions icu4c/source/i18n/unicode/messageformat2.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "unicode/messageformat2_arguments.h"
#include "unicode/messageformat2_data_model.h"
#include "unicode/messageformat2_function_registry.h"
#include "unicode/normalizer2.h"
#include "unicode/unistr.h"

#ifndef U_HIDE_DEPRECATED_API
Expand Down Expand Up @@ -325,6 +326,8 @@ namespace message2 {

private:
friend class Builder;
friend class Checker;
friend class MessageArguments;
friend class MessageContext;

MessageFormatter(const MessageFormatter::Builder& builder, UErrorCode &status);
Expand Down Expand Up @@ -352,6 +355,9 @@ namespace message2 {
void resolvePreferences(MessageContext&, UVector&, UVector&, UErrorCode&) const;

// Formatting methods

// Used for normalizing variable names and keys for comparison
UnicodeString normalizeNFC(const UnicodeString&) const;
[[nodiscard]] FormattedPlaceholder formatLiteral(const data_model::Literal&) const;
void formatPattern(MessageContext&, const Environment&, const data_model::Pattern&, UErrorCode&, UnicodeString&) const;
// Formats a call to a formatting function
Expand Down Expand Up @@ -445,6 +451,10 @@ namespace message2 {
// formatting methods return best-effort output.
// The default is false.
bool signalErrors = false;

// Used for implementing normalizeNFC()
const Normalizer2* nfcNormalizer = nullptr;

}; // class MessageFormatter

} // namespace message2
Expand Down
6 changes: 4 additions & 2 deletions icu4c/source/i18n/unicode/messageformat2_arguments.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ template class U_I18N_API LocalArray<message2::Formattable>;

namespace message2 {

class MessageContext;
class MessageFormatter;

// Arguments
// ----------
Expand Down Expand Up @@ -112,7 +112,9 @@ namespace message2 {
private:
friend class MessageContext;

const Formattable* getArgument(const data_model::VariableName&, UErrorCode&) const;
const Formattable* getArgument(const MessageFormatter&,
const data_model::VariableName&,
UErrorCode&) const;

// Avoids using Hashtable so that code constructing a Hashtable
// doesn't have to appear in this header file
Expand Down
3 changes: 3 additions & 0 deletions icu4c/source/test/intltest/messageformat2test_read_json.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,9 @@ void TestMessageFormat2::jsonTestsFromFiles(IcuTestErrorCode& errorCode) {
runTestsFromJsonFile(*this, "spec/functions/time.json", errorCode);

// Other tests (non-spec)
// TODO: Delete this file after https://github.com/unicode-org/message-format-wg/pull/904
// lands and the tests here are updated from the spec repo
runTestsFromJsonFile(*this, "normalization.json", errorCode);
runTestsFromJsonFile(*this, "more-functions.json", errorCode);
runTestsFromJsonFile(*this, "valid-tests.json", errorCode);
runTestsFromJsonFile(*this, "resolution-errors.json", errorCode);
Expand Down
47 changes: 47 additions & 0 deletions testdata/message2/normalization.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
{
"$schema": "https://raw.githubusercontent.com/unicode-org/message-format-wg/main/test/schemas/v0/tests.schema.json",
"scenario": "Syntax",
"description": "Test cases that do not depend on any registry definitions.",
"defaultTestProperties": {
"locale": "en-US"
},
"tests": [
{
"description": "NFC: literals are not normalized",
"src": "\u1E0A\u0323",
"exp": "\u1E0A\u0323"
},
{
"description": "NFC: variables are compared to each other as-if normalized; decl is non-normalized, use is",
"src": ".local $\u0044\u0323\u0307 = {foo} {{{$\u1E0c\u0307}}}",
"exp": "foo"
},
{
"description": "NFC: variables are compared to each other as-if normalized; decl is normalized, use isn't",
"src": ".local $\u1E0c\u0307 = {foo} {{{$\u0044\u0323\u0307}}}",
"exp": "foo"
},
{
"description": "NFC: variables are compared to each other as-if normalized; decl is normalized, use isn't",
"src": ".input {$\u1E0c\u0307} {{{$\u0044\u0323\u0307}}}",
"params": [{"name": "\u1E0c\u0307", "value": "foo"}],
"exp": "foo"
},
{
"description": "NFC: variables are compared to each other as-if normalized; decl is non-normalized, use is",
"src": ".input {$\u0044\u0323\u0307} {{{$\u1E0c\u0307}}}",
"params": [{"name": "\u0044\u0323\u0307", "value": "foo"}],
"exp": "foo"
},
{
"description": "NFC: keys are normalized",
"src": ".local $x = {\u1E0A\u0323 :string} .match {$x} \u1E0A\u0323 {{Not normalized}} \u1E0C\u0307 {{Normalized}} * {{Wrong}}",
"expErrors": [{"type": "duplicate-variant"}]
},
{
"description": "NFC: keys are normalized",
"src": ".local $x = {\u1E0A\u0323 :string} .match {$x} \u1E0C\u0307 {{Right}} * {{Wrong}}",
"exp": "Right"
}
]
}

0 comments on commit be97a13

Please sign in to comment.