From f7d641d5adb0460d1f58bad5947a29725870cc83 Mon Sep 17 00:00:00 2001 From: Tim Chevalier Date: Fri, 12 May 2023 22:23:28 -0700 Subject: [PATCH] ICU-22261 Add tech preview implementation for MessageFormat 2.0 to icu4c --- icu4c/source/common/unicode/utypes.h | 18 +- icu4c/source/common/utypes.cpp | 13 + icu4c/source/i18n/i18n.vcxproj | 11 + icu4c/source/i18n/i18n.vcxproj.filters | 36 + icu4c/source/i18n/i18n_uwp.vcxproj | 11 + icu4c/source/i18n/messageformat2.cpp | 874 +++++ icu4c/source/i18n/messageformat2_allocation.h | 139 + .../source/i18n/messageformat2_arguments.cpp | 55 + .../i18n/messageformat2_cached_formatters.h | 62 + icu4c/source/i18n/messageformat2_checker.cpp | 295 ++ icu4c/source/i18n/messageformat2_checker.h | 91 + .../source/i18n/messageformat2_data_model.cpp | 1098 ++++++ icu4c/source/i18n/messageformat2_errors.cpp | 286 ++ icu4c/source/i18n/messageformat2_errors.h | 155 + .../source/i18n/messageformat2_evaluation.cpp | 208 ++ icu4c/source/i18n/messageformat2_evaluation.h | 203 ++ .../i18n/messageformat2_formattable.cpp | 334 ++ .../source/i18n/messageformat2_formatter.cpp | 353 ++ .../i18n/messageformat2_function_registry.cpp | 1182 +++++++ ...essageformat2_function_registry_internal.h | 227 ++ icu4c/source/i18n/messageformat2_macros.h | 109 + icu4c/source/i18n/messageformat2_parser.cpp | 2433 +++++++++++++ icu4c/source/i18n/messageformat2_parser.h | 148 + .../source/i18n/messageformat2_serializer.cpp | 336 ++ icu4c/source/i18n/messageformat2_serializer.h | 69 + icu4c/source/i18n/sources.txt | 11 + icu4c/source/i18n/unicode/messageformat2.h | 406 +++ .../i18n/unicode/messageformat2_arguments.h | 143 + .../i18n/unicode/messageformat2_data_model.h | 3080 +++++++++++++++++ .../unicode/messageformat2_data_model_names.h | 38 + .../i18n/unicode/messageformat2_formattable.h | 1015 ++++++ .../messageformat2_function_registry.h | 389 +++ icu4c/source/test/depstest/dependencies.txt | 5 + icu4c/source/test/depstest/depstest.py | 17 + icu4c/source/test/intltest/Makefile.in | 1 + icu4c/source/test/intltest/intltest.vcxproj | 6 + .../test/intltest/intltest.vcxproj.filters | 8 +- icu4c/source/test/intltest/itformat.cpp | 2 + .../test/intltest/messageformat2test.cpp | 1071 ++++++ .../source/test/intltest/messageformat2test.h | 202 ++ .../intltest/messageformat2test_builtin.cpp | 191 + .../intltest/messageformat2test_custom.cpp | 728 ++++ .../intltest/messageformat2test_features.cpp | 474 +++ .../intltest/messageformat2test_fromjson.cpp | 1260 +++++++ .../test/intltest/messageformat2test_icu.cpp | 155 + .../test/intltest/messageformat2test_utils.h | 312 ++ 46 files changed, 18258 insertions(+), 2 deletions(-) create mode 100644 icu4c/source/i18n/messageformat2.cpp create mode 100644 icu4c/source/i18n/messageformat2_allocation.h create mode 100644 icu4c/source/i18n/messageformat2_arguments.cpp create mode 100644 icu4c/source/i18n/messageformat2_cached_formatters.h create mode 100644 icu4c/source/i18n/messageformat2_checker.cpp create mode 100644 icu4c/source/i18n/messageformat2_checker.h create mode 100644 icu4c/source/i18n/messageformat2_data_model.cpp create mode 100644 icu4c/source/i18n/messageformat2_errors.cpp create mode 100644 icu4c/source/i18n/messageformat2_errors.h create mode 100644 icu4c/source/i18n/messageformat2_evaluation.cpp create mode 100644 icu4c/source/i18n/messageformat2_evaluation.h create mode 100644 icu4c/source/i18n/messageformat2_formattable.cpp create mode 100644 icu4c/source/i18n/messageformat2_formatter.cpp create mode 100644 icu4c/source/i18n/messageformat2_function_registry.cpp create mode 100644 icu4c/source/i18n/messageformat2_function_registry_internal.h create mode 100644 icu4c/source/i18n/messageformat2_macros.h create mode 100644 icu4c/source/i18n/messageformat2_parser.cpp create mode 100644 icu4c/source/i18n/messageformat2_parser.h create mode 100644 icu4c/source/i18n/messageformat2_serializer.cpp create mode 100644 icu4c/source/i18n/messageformat2_serializer.h create mode 100644 icu4c/source/i18n/unicode/messageformat2.h create mode 100644 icu4c/source/i18n/unicode/messageformat2_arguments.h create mode 100644 icu4c/source/i18n/unicode/messageformat2_data_model.h create mode 100644 icu4c/source/i18n/unicode/messageformat2_data_model_names.h create mode 100644 icu4c/source/i18n/unicode/messageformat2_formattable.h create mode 100644 icu4c/source/i18n/unicode/messageformat2_function_registry.h create mode 100644 icu4c/source/test/intltest/messageformat2test.cpp create mode 100644 icu4c/source/test/intltest/messageformat2test.h create mode 100644 icu4c/source/test/intltest/messageformat2test_builtin.cpp create mode 100644 icu4c/source/test/intltest/messageformat2test_custom.cpp create mode 100644 icu4c/source/test/intltest/messageformat2test_features.cpp create mode 100644 icu4c/source/test/intltest/messageformat2test_fromjson.cpp create mode 100644 icu4c/source/test/intltest/messageformat2test_icu.cpp create mode 100644 icu4c/source/test/intltest/messageformat2test_utils.h diff --git a/icu4c/source/common/unicode/utypes.h b/icu4c/source/common/unicode/utypes.h index f890d5d1dbbe..b5da80ccb9db 100644 --- a/icu4c/source/common/unicode/utypes.h +++ b/icu4c/source/common/unicode/utypes.h @@ -438,6 +438,7 @@ typedef enum UErrorCode { U_PLUGIN_CHANGED_LEVEL_WARNING = -120, /**< A plugin caused a level change. May not be an error, but later plugins may not load. */ + #ifndef U_HIDE_DEPRECATED_API /** * One more than the highest normal UErrorCode warning value. @@ -568,12 +569,27 @@ typedef enum UErrorCode { U_FORMAT_INEXACT_ERROR, /**< Cannot format a number exactly and rounding mode is ROUND_UNNECESSARY @stable ICU 4.8 */ U_NUMBER_ARG_OUTOFBOUNDS_ERROR, /**< The argument to a NumberFormatter helper method was out of bounds; the bounds are usually 0 to 999. @stable ICU 61 */ U_NUMBER_SKELETON_SYNTAX_ERROR, /**< The number skeleton passed to C++ NumberFormatter or C UNumberFormatter was invalid or contained a syntax error. @stable ICU 62 */ + + /* MessageFormat 2.0 errors */ + U_MF_UNRESOLVED_VARIABLE_ERROR, /** A variable is referred to but not bound by any definition */ + U_MF_SYNTAX_ERROR, /** Includes all syntax errors */ + U_MF_UNKNOWN_FUNCTION_ERROR, /** An annotation refers to a function not defined by the standard or custom function registry */ + U_MF_VARIANT_KEY_MISMATCH_ERROR, /** In a match-construct, one or more variants had a different number of keys from the number of selectors */ + U_MF_FORMATTING_ERROR, /** Covers all runtime errors: for example, an internally inconsistent set of options. */ + U_MF_NONEXHAUSTIVE_PATTERN_ERROR, /** In a match-construct, the variants do not cover all possible values */ + U_MF_DUPLICATE_OPTION_NAME_ERROR, /** In an annotation, the same option name appears more than once */ + U_MF_SELECTOR_ERROR, /** A selector function is applied to an operand of the wrong type */ + U_MF_MISSING_SELECTOR_ANNOTATION_ERROR, /** A selector expression evaluates to an unannotated operand */ + U_MF_DUPLICATE_DECLARATION_ERROR, /** The same variable is declared in more than one .local or .input declaration */ + U_MF_OPERAND_MISMATCH_ERROR, /** An operand provided to a function does not have the required form for that function */ + U_MF_UNSUPPORTED_STATEMENT_ERROR, /** A message includes a reserved statement */ + U_MF_UNSUPPORTED_EXPRESSION_ERROR, /** A message includes syntax reserved for future standardization or private implementation use */ #ifndef U_HIDE_DEPRECATED_API /** * One more than the highest normal formatting API error code. * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420. */ - U_FMT_PARSE_ERROR_LIMIT = 0x10114, + U_FMT_PARSE_ERROR_LIMIT = 0x10121, #endif // U_HIDE_DEPRECATED_API /* diff --git a/icu4c/source/common/utypes.cpp b/icu4c/source/common/utypes.cpp index 63e05b1249b6..715994d67f04 100644 --- a/icu4c/source/common/utypes.cpp +++ b/icu4c/source/common/utypes.cpp @@ -129,6 +129,19 @@ _uFmtErrorName[U_FMT_PARSE_ERROR_LIMIT - U_FMT_PARSE_ERROR_START] = { "U_FORMAT_INEXACT_ERROR", "U_NUMBER_ARG_OUTOFBOUNDS_ERROR", "U_NUMBER_SKELETON_SYNTAX_ERROR", + "U_MF_UNRESOLVED_VARIABLE_ERROR", + "U_MF_SYNTAX_ERROR", + "U_MF_UNKNOWN_FUNCTION_ERROR", + "U_MF_VARIANT_KEY_MISMATCH_ERROR", + "U_MF_FORMATTING_ERROR", + "U_MF_NONEXHAUSTIVE_PATTERN_ERROR", + "U_MF_DUPLICATE_OPTION_NAME_ERROR", + "U_MF_SELECTOR_ERROR", + "U_MF_MISSING_SELECTOR_ANNOTATION_ERROR", + "U_MF_DUPLICATE_DECLARATION_ERROR", + "U_MF_OPERAND_MISMATCH_ERROR", + "U_MF_UNSUPPORTED_STATEMENT_ERROR", + "U_MF_UNSUPPORTED_EXPRESSION_ERROR" }; static const char * const diff --git a/icu4c/source/i18n/i18n.vcxproj b/icu4c/source/i18n/i18n.vcxproj index 55471bef1493..eea2571e4f62 100644 --- a/icu4c/source/i18n/i18n.vcxproj +++ b/icu4c/source/i18n/i18n.vcxproj @@ -173,6 +173,17 @@ + + + + + + + + + + + diff --git a/icu4c/source/i18n/i18n.vcxproj.filters b/icu4c/source/i18n/i18n.vcxproj.filters index e35edb5c1b19..fd0e99497dc0 100644 --- a/icu4c/source/i18n/i18n.vcxproj.filters +++ b/icu4c/source/i18n/i18n.vcxproj.filters @@ -213,6 +213,42 @@ formatting + + formatting + + + formatting + + + formatting + + + formatting + + + formatting + + + formatting + + + formatting + + + formatting + + + formatting + + + formatting + + + formatting + + + formatting + formatting diff --git a/icu4c/source/i18n/i18n_uwp.vcxproj b/icu4c/source/i18n/i18n_uwp.vcxproj index dd997d115802..db763118b57c 100644 --- a/icu4c/source/i18n/i18n_uwp.vcxproj +++ b/icu4c/source/i18n/i18n_uwp.vcxproj @@ -406,6 +406,17 @@ + + + + + + + + + + + diff --git a/icu4c/source/i18n/messageformat2.cpp b/icu4c/source/i18n/messageformat2.cpp new file mode 100644 index 000000000000..8d9efe6d62cb --- /dev/null +++ b/icu4c/source/i18n/messageformat2.cpp @@ -0,0 +1,874 @@ +// © 2024 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_FORMATTING + +#include "unicode/messageformat2_arguments.h" +#include "unicode/messageformat2_data_model.h" +#include "unicode/messageformat2_formattable.h" +#include "unicode/messageformat2.h" +#include "unicode/unistr.h" +#include "messageformat2_allocation.h" +#include "messageformat2_evaluation.h" +#include "messageformat2_macros.h" + + +U_NAMESPACE_BEGIN + +namespace message2 { + +using namespace data_model; + +// ------------------------------------------------------ +// Formatting + +// The result of formatting a literal is just itself. +static Formattable evalLiteral(const Literal& lit) { + return Formattable(lit.unquoted()); +} + +// Assumes that `var` is a message argument; returns the argument's value. +[[nodiscard]] FormattedPlaceholder MessageFormatter::evalArgument(const VariableName& var, MessageContext& context, UErrorCode& errorCode) const { + if (U_SUCCESS(errorCode)) { + // The fallback for a variable name is itself. + UnicodeString str(DOLLAR); + str += var; + const Formattable* val = context.getGlobal(var, errorCode); + if (U_SUCCESS(errorCode)) { + return (FormattedPlaceholder(*val, str)); + } + } + return {}; +} + +// Returns the contents of the literal +[[nodiscard]] FormattedPlaceholder MessageFormatter::formatLiteral(const Literal& lit) const { + // The fallback for a literal is itself. + return FormattedPlaceholder(evalLiteral(lit), lit.quoted()); +} + +[[nodiscard]] FormattedPlaceholder MessageFormatter::formatOperand(const Environment& env, + const Operand& rand, + MessageContext& context, + UErrorCode &status) const { + if (U_FAILURE(status)) { + return {}; + } + + if (rand.isNull()) { + return FormattedPlaceholder(); + } + if (rand.isVariable()) { + // Check if it's local or global + // Note: there is no name shadowing; this is enforced by the parser + const VariableName& var = rand.asVariable(); + // TODO: Currently, this code implements lazy evaluation of locals. + // That is, the environment binds names to a closure, not a resolved value. + // Eager vs. lazy evaluation is an open issue: + // see https://github.com/unicode-org/message-format-wg/issues/299 + + // Look up the variable in the environment + if (env.has(var)) { + // `var` is a local -- look it up + const Closure& rhs = env.lookup(var); + // Format the expression using the environment from the closure + return formatExpression(rhs.getEnv(), rhs.getExpr(), context, status); + } + // Variable wasn't found in locals -- check if it's global + FormattedPlaceholder result = evalArgument(var, context, status); + if (status == U_ILLEGAL_ARGUMENT_ERROR) { + status = U_ZERO_ERROR; + // Unbound variable -- set a resolution error + context.getErrors().setUnresolvedVariable(var, status); + // Use fallback per + // https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#fallback-resolution + UnicodeString str(DOLLAR); + str += var; + return FormattedPlaceholder(str); + } + return result; + } else { + U_ASSERT(rand.isLiteral()); + return formatLiteral(rand.asLiteral()); + } +} + +// Resolves a function's options +FunctionOptions MessageFormatter::resolveOptions(const Environment& env, const OptionMap& options, MessageContext& context, UErrorCode& status) const { + LocalPointer optionsVector(createUVector(status)); + if (U_FAILURE(status)) { + return {}; + } + LocalPointer resolvedOpt; + for (int i = 0; i < options.size(); i++) { + const Option& opt = options.getOption(i, status); + if (U_FAILURE(status)) { + return {}; + } + const UnicodeString& k = opt.getName(); + const Operand& v = opt.getValue(); + + // Options are fully evaluated before calling the function + // Format the operand + FormattedPlaceholder rhsVal = formatOperand(env, v, context, status); + if (U_FAILURE(status)) { + return {}; + } + if (!rhsVal.isFallback()) { + resolvedOpt.adoptInstead(create(ResolvedFunctionOption(k, rhsVal.asFormattable()), status)); + if (U_FAILURE(status)) { + return {}; + } + optionsVector->adoptElement(resolvedOpt.orphan(), status); + } + } + + return FunctionOptions(std::move(*optionsVector), status); +} + +// Overload that dispatches on argument type. Syntax doesn't provide for options in this case. +[[nodiscard]] FormattedPlaceholder MessageFormatter::evalFormatterCall(FormattedPlaceholder&& argument, + MessageContext& context, + UErrorCode& status) const { + if (U_FAILURE(status)) { + return {}; + } + + // These cases should have been checked for already + U_ASSERT(!argument.isFallback() && !argument.isNullOperand()); + + const Formattable& toFormat = argument.asFormattable(); + switch (toFormat.getType()) { + case UFMT_OBJECT: { + const FormattableObject* obj = toFormat.getObject(status); + U_ASSERT(U_SUCCESS(status)); + U_ASSERT(obj != nullptr); + const UnicodeString& type = obj->tag(); + FunctionName functionName; + if (!getDefaultFormatterNameByType(type, functionName)) { + // No formatter for this type -- follow default behavior + break; + } + return evalFormatterCall(functionName, + std::move(argument), + FunctionOptions(), + context, + status); + } + default: { + // TODO: The array case isn't handled yet; not sure whether it's desirable + // to have a default list formatter + break; + } + } + // No formatter for this type, or it's a primitive type (which will be formatted later) + // -- just return the argument itself + return std::move(argument); +} + +// Overload that dispatches on function name +[[nodiscard]] FormattedPlaceholder MessageFormatter::evalFormatterCall(const FunctionName& functionName, + FormattedPlaceholder&& argument, + FunctionOptions&& options, + MessageContext& context, + UErrorCode& status) const { + if (U_FAILURE(status)) { + return {}; + } + + DynamicErrors& errs = context.getErrors(); + + UnicodeString fallback(COLON); + fallback += functionName; + if (!argument.isNullOperand()) { + fallback = argument.fallback; + } + + if (isFormatter(functionName)) { + const Formatter& formatterImpl = getFormatter(context, functionName, status); + + UErrorCode savedStatus = status; + FormattedPlaceholder result = formatterImpl.format(std::move(argument), std::move(options), status); + // Update errors + if (savedStatus != status) { + if (U_FAILURE(status)) { + if (status == U_MF_OPERAND_MISMATCH_ERROR) { + status = U_ZERO_ERROR; + errs.setOperandMismatchError(functionName, status); + } else { + status = U_ZERO_ERROR; + // Convey any error generated by the formatter + // as a formatting error, except for operand mismatch errors + errs.setFormattingError(functionName, status); + } + return FormattedPlaceholder(fallback); + } else { + // Ignore warnings + status = savedStatus; + } + } + // Ignore the output if any errors occurred + if (errs.hasFormattingError()) { + return FormattedPlaceholder(fallback); + } + return result; + } + // No formatter with this name -- set error + if (isSelector(functionName)) { + errs.setFormattingError(functionName, status); + } else { + errs.setUnknownFunction(functionName, status); + } + return FormattedPlaceholder(fallback); +} + +// Per https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#fallback-resolution +static UnicodeString reservedFallback (const Expression& e) { + UErrorCode localErrorCode = U_ZERO_ERROR; + const Operator* rator = e.getOperator(localErrorCode); + U_ASSERT(U_SUCCESS(localErrorCode)); + const Reserved& r = rator->asReserved(); + + // An empty Reserved isn't representable in the syntax + U_ASSERT(r.numParts() > 0); + + const UnicodeString& contents = r.getPart(0).unquoted(); + // Parts should never be empty + U_ASSERT(contents.length() > 0); + + // Return first character of string + return UnicodeString(contents, 0, 1); +} + +// Formats an expression using `globalEnv` for the values of variables +[[nodiscard]] FormattedPlaceholder MessageFormatter::formatExpression(const Environment& globalEnv, + const Expression& expr, + MessageContext& context, + UErrorCode &status) const { + if (U_FAILURE(status)) { + return {}; + } + + // Formatting error + if (expr.isReserved()) { + context.getErrors().setReservedError(status); + return FormattedPlaceholder(reservedFallback(expr)); + } + + const Operand& rand = expr.getOperand(); + // Format the operand (formatOperand handles the case of a null operand) + FormattedPlaceholder randVal = formatOperand(globalEnv, rand, context, status); + + // Don't call the function on error values + if (randVal.isFallback()) { + return randVal; + } + + if (!expr.isFunctionCall()) { + // Dispatch based on type of `randVal` + return evalFormatterCall(std::move(randVal), + context, + status); + } else { + const Operator* rator = expr.getOperator(status); + U_ASSERT(U_SUCCESS(status)); + const FunctionName& functionName = rator->getFunctionName(); + const OptionMap& options = rator->getOptionsInternal(); + // Resolve the options + FunctionOptions resolvedOptions = resolveOptions(globalEnv, options, context, status); + + // Call the formatter function + // The fallback for a nullary function call is the function name + UnicodeString fallback; + if (rand.isNull()) { + fallback = UnicodeString(COLON); + fallback += functionName; + } else { + fallback = randVal.fallback; + } + return evalFormatterCall(functionName, + std::move(randVal), + std::move(resolvedOptions), + context, + status); + } +} + +// Formats each text and expression part of a pattern, appending the results to `result` +void MessageFormatter::formatPattern(MessageContext& context, const Environment& globalEnv, const Pattern& pat, UErrorCode &status, UnicodeString& result) const { + CHECK_ERROR(status); + + for (int32_t i = 0; i < pat.numParts(); i++) { + const PatternPart& part = pat.getPart(i); + if (part.isText()) { + result += part.asText(); + } else if (part.isMarkup()) { + // Markup is ignored + } else { + // Format the expression + FormattedPlaceholder partVal = formatExpression(globalEnv, part.contents(), context, status); + // Force full evaluation, e.g. applying default formatters to + // unformatted input (or formatting numbers as strings) + UnicodeString partResult = partVal.formatToString(locale, status); + result += partResult; + // Handle formatting errors. `formatToString()` can't take a context and thus can't + // register an error directly + if (status == U_MF_FORMATTING_ERROR) { + status = U_ZERO_ERROR; + // TODO: The name of the formatter that failed is unavailable. + // Not ideal, but it's hard for `formatToString()` + // to pass along more detailed diagnostics + context.getErrors().setFormattingError(status); + } + } + } +} + +// ------------------------------------------------------ +// Selection + +// See https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#resolve-selectors +// `res` is a vector of ResolvedSelectors +void MessageFormatter::resolveSelectors(MessageContext& context, const Environment& env, UErrorCode &status, UVector& res) const { + CHECK_ERROR(status); + U_ASSERT(!dataModel.hasPattern()); + + const Expression* selectors = dataModel.getSelectorsInternal(); + // 1. Let res be a new empty list of resolved values that support selection. + // (Implicit, since `res` is an out-parameter) + // 2. For each expression exp of the message's selectors + for (int32_t i = 0; i < dataModel.numSelectors(); i++) { + // 2i. Let rv be the resolved value of exp. + ResolvedSelector rv = formatSelectorExpression(env, selectors[i], context, status); + if (rv.hasSelector()) { + // 2ii. If selection is supported for rv: + // (True if this code has been reached) + } else { + // 2iii. Else: + // Let nomatch be a resolved value for which selection always fails. + // Append nomatch as the last element of the list res. + // Emit a Selection Error. + // (Note: in this case, rv, being a fallback, serves as `nomatch`) + #if U_DEBUG + const DynamicErrors& err = context.getErrors(); + U_ASSERT(err.hasError()); + U_ASSERT(rv.argument().isFallback()); + #endif + } + // 2ii(a). Append rv as the last element of the list res. + // (Also fulfills 2iii) + LocalPointer v(create(std::move(rv), status)); + CHECK_ERROR(status); + res.adoptElement(v.orphan(), status); + } +} + +// See https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#resolve-preferences +// `keys` and `matches` are vectors of strings +void MessageFormatter::matchSelectorKeys(const UVector& keys, + MessageContext& context, + ResolvedSelector&& rv, + UVector& keysOut, + UErrorCode& status) const { + CHECK_ERROR(status); + + if (!rv.hasSelector()) { + // Return an empty list of matches + return; + } + + auto selectorImpl = rv.getSelector(); + U_ASSERT(selectorImpl != nullptr); + UErrorCode savedStatus = status; + + // Convert `keys` to an array + int32_t keysLen = keys.size(); + UnicodeString* keysArr = new UnicodeString[keysLen]; + if (keysArr == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + for (int32_t i = 0; i < keysLen; i++) { + const UnicodeString* k = static_cast(keys[i]); + U_ASSERT(k != nullptr); + keysArr[i] = *k; + } + LocalArray adoptedKeys(keysArr); + + // Create an array to hold the output + UnicodeString* prefsArr = new UnicodeString[keysLen]; + if (prefsArr == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + LocalArray adoptedPrefs(prefsArr); + int32_t prefsLen = 0; + + // Call the selector + selectorImpl->selectKey(rv.takeArgument(), rv.takeOptions(), + adoptedKeys.getAlias(), keysLen, adoptedPrefs.getAlias(), prefsLen, + status); + + // Update errors + if (savedStatus != status) { + if (U_FAILURE(status)) { + status = U_ZERO_ERROR; + context.getErrors().setSelectorError(rv.getSelectorName(), status); + } else { + // Ignore warnings + status = savedStatus; + } + } + + CHECK_ERROR(status); + + // Copy the resulting keys (if there was no error) + keysOut.removeAllElements(); + for (int32_t i = 0; i < prefsLen; i++) { + UnicodeString* k = message2::create(std::move(prefsArr[i]), status); + if (k == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + keysOut.adoptElement(k, status); + CHECK_ERROR(status); + } +} + +// See https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#resolve-preferences +// `res` is a vector of FormattedPlaceholders; +// `pref` is a vector of vectors of strings +void MessageFormatter::resolvePreferences(MessageContext& context, UVector& res, UVector& pref, UErrorCode &status) const { + CHECK_ERROR(status); + + // 1. Let pref be a new empty list of lists of strings. + // (Implicit, since `pref` is an out-parameter) + UnicodeString ks; + LocalPointer ksP; + int32_t numVariants = dataModel.numVariants(); + const Variant* variants = dataModel.getVariantsInternal(); + // 2. For each index i in res + for (int32_t i = 0; i < (int32_t) res.size(); i++) { + // 2i. Let keys be a new empty list of strings. + LocalPointer keys(createUVector(status)); + CHECK_ERROR(status); + // 2ii. For each variant `var` of the message + for (int32_t variantNum = 0; variantNum < numVariants; variantNum++) { + const SelectorKeys& selectorKeys = variants[variantNum].getKeys(); + + // Note: Here, `var` names the key list of `var`, + // not a Variant itself + const Key* var = selectorKeys.getKeysInternal(); + // 2ii(a). Let `key` be the `var` key at position i. + U_ASSERT(i < selectorKeys.len); // established by semantic check in formatSelectors() + const Key& key = var[i]; + // 2ii(b). If `key` is not the catch-all key '*' + if (!key.isWildcard()) { + // 2ii(b)(a) Assert that key is a literal. + // (Not needed) + // 2ii(b)(b) Let `ks` be the resolved value of `key`. + ks = key.asLiteral().unquoted(); + // 2ii(b)(c) Append `ks` as the last element of the list `keys`. + ksP.adoptInstead(create(std::move(ks), status)); + CHECK_ERROR(status); + keys->adoptElement(ksP.orphan(), status); + } + } + // 2iii. Let `rv` be the resolved value at index `i` of `res`. + U_ASSERT(i < res.size()); + ResolvedSelector rv = std::move(*(static_cast(res[i]))); + // 2iv. Let matches be the result of calling the method MatchSelectorKeys(rv, keys) + LocalPointer matches(createUVector(status)); + matchSelectorKeys(*keys, context, std::move(rv), *matches, status); + // 2v. Append `matches` as the last element of the list `pref` + pref.adoptElement(matches.orphan(), status); + } +} + +// `v` is assumed to be a vector of strings +static int32_t vectorFind(const UVector& v, const UnicodeString& k) { + for (int32_t i = 0; i < v.size(); i++) { + if (*static_cast(v[i]) == k) { + return i; + } + } + return -1; +} + +static UBool vectorContains(const UVector& v, const UnicodeString& k) { + return (vectorFind(v, k) != -1); +} + +// See https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#filter-variants +// `pref` is a vector of vectors of strings. `vars` is a vector of PrioritizedVariants +void MessageFormatter::filterVariants(const UVector& pref, UVector& vars, UErrorCode& status) const { + const Variant* variants = dataModel.getVariantsInternal(); + + // 1. Let `vars` be a new empty list of variants. + // (Not needed since `vars` is an out-parameter) + // 2. For each variant `var` of the message: + for (int32_t j = 0; j < dataModel.numVariants(); j++) { + const SelectorKeys& selectorKeys = variants[j].getKeys(); + const Pattern& p = variants[j].getPattern(); + + // Note: Here, `var` names the key list of `var`, + // not a Variant itself + const Key* var = selectorKeys.getKeysInternal(); + // 2i. For each index `i` in `pref`: + bool noMatch = false; + for (int32_t i = 0; i < (int32_t) pref.size(); i++) { + // 2i(a). Let `key` be the `var` key at position `i`. + U_ASSERT(i < selectorKeys.len); + const Key& key = var[i]; + // 2i(b). If key is the catch-all key '*': + if (key.isWildcard()) { + // 2i(b)(a). Continue the inner loop on pref. + continue; + } + // 2i(c). Assert that `key` is a literal. + // (Not needed) + // 2i(d). Let `ks` be the resolved value of `key`. + UnicodeString ks = key.asLiteral().unquoted(); + // 2i(e). Let `matches` be the list of strings at index `i` of `pref`. + const UVector& matches = *(static_cast(pref[i])); // `matches` is a vector of strings + // 2i(f). If `matches` includes `ks` + if (vectorContains(matches, ks)) { + // 2i(f)(a). Continue the inner loop on `pref`. + continue; + } + // 2i(g). Else: + // 2i(g)(a). Continue the outer loop on message variants. + noMatch = true; + break; + } + if (!noMatch) { + // Append `var` as the last element of the list `vars`. + PrioritizedVariant* tuple = create(PrioritizedVariant(-1, selectorKeys, p), status); + CHECK_ERROR(status); + vars.adoptElement(tuple, status); + } + } +} + +// See https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#sort-variants +// Leaves the preferred variant as element 0 in `sortable` +// Note: this sorts in-place, so `sortable` is just `vars` +// `pref` is a vector of vectors of strings; `vars` is a vector of PrioritizedVariants +void MessageFormatter::sortVariants(const UVector& pref, UVector& vars, UErrorCode& status) const { + CHECK_ERROR(status); + +// Note: steps 1 and 2 are omitted since we use `vars` as `sortable` (we sort in-place) + // 1. Let `sortable` be a new empty list of (integer, variant) tuples. + // (Not needed since `sortable` is an out-parameter) + // 2. For each variant `var` of `vars` + // 2i. Let tuple be a new tuple (-1, var). + // 2ii. Append `tuple` as the last element of the list `sortable`. + + // 3. Let `len` be the integer count of items in `pref`. + int32_t len = pref.size(); + // 4. Let `i` be `len` - 1. + int32_t i = len - 1; + // 5. While i >= 0: + while (i >= 0) { + // 5i. Let `matches` be the list of strings at index `i` of `pref`. + U_ASSERT(pref[i] != nullptr); + const UVector& matches = *(static_cast(pref[i])); // `matches` is a vector of strings + // 5ii. Let `minpref` be the integer count of items in `matches`. + int32_t minpref = matches.size(); + // 5iii. For each tuple `tuple` of `sortable`: + for (int32_t j = 0; j < vars.size(); j++) { + U_ASSERT(vars[j] != nullptr); + PrioritizedVariant& tuple = *(static_cast(vars[j])); + // 5iii(a). Let matchpref be an integer with the value minpref. + int32_t matchpref = minpref; + // 5iii(b). Let `key` be the tuple variant key at position `i`. + const Key* tupleVariantKeys = tuple.keys.getKeysInternal(); + U_ASSERT(i < tuple.keys.len); // Given by earlier semantic checking + const Key& key = tupleVariantKeys[i]; + // 5iii(c) If `key` is not the catch-all key '*': + if (!key.isWildcard()) { + // 5iii(c)(a). Assert that `key` is a literal. + // (Not needed) + // 5iii(c)(b). Let `ks` be the resolved value of `key`. + UnicodeString ks = key.asLiteral().unquoted(); + // 5iii(c)(c) Let matchpref be the integer position of ks in `matches`. + matchpref = vectorFind(matches, ks); + U_ASSERT(matchpref >= 0); + } + // 5iii(d) Set the `tuple` integer value as matchpref. + tuple.priority = matchpref; + } + // 5iv. Set `sortable` to be the result of calling the method SortVariants(`sortable`) + vars.sort(comparePrioritizedVariants, status); + CHECK_ERROR(status); + // 5v. Set `i` to be `i` - 1. + i--; + } + // The caller is responsible for steps 6 and 7 + // 6. Let `var` be the `variant` element of the first element of `sortable`. + // 7. Select the pattern of `var` +} + + +// Evaluate the operand +ResolvedSelector MessageFormatter::resolveVariables(const Environment& env, const Operand& rand, MessageContext& context, UErrorCode &status) const { + if (U_FAILURE(status)) { + return {}; + } + + if (rand.isNull()) { + return ResolvedSelector(FormattedPlaceholder()); + } + + if (rand.isLiteral()) { + return ResolvedSelector(formatLiteral(rand.asLiteral())); + } + + // Must be variable + const VariableName& var = rand.asVariable(); + // Resolve the variable + if (env.has(var)) { + const Closure& referent = env.lookup(var); + // Resolve the referent + return resolveVariables(referent.getEnv(), referent.getExpr(), context, status); + } + // Either this is a global var or an unbound var -- + // either way, it can't be bound to a function call. + // Check globals + FormattedPlaceholder val = evalArgument(var, context, status); + if (status == U_ILLEGAL_ARGUMENT_ERROR) { + status = U_ZERO_ERROR; + // Unresolved variable -- could be a previous warning. Nothing to resolve + U_ASSERT(context.getErrors().hasUnresolvedVariableError()); + return ResolvedSelector(FormattedPlaceholder(var)); + } + // Pass through other errors + return ResolvedSelector(std::move(val)); +} + +// Evaluate the expression except for not performing the top-level function call +// (which is expected to be a selector, but may not be, in error cases) +ResolvedSelector MessageFormatter::resolveVariables(const Environment& env, + const Expression& expr, + MessageContext& context, + UErrorCode &status) const { + if (U_FAILURE(status)) { + return {}; + } + + // A `reserved` is an error + if (expr.isReserved()) { + context.getErrors().setReservedError(status); + return ResolvedSelector(FormattedPlaceholder(reservedFallback(expr))); + } + + // Function call -- resolve the operand and options + if (expr.isFunctionCall()) { + const Operator* rator = expr.getOperator(status); + U_ASSERT(U_SUCCESS(status)); + // Already checked that rator is non-reserved + const FunctionName& selectorName = rator->getFunctionName(); + if (isSelector(selectorName)) { + auto selector = getSelector(context, selectorName, status); + if (U_SUCCESS(status)) { + FunctionOptions resolvedOptions = resolveOptions(env, rator->getOptionsInternal(), context, status); + // Operand may be the null argument, but resolveVariables() handles that + FormattedPlaceholder argument = formatOperand(env, expr.getOperand(), context, status); + return ResolvedSelector(selectorName, selector, std::move(resolvedOptions), std::move(argument)); + } + } else if (isFormatter(selectorName)) { + context.getErrors().setSelectorError(selectorName, status); + } else { + context.getErrors().setUnknownFunction(selectorName, status); + } + // Non-selector used as selector; an error would have been recorded earlier + UnicodeString fallback(COLON); + fallback += selectorName; + if (!expr.getOperand().isNull()) { + fallback = formatOperand(env, expr.getOperand(), context, status).fallback; + } + return ResolvedSelector(FormattedPlaceholder(fallback)); + } else { + // Might be a variable reference, so expand one more level of variable + return resolveVariables(env, expr.getOperand(), context, status); + } +} + +ResolvedSelector MessageFormatter::formatSelectorExpression(const Environment& globalEnv, const Expression& expr, MessageContext& context, UErrorCode &status) const { + if (U_FAILURE(status)) { + return {}; + } + + // Resolve expression to determine if it's a function call + ResolvedSelector exprResult = resolveVariables(globalEnv, expr, context, status); + + DynamicErrors& err = context.getErrors(); + + // If there is a selector, then `resolveVariables()` recorded it in the context + if (exprResult.hasSelector()) { + // Check if there was an error + if (exprResult.argument().isFallback()) { + // Use a null expression if it's a syntax or data model warning; + // create a valid (non-fallback) formatted placeholder from the + // fallback string otherwise + if (err.hasSyntaxError() || err.hasDataModelError()) { + return ResolvedSelector(FormattedPlaceholder()); // Null operand + } else { + return ResolvedSelector(exprResult.takeArgument()); + } + } + return exprResult; + } + + // No selector was found; error should already have been set + U_ASSERT(err.hasMissingSelectorAnnotationError() || err.hasUnknownFunctionError() || err.hasSelectorError()); + return ResolvedSelector(FormattedPlaceholder(exprResult.argument().fallback)); +} + +void MessageFormatter::formatSelectors(MessageContext& context, const Environment& env, UErrorCode &status, UnicodeString& result) const { + CHECK_ERROR(status); + + // See https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#pattern-selection + + // Resolve Selectors + // res is a vector of FormattedPlaceholders + LocalPointer res(createUVector(status)); + CHECK_ERROR(status); + resolveSelectors(context, env, status, *res); + + // Resolve Preferences + // pref is a vector of vectors of strings + LocalPointer pref(createUVector(status)); + CHECK_ERROR(status); + resolvePreferences(context, *res, *pref, status); + + // Filter Variants + // vars is a vector of PrioritizedVariants + LocalPointer vars(createUVector(status)); + CHECK_ERROR(status); + filterVariants(*pref, *vars, status); + + // Sort Variants and select the final pattern + // Note: `sortable` in the spec is just `vars` here, + // which is sorted in-place + sortVariants(*pref, *vars, status); + + CHECK_ERROR(status); + + // 6. Let `var` be the `variant` element of the first element of `sortable`. + U_ASSERT(vars->size() > 0); // This should have been checked earlier (having 0 variants would be a data model error) + const PrioritizedVariant& var = *(static_cast(vars->elementAt(0))); + // 7. Select the pattern of `var` + const Pattern& pat = var.pat; + + // Format the pattern + formatPattern(context, env, pat, status, result); +} + +// Note: this is non-const due to the function registry being non-const, which is in turn +// due to the values (`FormatterFactory` objects in the map) having mutable state. +// In other words, formatting a message can mutate the underlying `MessageFormatter` by changing +// state within the factory objects that represent custom formatters. +UnicodeString MessageFormatter::formatToString(const MessageArguments& arguments, UErrorCode &status) { + EMPTY_ON_ERROR(status); + + // Create a new environment that will store closures for all local variables + Environment* env = Environment::create(status); + // Create a new context with the given arguments and the `errors` structure + MessageContext context(arguments, *errors, status); + + // Check for unresolved variable errors + checkDeclarations(context, env, status); + LocalPointer globalEnv(env); + + UnicodeString result; + if (dataModel.hasPattern()) { + formatPattern(context, *globalEnv, dataModel.getPattern(), status, result); + } else { + // Check for errors/warnings -- if so, then the result of pattern selection is the fallback value + // See https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#pattern-selection + const DynamicErrors& err = context.getErrors(); + if (err.hasSyntaxError() || err.hasDataModelError()) { + result += REPLACEMENT; + } else { + formatSelectors(context, *globalEnv, status, result); + } + } + // Update status according to all errors seen while formatting + context.checkErrors(status); + return result; +} + +// ---------------------------------------- +// Checking for resolution errors + +void MessageFormatter::check(MessageContext& context, const Environment& localEnv, const OptionMap& options, UErrorCode& status) const { + // Check the RHS of each option + for (int32_t i = 0; i < options.size(); i++) { + const Option& opt = options.getOption(i, status); + CHECK_ERROR(status); + check(context, localEnv, opt.getValue(), status); + } +} + +void MessageFormatter::check(MessageContext& context, const Environment& localEnv, const Operand& rand, UErrorCode& status) const { + // Nothing to check for literals + if (rand.isLiteral() || rand.isNull()) { + return; + } + + // Check that variable is in scope + const VariableName& var = rand.asVariable(); + // Check local scope + if (localEnv.has(var)) { + return; + } + // Check global scope + context.getGlobal(var, status); + if (status == U_ILLEGAL_ARGUMENT_ERROR) { + status = U_ZERO_ERROR; + context.getErrors().setUnresolvedVariable(var, status); + } + // Either `var` is a global, or some other error occurred. + // Nothing more to do either way + return; +} + +void MessageFormatter::check(MessageContext& context, const Environment& localEnv, const Expression& expr, UErrorCode& status) const { + // Check for unresolved variable errors + if (expr.isFunctionCall()) { + const Operator* rator = expr.getOperator(status); + U_ASSERT(U_SUCCESS(status)); + const Operand& rand = expr.getOperand(); + check(context, localEnv, rand, status); + check(context, localEnv, rator->getOptionsInternal(), status); + } +} + +// Check for resolution errors +void MessageFormatter::checkDeclarations(MessageContext& context, Environment*& env, UErrorCode &status) const { + CHECK_ERROR(status); + + const Binding* decls = getDataModel().getLocalVariablesInternal(); + U_ASSERT(env != nullptr && decls != nullptr); + + for (int32_t i = 0; i < getDataModel().bindingsLen; i++) { + const Binding& decl = decls[i]; + const Expression& rhs = decl.getValue(); + check(context, *env, rhs, status); + + // Add a closure to the global environment, + // memoizing the value of localEnv up to this point + + // Add the LHS to the environment for checking the next declaration + env = Environment::create(decl.getVariable(), Closure(rhs, *env), env, status); + CHECK_ERROR(status); + } +} +} // namespace message2 + +U_NAMESPACE_END + +#endif /* #if !UCONFIG_NO_FORMATTING */ diff --git a/icu4c/source/i18n/messageformat2_allocation.h b/icu4c/source/i18n/messageformat2_allocation.h new file mode 100644 index 000000000000..566206d79f30 --- /dev/null +++ b/icu4c/source/i18n/messageformat2_allocation.h @@ -0,0 +1,139 @@ +// © 2024 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#ifndef U_HIDE_DEPRECATED_API + +#ifndef MESSAGEFORMAT2_UTILS_H +#define MESSAGEFORMAT2_UTILS_H + +#if U_SHOW_CPLUSPLUS_API + +#if !UCONFIG_NO_FORMATTING + +#include "unicode/unistr.h" +#include "uvector.h" + +U_NAMESPACE_BEGIN + +namespace message2 { + + // Helpers + + template + static T* copyArray(const T* source, int32_t& len) { // `len` is an in/out param + if (source == nullptr) { + len = 0; + return nullptr; + } + T* dest = new T[len]; + if (dest == nullptr) { + // Set length to 0 to prevent the + // array from being accessed + len = 0; + } else { + for (int32_t i = 0; i < len; i++) { + dest[i] = source[i]; + } + } + return dest; + } + + template + static T* copyVectorToArray(const UVector& source, int32_t& len) { + len = source.size(); + T* dest = new T[len]; + if (dest == nullptr) { + // Set length to 0 to prevent the + // array from being accessed + len = 0; + } else { + for (int32_t i = 0; i < len; i++) { + dest[i] = *(static_cast(source.elementAt(i))); + } + } + return dest; + } + + template + static T* moveVectorToArray(UVector& source, int32_t& len) { + len = source.size(); + T* dest = new T[len]; + if (dest == nullptr) { + // Set length to 0 to prevent the + // array from being accessed + len = 0; + } else { + for (int32_t i = 0; i < len; i++) { + dest[i] = std::move(*static_cast(source.elementAt(i))); + } + } + source.removeAllElements(); + return dest; + } + + inline UVector* createUVectorNoAdopt(UErrorCode& status) { + if (U_FAILURE(status)) { + return nullptr; + } + LocalPointer result(new UVector(status)); + if (U_FAILURE(status)) { + return nullptr; + } + return result.orphan(); + } + + inline UVector* createUVector(UErrorCode& status) { + UVector* result = createUVectorNoAdopt(status); + if (U_FAILURE(status)) { + return nullptr; + } + result->setDeleter(uprv_deleteUObject); + return result; + } + + static UBool stringsEqual(const UElement s1, const UElement s2) { + return (*static_cast(s1.pointer) == *static_cast(s2.pointer)); + } + + inline UVector* createStringUVector(UErrorCode& status) { + UVector* v = createUVector(status); + if (U_FAILURE(status)) { + return nullptr; + } + v->setComparer(stringsEqual); + return v; + } + + inline UVector* createStringVectorNoAdopt(UErrorCode& status) { + UVector* v = createUVectorNoAdopt(status); + if (U_FAILURE(status)) { + return nullptr; + } + v->setComparer(stringsEqual); + return v; + } + + template + inline T* create(T&& node, UErrorCode& status) { + if (U_FAILURE(status)) { + return nullptr; + } + T* result = new T(std::move(node)); + if (result == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + } + return result; + } + +} // namespace message2 + +U_NAMESPACE_END + +#endif /* #if !UCONFIG_NO_FORMATTING */ + +#endif /* U_SHOW_CPLUSPLUS_API */ + +#endif // MESSAGEFORMAT2_UTILS_H + +#endif // U_HIDE_DEPRECATED_API +// eof diff --git a/icu4c/source/i18n/messageformat2_arguments.cpp b/icu4c/source/i18n/messageformat2_arguments.cpp new file mode 100644 index 000000000000..ac2d2365b406 --- /dev/null +++ b/icu4c/source/i18n/messageformat2_arguments.cpp @@ -0,0 +1,55 @@ +// © 2024 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_FORMATTING + +#include "unicode/messageformat2_arguments.h" +#include "unicode/messageformat2_data_model_names.h" +#include "uvector.h" // U_ASSERT + +U_NAMESPACE_BEGIN + +namespace message2 { + + using namespace data_model; + + // ------------------------------------------------------ + // MessageArguments + + using Arguments = MessageArguments; + + const Formattable* Arguments::getArgument(const VariableName& arg, UErrorCode& errorCode) const { + if (U_SUCCESS(errorCode)) { + U_ASSERT(argsLen == 0 || arguments.isValid()); + for (int32_t i = 0; i < argsLen; i++) { + if (argumentNames[i] == arg) { + return &arguments[i]; + } + } + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + } + return nullptr; + } + + MessageArguments::~MessageArguments() {} + + // Message arguments + // ----------------- + + MessageArguments& MessageArguments::operator=(MessageArguments&& other) noexcept { + U_ASSERT(other.arguments.isValid() || other.argsLen == 0); + argsLen = other.argsLen; + if (argsLen != 0) { + argumentNames.adoptInstead(other.argumentNames.orphan()); + arguments.adoptInstead(other.arguments.orphan()); + } + return *this; + } + +} // namespace message2 + +U_NAMESPACE_END + +#endif /* #if !UCONFIG_NO_FORMATTING */ diff --git a/icu4c/source/i18n/messageformat2_cached_formatters.h b/icu4c/source/i18n/messageformat2_cached_formatters.h new file mode 100644 index 000000000000..1c83441369c7 --- /dev/null +++ b/icu4c/source/i18n/messageformat2_cached_formatters.h @@ -0,0 +1,62 @@ +// © 2024 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#ifndef U_HIDE_DEPRECATED_API + +#ifndef MESSAGEFORMAT2_CACHED_FORMATTERS_H +#define MESSAGEFORMAT2_CACHED_FORMATTERS_H + +#if U_SHOW_CPLUSPLUS_API + +#if !UCONFIG_NO_FORMATTING + +#include "unicode/messageformat2_data_model_names.h" +#include "unicode/messageformat2_function_registry.h" +#include "hash.h" + +U_NAMESPACE_BEGIN + +namespace message2 { + + using namespace data_model; + + // Formatter cache + // -------------- + + class MessageFormatter; + + // Map from function names to Formatters + class CachedFormatters : public UObject { + private: + friend class MessageFormatter; + + // Maps stringified FunctionNames onto Formatter* + // Adopts its values + Hashtable cache; + CachedFormatters() { cache.setValueDeleter(uprv_deleteUObject); } + public: + // Returns a pointer because Formatter is an abstract class + const Formatter* getFormatter(const FunctionName& f) { + return static_cast(cache.get(f)); + } + // Adopts its argument + void adoptFormatter(const FunctionName& f, Formatter* val, UErrorCode& status) { + cache.put(f, val, status); + } + CachedFormatters& operator=(const CachedFormatters&) = delete; + + virtual ~CachedFormatters(); + }; + +} // namespace message2 + +U_NAMESPACE_END + +#endif /* #if !UCONFIG_NO_FORMATTING */ + +#endif /* U_SHOW_CPLUSPLUS_API */ + +#endif // MESSAGEFORMAT2_CACHED_FORMATTERS_H + +#endif // U_HIDE_DEPRECATED_API +// eof diff --git a/icu4c/source/i18n/messageformat2_checker.cpp b/icu4c/source/i18n/messageformat2_checker.cpp new file mode 100644 index 000000000000..9c384a505136 --- /dev/null +++ b/icu4c/source/i18n/messageformat2_checker.cpp @@ -0,0 +1,295 @@ +// © 2024 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_FORMATTING + +#include "messageformat2_allocation.h" +#include "messageformat2_checker.h" +#include "messageformat2_macros.h" +#include "uvector.h" // U_ASSERT + +U_NAMESPACE_BEGIN + +namespace message2 { + +/* +Checks data model errors +(see https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md#error-handling ) + +The following are checked here: +Variant Key Mismatch +Missing Fallback Variant (called NonexhaustivePattern here) +Missing Selector Annotation +Duplicate Declaration + - Most duplicate declaration errors are checked by the parser, + but the checker checks for declarations of input variables + that were previously implicitly declared +(Duplicate option names and duplicate declarations are checked by the parser) +*/ + +// Type environments +// ----------------- + +TypeEnvironment::TypeEnvironment(UErrorCode& status) { + CHECK_ERROR(status); + + UVector* temp; + temp = createStringVectorNoAdopt(status); + CHECK_ERROR(status); + annotated.adoptInstead(temp); + temp = createStringVectorNoAdopt(status); + CHECK_ERROR(status); + unannotated.adoptInstead(temp); + temp = createStringVectorNoAdopt(status); + CHECK_ERROR(status); + freeVars.adoptInstead(temp); +} + + static bool has(const UVector& v, const VariableName& var) { + return v.contains(const_cast(static_cast(&var))); + } + +// Returns true if `var` was either previously used (implicit declaration), +// or is in scope by an explicit declaration +bool TypeEnvironment::known(const VariableName& var) const { + return has(*annotated, var) || has(*unannotated, var) || has(*freeVars, var); +} + +TypeEnvironment::Type TypeEnvironment::get(const VariableName& var) const { + U_ASSERT(annotated.isValid()); + if (has(*annotated, var)) { + return Annotated; + } + U_ASSERT(unannotated.isValid()); + if (has(*unannotated, var)) { + return Unannotated; + } + U_ASSERT(freeVars.isValid()); + if (has(*freeVars, var)) { + return FreeVariable; + } + // This case is a "free variable without an implicit declaration", + // i.e. one used only in a selector expression and not in a declaration RHS + return Unannotated; +} + +void TypeEnvironment::extend(const VariableName& var, TypeEnvironment::Type t, UErrorCode& status) { + if (t == Unannotated) { + U_ASSERT(unannotated.isValid()); + // See comment below + unannotated->addElement(const_cast(static_cast(&var)), status); + return; + } + + if (t == FreeVariable) { + U_ASSERT(freeVars.isValid()); + // See comment below + freeVars->addElement(const_cast(static_cast(&var)), status); + return; + } + + U_ASSERT(annotated.isValid()); + // This is safe because elements of `annotated` are never written + // and the lifetime of `var` is guaranteed to include the lifetime of + // `annotated` + annotated->addElement(const_cast(static_cast(&var)), status); +} + +TypeEnvironment::~TypeEnvironment() {} + +// --------------------- + +static bool areDefaultKeys(const Key* keys, int32_t len) { + U_ASSERT(len > 0); + for (int32_t i = 0; i < len; i++) { + if (!keys[i].isWildcard()) { + return false; + } + } + return true; +} + +void Checker::addFreeVars(TypeEnvironment& t, const Operand& rand, UErrorCode& status) { + CHECK_ERROR(status); + + if (rand.isVariable()) { + const VariableName& v = rand.asVariable(); + if (!t.known(v)) { + t.extend(v, TypeEnvironment::Type::FreeVariable, status); + } + } +} + +void Checker::addFreeVars(TypeEnvironment& t, const OptionMap& opts, UErrorCode& status) { + for (int32_t i = 0; i < opts.size(); i++) { + const Option& o = opts.getOption(i, status); + CHECK_ERROR(status); + addFreeVars(t, o.getValue(), status); + } +} + +void Checker::addFreeVars(TypeEnvironment& t, const Operator& rator, UErrorCode& status) { + CHECK_ERROR(status); + + if (!rator.isReserved()) { + addFreeVars(t, rator.getOptionsInternal(), status); + } +} + +void Checker::addFreeVars(TypeEnvironment& t, const Expression& rhs, UErrorCode& status) { + CHECK_ERROR(status); + + if (rhs.isFunctionCall()) { + const Operator* rator = rhs.getOperator(status); + U_ASSERT(U_SUCCESS(status)); + addFreeVars(t, *rator, status); + } + addFreeVars(t, rhs.getOperand(), status); +} + +void Checker::checkVariants(UErrorCode& status) { + CHECK_ERROR(status); + + U_ASSERT(!dataModel.hasPattern()); + + // Check that each variant has a key list with size + // equal to the number of selectors + const Variant* variants = dataModel.getVariantsInternal(); + + // Check that one variant includes only wildcards + bool defaultExists = false; + + for (int32_t i = 0; i < dataModel.numVariants(); i++) { + const SelectorKeys& k = variants[i].getKeys(); + const Key* keys = k.getKeysInternal(); + int32_t len = k.len; + if (len != dataModel.numSelectors()) { + // Variant key mismatch + errors.addError(StaticErrorType::VariantKeyMismatchError, status); + return; + } + defaultExists |= areDefaultKeys(keys, len); + } + if (!defaultExists) { + errors.addError(StaticErrorType::NonexhaustivePattern, status); + return; + } +} + +void Checker::requireAnnotated(const TypeEnvironment& t, const Expression& selectorExpr, UErrorCode& status) { + CHECK_ERROR(status); + + if (selectorExpr.isFunctionCall()) { + return; // No error + } + if (!selectorExpr.isReserved()) { + const Operand& rand = selectorExpr.getOperand(); + if (rand.isVariable()) { + if (t.get(rand.asVariable()) == TypeEnvironment::Type::Annotated) { + return; // No error + } + } + } + // If this code is reached, an error was detected + errors.addError(StaticErrorType::MissingSelectorAnnotation, status); +} + +void Checker::checkSelectors(const TypeEnvironment& t, UErrorCode& status) { + U_ASSERT(!dataModel.hasPattern()); + + // Check each selector; if it's not annotated, emit a + // "missing selector annotation" error + const Expression* selectors = dataModel.getSelectorsInternal(); + for (int32_t i = 0; i < dataModel.numSelectors(); i++) { + requireAnnotated(t, selectors[i], status); + } +} + +TypeEnvironment::Type typeOf(TypeEnvironment& t, const Expression& expr) { + if (expr.isFunctionCall()) { + return TypeEnvironment::Type::Annotated; + } + if (expr.isReserved()) { + return TypeEnvironment::Type::Unannotated; + } + const Operand& rand = expr.getOperand(); + U_ASSERT(!rand.isNull()); + if (rand.isLiteral()) { + return TypeEnvironment::Type::Unannotated; + } + U_ASSERT(rand.isVariable()); + return t.get(rand.asVariable()); +} + +void Checker::checkDeclarations(TypeEnvironment& t, UErrorCode& status) { + CHECK_ERROR(status); + + // For each declaration, extend the type environment with its type + // Only a very simple type system is necessary: variables + // have the type "annotated", "unannotated", or "free". + // For "missing selector annotation" checking, free variables + // (message arguments) are treated as unannotated. + // Free variables are also used for checking duplicate declarations. + const Binding* env = dataModel.getLocalVariablesInternal(); + for (int32_t i = 0; i < dataModel.bindingsLen; i++) { + const Binding& b = env[i]; + const VariableName& lhs = b.getVariable(); + const Expression& rhs = b.getValue(); + + // First, add free variables from the RHS of b + // This must be done first so we can catch: + // .local $foo = {$foo} + // (where the RHS is the first use of $foo) + if (b.isLocal()) { + addFreeVars(t, rhs, status); + + // Next, check if the LHS equals any free variables + // whose implicit declarations are in scope + if (t.known(lhs) && t.get(lhs) == TypeEnvironment::Type::FreeVariable) { + errors.addError(StaticErrorType::DuplicateDeclarationError, status); + } + } else { + // Input declaration; if b has no annotation, there's nothing to check + if (!b.isLocal() && b.hasAnnotation()) { + const OptionMap& opts = b.getOptionsInternal(); + // For .input declarations, we just need to add any variables + // referenced in the options + addFreeVars(t, opts, status); + } + // Next, check if the LHS equals any free variables + // whose implicit declarations are in scope + if (t.known(lhs) && t.get(lhs) == TypeEnvironment::Type::FreeVariable) { + errors.addError(StaticErrorType::DuplicateDeclarationError, status); + } + } + // Next, extend the type environment with a binding from lhs to its type + t.extend(lhs, typeOf(t, rhs), status); + } + + // Check for unsupported statements + if (dataModel.unsupportedStatementsLen > 0) { + errors.addError(StaticErrorType::UnsupportedStatementError, status); + } +} + +void Checker::check(UErrorCode& status) { + CHECK_ERROR(status); + + TypeEnvironment typeEnv(status); + checkDeclarations(typeEnv, status); + // Pattern message + if (dataModel.hasPattern()) { + return; + } else { + // Selectors message + checkSelectors(typeEnv, status); + checkVariants(status); + } +} + +} // namespace message2 +U_NAMESPACE_END + +#endif /* #if !UCONFIG_NO_FORMATTING */ diff --git a/icu4c/source/i18n/messageformat2_checker.h b/icu4c/source/i18n/messageformat2_checker.h new file mode 100644 index 000000000000..ffb617b67a7f --- /dev/null +++ b/icu4c/source/i18n/messageformat2_checker.h @@ -0,0 +1,91 @@ +// © 2024 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#ifndef U_HIDE_DEPRECATED_API + +#ifndef MESSAGEFORMAT_CHECKER_H +#define MESSAGEFORMAT_CHECKER_H + +#if U_SHOW_CPLUSPLUS_API + +#if !UCONFIG_NO_FORMATTING + +#include "unicode/messageformat2_data_model.h" +#include "messageformat2_errors.h" + +U_NAMESPACE_BEGIN + +namespace message2 { + + using namespace data_model; + + // Used for checking missing selector annotation errors + // and duplicate declaration errors (specifically for + // implicit declarations) + class TypeEnvironment : public UMemory { + public: + // MessageFormat has a simple type system; + // variables are in-scope and annotated; in-scope and unannotated; + // or free (a free variable has no explicit declaration in the scope + // of its use.) + enum Type { + Annotated, + Unannotated, + FreeVariable + }; + void extend(const VariableName&, Type, UErrorCode& status); + Type get(const VariableName&) const; + bool known(const VariableName&) const; + TypeEnvironment(UErrorCode& status); + + virtual ~TypeEnvironment(); + + private: + // Stores variables known to be annotated. + LocalPointer annotated; // Vector of `VariableName`s + // Stores variables that are in-scope but unannotated. + LocalPointer unannotated; // Vector of `VariableName`s + // Stores free variables that are used in the RHS of a declaration + LocalPointer freeVars; // Vector of `VariableNames`; tracks free variables + // This can't just be "variables that don't appear in + // `annotated` or `unannotated`", as a use introduces + // an explicit declaration + }; // class TypeEnvironment + + // Checks a data model for semantic errors + // (Errors are defined in https://github.com/unicode-org/message-format-wg/blob/main/spec/formatting.md ) + class Checker { + public: + void check(UErrorCode&); + Checker(const MFDataModel& m, StaticErrors& e) : dataModel(m), errors(e) {} + private: + + void requireAnnotated(const TypeEnvironment&, const Expression&, UErrorCode&); + void addFreeVars(TypeEnvironment& t, const Operand&, UErrorCode&); + void addFreeVars(TypeEnvironment& t, const Operator&, UErrorCode&); + void addFreeVars(TypeEnvironment& t, const OptionMap&, UErrorCode&); + void addFreeVars(TypeEnvironment& t, const Expression&, UErrorCode&); + void checkDeclarations(TypeEnvironment&, UErrorCode&); + void checkSelectors(const TypeEnvironment&, UErrorCode&); + void checkVariants(UErrorCode&); + void check(const OptionMap&); + void check(const Operand&); + void check(const Expression&); + void check(const Pattern&); + const MFDataModel& dataModel; + StaticErrors& errors; + }; // class Checker + +} // namespace message2 + +U_NAMESPACE_END + +#endif /* #if !UCONFIG_NO_FORMATTING */ + +#endif /* U_SHOW_CPLUSPLUS_API */ + +#endif // MESSAGEFORMAT_CHECKER_H + +#endif // U_HIDE_DEPRECATED_API +// eof + diff --git a/icu4c/source/i18n/messageformat2_data_model.cpp b/icu4c/source/i18n/messageformat2_data_model.cpp new file mode 100644 index 000000000000..19574ada4341 --- /dev/null +++ b/icu4c/source/i18n/messageformat2_data_model.cpp @@ -0,0 +1,1098 @@ +// © 2024 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_FORMATTING + +#include "unicode/messageformat2_data_model.h" +#include "messageformat2_allocation.h" +#include "messageformat2_macros.h" +#include "uvector.h" + +U_NAMESPACE_BEGIN + +namespace message2 { + +// Implementation + +//------------------ SelectorKeys + +const Key* SelectorKeys::getKeysInternal() const { + return keys.getAlias(); +} + +// Lexically order key lists +bool SelectorKeys::operator<(const SelectorKeys& other) const { + // Handle key lists of different sizes first -- + // this case does have to be handled (even though it would + // reflect a data model error) because of the need to produce + // partial output + if (len < other.len) { + return true; + } + if (len > other.len) { + return false; + } + + for (int32_t i = 0; i < len; i++) { + if (keys[i] < other.keys[i]) { + return true; + } + if (!(keys[i] == other.keys[i])) { + return false; + } + } + // If we've reached here, all keys must be equal + return false; +} + +SelectorKeys::Builder::Builder(UErrorCode& status) { + keys = createUVector(status); +} + +SelectorKeys::Builder& SelectorKeys::Builder::add(Key&& key, UErrorCode& status) noexcept { + U_ASSERT(keys != nullptr); + if (U_SUCCESS(status)) { + Key* k = create(std::move(key), status); + keys->adoptElement(k, status); + } + return *this; +} + +SelectorKeys SelectorKeys::Builder::build(UErrorCode& status) const { + if (U_FAILURE(status)) { + return {}; + } + U_ASSERT(keys != nullptr); + return SelectorKeys(*keys, status); +} + +SelectorKeys::Builder::~Builder() { + if (keys != nullptr) { + delete keys; + } +} + +SelectorKeys::SelectorKeys(const UVector& ks, UErrorCode& status) : len(ks.size()) { + if (U_FAILURE(status)) { + return; + } + Key* result = copyVectorToArray(ks, len); + if (result == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + len = 0; + return; + } + keys.adoptInstead(result); +} + +SelectorKeys& SelectorKeys::operator=(SelectorKeys other) noexcept { + swap(*this, other); + return *this; +} + +SelectorKeys::SelectorKeys(const SelectorKeys& other) : len(other.len) { + keys.adoptInstead(copyArray(other.keys.getAlias(), len)); +} + +SelectorKeys::~SelectorKeys() { + len = 0; +} + +//------------------ Literal + +bool Literal::operator<(const Literal& other) const { + // Ignore quoting for the purposes of ordering + return contents < other.contents; +} + +bool Literal::operator==(const Literal& other) const { + // Ignore quoting for the purposes of ordering + return contents == other.contents; +} + +UnicodeString Literal::quoted() const { + UnicodeString result(PIPE); + result += unquoted(); + result += PIPE; + return result; +} + +const UnicodeString& Literal::unquoted() const { return contents; } + +Literal& Literal::operator=(Literal other) noexcept { + swap(*this, other); + + return *this; +} + +Literal::~Literal() { + thisIsQuoted = false; +} + +//------------------ Operand + +Operand::Operand(const Operand& other) : contents(other.contents) {} + +Operand& Operand::operator=(Operand other) noexcept { + swap(*this, other); + + return *this; +} + +UBool Operand::isVariable() const { + return (contents.has_value() && std::holds_alternative(*contents)); +} +UBool Operand::isLiteral() const { + return (contents.has_value() && std::holds_alternative(*contents)); +} +UBool Operand::isNull() const { return !contents.has_value(); } + +const Literal& Operand::asLiteral() const { + U_ASSERT(isLiteral()); + return *(std::get_if(&(*contents))); +} + +const VariableName& Operand::asVariable() const { + U_ASSERT(isVariable()); + return *(std::get_if(&(*contents))); +} + +Operand::~Operand() {} + +//---------------- Key + +Key& Key::operator=(Key other) noexcept { + swap(*this, other); + return *this; +} + +bool Key::operator<(const Key& other) const { + // Arbitrarily treat * as greater than all concrete keys + if (isWildcard()) { + return false; + } + if (other.isWildcard()) { + return true; + } + return (asLiteral() < other.asLiteral()); +} + +bool Key::operator==(const Key& other) const { + if (isWildcard()) { + return other.isWildcard(); + } + return (asLiteral() == other.asLiteral()); +} + +const Literal& Key::asLiteral() const { + U_ASSERT(!isWildcard()); + return *contents; +} + +Key::~Key() {} + +// ------------ Reserved + +// Copy constructor +Reserved::Reserved(const Reserved& other) { + len = other.len; + parts.adoptInstead(copyArray(other.parts.getAlias(), len)); +} + +Reserved& Reserved::operator=(Reserved other) noexcept { + swap(*this, other); + return *this; +} + +Reserved::Reserved(const UVector& ps, UErrorCode& status) noexcept : len(ps.size()) { + if (U_FAILURE(status)) { + return; + } + parts = LocalArray(copyVectorToArray(ps, len)); +} + +int32_t Reserved::numParts() const { + return len; +} + +const Literal& Reserved::getPart(int32_t i) const { + U_ASSERT(i < numParts()); + return parts[i]; +} + +Reserved::Builder::Builder(UErrorCode& status) { + parts = createUVector(status); +} + +Reserved Reserved::Builder::build(UErrorCode& status) const noexcept { + if (U_FAILURE(status)) { + return {}; + } + U_ASSERT(parts != nullptr); + return Reserved(*parts, status); +} + +Reserved::Builder& Reserved::Builder::add(Literal&& part, UErrorCode& status) noexcept { + U_ASSERT(parts != nullptr); + if (U_SUCCESS(status)) { + Literal* l = create(std::move(part), status); + parts->adoptElement(l, status); + } + return *this; +} + +Reserved::Builder::~Builder() { + if (parts != nullptr) { + delete parts; + } +} + +Reserved::~Reserved() { + len = 0; +} + +//------------------------ Operator + +OptionMap::OptionMap(const UVector& opts, UErrorCode& status) { + CHECK_ERROR(status); + + len = opts.size(); + Option* result = copyVectorToArray