createObjectsFromName(): be more tolerant about N/S vs North/South, a…

…bsence of zone or height We want the following matches to be possible: user entry official name ------------------ ---------------- EGM96 EGM96 height WGS84 UTM31 north WGS 84 / UTM zone 31N
rouault · Nov 10, 2024 · 09e54a9 · 09e54a9
1 parent 9f2289c
commit 09e54a9
Show file tree

Hide file tree

Showing 7 changed files with 246 additions and 39 deletions.
diff --git a/include/proj/metadata.hpp b/include/proj/metadata.hpp
@@ -397,11 +397,15 @@ class PROJ_GCC_DLL Identifier : public util::BaseObject,
 
     PROJ_DLL static bool isEquivalentName(const char *a,
                                           const char *b) noexcept;
+    PROJ_DLL static bool
+    isEquivalentName(const char *a, const char *b,
+                     bool biggerDifferencesAllowed) noexcept;
 
     PROJ_PRIVATE :
         //! @cond Doxygen_Suppress
         PROJ_INTERNAL static std::string
-        canonicalizeName(const std::string &str);
+        canonicalizeName(const std::string &str,
+                         bool biggerDifferencesAllowed = true);
 
     PROJ_INTERNAL void _exportToWKT(io::WKTFormatter *formatter)
         const override; // throw(io::FormattingException)

diff --git a/scripts/reference_exported_symbols.txt b/scripts/reference_exported_symbols.txt
@@ -523,6 +523,7 @@ osgeo::proj::metadata::Identifier::description() const
 osgeo::proj::metadata::Identifier::~Identifier()
 osgeo::proj::metadata::Identifier::Identifier(osgeo::proj::metadata::Identifier const&)
 osgeo::proj::metadata::Identifier::isEquivalentName(char const*, char const*)
+osgeo::proj::metadata::Identifier::isEquivalentName(char const*, char const*, bool)
 osgeo::proj::metadata::Identifier::uri() const
 osgeo::proj::metadata::Identifier::version() const
 osgeo::proj::metadata::PositionalAccuracy::create(std::string const&)

diff --git a/src/apps/projinfo.cpp b/src/apps/projinfo.cpp
@@ -370,6 +370,7 @@ static BaseObjectNNPtr buildObject(
                         limitResultCount);
                     if (res.size() == 1) {
                         obj = res.front().as_nullable();
+                        break;
                     } else {
                         for (const auto &l_obj : res) {
                             if (Identifier::isEquivalentName(

diff --git a/src/iso19111/factory.cpp b/src/iso19111/factory.cpp
@@ -9171,7 +9171,8 @@ AuthorityFactory::createObjectsFromNameEx(
         auto sqlRes = d->run(sql, params);
         bool isFirst = true;
         bool firstIsDeprecated = false;
-        bool foundExactMatch = false;
+        size_t countExactMatch = 0;
+        size_t countExactMatchOnAlias = 0;
         std::size_t hashCodeFirstMatch = 0;
         for (const auto &row : sqlRes) {
             const auto &name = row[3];
@@ -9262,9 +9263,12 @@ AuthorityFactory::createObjectsFromNameEx(
                 throw std::runtime_error("Unsupported table_name");
             };
             const auto obj = getObject(table_name, code);
-            if (metadata::Identifier::canonicalizeName(obj->nameStr()) ==
-                canonicalizedSearchedName) {
-                foundExactMatch = true;
+            if (metadata::Identifier::isEquivalentName(
+                    obj->nameStr().c_str(), searchedName.c_str(), false)) {
+                countExactMatch++;
+            } else if (metadata::Identifier::isEquivalentName(
+                           name.c_str(), searchedName.c_str(), false)) {
+                countExactMatchOnAlias++;
             }
 
             const auto objPtr = obj.get();
@@ -9280,14 +9284,21 @@ AuthorityFactory::createObjectsFromNameEx(
             }
         }
 
-        // If we found a name that is an exact match, and all objects have the
-        // same type, and we are not in approximate mode, only keep the
-        // object(s) with the exact name match.
-        if (foundExactMatch && hashCodeFirstMatch != 0 && !approximateMatch) {
+        // If we found several objects that are an exact match, and all objects
+        // have the same type, and we are not in approximate mode, only keep the
+        // objects with the exact name match.
+        if ((countExactMatch + countExactMatchOnAlias) >= 1 &&
+            hashCodeFirstMatch != 0 && !approximateMatch) {
             std::list<PairObjectName> resTmp;
+            bool biggerDifferencesAllowed = (countExactMatch == 0);
             for (const auto &pair : res) {
-                if (metadata::Identifier::canonicalizeName(
-                        pair.first->nameStr()) == canonicalizedSearchedName) {
+                if (metadata::Identifier::isEquivalentName(
+                        pair.first->nameStr().c_str(), searchedName.c_str(),
+                        biggerDifferencesAllowed) ||
+                    (countExactMatch == 0 &&
+                     metadata::Identifier::isEquivalentName(
+                         pair.second.c_str(), searchedName.c_str(),
+                         biggerDifferencesAllowed))) {
                     resTmp.emplace_back(pair);
                 }
             }
@@ -9298,6 +9309,7 @@ AuthorityFactory::createObjectsFromNameEx(
     auto sortLambda = [](const PairObjectName &a, const PairObjectName &b) {
         const auto &aName = a.first->nameStr();
         const auto &bName = b.first->nameStr();
+
         if (aName.size() < bName.size()) {
             return true;
         }

diff --git a/src/iso19111/io.cpp b/src/iso19111/io.cpp
@@ -7966,20 +7966,23 @@ static BaseObjectNNPtr createFromUserInput(const std::string &text,
 
                 // If there's exactly only one object whose name is equivalent
                 // to the user input, return it.
-                IdentifiedObjectPtr identifiedObj;
-                for (const auto &obj : res) {
-                    if (Identifier::isEquivalentName(obj->nameStr().c_str(),
-                                                     objectName.c_str())) {
-                        if (identifiedObj == nullptr) {
-                            identifiedObj = obj.as_nullable();
-                        } else {
-                            identifiedObj = nullptr;
-                            break;
+                for (int pass = 0; pass <= 1; ++pass) {
+                    IdentifiedObjectPtr identifiedObj;
+                    for (const auto &obj : res) {
+                        if (Identifier::isEquivalentName(
+                                obj->nameStr().c_str(), objectName.c_str(),
+                                /* biggerDifferencesAllowed = */ pass == 1)) {
+                            if (identifiedObj == nullptr) {
+                                identifiedObj = obj.as_nullable();
+                            } else {
+                                identifiedObj = nullptr;
+                                break;
+                            }
                         }
                     }
-                }
-                if (identifiedObj) {
-                    return identifiedObj;
+                    if (identifiedObj) {
+                        return identifiedObj;
+                    }
                 }
 
                 std::string msg("several objects matching this name: ");

diff --git a/src/iso19111/metadata.cpp b/src/iso19111/metadata.cpp
@@ -1214,6 +1214,14 @@ static bool isIgnoredChar(char ch) {
 
 // ---------------------------------------------------------------------------
 
+//! @cond Doxygen_Suppress
+static char lower(char ch) {
+    return ch >= 'A' && ch <= 'Z' ? ch - 'A' + 'a' : ch;
+}
+//! @endcond
+
+// ---------------------------------------------------------------------------
+
 //! @cond Doxygen_Suppress
 static const struct utf8_to_lower {
     const char *utf8;
@@ -1249,21 +1257,94 @@ static const struct utf8_to_lower *get_ascii_replacement(const char *c_str) {
 // ---------------------------------------------------------------------------
 
 //! @cond Doxygen_Suppress
-std::string Identifier::canonicalizeName(const std::string &str) {
+
+/** Checks if needle is a substring of c_str.
+ *
+ * e.g matchesLowerCase("JavaScript", "java") returns true
+ */
+static bool matchesLowerCase(const char *c_str, const char *needle) {
+    size_t i = 0;
+    for (; c_str[i] && needle[i]; ++i) {
+        if (lower(c_str[i]) != lower(needle[i])) {
+            return false;
+        }
+    }
+    return needle[i] == 0;
+}
+//! @endcond
+
+// ---------------------------------------------------------------------------
+
+//! @cond Doxygen_Suppress
+
+static inline bool isdigit(char ch) { return ch >= '0' && ch <= '9'; }
+//! @endcond
+
+// ---------------------------------------------------------------------------
+
+//! @cond Doxygen_Suppress
+std::string Identifier::canonicalizeName(const std::string &str,
+                                         bool biggerDifferencesAllowed) {
     std::string res;
     const char *c_str = str.c_str();
     for (size_t i = 0; c_str[i] != 0; ++i) {
-        const auto ch = c_str[i];
+        const auto ch = lower(c_str[i]);
         if (ch == ' ' && c_str[i + 1] == '+' && c_str[i + 2] == ' ') {
             i += 2;
             continue;
         }
-        if (ch == '1' && !res.empty() &&
-            !(res.back() >= '0' && res.back() <= '9') && c_str[i + 1] == '9' &&
-            c_str[i + 2] >= '0' && c_str[i + 2] <= '9') {
+
+        // Canonicalize "19dd" (where d is a digit) as "dd"
+        if (ch == '1' && !res.empty() && !isdigit(res.back()) &&
+            c_str[i + 1] == '9' && isdigit(c_str[i + 2]) &&
+            isdigit(c_str[i + 3])) {
             ++i;
             continue;
         }
+
+        if (biggerDifferencesAllowed) {
+
+            const auto skipSubstring = [](char l_ch, const char *l_str,
+                                          size_t &idx, const char *substr) {
+                if (l_ch == substr[0] && idx > 0 &&
+                    isIgnoredChar(l_str[idx - 1]) &&
+                    matchesLowerCase(l_str + idx, substr)) {
+                    idx += strlen(substr) - 1;
+                    return true;
+                }
+                return false;
+            };
+
+            // Skip "zone" or "height" if preceding character is a space
+            if (skipSubstring(ch, c_str, i, "zone") ||
+                skipSubstring(ch, c_str, i, "height")) {
+                continue;
+            }
+
+            // Replace a substring by its first character if preceding character
+            // is a space or a digit
+            const auto replaceByFirstChar = [](char l_ch, const char *l_str,
+                                               size_t &idx, const char *substr,
+                                               std::string &l_res) {
+                if (l_ch == substr[0] && idx > 0 &&
+                    (isIgnoredChar(l_str[idx - 1]) ||
+                     isdigit(l_str[idx - 1])) &&
+                    matchesLowerCase(l_str + idx, substr)) {
+                    l_res.push_back(l_ch);
+                    idx += strlen(substr) - 1;
+                    return true;
+                }
+                return false;
+            };
+
+            // Replace "north" or "south" by its first character if preceding
+            // character is a space or a digit
+            if (replaceByFirstChar(ch, c_str, i, "north", res) ||
+                replaceByFirstChar(ch, c_str, i, "south", res)) {
+                continue;
+            }
+        }
+
         if (static_cast<unsigned char>(ch) > 127) {
             const auto *replacement = get_ascii_replacement(c_str + i);
             if (replacement) {
@@ -1273,7 +1354,7 @@ std::string Identifier::canonicalizeName(const std::string &str) {
             }
         }
         if (!isIgnoredChar(ch)) {
-            res.push_back(static_cast<char>(::tolower(ch)));
+            res.push_back(ch);
         }
     }
     return res;
@@ -1286,15 +1367,22 @@ std::string Identifier::canonicalizeName(const std::string &str) {
  *
  * Two names are equivalent by removing any space, underscore, dash, slash,
  * { or } character from them, and comparing in a case insensitive way.
+ *
+ * @param a first string
+ * @param b second string
+ * @param biggerDifferencesAllowed if true, "height" and "zone" words are
+ * ignored, and "north" is shortened as "n" and "south" as "n".
+ * @since 9.6
  */
-bool Identifier::isEquivalentName(const char *a, const char *b) noexcept {
+bool Identifier::isEquivalentName(const char *a, const char *b,
+                                  bool biggerDifferencesAllowed) noexcept {
     size_t i = 0;
     size_t j = 0;
     char lastValidA = 0;
     char lastValidB = 0;
     while (a[i] != 0 || b[j] != 0) {
-        char aCh = a[i];
-        char bCh = b[j];
+        char aCh = lower(a[i]);
+        char bCh = lower(b[j]);
         if (aCh == ' ' && a[i + 1] == '+' && a[i + 2] == ' ' && a[i + 3] != 0) {
             i += 3;
             continue;
@@ -1311,18 +1399,69 @@ bool Identifier::isEquivalentName(const char *a, const char *b) noexcept {
             ++j;
             continue;
         }
-        if (aCh == '1' && !(lastValidA >= '0' && lastValidA <= '9') &&
-            a[i + 1] == '9' && a[i + 2] >= '0' && a[i + 2] <= '9') {
+
+        // Canonicalize "19dd" (where d is a digit) as "dd"
+        if (aCh == '1' && !isdigit(lastValidA) && a[i + 1] == '9' &&
+            isdigit(a[i + 2]) && isdigit(a[i + 3])) {
             i += 2;
             lastValidA = '9';
             continue;
         }
-        if (bCh == '1' && !(lastValidB >= '0' && lastValidB <= '9') &&
-            b[j + 1] == '9' && b[j + 2] >= '0' && b[j + 2] <= '9') {
+        if (bCh == '1' && !isdigit(lastValidB) && b[j + 1] == '9' &&
+            isdigit(b[j + 2]) && isdigit(b[j + 3])) {
             j += 2;
             lastValidB = '9';
             continue;
         }
+
+        if (biggerDifferencesAllowed) {
+            // Skip a substring if preceding character is a space
+            const auto skipSubString = [](char ch, const char *str, size_t &idx,
+                                          const char *substr) {
+                if (ch == substr[0] && idx > 0 && isIgnoredChar(str[idx - 1]) &&
+                    matchesLowerCase(str + idx, substr)) {
+                    idx += strlen(substr);
+                    return true;
+                }
+                return false;
+            };
+
+            bool skip = false;
+            if (skipSubString(aCh, a, i, "zone"))
+                skip = true;
+            if (skipSubString(bCh, b, j, "zone"))
+                skip = true;
+            if (skip)
+                continue;
+
+            if (skipSubString(aCh, a, i, "height"))
+                skip = true;
+            if (skipSubString(bCh, b, j, "height"))
+                skip = true;
+            if (skip)
+                continue;
+
+            // Replace a substring by its first character if preceding character
+            // is a space or a digit
+            const auto replaceByFirstChar = [](char ch, const char *str,
+                                               size_t &idx,
+                                               const char *substr) {
+                if (ch == substr[0] && idx > 0 &&
+                    (isIgnoredChar(str[idx - 1]) || isdigit(str[idx - 1])) &&
+                    matchesLowerCase(str + idx, substr)) {
+                    idx += strlen(substr) - 1;
+                    return true;
+                }
+                return false;
+            };
+
+            if (!replaceByFirstChar(aCh, a, i, "north"))
+                replaceByFirstChar(aCh, a, i, "south");
+
+            if (!replaceByFirstChar(bCh, b, j, "north"))
+                replaceByFirstChar(bCh, b, j, "south");
+        }
+
         if (static_cast<unsigned char>(aCh) > 127) {
             const auto *replacement = get_ascii_replacement(a + i);
             if (replacement) {
@@ -1337,8 +1476,7 @@ bool Identifier::isEquivalentName(const char *a, const char *b) noexcept {
                 j += strlen(replacement->utf8) - 1;
             }
         }
-        if ((aCh == 0 && bCh != 0) || (aCh != 0 && bCh == 0) ||
-            ::tolower(aCh) != ::tolower(bCh)) {
+        if (aCh != bCh) {
             return false;
         }
         lastValidA = aCh;
@@ -1353,6 +1491,17 @@ bool Identifier::isEquivalentName(const char *a, const char *b) noexcept {
 
 // ---------------------------------------------------------------------------
 
+/** \brief Returns whether two names are considered equivalent.
+ *
+ * Two names are equivalent by removing any space, underscore, dash, slash,
+ * { or } character from them, and comparing in a case insensitive way.
+ */
+bool Identifier::isEquivalentName(const char *a, const char *b) noexcept {
+    return isEquivalentName(a, b, /* biggerDifferencesAllowed = */ true);
+}
+
+// ---------------------------------------------------------------------------
+
 //! @cond Doxygen_Suppress
 struct PositionalAccuracy::Private {
     std::string value_{};