Add an iterator to tf to decode UTF-8 strings

PixarAnimationStudios · Nov 9, 2023 · 8d074d1 · 8d074d1
1 parent dcbc1ca
commit 8d074d1
Show file tree

Hide file tree

Showing 4 changed files with 621 additions and 0 deletions.
diff --git a/pxr/base/tf/CMakeLists.txt b/pxr/base/tf/CMakeLists.txt
@@ -83,6 +83,7 @@ pxr_library(tf
         type
         typeFunctions
         typeNotice
+        unicodeUtils
         warning
         weakBase
         weakPtr
@@ -383,6 +384,7 @@ pxr_build_test(testTf
         testenv/type.cpp
         testenv/typeMultipleInheritance.cpp
         testenv/typeInfoMap.cpp
+        testenv/unicodeUtils.cpp
         testenv/weakPtr.cpp
 )
 
@@ -646,6 +648,9 @@ pxr_register_test(TfTypeInfoMap
 pxr_register_test(TfType_MultipleInheritance
     COMMAND "${CMAKE_INSTALL_PREFIX}/tests/testTf TfType_MultipleInheritance"
 )
+pxr_register_test(TfUnicodeUtils
+    COMMAND "${CMAKE_INSTALL_PREFIX}/tests/testTf TfUnicodeUtils"
+)
 pxr_register_test(TfWeakPtr
     COMMAND "${CMAKE_INSTALL_PREFIX}/tests/testTf TfWeakPtr"
 )

diff --git a/pxr/base/tf/testenv/unicodeUtils.cpp b/pxr/base/tf/testenv/unicodeUtils.cpp
@@ -0,0 +1,110 @@
+//
+// Copyright 2023 Pixar
+//
+// Licensed under the Apache License, Version 2.0 (the "Apache License")
+// with the following modification; you may not use this file except in
+// compliance with the Apache License and the following modification to it:
+// Section 6. Trademarks. is deleted and replaced with:
+//
+// 6. Trademarks. This License does not grant permission to use the trade
+//    names, trademarks, service marks, or product names of the Licensor
+//    and its affiliates, except as required to comply with Section 4(c) of
+//    the License and to reproduce the content of the NOTICE file.
+//
+// You may obtain a copy of the Apache License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the Apache License with the above modification is
+// distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the Apache License for the specific
+// language governing permissions and limitations under the Apache License.
+//
+#include "pxr/pxr.h"
+#include "pxr/base/tf/diagnosticLite.h"
+#include "pxr/base/tf/regTest.h"
+#include "pxr/base/tf/unicodeUtils.h"
+
+#include <string_view>
+
+PXR_NAMESPACE_USING_DIRECTIVE
+
+static bool
+TestUtf8CodePointView()
+{
+
+    {
+        TF_AXIOM(TfUnicodeUtils::Utf8CodePointView{}.empty());
+    }
+
+    // Exercise the iterator converting from UTF-8 char to code point
+    {
+        const std::string_view s1{"ⅈ75_hgòð㤻"};
+        TfUnicodeUtils::Utf8CodePointView u1{s1};
+        auto i1 = std::cbegin(u1);
+        TF_AXIOM(i1.GetBase() == s1.begin());
+        TF_AXIOM(*i1 == 8520);
+        std::advance(i1, 9);
+        TF_AXIOM(i1 == std::cend(u1));
+
+        for (const uint32_t codePoint : u1) {
+            TF_AXIOM(codePoint != TfUnicodeUtils::INVALID_CODE_POINT);
+        }
+    }
+
+    {
+        const std::string_view s2{"㤼01৪∫"};
+        TfUnicodeUtils::Utf8CodePointView u2{s2};
+        auto i2 = std::cbegin(u2);
+        TF_AXIOM(i2.GetBase() == s2.begin());
+        TF_AXIOM(*i2 == 14652);
+        std::advance(i2, 5);
+        TF_AXIOM(i2 == std::cend(u2));
+
+        for (const uint32_t codePoint : u2) {
+            TF_AXIOM(codePoint != TfUnicodeUtils::INVALID_CODE_POINT);
+        }
+    }
+
+    {
+        const std::string_view s3{"㤻üaf-∫⁇…🔗"};
+        TfUnicodeUtils::Utf8CodePointView u3{s3};
+        auto i3a = std::cbegin(u3);
+        auto i3b = std::cbegin(u3);
+
+        // The C++20 ranges version of find_if can be used with sentinels in
+        // C++20
+        for (; i3b != std::cend(u3); ++i3b) {
+            if (*(i3b.GetBase()) == '-') {
+                break;
+            }
+        }
+        TF_AXIOM(i3b != std::cend(u3));
+
+        // i3a should contain all characters before the "-"
+        TF_AXIOM(*i3a == 14651);
+        std::advance(i3a, 4);
+        TF_AXIOM(i3a == i3b);
+        TF_AXIOM(i3a.GetBase() == i3b.GetBase());
+
+        // i3b should include the "-" character
+        TF_AXIOM(*i3b == 45);
+        std::advance(i3b, 5);
+        TF_AXIOM(i3b == std::cend(u3));
+
+        for (const uint32_t codePoint : u3) {
+            TF_AXIOM(codePoint != TfUnicodeUtils::INVALID_CODE_POINT);
+        }
+
+    }
+    return true;
+}
+
+static bool
+Test_TfUnicodeUtils()
+{
+    return TestUtf8CodePointView();
+}
+
+TF_ADD_REGTEST(TfUnicodeUtils);
diff --git a/pxr/base/tf/unicodeUtils.cpp b/pxr/base/tf/unicodeUtils.cpp
@@ -0,0 +1,191 @@
+//
+// Copyright 2023 Pixar
+//
+// Licensed under the Apache License, Version 2.0 (the "Apache License")
+// with the following modification; you may not use this file except in
+// compliance with the Apache License and the following modification to it:
+// Section 6. Trademarks. is deleted and replaced with:
+//
+// 6. Trademarks. This License does not grant permission to use the trade
+//    names, trademarks, service marks, or product names of the Licensor
+//    and its affiliates, except as required to comply with Section 4(c) of
+//    the License and to reproduce the content of the NOTICE file.
+//
+// You may obtain a copy of the Apache License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the Apache License with the above modification is
+// distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the Apache License for the specific
+// language governing permissions and limitations under the Apache License.
+//
+
+#include "pxr/base/tf/diagnostic.h"
+#include "pxr/base/tf/unicodeUtils.h"
+
+PXR_NAMESPACE_OPEN_SCOPE
+
+namespace TfUnicodeUtils {
+
+uint32_t Utf8CodePointIterator::_GetCodePoint() const
+{
+    // determine what encoding length the character is
+    _EncodingLength encodingLength = this->_GetEncodingLength();
+    if (encodingLength > std::distance(_it, _end)) {
+        // error condition, would read bytes past the end of the range
+        return INVALID_CODE_POINT;
+    }
+    if (encodingLength == 1)
+    {
+        return static_cast<uint32_t>(static_cast<unsigned char>(*_it));
+    }
+    auto begin = _it;
+    if (encodingLength == 2)
+    {
+        unsigned char byte1 = static_cast<unsigned char>(*begin);
+        unsigned char byte2 = static_cast<unsigned char>(*(++begin));
+
+        // ensure the ranges we expect, or it's not a valid character
+        if (byte1 < static_cast<unsigned char>('\xc2') ||
+            byte1 > static_cast<unsigned char>('\xdf'))
+        {
+            return INVALID_CODE_POINT;
+        }
+        if (byte2 < static_cast<unsigned char>('\x80') ||
+            byte2 > static_cast<unsigned char>('\xbf'))
+        {
+            return INVALID_CODE_POINT;
+        }
+
+        // the code point is constructed from the last 5 bits of byte1
+        // and the last 6 bits of byte2
+        return ((byte1 & 0x1f) << 6) + (byte2 & 0x3f);
+    }
+    else if (encodingLength == 3)
+    {
+        unsigned char byte1 = static_cast<unsigned char>(*begin);
+        unsigned char byte2 = static_cast<unsigned char>(*(++begin));
+        unsigned char byte3 = static_cast<unsigned char>(*(++begin));
+
+        // ensure the ranges we expect, or it's not a valid character
+        if (byte1 == static_cast<unsigned char>('\xe0'))
+        {
+            // byte2 must be in range A0..BF
+            // byte3 must be in range 80..BF
+            if (byte2 < static_cast<unsigned char>('\xa0') ||
+                byte2 > static_cast<unsigned char>('\xbf') ||
+                byte3 < static_cast<unsigned char>('\x80') ||
+                byte3 > static_cast<unsigned char>('\xbf'))
+            {
+                return INVALID_CODE_POINT;
+            }
+        }
+        else if ((byte1 >= static_cast<unsigned char>('\xe1') &&
+                  byte1 <= static_cast<unsigned char>('\xec')) ||
+                  byte1 == static_cast<unsigned char>('\xee') ||
+                  byte1 == static_cast<unsigned char>('\xef'))
+        {
+            // byte2 must be in range 80..BF
+            // byte3 must be in range 80..BF
+            if (byte2 < static_cast<unsigned char>('\x80') ||
+                byte2 > static_cast<unsigned char>('\xbf') ||
+                byte3 < static_cast<unsigned char>('\x80') ||
+                byte3 > static_cast<unsigned char>('\xbf'))
+            {
+                return INVALID_CODE_POINT;
+            }
+        }
+        else if (byte1 == static_cast<unsigned char>('\xed'))
+        {
+            // byte2 must be in range 80..9F
+            // byte3 must be in range 80..BF
+            if (byte2 < static_cast<unsigned char>('\x80') ||
+                byte2 > static_cast<unsigned char>('\x9f') ||
+                byte3 < static_cast<unsigned char>('\x80') ||
+                byte3 > static_cast<unsigned char>('\xbf'))
+            {
+                return INVALID_CODE_POINT;
+            }
+        }
+        else
+        {
+            // byte 1 invalid
+            return INVALID_CODE_POINT;
+        }
+
+        // code point is constructed from the last 4 bits of byte1
+        // and the last 6 bits of bytes 2 and 3
+        return ((byte1 & 0xf) << 12) + ((byte2 & 0x3f) << 6) +
+                (byte3 & 0x3f);
+    }
+    else if (encodingLength == 4)
+    {
+        unsigned char byte1 = static_cast<unsigned char>(*begin);
+        unsigned char byte2 = static_cast<unsigned char>(*(++begin));
+        unsigned char byte3 = static_cast<unsigned char>(*(++begin));
+        unsigned char byte4 = static_cast<unsigned char>(*(++begin));
+
+        if (byte1 == static_cast<unsigned char>('\xf0'))
+        {
+            // byte2 must be in range 90..BF
+            // byte3 must be in range 80..BF
+            // byte4 must be in range 80..BF
+            if (byte2 < static_cast<unsigned char>('\x90') ||
+                byte2 > static_cast<unsigned char>('\xbf') ||
+                byte3 < static_cast<unsigned char>('\x80') ||
+                byte3 > static_cast<unsigned char>('\xbf') ||
+                byte4 < static_cast<unsigned char>('\x80') ||
+                byte4 > static_cast<unsigned char>('\xbf'))
+            {
+                return INVALID_CODE_POINT;
+            }
+        }
+        else if (byte1 >= static_cast<unsigned char>('\xf1') &&
+                 byte1 <= static_cast<unsigned char>('\xf3'))
+        {
+            // byte2 must be in range 80..BF
+            // byte3 must be in range 80..BF
+            // byte4 must be in range 80..BF
+            if (byte2 < static_cast<unsigned char>('\x80') ||
+                byte2 > static_cast<unsigned char>('\xbf') ||
+                byte3 < static_cast<unsigned char>('\x80') ||
+                byte3 > static_cast<unsigned char>('\xbf') ||
+                byte4 < static_cast<unsigned char>('\x80') ||
+                byte4 > static_cast<unsigned char>('\xbf'))
+            {
+                return INVALID_CODE_POINT;
+            }
+        }
+        else if (byte1 == static_cast<unsigned char>('\xf4'))
+        {
+            // byte2 must be in range 80..8F
+            // byte3 must be in range 80..BF
+            // byte4 must be in range 80..BF
+            if (byte2 < static_cast<unsigned char>('\x80') ||
+                byte2 > static_cast<unsigned char>('\x8f') ||
+                byte3 < static_cast<unsigned char>('\x80') ||
+                byte3 > static_cast<unsigned char>('\xbf') ||
+                byte4 < static_cast<unsigned char>('\x80') ||
+                byte4 > static_cast<unsigned char>('\xbf'))
+            {
+                return INVALID_CODE_POINT;
+            }
+        }
+        else
+        {
+            // byte 1 is invalid
+            return INVALID_CODE_POINT;
+        }
+
+        // code point is constructed from the last 3 bits of byte 1
+        // and the last 6 bits of bytes 2, 3, and 4
+        return ((byte1 & 0x7) << 18) + ((byte2 & 0x3f) << 12) +
+               ((byte3 & 0x3f) << 6) + (byte4 & 0x3f);
+    }
+    return INVALID_CODE_POINT;
+}
+} // end TfUnicodeUtils
+
+PXR_NAMESPACE_CLOSE_SCOPE