Add an iterator to tf to decode UTF-8 strings

PixarAnimationStudios · Nov 6, 2023 · cf438d6 · cf438d6
1 parent ceb3699
commit cf438d6
Show file tree

Hide file tree

Showing 4 changed files with 517 additions and 0 deletions.
diff --git a/pxr/base/tf/CMakeLists.txt b/pxr/base/tf/CMakeLists.txt
@@ -83,6 +83,7 @@ pxr_library(tf
         type
         typeFunctions
         typeNotice
+        unicodeUtils
         warning
         weakBase
         weakPtr
@@ -383,6 +384,7 @@ pxr_build_test(testTf
         testenv/type.cpp
         testenv/typeMultipleInheritance.cpp
         testenv/typeInfoMap.cpp
+        testenv/unicodeUtils.cpp
         testenv/weakPtr.cpp
 )
 
@@ -646,6 +648,9 @@ pxr_register_test(TfTypeInfoMap
 pxr_register_test(TfType_MultipleInheritance
     COMMAND "${CMAKE_INSTALL_PREFIX}/tests/testTf TfType_MultipleInheritance"
 )
+pxr_register_test(TfUnicodeUtils
+    COMMAND "${CMAKE_INSTALL_PREFIX}/tests/testTf TfUnicodeUtils"
+)
 pxr_register_test(TfWeakPtr
     COMMAND "${CMAKE_INSTALL_PREFIX}/tests/testTf TfWeakPtr"
 )

diff --git a/pxr/base/tf/testenv/unicodeUtils.cpp b/pxr/base/tf/testenv/unicodeUtils.cpp
@@ -0,0 +1,75 @@
+//
+// Copyright 2023 Pixar
+//
+// Licensed under the Apache License, Version 2.0 (the "Apache License")
+// with the following modification; you may not use this file except in
+// compliance with the Apache License and the following modification to it:
+// Section 6. Trademarks. is deleted and replaced with:
+//
+// 6. Trademarks. This License does not grant permission to use the trade
+//    names, trademarks, service marks, or product names of the Licensor
+//    and its affiliates, except as required to comply with Section 4(c) of
+//    the License and to reproduce the content of the NOTICE file.
+//
+// You may obtain a copy of the Apache License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the Apache License with the above modification is
+// distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the Apache License for the specific
+// language governing permissions and limitations under the Apache License.
+//
+#include "pxr/pxr.h"
+#include "pxr/base/tf/diagnosticLite.h"
+#include "pxr/base/tf/regTest.h"
+#include "pxr/base/tf/unicodeUtils.h"
+
+#include <string_view>
+
+PXR_NAMESPACE_USING_DIRECTIVE
+
+static bool
+TestUtf8Iterator()
+{
+    // Exercise the iterator converting from UTF-8 char to code point
+    const std::string_view s1 = "ⅈ75_hgòð㤻";
+    const std::string_view s2 = "㤼01৪∫";
+    const std::string_view s3 = "㤻üaf-∫⁇…🔗";
+
+    TfUnicodeUtils::Utf8ConstIterator i1(s1);
+    TfUnicodeUtils::Utf8ConstIterator i2(s2);
+    TfUnicodeUtils::Utf8ConstIterator i3(s3.begin(),
+                                         s3.begin() + s3.find("-"));
+    TfUnicodeUtils::Utf8ConstIterator i4(s3.begin() + s3.find("-"), s3.end());
+
+    TF_AXIOM(i1.GetBase() == s1.begin());
+    TF_AXIOM(*i1 == 8520);
+    std::advance(i1, 9);
+    TF_AXIOM(i1.GetBase() == s1.end());
+
+    TF_AXIOM(*i2 == 14652);
+    std::advance(i2, 5);
+    TF_AXIOM(i2.GetBase() == s2.end());
+
+    // i3 should contain all characters before the "-"
+    TF_AXIOM(*i3 == 14651);
+    std::advance(i3, 4);
+    TF_AXIOM(i3.GetBase() == i4.GetBase());
+
+    // i4 should include the "-" character
+    TF_AXIOM(*i4 == 45);
+    std::advance(i4, 5);
+    TF_AXIOM(i4.GetBase() == s3.end());
+
+    return true;
+}
+
+static bool
+Test_TfUnicodeUtils()
+{
+    return TestUtf8Iterator();
+}
+
+TF_ADD_REGTEST(TfUnicodeUtils);
diff --git a/pxr/base/tf/unicodeUtils.cpp b/pxr/base/tf/unicodeUtils.cpp
@@ -0,0 +1,208 @@
+//
+// Copyright 2023 Pixar
+//
+// Licensed under the Apache License, Version 2.0 (the "Apache License")
+// with the following modification; you may not use this file except in
+// compliance with the Apache License and the following modification to it:
+// Section 6. Trademarks. is deleted and replaced with:
+//
+// 6. Trademarks. This License does not grant permission to use the trade
+//    names, trademarks, service marks, or product names of the Licensor
+//    and its affiliates, except as required to comply with Section 4(c) of
+//    the License and to reproduce the content of the NOTICE file.
+//
+// You may obtain a copy of the Apache License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the Apache License with the above modification is
+// distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the Apache License for the specific
+// language governing permissions and limitations under the Apache License.
+//
+
+#include "pxr/base/tf/diagnostic.h"
+#include "pxr/base/tf/unicodeUtils.h"
+
+PXR_NAMESPACE_OPEN_SCOPE
+
+namespace TfUnicodeUtils {
+
+std::optional<uint32_t> Utf8ConstIterator::_GetCodePoint() const
+{
+    auto begin = _it;
+    if (begin >= _end)
+    {
+        // error condition, we can't advance and the code point is invalid
+        return std::nullopt;
+    }
+
+    // determine what encoding length the character is
+    _EncodingLength encodingLength = this->_GetEncodingLength();
+    if (encodingLength == 1)
+    {
+        return static_cast<uint32_t>(static_cast<unsigned char>(*begin));
+    }
+    else if (encodingLength == 2)
+    {
+        if (std::distance(begin, _end) < encodingLength)
+        {
+            return std::nullopt;
+        }
+
+        unsigned char byte1 = static_cast<unsigned char>(*begin);
+        unsigned char byte2 = static_cast<unsigned char>(*(++begin));
+
+        // ensure the ranges we expect, or it's not a valid character
+        if (byte1 < static_cast<unsigned char>('\xc2') ||
+            byte1 > static_cast<unsigned char>('\xdf'))
+        {
+            return std::nullopt;
+        }
+        if (byte2 < static_cast<unsigned char>('\x80') ||
+            byte2 > static_cast<unsigned char>('\xbf'))
+        {
+            return std::nullopt;
+        }
+
+        // the code point is constructed from the last 5 bits of byte1
+        // and the last 6 bits of byte2
+        return ((byte1 & 0x1f) << 6) + (byte2 & 0x3f);
+    }
+    else if (encodingLength == 3)
+    {
+        if (std::distance(begin, _end) < encodingLength)
+        {
+            return std::nullopt;
+        }
+
+        unsigned char byte1 = static_cast<unsigned char>(*begin);
+        unsigned char byte2 = static_cast<unsigned char>(*(++begin));
+        unsigned char byte3 = static_cast<unsigned char>(*(++begin));
+
+        // ensure the ranges we expect, or it's not a valid character
+        if (byte1 == static_cast<unsigned char>('\xe0'))
+        {
+            // byte2 must be in range A0..BF
+            // byte3 must be in range 80..BF
+            if (byte2 < static_cast<unsigned char>('\xa0') ||
+                byte2 > static_cast<unsigned char>('\xbf') ||
+                byte3 < static_cast<unsigned char>('\x80') ||
+                byte3 > static_cast<unsigned char>('\xbf'))
+            {
+                return std::nullopt;
+            }
+        }
+        else if ((byte1 >= static_cast<unsigned char>('\xe1') &&
+                    byte1 <= static_cast<unsigned char>('\xec')) ||
+                    byte1 == static_cast<unsigned char>('\xee') ||
+                    byte1 == static_cast<unsigned char>('\xef'))
+        {
+            // byte2 must be in range 80..BF
+            // byte3 must be in range 80..BF
+            if (byte2 < static_cast<unsigned char>('\x80') ||
+                byte2 > static_cast<unsigned char>('\xbf') ||
+                byte3 < static_cast<unsigned char>('\x80') ||
+                byte3 > static_cast<unsigned char>('\xbf'))
+            {
+                return std::nullopt;
+            }
+        }
+        else if (byte1 == static_cast<unsigned char>('\xed'))
+        {
+            // byte2 must be in range 80..9F
+            // byte3 must be in range 80..BF
+            if (byte2 < static_cast<unsigned char>('\x80') ||
+                byte2 > static_cast<unsigned char>('\x9f') ||
+                byte3 < static_cast<unsigned char>('\x80') ||
+                byte3 > static_cast<unsigned char>('\xbf'))
+            {
+                return std::nullopt;
+            }
+        }
+        else
+        {
+            // byte 1 invalid
+            return std::nullopt;
+        }
+
+        // code point is constructed from the last 4 bits of byte1
+        // and the last 6 bits of bytes 2 and 3
+        return ((byte1 & 0xf) << 12) + ((byte2 & 0x3f) << 6) +
+                (byte3 & 0x3f);
+    }
+    else if (encodingLength == 4)
+    {
+        if (std::distance(begin, _end) < encodingLength)
+        {
+            return std::nullopt;
+        }
+
+        unsigned char byte1 = static_cast<unsigned char>(*begin);
+        unsigned char byte2 = static_cast<unsigned char>(*(++begin));
+        unsigned char byte3 = static_cast<unsigned char>(*(++begin));
+        unsigned char byte4 = static_cast<unsigned char>(*(++begin));
+
+        if (byte1 == static_cast<unsigned char>('\xf0'))
+        {
+            // byte2 must be in range 90..BF
+            // byte3 must be in range 80..BF
+            // byte4 must be in range 80..BF
+            if (byte2 < static_cast<unsigned char>('\x90') ||
+                byte2 > static_cast<unsigned char>('\xbf') ||
+                byte3 < static_cast<unsigned char>('\x80') ||
+                byte3 > static_cast<unsigned char>('\xbf') ||
+                byte4 < static_cast<unsigned char>('\x80') ||
+                byte4 > static_cast<unsigned char>('\xbf'))
+            {
+                return std::nullopt;
+            }
+        }
+        else if (byte1 >= static_cast<unsigned char>('\xf1') &&
+                 byte1 <= static_cast<unsigned char>('\xf3'))
+        {
+            // byte2 must be in range 80..BF
+            // byte3 must be in range 80..BF
+            // byte4 must be in range 80..BF
+            if (byte2 < static_cast<unsigned char>('\x80') ||
+                byte2 > static_cast<unsigned char>('\xbf') ||
+                byte3 < static_cast<unsigned char>('\x80') ||
+                byte3 > static_cast<unsigned char>('\xbf') ||
+                byte4 < static_cast<unsigned char>('\x80') ||
+                byte4 > static_cast<unsigned char>('\xbf'))
+            {
+                return std::nullopt;
+            }
+        }
+        else if (byte1 == static_cast<unsigned char>('\xf4'))
+        {
+            // byte2 must be in range 80..8F
+            // byte3 must be in range 80..BF
+            // byte4 must be in range 80..BF
+            if (byte2 < static_cast<unsigned char>('\x80') ||
+                byte2 > static_cast<unsigned char>('\x8f') ||
+                byte3 < static_cast<unsigned char>('\x80') ||
+                byte3 > static_cast<unsigned char>('\xbf') ||
+                byte4 < static_cast<unsigned char>('\x80') ||
+                byte4 > static_cast<unsigned char>('\xbf'))
+            {
+                return std::nullopt;
+            }
+        }
+        else
+        {
+            // byte 1 is invalid
+            return std::nullopt;
+        }
+
+        // code point is constructed from the last 3 bits of byte 1
+        // and the last 6 bits of bytes 2, 3, and 4
+        return ((byte1 & 0x7) << 18) + ((byte2 & 0x3f) << 12) +
+               ((byte3 & 0x3f) << 6) + (byte4 & 0x3f);
+    }
+    return std::nullopt;
+}
+} // end TfUnicodeUtils
+
+PXR_NAMESPACE_CLOSE_SCOPE