Skip to content

Commit

Permalink
Add an iterator to tf to decode UTF-8 strings
Browse files Browse the repository at this point in the history
  • Loading branch information
erslavin authored and nvmkuruc committed Nov 6, 2023
1 parent ceb3699 commit cf438d6
Show file tree
Hide file tree
Showing 4 changed files with 517 additions and 0 deletions.
5 changes: 5 additions & 0 deletions pxr/base/tf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ pxr_library(tf
type
typeFunctions
typeNotice
unicodeUtils
warning
weakBase
weakPtr
Expand Down Expand Up @@ -383,6 +384,7 @@ pxr_build_test(testTf
testenv/type.cpp
testenv/typeMultipleInheritance.cpp
testenv/typeInfoMap.cpp
testenv/unicodeUtils.cpp
testenv/weakPtr.cpp
)

Expand Down Expand Up @@ -646,6 +648,9 @@ pxr_register_test(TfTypeInfoMap
pxr_register_test(TfType_MultipleInheritance
COMMAND "${CMAKE_INSTALL_PREFIX}/tests/testTf TfType_MultipleInheritance"
)
pxr_register_test(TfUnicodeUtils
COMMAND "${CMAKE_INSTALL_PREFIX}/tests/testTf TfUnicodeUtils"
)
pxr_register_test(TfWeakPtr
COMMAND "${CMAKE_INSTALL_PREFIX}/tests/testTf TfWeakPtr"
)
Expand Down
75 changes: 75 additions & 0 deletions pxr/base/tf/testenv/unicodeUtils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
//
// Copyright 2023 Pixar
//
// Licensed under the Apache License, Version 2.0 (the "Apache License")
// with the following modification; you may not use this file except in
// compliance with the Apache License and the following modification to it:
// Section 6. Trademarks. is deleted and replaced with:
//
// 6. Trademarks. This License does not grant permission to use the trade
// names, trademarks, service marks, or product names of the Licensor
// and its affiliates, except as required to comply with Section 4(c) of
// the License and to reproduce the content of the NOTICE file.
//
// You may obtain a copy of the Apache License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the Apache License with the above modification is
// distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the Apache License for the specific
// language governing permissions and limitations under the Apache License.
//
#include "pxr/pxr.h"
#include "pxr/base/tf/diagnosticLite.h"
#include "pxr/base/tf/regTest.h"
#include "pxr/base/tf/unicodeUtils.h"

#include <string_view>

PXR_NAMESPACE_USING_DIRECTIVE

static bool
TestUtf8Iterator()
{
// Exercise the iterator converting from UTF-8 char to code point
const std::string_view s1 = "ⅈ75_hgòð㤻";
const std::string_view s2 = "㤼01৪∫";
const std::string_view s3 = "㤻üaf-∫⁇…🔗";

TfUnicodeUtils::Utf8ConstIterator i1(s1);
TfUnicodeUtils::Utf8ConstIterator i2(s2);
TfUnicodeUtils::Utf8ConstIterator i3(s3.begin(),
s3.begin() + s3.find("-"));
TfUnicodeUtils::Utf8ConstIterator i4(s3.begin() + s3.find("-"), s3.end());

TF_AXIOM(i1.GetBase() == s1.begin());
TF_AXIOM(*i1 == 8520);
std::advance(i1, 9);
TF_AXIOM(i1.GetBase() == s1.end());

TF_AXIOM(*i2 == 14652);
std::advance(i2, 5);
TF_AXIOM(i2.GetBase() == s2.end());

// i3 should contain all characters before the "-"
TF_AXIOM(*i3 == 14651);
std::advance(i3, 4);
TF_AXIOM(i3.GetBase() == i4.GetBase());

// i4 should include the "-" character
TF_AXIOM(*i4 == 45);
std::advance(i4, 5);
TF_AXIOM(i4.GetBase() == s3.end());

return true;
}

static bool
Test_TfUnicodeUtils()
{
return TestUtf8Iterator();
}

TF_ADD_REGTEST(TfUnicodeUtils);
208 changes: 208 additions & 0 deletions pxr/base/tf/unicodeUtils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
//
// Copyright 2023 Pixar
//
// Licensed under the Apache License, Version 2.0 (the "Apache License")
// with the following modification; you may not use this file except in
// compliance with the Apache License and the following modification to it:
// Section 6. Trademarks. is deleted and replaced with:
//
// 6. Trademarks. This License does not grant permission to use the trade
// names, trademarks, service marks, or product names of the Licensor
// and its affiliates, except as required to comply with Section 4(c) of
// the License and to reproduce the content of the NOTICE file.
//
// You may obtain a copy of the Apache License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the Apache License with the above modification is
// distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the Apache License for the specific
// language governing permissions and limitations under the Apache License.
//

#include "pxr/base/tf/diagnostic.h"
#include "pxr/base/tf/unicodeUtils.h"

PXR_NAMESPACE_OPEN_SCOPE

namespace TfUnicodeUtils {

std::optional<uint32_t> Utf8ConstIterator::_GetCodePoint() const
{
auto begin = _it;
if (begin >= _end)
{
// error condition, we can't advance and the code point is invalid
return std::nullopt;
}

// determine what encoding length the character is
_EncodingLength encodingLength = this->_GetEncodingLength();
if (encodingLength == 1)
{
return static_cast<uint32_t>(static_cast<unsigned char>(*begin));
}
else if (encodingLength == 2)
{
if (std::distance(begin, _end) < encodingLength)
{
return std::nullopt;
}

unsigned char byte1 = static_cast<unsigned char>(*begin);
unsigned char byte2 = static_cast<unsigned char>(*(++begin));

// ensure the ranges we expect, or it's not a valid character
if (byte1 < static_cast<unsigned char>('\xc2') ||
byte1 > static_cast<unsigned char>('\xdf'))
{
return std::nullopt;
}
if (byte2 < static_cast<unsigned char>('\x80') ||
byte2 > static_cast<unsigned char>('\xbf'))
{
return std::nullopt;
}

// the code point is constructed from the last 5 bits of byte1
// and the last 6 bits of byte2
return ((byte1 & 0x1f) << 6) + (byte2 & 0x3f);
}
else if (encodingLength == 3)
{
if (std::distance(begin, _end) < encodingLength)
{
return std::nullopt;
}

unsigned char byte1 = static_cast<unsigned char>(*begin);
unsigned char byte2 = static_cast<unsigned char>(*(++begin));
unsigned char byte3 = static_cast<unsigned char>(*(++begin));

// ensure the ranges we expect, or it's not a valid character
if (byte1 == static_cast<unsigned char>('\xe0'))
{
// byte2 must be in range A0..BF
// byte3 must be in range 80..BF
if (byte2 < static_cast<unsigned char>('\xa0') ||
byte2 > static_cast<unsigned char>('\xbf') ||
byte3 < static_cast<unsigned char>('\x80') ||
byte3 > static_cast<unsigned char>('\xbf'))
{
return std::nullopt;
}
}
else if ((byte1 >= static_cast<unsigned char>('\xe1') &&
byte1 <= static_cast<unsigned char>('\xec')) ||
byte1 == static_cast<unsigned char>('\xee') ||
byte1 == static_cast<unsigned char>('\xef'))
{
// byte2 must be in range 80..BF
// byte3 must be in range 80..BF
if (byte2 < static_cast<unsigned char>('\x80') ||
byte2 > static_cast<unsigned char>('\xbf') ||
byte3 < static_cast<unsigned char>('\x80') ||
byte3 > static_cast<unsigned char>('\xbf'))
{
return std::nullopt;
}
}
else if (byte1 == static_cast<unsigned char>('\xed'))
{
// byte2 must be in range 80..9F
// byte3 must be in range 80..BF
if (byte2 < static_cast<unsigned char>('\x80') ||
byte2 > static_cast<unsigned char>('\x9f') ||
byte3 < static_cast<unsigned char>('\x80') ||
byte3 > static_cast<unsigned char>('\xbf'))
{
return std::nullopt;
}
}
else
{
// byte 1 invalid
return std::nullopt;
}

// code point is constructed from the last 4 bits of byte1
// and the last 6 bits of bytes 2 and 3
return ((byte1 & 0xf) << 12) + ((byte2 & 0x3f) << 6) +
(byte3 & 0x3f);
}
else if (encodingLength == 4)
{
if (std::distance(begin, _end) < encodingLength)
{
return std::nullopt;
}

unsigned char byte1 = static_cast<unsigned char>(*begin);
unsigned char byte2 = static_cast<unsigned char>(*(++begin));
unsigned char byte3 = static_cast<unsigned char>(*(++begin));
unsigned char byte4 = static_cast<unsigned char>(*(++begin));

if (byte1 == static_cast<unsigned char>('\xf0'))
{
// byte2 must be in range 90..BF
// byte3 must be in range 80..BF
// byte4 must be in range 80..BF
if (byte2 < static_cast<unsigned char>('\x90') ||
byte2 > static_cast<unsigned char>('\xbf') ||
byte3 < static_cast<unsigned char>('\x80') ||
byte3 > static_cast<unsigned char>('\xbf') ||
byte4 < static_cast<unsigned char>('\x80') ||
byte4 > static_cast<unsigned char>('\xbf'))
{
return std::nullopt;
}
}
else if (byte1 >= static_cast<unsigned char>('\xf1') &&
byte1 <= static_cast<unsigned char>('\xf3'))
{
// byte2 must be in range 80..BF
// byte3 must be in range 80..BF
// byte4 must be in range 80..BF
if (byte2 < static_cast<unsigned char>('\x80') ||
byte2 > static_cast<unsigned char>('\xbf') ||
byte3 < static_cast<unsigned char>('\x80') ||
byte3 > static_cast<unsigned char>('\xbf') ||
byte4 < static_cast<unsigned char>('\x80') ||
byte4 > static_cast<unsigned char>('\xbf'))
{
return std::nullopt;
}
}
else if (byte1 == static_cast<unsigned char>('\xf4'))
{
// byte2 must be in range 80..8F
// byte3 must be in range 80..BF
// byte4 must be in range 80..BF
if (byte2 < static_cast<unsigned char>('\x80') ||
byte2 > static_cast<unsigned char>('\x8f') ||
byte3 < static_cast<unsigned char>('\x80') ||
byte3 > static_cast<unsigned char>('\xbf') ||
byte4 < static_cast<unsigned char>('\x80') ||
byte4 > static_cast<unsigned char>('\xbf'))
{
return std::nullopt;
}
}
else
{
// byte 1 is invalid
return std::nullopt;
}

// code point is constructed from the last 3 bits of byte 1
// and the last 6 bits of bytes 2, 3, and 4
return ((byte1 & 0x7) << 18) + ((byte2 & 0x3f) << 12) +
((byte3 & 0x3f) << 6) + (byte4 & 0x3f);
}
return std::nullopt;
}
} // end TfUnicodeUtils

PXR_NAMESPACE_CLOSE_SCOPE
Loading

0 comments on commit cf438d6

Please sign in to comment.