-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add an iterator to tf to decode UTF-8 strings
- Loading branch information
Showing
4 changed files
with
517 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
// | ||
// Copyright 2023 Pixar | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "Apache License") | ||
// with the following modification; you may not use this file except in | ||
// compliance with the Apache License and the following modification to it: | ||
// Section 6. Trademarks. is deleted and replaced with: | ||
// | ||
// 6. Trademarks. This License does not grant permission to use the trade | ||
// names, trademarks, service marks, or product names of the Licensor | ||
// and its affiliates, except as required to comply with Section 4(c) of | ||
// the License and to reproduce the content of the NOTICE file. | ||
// | ||
// You may obtain a copy of the Apache License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the Apache License with the above modification is | ||
// distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the Apache License for the specific | ||
// language governing permissions and limitations under the Apache License. | ||
// | ||
#include "pxr/pxr.h" | ||
#include "pxr/base/tf/diagnosticLite.h" | ||
#include "pxr/base/tf/regTest.h" | ||
#include "pxr/base/tf/unicodeUtils.h" | ||
|
||
#include <string_view> | ||
|
||
PXR_NAMESPACE_USING_DIRECTIVE | ||
|
||
static bool | ||
TestUtf8Iterator() | ||
{ | ||
// Exercise the iterator converting from UTF-8 char to code point | ||
const std::string_view s1 = "ⅈ75_hgòð㤻"; | ||
const std::string_view s2 = "㤼01৪∫"; | ||
const std::string_view s3 = "㤻üaf-∫⁇…🔗"; | ||
|
||
TfUnicodeUtils::Utf8ConstIterator i1(s1); | ||
TfUnicodeUtils::Utf8ConstIterator i2(s2); | ||
TfUnicodeUtils::Utf8ConstIterator i3(s3.begin(), | ||
s3.begin() + s3.find("-")); | ||
TfUnicodeUtils::Utf8ConstIterator i4(s3.begin() + s3.find("-"), s3.end()); | ||
|
||
TF_AXIOM(i1.GetBase() == s1.begin()); | ||
TF_AXIOM(*i1 == 8520); | ||
std::advance(i1, 9); | ||
TF_AXIOM(i1.GetBase() == s1.end()); | ||
|
||
TF_AXIOM(*i2 == 14652); | ||
std::advance(i2, 5); | ||
TF_AXIOM(i2.GetBase() == s2.end()); | ||
|
||
// i3 should contain all characters before the "-" | ||
TF_AXIOM(*i3 == 14651); | ||
std::advance(i3, 4); | ||
TF_AXIOM(i3.GetBase() == i4.GetBase()); | ||
|
||
// i4 should include the "-" character | ||
TF_AXIOM(*i4 == 45); | ||
std::advance(i4, 5); | ||
TF_AXIOM(i4.GetBase() == s3.end()); | ||
|
||
return true; | ||
} | ||
|
||
static bool | ||
Test_TfUnicodeUtils() | ||
{ | ||
return TestUtf8Iterator(); | ||
} | ||
|
||
TF_ADD_REGTEST(TfUnicodeUtils); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,208 @@ | ||
// | ||
// Copyright 2023 Pixar | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "Apache License") | ||
// with the following modification; you may not use this file except in | ||
// compliance with the Apache License and the following modification to it: | ||
// Section 6. Trademarks. is deleted and replaced with: | ||
// | ||
// 6. Trademarks. This License does not grant permission to use the trade | ||
// names, trademarks, service marks, or product names of the Licensor | ||
// and its affiliates, except as required to comply with Section 4(c) of | ||
// the License and to reproduce the content of the NOTICE file. | ||
// | ||
// You may obtain a copy of the Apache License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the Apache License with the above modification is | ||
// distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the Apache License for the specific | ||
// language governing permissions and limitations under the Apache License. | ||
// | ||
|
||
#include "pxr/base/tf/diagnostic.h" | ||
#include "pxr/base/tf/unicodeUtils.h" | ||
|
||
PXR_NAMESPACE_OPEN_SCOPE | ||
|
||
namespace TfUnicodeUtils { | ||
|
||
std::optional<uint32_t> Utf8ConstIterator::_GetCodePoint() const | ||
{ | ||
auto begin = _it; | ||
if (begin >= _end) | ||
{ | ||
// error condition, we can't advance and the code point is invalid | ||
return std::nullopt; | ||
} | ||
|
||
// determine what encoding length the character is | ||
_EncodingLength encodingLength = this->_GetEncodingLength(); | ||
if (encodingLength == 1) | ||
{ | ||
return static_cast<uint32_t>(static_cast<unsigned char>(*begin)); | ||
} | ||
else if (encodingLength == 2) | ||
{ | ||
if (std::distance(begin, _end) < encodingLength) | ||
{ | ||
return std::nullopt; | ||
} | ||
|
||
unsigned char byte1 = static_cast<unsigned char>(*begin); | ||
unsigned char byte2 = static_cast<unsigned char>(*(++begin)); | ||
|
||
// ensure the ranges we expect, or it's not a valid character | ||
if (byte1 < static_cast<unsigned char>('\xc2') || | ||
byte1 > static_cast<unsigned char>('\xdf')) | ||
{ | ||
return std::nullopt; | ||
} | ||
if (byte2 < static_cast<unsigned char>('\x80') || | ||
byte2 > static_cast<unsigned char>('\xbf')) | ||
{ | ||
return std::nullopt; | ||
} | ||
|
||
// the code point is constructed from the last 5 bits of byte1 | ||
// and the last 6 bits of byte2 | ||
return ((byte1 & 0x1f) << 6) + (byte2 & 0x3f); | ||
} | ||
else if (encodingLength == 3) | ||
{ | ||
if (std::distance(begin, _end) < encodingLength) | ||
{ | ||
return std::nullopt; | ||
} | ||
|
||
unsigned char byte1 = static_cast<unsigned char>(*begin); | ||
unsigned char byte2 = static_cast<unsigned char>(*(++begin)); | ||
unsigned char byte3 = static_cast<unsigned char>(*(++begin)); | ||
|
||
// ensure the ranges we expect, or it's not a valid character | ||
if (byte1 == static_cast<unsigned char>('\xe0')) | ||
{ | ||
// byte2 must be in range A0..BF | ||
// byte3 must be in range 80..BF | ||
if (byte2 < static_cast<unsigned char>('\xa0') || | ||
byte2 > static_cast<unsigned char>('\xbf') || | ||
byte3 < static_cast<unsigned char>('\x80') || | ||
byte3 > static_cast<unsigned char>('\xbf')) | ||
{ | ||
return std::nullopt; | ||
} | ||
} | ||
else if ((byte1 >= static_cast<unsigned char>('\xe1') && | ||
byte1 <= static_cast<unsigned char>('\xec')) || | ||
byte1 == static_cast<unsigned char>('\xee') || | ||
byte1 == static_cast<unsigned char>('\xef')) | ||
{ | ||
// byte2 must be in range 80..BF | ||
// byte3 must be in range 80..BF | ||
if (byte2 < static_cast<unsigned char>('\x80') || | ||
byte2 > static_cast<unsigned char>('\xbf') || | ||
byte3 < static_cast<unsigned char>('\x80') || | ||
byte3 > static_cast<unsigned char>('\xbf')) | ||
{ | ||
return std::nullopt; | ||
} | ||
} | ||
else if (byte1 == static_cast<unsigned char>('\xed')) | ||
{ | ||
// byte2 must be in range 80..9F | ||
// byte3 must be in range 80..BF | ||
if (byte2 < static_cast<unsigned char>('\x80') || | ||
byte2 > static_cast<unsigned char>('\x9f') || | ||
byte3 < static_cast<unsigned char>('\x80') || | ||
byte3 > static_cast<unsigned char>('\xbf')) | ||
{ | ||
return std::nullopt; | ||
} | ||
} | ||
else | ||
{ | ||
// byte 1 invalid | ||
return std::nullopt; | ||
} | ||
|
||
// code point is constructed from the last 4 bits of byte1 | ||
// and the last 6 bits of bytes 2 and 3 | ||
return ((byte1 & 0xf) << 12) + ((byte2 & 0x3f) << 6) + | ||
(byte3 & 0x3f); | ||
} | ||
else if (encodingLength == 4) | ||
{ | ||
if (std::distance(begin, _end) < encodingLength) | ||
{ | ||
return std::nullopt; | ||
} | ||
|
||
unsigned char byte1 = static_cast<unsigned char>(*begin); | ||
unsigned char byte2 = static_cast<unsigned char>(*(++begin)); | ||
unsigned char byte3 = static_cast<unsigned char>(*(++begin)); | ||
unsigned char byte4 = static_cast<unsigned char>(*(++begin)); | ||
|
||
if (byte1 == static_cast<unsigned char>('\xf0')) | ||
{ | ||
// byte2 must be in range 90..BF | ||
// byte3 must be in range 80..BF | ||
// byte4 must be in range 80..BF | ||
if (byte2 < static_cast<unsigned char>('\x90') || | ||
byte2 > static_cast<unsigned char>('\xbf') || | ||
byte3 < static_cast<unsigned char>('\x80') || | ||
byte3 > static_cast<unsigned char>('\xbf') || | ||
byte4 < static_cast<unsigned char>('\x80') || | ||
byte4 > static_cast<unsigned char>('\xbf')) | ||
{ | ||
return std::nullopt; | ||
} | ||
} | ||
else if (byte1 >= static_cast<unsigned char>('\xf1') && | ||
byte1 <= static_cast<unsigned char>('\xf3')) | ||
{ | ||
// byte2 must be in range 80..BF | ||
// byte3 must be in range 80..BF | ||
// byte4 must be in range 80..BF | ||
if (byte2 < static_cast<unsigned char>('\x80') || | ||
byte2 > static_cast<unsigned char>('\xbf') || | ||
byte3 < static_cast<unsigned char>('\x80') || | ||
byte3 > static_cast<unsigned char>('\xbf') || | ||
byte4 < static_cast<unsigned char>('\x80') || | ||
byte4 > static_cast<unsigned char>('\xbf')) | ||
{ | ||
return std::nullopt; | ||
} | ||
} | ||
else if (byte1 == static_cast<unsigned char>('\xf4')) | ||
{ | ||
// byte2 must be in range 80..8F | ||
// byte3 must be in range 80..BF | ||
// byte4 must be in range 80..BF | ||
if (byte2 < static_cast<unsigned char>('\x80') || | ||
byte2 > static_cast<unsigned char>('\x8f') || | ||
byte3 < static_cast<unsigned char>('\x80') || | ||
byte3 > static_cast<unsigned char>('\xbf') || | ||
byte4 < static_cast<unsigned char>('\x80') || | ||
byte4 > static_cast<unsigned char>('\xbf')) | ||
{ | ||
return std::nullopt; | ||
} | ||
} | ||
else | ||
{ | ||
// byte 1 is invalid | ||
return std::nullopt; | ||
} | ||
|
||
// code point is constructed from the last 3 bits of byte 1 | ||
// and the last 6 bits of bytes 2, 3, and 4 | ||
return ((byte1 & 0x7) << 18) + ((byte2 & 0x3f) << 12) + | ||
((byte3 & 0x3f) << 6) + (byte4 & 0x3f); | ||
} | ||
return std::nullopt; | ||
} | ||
} // end TfUnicodeUtils | ||
|
||
PXR_NAMESPACE_CLOSE_SCOPE |
Oops, something went wrong.