Skip to content

Commit

Permalink
Support for UTF-8 chars in Identifiers
Browse files Browse the repository at this point in the history
- Modified path parser to accept UTF-8 characters for Identifiers
- Modified identifier validity rules to accept valid UTF-8 identifiers
  (XID_Start followed by XID_Continue)
- Added UTF-8 utility functions to tf
- Added tests for UTF-8 based paths
- Added reference UnicodeDatabase.txt for character classes
  • Loading branch information
erslavin committed Sep 13, 2023
1 parent 7235163 commit 58a29e2
Show file tree
Hide file tree
Showing 14 changed files with 35,781 additions and 113 deletions.
3 changes: 3 additions & 0 deletions pxr/base/tf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ pxr_library(tf
type
typeFunctions
typeNotice
unicodeUtilsImpl
warning
weakBase
weakPtr
Expand Down Expand Up @@ -146,6 +147,7 @@ pxr_library(tf
staticTokens.h
typeInfoMap.h
type_Impl.h
unicodeUtils.h

PYTHON_PUBLIC_HEADERS
py3Compat.h
Expand All @@ -157,6 +159,7 @@ pxr_library(tf
atomicRenameUtil
debugCodes
noticeRegistry
unicodeCharacterClasses

PYTHON_PRIVATE_CLASSES
pyErrorInternal
Expand Down
29 changes: 1 addition & 28 deletions pxr/base/tf/stringUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1170,34 +1170,7 @@ TfStringCatPaths( const string &prefix, const string &suffix )
std::string
TfMakeValidIdentifier(const std::string &in)
{
std::string result;

if (in.empty()) {
result.push_back('_');
return result;
}

result.reserve(in.size());
char const *p = in.c_str();
if (!(('a' <= *p && *p <= 'z') ||
('A' <= *p && *p <= 'Z') ||
*p == '_')) {
result.push_back('_');
} else {
result.push_back(*p);
}

for (++p; *p; ++p) {
if (!(('a' <= *p && *p <= 'z') ||
('A' <= *p && *p <= 'Z') ||
('0' <= *p && *p <= '9') ||
*p == '_')) {
result.push_back('_');
} else {
result.push_back(*p);
}
}
return result;
return TfUnicodeUtils::MakeValidUTF8Identifier(in);
}

std::string
Expand Down
14 changes: 2 additions & 12 deletions pxr/base/tf/stringUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include "pxr/base/arch/inttypes.h"
#include "pxr/base/tf/api.h"
#include "pxr/base/tf/enum.h"
#include "pxr/base/tf/unicodeUtils.h"

#include <cstdarg>
#include <cstring>
Expand Down Expand Up @@ -695,18 +696,7 @@ std::string TfStringCatPaths( const std::string &prefix,
inline bool
TfIsValidIdentifier(std::string const &identifier)
{
char const *p = identifier.c_str();
auto letter = [](unsigned c) { return ((c-'A') < 26) || ((c-'a') < 26); };
auto number = [](unsigned c) { return (c-'0') < 10; };
auto under = [](unsigned c) { return c == '_'; };
unsigned x = *p;
if (!x || number(x)) {
return false;
}
while (letter(x) || number(x) || under(x)) {
x = *p++;
};
return x == 0;
return TfUnicodeUtils::IsValidUTF8Identifier(identifier.begin(), identifier.end());
}

/// Produce a valid identifier (see TfIsValidIdentifier) from \p in by
Expand Down
34,626 changes: 34,626 additions & 0 deletions pxr/base/tf/unicode/UnicodeDatabase.txt

Large diffs are not rendered by default.

193 changes: 193 additions & 0 deletions pxr/base/tf/unicode/tfGenCharacterClasses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
#!/usr/bin/env python
#
# Copyright 2016 Pixar
#
# Licensed under the Apache License, Version 2.0 (the "Apache License")
# with the following modification; you may not use this file except in
# compliance with the Apache License and the following modification to it:
# Section 6. Trademarks. is deleted and replaced with:
#
# 6. Trademarks. This License does not grant permission to use the trade
# names, trademarks, service marks, or product names of the Licensor
# and its affiliates, except as required to comply with Section 4(c) of
# the License and to reproduce the content of the NOTICE file.
#
# You may obtain a copy of the Apache License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the Apache License with the above modification is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the Apache License for the specific
# language governing permissions and limitations under the Apache License.
#
# A script for generating the character class sets for XID_Start and XID_Continue
# character classes. This takes a source UnicodeDatabase.txt from the Unicode standard
# and generates C++ source files that populate unordered sets with the appropriate
# code points.

import os

from argparse import ArgumentParser
from ctypes import c_uint

UNICODE_DATABASE_FILE = "UnicodeDatabase.txt"
SPECIAL_CASES_FILE = "SpecialCasing.txt"
CPP_FILE_NAME = "unicodeCharacterClasses.cpp"
XID_START_CLASS = ["Lu", "Ll", "Lt", "Lm", "Lo", "Nl"]
XID_CONTINUE_CLASS = ["Nd", "Mn", "Mc", "Pc"]

CPP_FILE_HEADER = """
//
// Copyright 2016 Pixar
//
// Licensed under the Apache License, Version 2.0 (the "Apache License")
// with the following modification; you may not use this file except in
// compliance with the Apache License and the following modification to it:
// Section 6. Trademarks. is deleted and replaced with:
//
// 6. Trademarks. This License does not grant permission to use the trade
// names, trademarks, service marks, or product names of the Licensor
// and its affiliates, except as required to comply with Section 4(c) of
// the License and to reproduce the content of the NOTICE file.
//
// You may obtain a copy of the Apache License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the Apache License with the above modification is
// distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the Apache License for the specific
// language governing permissions and limitations under the Apache License.
//
"""

INCLUDE_HEADERS = """
#include "pxr/pxr.h"
#include "pxr/base/tf/unicodeUtilsImpl.h"
#include <unordered_set>
#include <vector>
#include <utility>
"""

xid_start_class = []
xid_continue_class = []
xid_start_range_pairs = []
xid_continue_range_pairs = []

def _write_cpp_files(destination_directory : str):
"""
Writes the C++ code file that will initialize character class
sets with the values read by this script.
Args:
destination_directory: A string defining the path at which the generated cpp file will be written to.
If the specified directory does not exist, it will be created.
"""
if not os.path.exists(destination_directory):
os.mkdir(destination_directory)

generated_cpp_file_name = os.path.join(destination_directory, CPP_FILE_NAME)
with open(generated_cpp_file_name, 'w') as generated_cpp_file:
# write the header comment
generated_cpp_file.write(CPP_FILE_HEADER)

# write includes
generated_cpp_file.write(INCLUDE_HEADERS)

# open the namespace
generated_cpp_file.write("PXR_NAMESPACE_OPEN_SCOPE\n\n")

# generate the sets
generated_cpp_file.write("std::unordered_set<uint32_t> TfUnicodeUtils::Impl::xidStartClass = {")
generated_cpp_file.write(','.join(xid_start_class))
generated_cpp_file.write("};\n\n")
generated_cpp_file.write("std::unordered_set<uint32_t> TfUnicodeUtils::Impl::xidContinueClass = {")
generated_cpp_file.write(','.join(xid_continue_class))
generated_cpp_file.write("};\n\n")
generated_cpp_file.write("std::vector<std::pair<uint32_t, uint32_t>> TfUnicodeUtils::Impl::xidStartRangeClass = {")
if len(xid_start_range_pairs) > 0:
generated_cpp_file.write(','.join('{' + x[0] + ',' + x[1] + '}' for x in xid_start_range_pairs))
generated_cpp_file.write("};\n\n")
generated_cpp_file.write("std::vector<std::pair<uint32_t, uint32_t>> TfUnicodeUtils::Impl::xidContinueRangeClass = {")
if len(xid_continue_range_pairs) > 0:
generated_cpp_file.write(','.join('{' + x[0] + ',' + x[1] + '}' for x in xid_continue_range_pairs))
generated_cpp_file.write("};\n\n")

# close the namespace
generated_cpp_file.write("PXR_NAMESPACE_CLOSE_SCOPE\n")

def _parseArguments():
"""
Parses the arguments sent to the script.
Returns:
An object containing the parsed arguments as accessible fields.
"""
parser = ArgumentParser(description='Generate character class sets for Unicode characters.')
parser.add_argument('--srcDir', required=False, default=os.getcwd(),
help='The source directory where the UnicodeDatabase.txt file exists.')
parser.add_argument('--destDir', required=False, default=os.getcwd(),
help='The destination directory where the processed cpp file will be written to.')

return parser.parse_args()

if __name__ == '__main__':
# read in the UnicodeDatabase.txt file
arguments = _parseArguments()

# parse the UnicodeDatabase.txt file
# each line represents a single code point with information about the character
# represented by that code point
# for now we are only interested in the code point itself and
# the character class, which reside in columns 0 and 2 respectively
unicode_database_file_name = os.path.join(arguments.srcDir, UNICODE_DATABASE_FILE)
if not os.path.exists(unicode_database_file_name):
raise RuntimeError(f"Error in script: Could not find 'UnicodeDatabase.txt' at path {arguments.srcDir}!")

# the UnicodeDatabase is the primary source of mappings
# this will give us the character classes as well as:
# (some) case mapping information (the rest is in SpecialCasing.txt)
first_pair = None
with open(unicode_database_file_name, 'r') as unicode_database_file:
for line in unicode_database_file:
# split the line
tokens = line.split(';')
code_point = int(tokens[0], 16)
character_name = tokens[1]
character_class = tokens[2]
if '<' in character_name:
# this is an indication that the character is a group of characters
# that fall in a range of code points that all have the same character class
# with more specific properties given elsewhere
# we don't need those, but we do need to account for ranges
# the first part of the range is always before the last part in the UnicodeDatabase.txt file
# and are always separated by a single line, so we can track it very simply
if ', First' in character_name:
# it's the first character in the range
first_pair = str(code_point)
elif ', Last' in character_name:
# it's the second character in the range
if character_class in XID_START_CLASS:
xid_start_range_pairs.append((first_pair, str(code_point)))
elif character_class in XID_CONTINUE_CLASS:
xid_continue_range_pairs.append((first_pair, str(code_point)))

first_pair = None
else:
if character_class in XID_START_CLASS:
xid_start_class.append(str(code_point))
elif character_class in XID_CONTINUE_CLASS:
xid_continue_class.append(str(code_point))

if code_point == 95:
# special case is underscore, which we will add to the XID_Start class because
# C++ / Python allow it specifically (it's a separate if because it's part of
# the "Pc" class, meaning it is considered XID_Continue)
xid_start_class.append(str(code_point))

_write_cpp_files(arguments.destDir)
43 changes: 43 additions & 0 deletions pxr/base/tf/unicodeCharacterClasses.cpp

Large diffs are not rendered by default.

47 changes: 47 additions & 0 deletions pxr/base/tf/unicodeCharacterClasses.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
//
// Copyright 2023 Pixar
//
// Licensed under the Apache License, Version 2.0 (the "Apache License")
// with the following modification; you may not use this file except in
// compliance with the Apache License and the following modification to it:
// Section 6. Trademarks. is deleted and replaced with:
//
// 6. Trademarks. This License does not grant permission to use the trade
// names, trademarks, service marks, or product names of the Licensor
// and its affiliates, except as required to comply with Section 4(c) of
// the License and to reproduce the content of the NOTICE file.
//
// You may obtain a copy of the Apache License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the Apache License with the above modification is
// distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the Apache License for the specific
// language governing permissions and limitations under the Apache License.
//
#ifndef PXR_BASE_TF_UNICODE_CHARACTER_CLASSES_H
#define PXR_BASE_TF_UNICODE_CHARACTER_CLASSES_H

#include "pxr/pxr.h"

#include <unordered_set>
#include <vector>

PXR_NAMESPACE_OPEN_SCOPE

namespace TfUnicodeUtils {
namespace Impl {
// these data structures hold information generated from the source
// UnicodeDatabase.txt file (see unicode/tfGenCharacteClasses.py)
extern std::unordered_set<uint32_t> xidStartClass;
extern std::unordered_set<uint32_t> xidContinueClass;
extern std::vector<std::pair<uint32_t, uint32_t>> xidStartRangeClass;
extern std::vector<std::pair<uint32_t, uint32_t>> xidContinueRangeClass;
}
}

PXR_NAMESPACE_CLOSE_SCOPE

#endif // PXR_BASE_TF_UNICODE_CHARACTER_CLASSES_H_
Loading

0 comments on commit 58a29e2

Please sign in to comment.