Support for UTF-8 chars in Identifiers

- Modified path parser to accept UTF-8 characters for Identifiers - Modified identifier validity rules to accept valid UTF-8 identifiers (XID_Start followed by XID_Continue) - Added UTF-8 utility functions to tf - Added tests for UTF-8 based paths - Added reference UnicodeDatabase.txt for character classes
PixarAnimationStudios · Sep 13, 2023 · 58a29e2 · 58a29e2
1 parent 7235163
commit 58a29e2
Show file tree

Hide file tree

Showing 14 changed files with 35,781 additions and 113 deletions.
diff --git a/pxr/base/tf/CMakeLists.txt b/pxr/base/tf/CMakeLists.txt
@@ -83,6 +83,7 @@ pxr_library(tf
         type
         typeFunctions
         typeNotice
+        unicodeUtilsImpl
         warning
         weakBase
         weakPtr
@@ -146,6 +147,7 @@ pxr_library(tf
         staticTokens.h
         typeInfoMap.h
         type_Impl.h
+        unicodeUtils.h
 
     PYTHON_PUBLIC_HEADERS
         py3Compat.h
@@ -157,6 +159,7 @@ pxr_library(tf
         atomicRenameUtil
         debugCodes
         noticeRegistry
+        unicodeCharacterClasses
 
     PYTHON_PRIVATE_CLASSES
         pyErrorInternal

diff --git a/pxr/base/tf/stringUtils.cpp b/pxr/base/tf/stringUtils.cpp
@@ -1170,34 +1170,7 @@ TfStringCatPaths( const string &prefix, const string &suffix )
 std::string
 TfMakeValidIdentifier(const std::string &in)
 {
-    std::string result;
-
-    if (in.empty()) {
-        result.push_back('_');
-        return result;
-    }
-
-    result.reserve(in.size());
-    char const *p = in.c_str();
-    if (!(('a' <= *p && *p <= 'z') || 
-          ('A' <= *p && *p <= 'Z') || 
-          *p == '_')) {
-        result.push_back('_');
-    } else {
-        result.push_back(*p);
-    }
-
-    for (++p; *p; ++p) {
-        if (!(('a' <= *p && *p <= 'z') ||    
-              ('A' <= *p && *p <= 'Z') ||  
-              ('0' <= *p && *p <= '9') ||  
-              *p == '_')) {
-            result.push_back('_');
-        } else {
-            result.push_back(*p);
-        }
-    }
-    return result;
+    return TfUnicodeUtils::MakeValidUTF8Identifier(in);
 }
 
 std::string

diff --git a/pxr/base/tf/stringUtils.h b/pxr/base/tf/stringUtils.h
@@ -34,6 +34,7 @@
 #include "pxr/base/arch/inttypes.h"
 #include "pxr/base/tf/api.h"
 #include "pxr/base/tf/enum.h"
+#include "pxr/base/tf/unicodeUtils.h"
 
 #include <cstdarg>
 #include <cstring>
@@ -695,18 +696,7 @@ std::string TfStringCatPaths( const std::string &prefix,
 inline bool
 TfIsValidIdentifier(std::string const &identifier)
 {
-    char const *p = identifier.c_str();
-    auto letter = [](unsigned c) { return ((c-'A') < 26) || ((c-'a') < 26); };
-    auto number = [](unsigned c) { return (c-'0') < 10; };
-    auto under = [](unsigned c) { return c == '_'; };
-    unsigned x = *p;
-    if (!x || number(x)) {
-        return false;
-    }
-    while (letter(x) || number(x) || under(x)) {
-        x = *p++;
-    };
-    return x == 0;
+    return TfUnicodeUtils::IsValidUTF8Identifier(identifier.begin(), identifier.end());
 }
 
 /// Produce a valid identifier (see TfIsValidIdentifier) from \p in by

diff --git a/pxr/base/tf/unicode/UnicodeDatabase.txt b/pxr/base/tf/unicode/UnicodeDatabase.txt
diff --git a/pxr/base/tf/unicode/tfGenCharacterClasses.py b/pxr/base/tf/unicode/tfGenCharacterClasses.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python
+#
+# Copyright 2016 Pixar
+#
+# Licensed under the Apache License, Version 2.0 (the "Apache License")
+# with the following modification; you may not use this file except in
+# compliance with the Apache License and the following modification to it:
+# Section 6. Trademarks. is deleted and replaced with:
+#
+# 6. Trademarks. This License does not grant permission to use the trade
+#    names, trademarks, service marks, or product names of the Licensor
+#    and its affiliates, except as required to comply with Section 4(c) of
+#    the License and to reproduce the content of the NOTICE file.
+#
+# You may obtain a copy of the Apache License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the Apache License with the above modification is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the Apache License for the specific
+# language governing permissions and limitations under the Apache License.
+#
+# A script for generating the character class sets for XID_Start and XID_Continue
+# character classes.  This takes a source UnicodeDatabase.txt from the Unicode standard
+# and generates C++ source files that populate unordered sets with the appropriate
+# code points.
+
+import os
+
+from argparse import ArgumentParser
+from ctypes import c_uint
+
+UNICODE_DATABASE_FILE = "UnicodeDatabase.txt"
+SPECIAL_CASES_FILE = "SpecialCasing.txt"
+CPP_FILE_NAME = "unicodeCharacterClasses.cpp"
+XID_START_CLASS = ["Lu", "Ll", "Lt", "Lm", "Lo", "Nl"]
+XID_CONTINUE_CLASS = ["Nd", "Mn", "Mc", "Pc"]
+
+CPP_FILE_HEADER = """
+//
+// Copyright 2016 Pixar
+//
+// Licensed under the Apache License, Version 2.0 (the "Apache License")
+// with the following modification; you may not use this file except in
+// compliance with the Apache License and the following modification to it:
+// Section 6. Trademarks. is deleted and replaced with:
+//
+// 6. Trademarks. This License does not grant permission to use the trade
+//    names, trademarks, service marks, or product names of the Licensor
+//    and its affiliates, except as required to comply with Section 4(c) of
+//    the License and to reproduce the content of the NOTICE file.
+//
+// You may obtain a copy of the Apache License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the Apache License with the above modification is
+// distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the Apache License for the specific
+// language governing permissions and limitations under the Apache License.
+//
+"""
+
+INCLUDE_HEADERS = """
+#include "pxr/pxr.h"
+#include "pxr/base/tf/unicodeUtilsImpl.h"
+
+#include <unordered_set>
+#include <vector>
+#include <utility>
+
+"""
+
+xid_start_class = []
+xid_continue_class = []
+xid_start_range_pairs = []
+xid_continue_range_pairs = []
+
+def _write_cpp_files(destination_directory : str):
+    """
+    Writes the C++ code file that will initialize character class
+    sets with the values read by this script.
+
+    Args:
+        destination_directory: A string defining the path at which the generated cpp file will be written to.
+                               If the specified directory does not exist, it will be created.
+    """
+    if not os.path.exists(destination_directory):
+        os.mkdir(destination_directory)
+
+    generated_cpp_file_name = os.path.join(destination_directory, CPP_FILE_NAME)
+    with open(generated_cpp_file_name, 'w') as generated_cpp_file:
+        # write the header comment
+        generated_cpp_file.write(CPP_FILE_HEADER)
+
+        # write includes
+        generated_cpp_file.write(INCLUDE_HEADERS)
+
+        # open the namespace
+        generated_cpp_file.write("PXR_NAMESPACE_OPEN_SCOPE\n\n")
+
+        # generate the sets
+        generated_cpp_file.write("std::unordered_set<uint32_t> TfUnicodeUtils::Impl::xidStartClass = {")
+        generated_cpp_file.write(','.join(xid_start_class))
+        generated_cpp_file.write("};\n\n")
+        generated_cpp_file.write("std::unordered_set<uint32_t> TfUnicodeUtils::Impl::xidContinueClass = {")
+        generated_cpp_file.write(','.join(xid_continue_class))
+        generated_cpp_file.write("};\n\n")
+        generated_cpp_file.write("std::vector<std::pair<uint32_t, uint32_t>> TfUnicodeUtils::Impl::xidStartRangeClass = {")
+        if len(xid_start_range_pairs) > 0:
+            generated_cpp_file.write(','.join('{' + x[0] + ',' + x[1] + '}' for x in xid_start_range_pairs))
+        generated_cpp_file.write("};\n\n")
+        generated_cpp_file.write("std::vector<std::pair<uint32_t, uint32_t>> TfUnicodeUtils::Impl::xidContinueRangeClass = {")
+        if len(xid_continue_range_pairs) > 0:
+            generated_cpp_file.write(','.join('{' + x[0] + ',' + x[1] + '}' for x in xid_continue_range_pairs))
+        generated_cpp_file.write("};\n\n")
+
+        # close the namespace
+        generated_cpp_file.write("PXR_NAMESPACE_CLOSE_SCOPE\n")
+
+def _parseArguments():
+    """
+    Parses the arguments sent to the script.
+
+    Returns:
+        An object containing the parsed arguments as accessible fields.
+    """
+    parser = ArgumentParser(description='Generate character class sets for Unicode characters.')
+    parser.add_argument('--srcDir', required=False, default=os.getcwd(),
+                        help='The source directory where the UnicodeDatabase.txt file exists.')
+    parser.add_argument('--destDir', required=False, default=os.getcwd(),
+                        help='The destination directory where the processed cpp file will be written to.')
+
+    return parser.parse_args()
+
+if __name__ == '__main__':
+    # read in the UnicodeDatabase.txt file
+    arguments = _parseArguments()
+
+    # parse the UnicodeDatabase.txt file
+    # each line represents a single code point with information about the character
+    # represented by that code point
+    # for now we are only interested in the code point itself and
+    # the character class, which reside in columns 0 and 2 respectively
+    unicode_database_file_name = os.path.join(arguments.srcDir, UNICODE_DATABASE_FILE)
+    if not os.path.exists(unicode_database_file_name):
+        raise RuntimeError(f"Error in script: Could not find 'UnicodeDatabase.txt' at path {arguments.srcDir}!")
+
+    # the UnicodeDatabase is the primary source of mappings
+    # this will give us the character classes as well as:
+    # (some) case mapping information (the rest is in SpecialCasing.txt)
+    first_pair = None
+    with open(unicode_database_file_name, 'r') as unicode_database_file:
+       for line in unicode_database_file:
+            # split the line 
+            tokens = line.split(';')
+            code_point = int(tokens[0], 16)
+            character_name = tokens[1]
+            character_class = tokens[2]
+            if '<' in character_name:
+                # this is an indication that the character is a group of characters
+                # that fall in a range of code points that all have the same character class
+                # with more specific properties given elsewhere
+                # we don't need those, but we do need to account for ranges
+                # the first part of the range is always before the last part in the UnicodeDatabase.txt file
+                # and are always separated by a single line, so we can track it very simply
+                if ', First' in character_name:
+                    # it's the first character in the range
+                    first_pair = str(code_point)
+                elif ', Last' in character_name:
+                    # it's the second character in the range
+                    if character_class in XID_START_CLASS:
+                        xid_start_range_pairs.append((first_pair, str(code_point)))
+                    elif character_class in XID_CONTINUE_CLASS: 
+                        xid_continue_range_pairs.append((first_pair, str(code_point)))
+
+                    first_pair = None
+            else:
+                if character_class in XID_START_CLASS:
+                    xid_start_class.append(str(code_point))
+                elif character_class in XID_CONTINUE_CLASS:
+                    xid_continue_class.append(str(code_point))
+
+            if code_point == 95:
+                # special case is underscore, which we will add to the XID_Start class because
+                # C++ / Python allow it specifically (it's a separate if because it's part of
+                # the "Pc" class, meaning it is considered XID_Continue)
+                xid_start_class.append(str(code_point))
+
+    _write_cpp_files(arguments.destDir)
diff --git a/pxr/base/tf/unicodeCharacterClasses.cpp b/pxr/base/tf/unicodeCharacterClasses.cpp
diff --git a/pxr/base/tf/unicodeCharacterClasses.h b/pxr/base/tf/unicodeCharacterClasses.h
@@ -0,0 +1,47 @@
+//
+// Copyright 2023 Pixar
+//
+// Licensed under the Apache License, Version 2.0 (the "Apache License")
+// with the following modification; you may not use this file except in
+// compliance with the Apache License and the following modification to it:
+// Section 6. Trademarks. is deleted and replaced with:
+//
+// 6. Trademarks. This License does not grant permission to use the trade
+//    names, trademarks, service marks, or product names of the Licensor
+//    and its affiliates, except as required to comply with Section 4(c) of
+//    the License and to reproduce the content of the NOTICE file.
+//
+// You may obtain a copy of the Apache License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the Apache License with the above modification is
+// distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the Apache License for the specific
+// language governing permissions and limitations under the Apache License.
+//
+#ifndef PXR_BASE_TF_UNICODE_CHARACTER_CLASSES_H
+#define PXR_BASE_TF_UNICODE_CHARACTER_CLASSES_H
+
+#include "pxr/pxr.h"
+
+#include <unordered_set>
+#include <vector>
+
+PXR_NAMESPACE_OPEN_SCOPE
+
+namespace TfUnicodeUtils {
+namespace Impl {
+    // these data structures hold information generated from the source
+    // UnicodeDatabase.txt file (see unicode/tfGenCharacteClasses.py)
+    extern std::unordered_set<uint32_t> xidStartClass;
+    extern std::unordered_set<uint32_t> xidContinueClass;
+    extern std::vector<std::pair<uint32_t, uint32_t>> xidStartRangeClass;
+    extern std::vector<std::pair<uint32_t, uint32_t>> xidContinueRangeClass;
+}
+}
+
+PXR_NAMESPACE_CLOSE_SCOPE
+
+#endif // PXR_BASE_TF_UNICODE_CHARACTER_CLASSES_H_