From c10c452b8310e659c1e30db4b9e24045c91b5baf Mon Sep 17 00:00:00 2001 From: Luigi Pinca Date: Tue, 3 Jan 2023 21:15:36 +0100 Subject: [PATCH] [major] Use the is_utf8 library (#109) Use the is_utf8 [1] library to greatly improve performance. [1]: https://github.com/simdutf/is_utf8 --- .gitignore | 1 + LICENSE | 27 +- binding.gyp | 7 +- deps/is_utf8/CMakeLists.txt | 106 + deps/is_utf8/LICENSE-APACHE | 204 + deps/is_utf8/LICENSE-BOOST | 23 + deps/is_utf8/LICENSE-MIT | 18 + deps/is_utf8/README.md | 59 + deps/is_utf8/benchmarks/CMakeLists.txt | 13 + deps/is_utf8/benchmarks/bench.cpp | 251 + deps/is_utf8/cmake/add_cpp_test.cmake | 59 + deps/is_utf8/cmake/import.cmake | 48 + deps/is_utf8/cmake/is_utf8-config.cmake.in | 2 + deps/is_utf8/include/is_utf8.h | 10 + deps/is_utf8/src/CMakeLists.txt | 38 + deps/is_utf8/src/is_utf8.cpp | 6740 ++++++++++++++++++++ deps/is_utf8/tests/CMakeLists.txt | 8 + deps/is_utf8/tests/unit.cpp | 338 + package.json | 3 + src/validation.c | 109 - src/validation.cc | 40 + 21 files changed, 7989 insertions(+), 115 deletions(-) create mode 100644 deps/is_utf8/CMakeLists.txt create mode 100644 deps/is_utf8/LICENSE-APACHE create mode 100644 deps/is_utf8/LICENSE-BOOST create mode 100644 deps/is_utf8/LICENSE-MIT create mode 100644 deps/is_utf8/README.md create mode 100644 deps/is_utf8/benchmarks/CMakeLists.txt create mode 100644 deps/is_utf8/benchmarks/bench.cpp create mode 100644 deps/is_utf8/cmake/add_cpp_test.cmake create mode 100644 deps/is_utf8/cmake/import.cmake create mode 100644 deps/is_utf8/cmake/is_utf8-config.cmake.in create mode 100644 deps/is_utf8/include/is_utf8.h create mode 100644 deps/is_utf8/src/CMakeLists.txt create mode 100644 deps/is_utf8/src/is_utf8.cpp create mode 100644 deps/is_utf8/tests/CMakeLists.txt create mode 100644 deps/is_utf8/tests/unit.cpp delete mode 100644 src/validation.c create mode 100644 src/validation.cc diff --git a/.gitignore b/.gitignore index 5f2ba3e..d6e2211 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +deps/is_utf8/.github/ node_modules/ prebuilds/ build/ diff --git a/LICENSE b/LICENSE index 0b37d7b..2e1dc33 100644 --- a/LICENSE +++ b/LICENSE @@ -23,10 +23,29 @@ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -This license applies to parts originating from -https://www.cl.cam.ac.uk/~mgk25/ucs/utf8_check.c: +This license applies to all parts of utf-8-validate that are not externally +maintained libraries. + +The externally maintained is_utf8 library used by utf-8-validate, located at +deps/is_utf8, is licensed as follows: """ -Markus Kuhn -- 2005-03-30 -License: http://www.cl.cam.ac.uk/~mgk25/short-license.html +Copyright 2022 The is_utf8 authors + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ diff --git a/binding.gyp b/binding.gyp index 30edf27..d873b5f 100644 --- a/binding.gyp +++ b/binding.gyp @@ -2,8 +2,11 @@ 'targets': [ { 'target_name': 'validation', - 'sources': ['src/validation.c'], - 'cflags': ['-std=c99'], + 'sources': [ + 'src/validation.cc', + 'deps/is_utf8/src/is_utf8.cpp' + ], + 'cflags_cc': ['-std=gnu++11'], 'conditions': [ ["OS=='mac'", { 'xcode_settings': { diff --git a/deps/is_utf8/CMakeLists.txt b/deps/is_utf8/CMakeLists.txt new file mode 100644 index 0000000..39b8411 --- /dev/null +++ b/deps/is_utf8/CMakeLists.txt @@ -0,0 +1,106 @@ +cmake_minimum_required(VERSION 3.15) + +project(is_utf8 + DESCRIPTION "Fast UTF-8 Validation" + LANGUAGES CXX + VERSION 1.2.1 +) + +include(GNUInstallDirs) +include(CTest) + +option(IS_UTF8_SANITIZE "Sanitize addresses" OFF) + +if (NOT CMAKE_BUILD_TYPE) + message(STATUS "No build type selected, default to Release") + if(IS_UTF8_SANITIZE) + set(CMAKE_BUILD_TYPE Debug CACHE STRING "Choose the type of build." FORCE) + else() + set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) + endif() +endif() + +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_MACOSX_RPATH OFF) + +set(IS_UTF8_LIB_VERSION "1.2.1" CACHE STRING "is_utf8 library version") +set(IS_UTF8_LIB_SOVERSION "1" CACHE STRING "is_utf8 library soversion") + +set(IS_UTF8_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src) +add_subdirectory(src) + +if (BUILD_TESTING) + message(STATUS "The tests are enabled.") + add_subdirectory(tests) +else() + message(STATUS "The tests are disabled.") +endif(BUILD_TESTING) + + +add_subdirectory(benchmarks) +# ---- Install rules ---- +add_library(is_utf8::is_utf8 ALIAS is_utf8) + +set_target_properties( + is_utf8 PROPERTIES + VERSION "${IS_UTF8_LIB_VERSION}" + SOVERSION "${IS_UTF8_LIB_SOVERSION}" + WINDOWS_EXPORT_ALL_SYMBOLS YES +) + +include(CMakePackageConfigHelpers) +include(GNUInstallDirs) + +install( + FILES include/is_utf8.h + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" + COMPONENT is_utf8_Development +) + +install( + TARGETS is_utf8 + EXPORT is_utf8Targets + RUNTIME COMPONENT is_utf8_Runtime + LIBRARY COMPONENT is_utf8_Runtime + NAMELINK_COMPONENT is_utf8_Development + ARCHIVE COMPONENT is_utf8_Development + INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" +) + +configure_file(cmake/is_utf8-config.cmake.in is_utf8-config.cmake @ONLY) + +write_basic_package_version_file( + is_utf8-config-version.cmake + COMPATIBILITY SameMinorVersion +) + +set( + IS_UTF8_INSTALL_CMAKEDIR "${CMAKE_INSTALL_LIBDIR}/cmake/is_utf8" + CACHE STRING "CMake package config location relative to the install prefix" +) +mark_as_advanced(IS_UTF8_INSTALL_CMAKEDIR) + +install( + FILES + "${PROJECT_BINARY_DIR}/is_utf8-config.cmake" + "${PROJECT_BINARY_DIR}/is_utf8-config-version.cmake" + DESTINATION "${IS_UTF8_INSTALL_CMAKEDIR}" + COMPONENT is_utf8_Development +) + +# +# CPack +# +if(is_top_project) + set(CPACK_PACKAGE_VENDOR "Daniel Lemire") + set(CPACK_PACKAGE_CONTACT "lemire@gmail.com") + set(CPACK_RESOURCE_FILE_LICENSE "${PROJECT_SOURCE_DIR}/LICENSE-MIT") + set(CPACK_RPM_PACKAGE_LICENSE "${PROJECT_SOURCE_DIR}/LICENSE-MIT") + set(CPACK_RESOURCE_FILE_README "${PROJECT_SOURCE_DIR}/README.md") + set(CPACK_SOURCE_GENERATOR "TGZ;ZIP") + include(CPack) +endif() + +# ---- diff --git a/deps/is_utf8/LICENSE-APACHE b/deps/is_utf8/LICENSE-APACHE new file mode 100644 index 0000000..db01662 --- /dev/null +++ b/deps/is_utf8/LICENSE-APACHE @@ -0,0 +1,204 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +NOTE: This software may be used as part of software released under the +GNU General Public License (GPL) version 2. diff --git a/deps/is_utf8/LICENSE-BOOST b/deps/is_utf8/LICENSE-BOOST new file mode 100644 index 0000000..36b7cd9 --- /dev/null +++ b/deps/is_utf8/LICENSE-BOOST @@ -0,0 +1,23 @@ +Boost Software License - Version 1.0 - August 17th, 2003 + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/deps/is_utf8/LICENSE-MIT b/deps/is_utf8/LICENSE-MIT new file mode 100644 index 0000000..5efe84c --- /dev/null +++ b/deps/is_utf8/LICENSE-MIT @@ -0,0 +1,18 @@ +Copyright 2022 The is_utf8 authors + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/deps/is_utf8/README.md b/deps/is_utf8/README.md new file mode 100644 index 0000000..acc6895 --- /dev/null +++ b/deps/is_utf8/README.md @@ -0,0 +1,59 @@ +# is_utf8 + +Most strings online are in unicode using the UTF-8 encoding. Validating strings +quickly before accepting them is important. + + + + +## How to use is_utf8 + +This is a simple one-source file library to validate UTF-8 strings at high speeds using SIMD instructions. It works on all platforms (ARM, x64). + +Build and link `is_utf8.cpp` with your project. Code usage: + +```C++ + #include "is_utf8.h" + + char * mystring = ... + bool is_it_valid = is_utf8(mystring, thestringlength); +``` + +It should be able to validate strings using less than 1 cycle per input byte. + +## Reference + +- John Keiser, Daniel Lemire, [Validating UTF-8 In Less Than One Instruction Per Byte](https://arxiv.org/abs/2010.03090), Software: Practice & Experience 51 (5), 2021 + +### Want more? + +If you want a wide range of fast Unicode function for production use, you can rely on the simdutf library. It is as simple as the following: + +```C++ +#include "simdutf.cpp" +#include "simdutf.h" + +int main(int argc, char *argv[]) { + const char *source = "1234"; + // 4 == strlen(source) + bool validutf8 = simdutf::validate_utf8(source, 4); + if (validutf8) { + std::cout << "valid UTF-8" << std::endl; + } else { + std::cerr << "invalid UTF-8" << std::endl; + return EXIT_FAILURE; + } +} +``` + +See https://github.com/simdutf/ + + +## License + +This library is distributed under the terms of any of the following +licenses, at your option: + +* Apache License (Version 2.0) [LICENSE-APACHE](LICENSE-APACHE), +* Boost Software License [LICENSE-BOOST](LICENSE-BOOST), or +* MIT License [LICENSE-MIT](LICENSE-MIT). diff --git a/deps/is_utf8/benchmarks/CMakeLists.txt b/deps/is_utf8/benchmarks/CMakeLists.txt new file mode 100644 index 0000000..0bceec7 --- /dev/null +++ b/deps/is_utf8/benchmarks/CMakeLists.txt @@ -0,0 +1,13 @@ + +include(${PROJECT_SOURCE_DIR}/cmake/import.cmake) +set_off(SIMDUTF_TOOLS) +set_off(SIMDUTF_BENCHMARKS) + +import_dependency(simdutf simdutf/simdutf v2.0.9) +set(BUILD_TESTING OFF) +add_dependency(simdutf) +unset(BUILD_TESTING) + +add_executable(bench bench.cpp) +target_link_libraries(bench PRIVATE is_utf8) +target_link_libraries(bench PRIVATE simdutf) diff --git a/deps/is_utf8/benchmarks/bench.cpp b/deps/is_utf8/benchmarks/bench.cpp new file mode 100644 index 0000000..4492894 --- /dev/null +++ b/deps/is_utf8/benchmarks/bench.cpp @@ -0,0 +1,251 @@ +#include "is_utf8.h" +#include "simdutf.h" +#include +#include +#include +#include +#include +#include +#include +#include + +uint64_t nano() { + return std::chrono::duration_cast<::std::chrono::nanoseconds>( + std::chrono::steady_clock::now().time_since_epoch()) + .count(); +} + +#ifdef _MSC_VER +#define never_inline __declspec(noinline) +#else +#define never_inline __attribute__((noinline)) +#endif + +// generate a string having at least length N +// can exceed by up to 3 chars, returns the actual length +size_t populate_utf8(char *data, size_t N) { + size_t i = 0; + for (; i < N;) { + int w = rand() & 0xFF; + if (w < 0x80) { + data[i++] = 0x20; // w; + } else if (w < 0xE0) { + data[i++] = 0xC2 + rand() % (0xDF - 0xC2 + 1); + data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); + } else if (w == 0xE0) { + data[i++] = w; + data[i++] = 0xA0 + rand() % (0xBF - 0xA0 + 1); + data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); + } else if (w <= 0xEC) { + data[i++] = w; + data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); + data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); + } else if (w == 0xED) { + data[i++] = w; + data[i++] = 0x80 + rand() % (0x9F - 0x80 + 1); + data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); + } else if (w <= 0xEF) { + data[i++] = w; + data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); + data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); + } else if (w < 0xF0) { + data[i++] = 0xF1 + rand() % (0xF3 - 0xF1 + 1); + data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); + data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); + data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); + } else if (w == 0xF0) { + data[i++] = w; + data[i++] = 0x90 + rand() % (0xBF - 0x90 + 1); + data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); + data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); + } else if (w <= 0xF3) { + data[i++] = 0xF1 + rand() % (0xF3 - 0xF1 + 1); + data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); + data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); + data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); + } else if (w == 0xF4) { + data[i++] = w; + data[i++] = 0x80 + rand() % (0x8F - 0x80 + 1); + data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); + data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1); + } + } + data[i] = '\0'; + return i; +} + +// copied in part from Guava +static never_inline bool basic_validate_utf8(const char *b, size_t length) { + const unsigned char *bytes = (const unsigned char *)b; + for (size_t index = 0;;) { + unsigned char byte1; + + do { // fast ASCII Path + if (index >= length) { + return true; + } + byte1 = bytes[index++]; + } while (byte1 < 0x80); + if (byte1 < 0xE0) { + // Two-byte form. + if (index == length) { + return false; + } + if (byte1 < 0xC2 || bytes[index++] > 0xBF) { + return false; + } + } else if (byte1 < 0xF0) { + // Three-byte form. + if (index + 1 >= length) { + return false; + } + unsigned char byte2 = bytes[index++]; + if (byte2 > 0xBF + // Overlong? 5 most significant bits must not all be zero. + || (byte1 == 0xE0 && byte2 < 0xA0) + // Check for illegal surrogate codepoints. + || (byte1 == 0xED && 0xA0 <= byte2) + // Third byte trailing-byte test. + || bytes[index++] > 0xBF) { + return false; + } + } else { + + // Four-byte form. + if (index + 2 >= length) { + return false; + } + int byte2 = bytes[index++]; + if (byte2 > 0xBF + // Check that 1 <= plane <= 16. Tricky optimized form of: + // if (byte1 > (byte) 0xF4 + // || byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 + // || byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) + || (((byte1 << 28) + (byte2 - 0x90)) >> 30) != 0 + // Third byte trailing-byte test + || bytes[index++] > 0xBF + // Fourth byte trailing-byte test + || bytes[index++] > 0xBF) { + return false; + } + } + } +} + + +bool zerobuffer_bench(size_t N) { + printf("zero buffer \n"); + printf("string size = %zu \n", N); + char *input = new char[N]{}; + volatile bool isgood{true}; + + { + uint64_t start = nano(); + uint64_t finish = start; + size_t count{0}; + uint64_t threshold = 500000000; + for (; finish - start < threshold;) { + count++; + isgood &= basic_validate_utf8(input, N); + finish = nano(); + } + double t = (N * count) / double(finish - start); + + printf("basic_validate_utf8 %f GB/s\n", t); + } + + { + uint64_t start = nano(); + uint64_t finish = start; + size_t count{0}; + uint64_t threshold = 500000000; + for (; finish - start < threshold;) { + count++; + isgood &= simdutf::validate_utf8(input, N); + finish = nano(); + } + double t = (N * count) / double(finish - start); + + printf("simdutf %f GB/s\n", t); + } + + { + uint64_t start = nano(); + uint64_t finish = start; + size_t count{0}; + uint64_t threshold = 500000000; + for (; finish - start < threshold;) { + count++; + isgood &= is_utf8(input, N); + finish = nano(); + } + double t = (N * count) / double(finish - start); + + printf("is_utf8 %f GB/s\n", t); + } + delete[] input; + printf("\n"); + return isgood; +} + +bool bench(size_t N) { + printf("random UTF-8\n"); + printf("string size = %zu \n", N); + char *input = new char[N]; + populate_utf8(input, N); + volatile bool isgood{true}; + + { + uint64_t start = nano(); + uint64_t finish = start; + size_t count{0}; + uint64_t threshold = 500000000; + for (; finish - start < threshold;) { + count++; + isgood &= basic_validate_utf8(input, N); + finish = nano(); + } + double t = (N * count) / double(finish - start); + + printf("basic_validate_utf8 %f GB/s\n", t); + } + + { + uint64_t start = nano(); + uint64_t finish = start; + size_t count{0}; + uint64_t threshold = 500000000; + for (; finish - start < threshold;) { + count++; + isgood &= simdutf::validate_utf8(input, N); + finish = nano(); + } + double t = (N * count) / double(finish - start); + + printf("simdutf %f GB/s\n", t); + } + + { + uint64_t start = nano(); + uint64_t finish = start; + size_t count{0}; + uint64_t threshold = 500000000; + for (; finish - start < threshold;) { + count++; + isgood &= is_utf8(input, N); + finish = nano(); + } + double t = (N * count) / double(finish - start); + + printf("is_utf8 %f GB/s\n", t); + } + delete[] input; + printf("\n"); + return isgood; +} + +int main() { + return (bench(40096) & bench(100000) & bench(50000)) + & (zerobuffer_bench(40096) & zerobuffer_bench(100000) & zerobuffer_bench(50000)) + ? EXIT_SUCCESS : EXIT_FAILURE; +} \ No newline at end of file diff --git a/deps/is_utf8/cmake/add_cpp_test.cmake b/deps/is_utf8/cmake/add_cpp_test.cmake new file mode 100644 index 0000000..c4ce6a5 --- /dev/null +++ b/deps/is_utf8/cmake/add_cpp_test.cmake @@ -0,0 +1,59 @@ +# Helper so we don't have to repeat ourselves so much +# Usage: add_cpp_test(testname [COMPILE_ONLY] [SOURCES a.cpp b.cpp ...] [LABELS acceptance per_implementation ...]) +# SOURCES defaults to testname.cpp if not specified. +function(add_cpp_test TEST_NAME) + # Parse arguments + cmake_parse_arguments(PARSE_ARGV 1 ARGS "COMPILE_ONLY;LIBRARY;WILL_FAIL" "" "SOURCES;LABELS;DEPENDENCY_OF") + if (NOT ARGS_SOURCES) + list(APPEND ARGS_SOURCES ${TEST_NAME}.cpp) + endif() + if (ARGS_COMPILE_ONLY) + list(APPEND ${ARGS_LABELS} compile_only) + endif() + if (IS_UTF8_SANITIZE) + add_compile_options(-fsanitize=address -fno-omit-frame-pointer -fno-sanitize-recover=all) + add_compile_definitions(ASAN_OPTIONS=detect_leaks=1) + endif() + # Add the compile target + if (ARGS_LIBRARY) + add_library(${TEST_NAME} STATIC ${ARGS_SOURCES}) + else(ARGS_LIBRARY) + add_executable(${TEST_NAME} ${ARGS_SOURCES}) + endif(ARGS_LIBRARY) + + # Add test + if (ARGS_COMPILE_ONLY OR ARGS_LIBRARY) + add_test( + NAME ${TEST_NAME} + COMMAND ${CMAKE_COMMAND} --build . --target ${TEST_NAME} --config $ + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + ) + set_target_properties(${TEST_NAME} PROPERTIES EXCLUDE_FROM_ALL TRUE EXCLUDE_FROM_DEFAULT_BUILD TRUE) + else() + add_test(${TEST_NAME} ${TEST_NAME}) + + # Add to