diff --git a/.gitignore b/.gitignore
index 5f2ba3e..d6e2211 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+deps/is_utf8/.github/
node_modules/
prebuilds/
build/
diff --git a/LICENSE b/LICENSE
index 0b37d7b..2e1dc33 100644
--- a/LICENSE
+++ b/LICENSE
@@ -23,10 +23,29 @@ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""
-This license applies to parts originating from
-https://www.cl.cam.ac.uk/~mgk25/ucs/utf8_check.c:
+This license applies to all parts of utf-8-validate that are not externally
+maintained libraries.
+
+The externally maintained is_utf8 library used by utf-8-validate, located at
+deps/is_utf8, is licensed as follows:
"""
-Markus Kuhn -- 2005-03-30
-License: http://www.cl.cam.ac.uk/~mgk25/short-license.html
+Copyright 2022 The is_utf8 authors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""
diff --git a/binding.gyp b/binding.gyp
index 30edf27..d873b5f 100644
--- a/binding.gyp
+++ b/binding.gyp
@@ -2,8 +2,11 @@
'targets': [
{
'target_name': 'validation',
- 'sources': ['src/validation.c'],
- 'cflags': ['-std=c99'],
+ 'sources': [
+ 'src/validation.cc',
+ 'deps/is_utf8/src/is_utf8.cpp'
+ ],
+ 'cflags_cc': ['-std=gnu++11'],
'conditions': [
["OS=='mac'", {
'xcode_settings': {
diff --git a/deps/is_utf8/CMakeLists.txt b/deps/is_utf8/CMakeLists.txt
new file mode 100644
index 0000000..39b8411
--- /dev/null
+++ b/deps/is_utf8/CMakeLists.txt
@@ -0,0 +1,106 @@
+cmake_minimum_required(VERSION 3.15)
+
+project(is_utf8
+ DESCRIPTION "Fast UTF-8 Validation"
+ LANGUAGES CXX
+ VERSION 1.2.1
+)
+
+include(GNUInstallDirs)
+include(CTest)
+
+option(IS_UTF8_SANITIZE "Sanitize addresses" OFF)
+
+if (NOT CMAKE_BUILD_TYPE)
+ message(STATUS "No build type selected, default to Release")
+ if(IS_UTF8_SANITIZE)
+ set(CMAKE_BUILD_TYPE Debug CACHE STRING "Choose the type of build." FORCE)
+ else()
+ set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
+ endif()
+endif()
+
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_MACOSX_RPATH OFF)
+
+set(IS_UTF8_LIB_VERSION "1.2.1" CACHE STRING "is_utf8 library version")
+set(IS_UTF8_LIB_SOVERSION "1" CACHE STRING "is_utf8 library soversion")
+
+set(IS_UTF8_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src)
+add_subdirectory(src)
+
+if (BUILD_TESTING)
+ message(STATUS "The tests are enabled.")
+ add_subdirectory(tests)
+else()
+ message(STATUS "The tests are disabled.")
+endif(BUILD_TESTING)
+
+
+add_subdirectory(benchmarks)
+# ---- Install rules ----
+add_library(is_utf8::is_utf8 ALIAS is_utf8)
+
+set_target_properties(
+ is_utf8 PROPERTIES
+ VERSION "${IS_UTF8_LIB_VERSION}"
+ SOVERSION "${IS_UTF8_LIB_SOVERSION}"
+ WINDOWS_EXPORT_ALL_SYMBOLS YES
+)
+
+include(CMakePackageConfigHelpers)
+include(GNUInstallDirs)
+
+install(
+ FILES include/is_utf8.h
+ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
+ COMPONENT is_utf8_Development
+)
+
+install(
+ TARGETS is_utf8
+ EXPORT is_utf8Targets
+ RUNTIME COMPONENT is_utf8_Runtime
+ LIBRARY COMPONENT is_utf8_Runtime
+ NAMELINK_COMPONENT is_utf8_Development
+ ARCHIVE COMPONENT is_utf8_Development
+ INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
+)
+
+configure_file(cmake/is_utf8-config.cmake.in is_utf8-config.cmake @ONLY)
+
+write_basic_package_version_file(
+ is_utf8-config-version.cmake
+ COMPATIBILITY SameMinorVersion
+)
+
+set(
+ IS_UTF8_INSTALL_CMAKEDIR "${CMAKE_INSTALL_LIBDIR}/cmake/is_utf8"
+ CACHE STRING "CMake package config location relative to the install prefix"
+)
+mark_as_advanced(IS_UTF8_INSTALL_CMAKEDIR)
+
+install(
+ FILES
+ "${PROJECT_BINARY_DIR}/is_utf8-config.cmake"
+ "${PROJECT_BINARY_DIR}/is_utf8-config-version.cmake"
+ DESTINATION "${IS_UTF8_INSTALL_CMAKEDIR}"
+ COMPONENT is_utf8_Development
+)
+
+#
+# CPack
+#
+if(is_top_project)
+ set(CPACK_PACKAGE_VENDOR "Daniel Lemire")
+ set(CPACK_PACKAGE_CONTACT "lemire@gmail.com")
+ set(CPACK_RESOURCE_FILE_LICENSE "${PROJECT_SOURCE_DIR}/LICENSE-MIT")
+ set(CPACK_RPM_PACKAGE_LICENSE "${PROJECT_SOURCE_DIR}/LICENSE-MIT")
+ set(CPACK_RESOURCE_FILE_README "${PROJECT_SOURCE_DIR}/README.md")
+ set(CPACK_SOURCE_GENERATOR "TGZ;ZIP")
+ include(CPack)
+endif()
+
+# ----
diff --git a/deps/is_utf8/LICENSE-APACHE b/deps/is_utf8/LICENSE-APACHE
new file mode 100644
index 0000000..db01662
--- /dev/null
+++ b/deps/is_utf8/LICENSE-APACHE
@@ -0,0 +1,204 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+NOTE: This software may be used as part of software released under the
+GNU General Public License (GPL) version 2.
diff --git a/deps/is_utf8/LICENSE-BOOST b/deps/is_utf8/LICENSE-BOOST
new file mode 100644
index 0000000..36b7cd9
--- /dev/null
+++ b/deps/is_utf8/LICENSE-BOOST
@@ -0,0 +1,23 @@
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/deps/is_utf8/LICENSE-MIT b/deps/is_utf8/LICENSE-MIT
new file mode 100644
index 0000000..5efe84c
--- /dev/null
+++ b/deps/is_utf8/LICENSE-MIT
@@ -0,0 +1,18 @@
+Copyright 2022 The is_utf8 authors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/deps/is_utf8/README.md b/deps/is_utf8/README.md
new file mode 100644
index 0000000..acc6895
--- /dev/null
+++ b/deps/is_utf8/README.md
@@ -0,0 +1,59 @@
+# is_utf8
+
+Most strings online are in unicode using the UTF-8 encoding. Validating strings
+quickly before accepting them is important.
+
+
+
+
+## How to use is_utf8
+
+This is a simple one-source file library to validate UTF-8 strings at high speeds using SIMD instructions. It works on all platforms (ARM, x64).
+
+Build and link `is_utf8.cpp` with your project. Code usage:
+
+```C++
+ #include "is_utf8.h"
+
+ char * mystring = ...
+ bool is_it_valid = is_utf8(mystring, thestringlength);
+```
+
+It should be able to validate strings using less than 1 cycle per input byte.
+
+## Reference
+
+- John Keiser, Daniel Lemire, [Validating UTF-8 In Less Than One Instruction Per Byte](https://arxiv.org/abs/2010.03090), Software: Practice & Experience 51 (5), 2021
+
+### Want more?
+
+If you want a wide range of fast Unicode function for production use, you can rely on the simdutf library. It is as simple as the following:
+
+```C++
+#include "simdutf.cpp"
+#include "simdutf.h"
+
+int main(int argc, char *argv[]) {
+ const char *source = "1234";
+ // 4 == strlen(source)
+ bool validutf8 = simdutf::validate_utf8(source, 4);
+ if (validutf8) {
+ std::cout << "valid UTF-8" << std::endl;
+ } else {
+ std::cerr << "invalid UTF-8" << std::endl;
+ return EXIT_FAILURE;
+ }
+}
+```
+
+See https://github.com/simdutf/
+
+
+## License
+
+This library is distributed under the terms of any of the following
+licenses, at your option:
+
+* Apache License (Version 2.0) [LICENSE-APACHE](LICENSE-APACHE),
+* Boost Software License [LICENSE-BOOST](LICENSE-BOOST), or
+* MIT License [LICENSE-MIT](LICENSE-MIT).
diff --git a/deps/is_utf8/benchmarks/CMakeLists.txt b/deps/is_utf8/benchmarks/CMakeLists.txt
new file mode 100644
index 0000000..0bceec7
--- /dev/null
+++ b/deps/is_utf8/benchmarks/CMakeLists.txt
@@ -0,0 +1,13 @@
+
+include(${PROJECT_SOURCE_DIR}/cmake/import.cmake)
+set_off(SIMDUTF_TOOLS)
+set_off(SIMDUTF_BENCHMARKS)
+
+import_dependency(simdutf simdutf/simdutf v2.0.9)
+set(BUILD_TESTING OFF)
+add_dependency(simdutf)
+unset(BUILD_TESTING)
+
+add_executable(bench bench.cpp)
+target_link_libraries(bench PRIVATE is_utf8)
+target_link_libraries(bench PRIVATE simdutf)
diff --git a/deps/is_utf8/benchmarks/bench.cpp b/deps/is_utf8/benchmarks/bench.cpp
new file mode 100644
index 0000000..4492894
--- /dev/null
+++ b/deps/is_utf8/benchmarks/bench.cpp
@@ -0,0 +1,251 @@
+#include "is_utf8.h"
+#include "simdutf.h"
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+uint64_t nano() {
+ return std::chrono::duration_cast<::std::chrono::nanoseconds>(
+ std::chrono::steady_clock::now().time_since_epoch())
+ .count();
+}
+
+#ifdef _MSC_VER
+#define never_inline __declspec(noinline)
+#else
+#define never_inline __attribute__((noinline))
+#endif
+
+// generate a string having at least length N
+// can exceed by up to 3 chars, returns the actual length
+size_t populate_utf8(char *data, size_t N) {
+ size_t i = 0;
+ for (; i < N;) {
+ int w = rand() & 0xFF;
+ if (w < 0x80) {
+ data[i++] = 0x20; // w;
+ } else if (w < 0xE0) {
+ data[i++] = 0xC2 + rand() % (0xDF - 0xC2 + 1);
+ data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1);
+ } else if (w == 0xE0) {
+ data[i++] = w;
+ data[i++] = 0xA0 + rand() % (0xBF - 0xA0 + 1);
+ data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1);
+ } else if (w <= 0xEC) {
+ data[i++] = w;
+ data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1);
+ data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1);
+ } else if (w == 0xED) {
+ data[i++] = w;
+ data[i++] = 0x80 + rand() % (0x9F - 0x80 + 1);
+ data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1);
+ } else if (w <= 0xEF) {
+ data[i++] = w;
+ data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1);
+ data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1);
+ } else if (w < 0xF0) {
+ data[i++] = 0xF1 + rand() % (0xF3 - 0xF1 + 1);
+ data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1);
+ data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1);
+ data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1);
+ } else if (w == 0xF0) {
+ data[i++] = w;
+ data[i++] = 0x90 + rand() % (0xBF - 0x90 + 1);
+ data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1);
+ data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1);
+ } else if (w <= 0xF3) {
+ data[i++] = 0xF1 + rand() % (0xF3 - 0xF1 + 1);
+ data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1);
+ data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1);
+ data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1);
+ } else if (w == 0xF4) {
+ data[i++] = w;
+ data[i++] = 0x80 + rand() % (0x8F - 0x80 + 1);
+ data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1);
+ data[i++] = 0x80 + rand() % (0xBF - 0x80 + 1);
+ }
+ }
+ data[i] = '\0';
+ return i;
+}
+
+// copied in part from Guava
+static never_inline bool basic_validate_utf8(const char *b, size_t length) {
+ const unsigned char *bytes = (const unsigned char *)b;
+ for (size_t index = 0;;) {
+ unsigned char byte1;
+
+ do { // fast ASCII Path
+ if (index >= length) {
+ return true;
+ }
+ byte1 = bytes[index++];
+ } while (byte1 < 0x80);
+ if (byte1 < 0xE0) {
+ // Two-byte form.
+ if (index == length) {
+ return false;
+ }
+ if (byte1 < 0xC2 || bytes[index++] > 0xBF) {
+ return false;
+ }
+ } else if (byte1 < 0xF0) {
+ // Three-byte form.
+ if (index + 1 >= length) {
+ return false;
+ }
+ unsigned char byte2 = bytes[index++];
+ if (byte2 > 0xBF
+ // Overlong? 5 most significant bits must not all be zero.
+ || (byte1 == 0xE0 && byte2 < 0xA0)
+ // Check for illegal surrogate codepoints.
+ || (byte1 == 0xED && 0xA0 <= byte2)
+ // Third byte trailing-byte test.
+ || bytes[index++] > 0xBF) {
+ return false;
+ }
+ } else {
+
+ // Four-byte form.
+ if (index + 2 >= length) {
+ return false;
+ }
+ int byte2 = bytes[index++];
+ if (byte2 > 0xBF
+ // Check that 1 <= plane <= 16. Tricky optimized form of:
+ // if (byte1 > (byte) 0xF4
+ // || byte1 == (byte) 0xF0 && byte2 < (byte) 0x90
+ // || byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
+ || (((byte1 << 28) + (byte2 - 0x90)) >> 30) != 0
+ // Third byte trailing-byte test
+ || bytes[index++] > 0xBF
+ // Fourth byte trailing-byte test
+ || bytes[index++] > 0xBF) {
+ return false;
+ }
+ }
+ }
+}
+
+
+bool zerobuffer_bench(size_t N) {
+ printf("zero buffer \n");
+ printf("string size = %zu \n", N);
+ char *input = new char[N]{};
+ volatile bool isgood{true};
+
+ {
+ uint64_t start = nano();
+ uint64_t finish = start;
+ size_t count{0};
+ uint64_t threshold = 500000000;
+ for (; finish - start < threshold;) {
+ count++;
+ isgood &= basic_validate_utf8(input, N);
+ finish = nano();
+ }
+ double t = (N * count) / double(finish - start);
+
+ printf("basic_validate_utf8 %f GB/s\n", t);
+ }
+
+ {
+ uint64_t start = nano();
+ uint64_t finish = start;
+ size_t count{0};
+ uint64_t threshold = 500000000;
+ for (; finish - start < threshold;) {
+ count++;
+ isgood &= simdutf::validate_utf8(input, N);
+ finish = nano();
+ }
+ double t = (N * count) / double(finish - start);
+
+ printf("simdutf %f GB/s\n", t);
+ }
+
+ {
+ uint64_t start = nano();
+ uint64_t finish = start;
+ size_t count{0};
+ uint64_t threshold = 500000000;
+ for (; finish - start < threshold;) {
+ count++;
+ isgood &= is_utf8(input, N);
+ finish = nano();
+ }
+ double t = (N * count) / double(finish - start);
+
+ printf("is_utf8 %f GB/s\n", t);
+ }
+ delete[] input;
+ printf("\n");
+ return isgood;
+}
+
+bool bench(size_t N) {
+ printf("random UTF-8\n");
+ printf("string size = %zu \n", N);
+ char *input = new char[N];
+ populate_utf8(input, N);
+ volatile bool isgood{true};
+
+ {
+ uint64_t start = nano();
+ uint64_t finish = start;
+ size_t count{0};
+ uint64_t threshold = 500000000;
+ for (; finish - start < threshold;) {
+ count++;
+ isgood &= basic_validate_utf8(input, N);
+ finish = nano();
+ }
+ double t = (N * count) / double(finish - start);
+
+ printf("basic_validate_utf8 %f GB/s\n", t);
+ }
+
+ {
+ uint64_t start = nano();
+ uint64_t finish = start;
+ size_t count{0};
+ uint64_t threshold = 500000000;
+ for (; finish - start < threshold;) {
+ count++;
+ isgood &= simdutf::validate_utf8(input, N);
+ finish = nano();
+ }
+ double t = (N * count) / double(finish - start);
+
+ printf("simdutf %f GB/s\n", t);
+ }
+
+ {
+ uint64_t start = nano();
+ uint64_t finish = start;
+ size_t count{0};
+ uint64_t threshold = 500000000;
+ for (; finish - start < threshold;) {
+ count++;
+ isgood &= is_utf8(input, N);
+ finish = nano();
+ }
+ double t = (N * count) / double(finish - start);
+
+ printf("is_utf8 %f GB/s\n", t);
+ }
+ delete[] input;
+ printf("\n");
+ return isgood;
+}
+
+int main() {
+ return (bench(40096) & bench(100000) & bench(50000))
+ & (zerobuffer_bench(40096) & zerobuffer_bench(100000) & zerobuffer_bench(50000))
+ ? EXIT_SUCCESS : EXIT_FAILURE;
+}
\ No newline at end of file
diff --git a/deps/is_utf8/cmake/add_cpp_test.cmake b/deps/is_utf8/cmake/add_cpp_test.cmake
new file mode 100644
index 0000000..c4ce6a5
--- /dev/null
+++ b/deps/is_utf8/cmake/add_cpp_test.cmake
@@ -0,0 +1,59 @@
+# Helper so we don't have to repeat ourselves so much
+# Usage: add_cpp_test(testname [COMPILE_ONLY] [SOURCES a.cpp b.cpp ...] [LABELS acceptance per_implementation ...])
+# SOURCES defaults to testname.cpp if not specified.
+function(add_cpp_test TEST_NAME)
+ # Parse arguments
+ cmake_parse_arguments(PARSE_ARGV 1 ARGS "COMPILE_ONLY;LIBRARY;WILL_FAIL" "" "SOURCES;LABELS;DEPENDENCY_OF")
+ if (NOT ARGS_SOURCES)
+ list(APPEND ARGS_SOURCES ${TEST_NAME}.cpp)
+ endif()
+ if (ARGS_COMPILE_ONLY)
+ list(APPEND ${ARGS_LABELS} compile_only)
+ endif()
+ if (IS_UTF8_SANITIZE)
+ add_compile_options(-fsanitize=address -fno-omit-frame-pointer -fno-sanitize-recover=all)
+ add_compile_definitions(ASAN_OPTIONS=detect_leaks=1)
+ endif()
+ # Add the compile target
+ if (ARGS_LIBRARY)
+ add_library(${TEST_NAME} STATIC ${ARGS_SOURCES})
+ else(ARGS_LIBRARY)
+ add_executable(${TEST_NAME} ${ARGS_SOURCES})
+ endif(ARGS_LIBRARY)
+
+ # Add test
+ if (ARGS_COMPILE_ONLY OR ARGS_LIBRARY)
+ add_test(
+ NAME ${TEST_NAME}
+ COMMAND ${CMAKE_COMMAND} --build . --target ${TEST_NAME} --config $
+ WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
+ )
+ set_target_properties(${TEST_NAME} PROPERTIES EXCLUDE_FROM_ALL TRUE EXCLUDE_FROM_DEFAULT_BUILD TRUE)
+ else()
+ add_test(${TEST_NAME} ${TEST_NAME})
+
+ # Add to