From 3ea0d36000e2314baba7e9fc6a97f08670a6f7e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 1 May 2024 17:52:55 +0200 Subject: [PATCH 01/10] Server: add tests for batch size, different seeds (#6950) --- .../server/tests/features/results.feature | 88 +++++++---- examples/server/tests/features/steps/steps.py | 148 ++++++++++++------ 2 files changed, 156 insertions(+), 80 deletions(-) diff --git a/examples/server/tests/features/results.feature b/examples/server/tests/features/results.feature index f17120f7b9fbc..aa0b8d0c648b4 100644 --- a/examples/server/tests/features/results.feature +++ b/examples/server/tests/features/results.feature @@ -7,44 +7,16 @@ Feature: Results And a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models And a model file test-model-00001-of-00003.gguf And 128 as batch size - And 256 KV cache size + And 1024 KV cache size And 128 max tokens to predict + And continuous batching - Scenario Outline: Multi users completion + Scenario Outline: consistent results with same seed Given slots - And continuous batching Then the server is starting Then the server is healthy - Given 42 as seed - And a prompt: - """ - Write a very long story about AI. - """ - - Given 42 as seed - And a prompt: - """ - Write a very long story about AI. - """ - - Given 42 as seed - And a prompt: - """ - Write a very long story about AI. - """ - - Given 42 as seed - And a prompt: - """ - Write a very long story about AI. - """ - - Given 42 as seed - And a prompt: - """ - Write a very long story about AI. - """ + Given 4 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 42 Given concurrent completion requests Then the server is busy @@ -55,3 +27,55 @@ Feature: Results | n_slots | | 1 | | 2 | + + Scenario Outline: different results with different seed + Given slots + Then the server is starting + Then the server is healthy + + Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 42 + Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 43 + Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 44 + Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 45 + + Given concurrent completion requests + Then the server is busy + Then the server is idle + And all slots are idle + Then all predictions are different + Examples: + | n_slots | + | 1 | + | 2 | + + Scenario Outline: consistent results with same seed and varying batch size + Given 4 slots + And temperature + # And 0 as draft + Then the server is starting + Then the server is healthy + + Given 1 prompts "Write a very long story about AI." with seed 42 + And concurrent completion requests + # Then the server is busy # Not all slots will be utilized. + Then the server is idle + And all slots are idle + + Given prompts "Write a very long story about AI." with seed 42 + And concurrent completion requests + # Then the server is busy # Not all slots will be utilized. + Then the server is idle + And all slots are idle + + Then all predictions are equal + Examples: + | n_parallel | temp | + | 1 | 0.0 | + | 2 | 0.0 | + | 4 | 0.0 | + | 1 | 1.0 | + # FIXME: These tests fail on master. The problem seems to be the unified KV cache. + # See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227 + # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574 . + # | 2 | 1.0 | + # | 4 | 1.0 | diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index f71e0d706cca9..b8dbef21d1b76 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -65,6 +65,7 @@ def step_server_config(context, server_fqdn, server_port): context.server_seed = None context.user_api_key = None context.response_format = None + context.temperature = None context.tasks_result = [] context.concurrent_tasks = [] @@ -232,15 +233,17 @@ async def step_all_slots_status(context, expected_slot_status_string): @async_run_until_complete async def step_request_completion(context, api_error): expect_api_error = api_error == 'raised' + seeds = await completions_seed(context, num_seeds=1) completion = await request_completion(context.prompts.pop(), + seeds[0] if seeds is not None else seeds, context.base_url, debug=context.debug, n_predict=context.n_predict, cache_prompt=context.cache_prompt, id_slot=context.id_slot, - seed=await completions_seed(context), expect_api_error=expect_api_error, - user_api_key=context.user_api_key) + user_api_key=context.user_api_key, + temperature=context.temperature) context.tasks_result.append(completion) if context.debug: print(f"Completion response: {completion}") @@ -269,6 +272,15 @@ async def step_predictions_equal(context): context.tasks_result = [] +@step('all predictions are different') +@async_run_until_complete +async def step_predictions_equal(context): + n_completions = await gather_tasks_results(context) + assert n_completions >= 2, "need at least 2 completions" + assert_all_predictions_different(context.tasks_result) + context.tasks_result = [] + + @step('the completion is truncated') def step_assert_completion_truncated(context): step_assert_completion_truncated(context, '') @@ -311,6 +323,11 @@ def step_response_format(context, response_format): context.response_format = json.loads(response_format) +@step('{temperature:f} temperature') +def step_temperature(context, temperature): + context.temperature = temperature + + @step('streaming is {enable_streaming}') def step_streaming(context, enable_streaming): context.enable_streaming = enable_streaming == 'enabled' @@ -353,7 +370,10 @@ def step_n_ubatch(context, n_ubatch): @step('{seed:d} as seed') def step_seed(context, seed): - context.seed = seed + if context.seed is None: + context.seed = [seed] + else: + context.seed.append(seed) @step('a prefix prompt') @@ -413,7 +433,9 @@ async def step_oai_chat_completions(context, api_error): if context.debug: print(f"Submitting OAI compatible completions request...") expect_api_error = api_error == 'raised' + seeds = await completions_seed(context, num_seeds=1), completion = await oai_chat_completions(context.prompts.pop(), + seeds[0] if seeds is not None else seeds, context.system_prompt, context.base_url, '/v1/chat', @@ -429,8 +451,6 @@ async def step_oai_chat_completions(context, api_error): response_format=context.response_format if hasattr(context, 'response_format') else None, - seed=await completions_seed(context), - user_api_key=context.user_api_key if hasattr(context, 'user_api_key') else None, @@ -457,20 +477,31 @@ def step_a_prompt_prompt(context, prompt): context.n_prompts = len(context.prompts) +@step('{num_prompts:d} prompts {prompt} with seed {seed:d}') +def step_many_prompts(context, num_prompts, prompt, seed): + if context.seed is None: + context.seed = [] + for _ in range(num_prompts): + context.seed.append(seed) + context.prompts.append(prompt) + context.n_prompts = len(context.prompts) + + @step('concurrent completion requests') @async_run_until_complete() async def step_concurrent_completion_requests(context): - await concurrent_requests(context, - request_completion, - # prompt is inserted automatically - context.base_url, - debug=context.debug, - prompt_prefix=context.prompt_prefix, - prompt_suffix=context.prompt_suffix, - n_predict=context.n_predict if hasattr(context, 'n_predict') else None, - seed=await completions_seed(context), - user_api_key=context.user_api_key if hasattr(context, - 'user_api_key') else None) + await concurrent_requests( + context, + request_completion, + # prompt is inserted automatically + context.base_url, + debug=context.debug, + prompt_prefix=context.prompt_prefix, + prompt_suffix=context.prompt_suffix, + n_predict=context.n_predict if hasattr(context, 'n_predict') else None, + user_api_key=context.user_api_key if hasattr(context, 'user_api_key') else None, + temperature=context.temperature, + ) @step('concurrent OAI completions requests') @@ -490,7 +521,6 @@ async def step_oai_chat_completions(context): if hasattr(context, 'enable_streaming') else None, response_format=context.response_format if hasattr(context, 'response_format') else None, - seed=await completions_seed(context), user_api_key=context.user_api_key if hasattr(context, 'user_api_key') else None) @@ -512,10 +542,6 @@ async def step_oai_chat_completions(context): if hasattr(context, 'enable_streaming') else None, response_format=context.response_format if hasattr(context, 'response_format') else None, - seed=context.seed - if hasattr(context, 'seed') else - context.server_seed - if hasattr(context, 'server_seed') else None, user_api_key=context.user_api_key if hasattr(context, 'user_api_key') else None) @@ -544,7 +570,7 @@ async def all_prompts_are_predicted(context, expected_predicted_n=None): @async_run_until_complete async def step_compute_embedding(context): context.n_prompts = 1 - context.embeddings = await request_embedding(context_text(context), base_url=context.base_url) + context.embeddings = await request_embedding(context_text(context), None, base_url=context.base_url) @step('all embeddings are the same') @@ -585,7 +611,7 @@ def step_assert_embeddings(context): @async_run_until_complete async def step_oai_compute_embeddings(context): context.n_prompts = 1 - context.embeddings = await request_oai_embeddings(context_text(context), + context.embeddings = await request_oai_embeddings(context_text(context), None, base_url=context.base_url, user_api_key=context.user_api_key, model=context.model) @@ -594,7 +620,7 @@ async def step_oai_compute_embeddings(context): @step('an OAI compatible embeddings computation request for multiple inputs') @async_run_until_complete async def step_oai_compute_embeddings_multiple_inputs(context): - context.embeddings = await request_oai_embeddings(context.prompts, + context.embeddings = await request_oai_embeddings(context.prompts, None, base_url=context.base_url, user_api_key=context.user_api_key, model=context.model) @@ -740,8 +766,9 @@ async def concurrent_requests(context, f_completion, *args, **kwargs): if context.debug: print(f"starting {context.n_prompts} concurrent completion requests...") assert context.n_prompts > 0 + seeds = await completions_seed(context) for prompt_no in range(context.n_prompts): - shifted_args = [context.prompts.pop(), *args] + shifted_args = [context.prompts.pop(), seeds[prompt_no], *args] context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs))) await asyncio.sleep(0.1) @@ -781,6 +808,7 @@ def step_server_responds_with_status_code(context, status_code): async def request_completion(prompt, + seed, base_url, debug=False, prompt_prefix=None, @@ -788,9 +816,9 @@ async def request_completion(prompt, n_predict=None, cache_prompt=False, id_slot=None, - seed=None, expect_api_error=None, - user_api_key=None): + user_api_key=None, + temperature=None): if debug: print(f"Sending completion request: {prompt}") origin = "my.super.domain" @@ -811,7 +839,8 @@ async def request_completion(prompt, "n_predict": n_predict if n_predict is not None else -1, "cache_prompt": cache_prompt, "id_slot": id_slot, - "seed": seed if seed is not None else 42 + "seed": seed if seed is not None else 42, + "temperature": temperature if temperature is not None else "0.8f", }, headers=headers, timeout=3600) as response: @@ -824,6 +853,7 @@ async def request_completion(prompt, async def oai_chat_completions(user_prompt, + seed, system_prompt, base_url, base_path, @@ -833,7 +863,6 @@ async def oai_chat_completions(user_prompt, n_predict=None, enable_streaming=None, response_format=None, - seed=None, user_api_key=None, expect_api_error=None): if debug: @@ -952,7 +981,7 @@ async def oai_chat_completions(user_prompt, return completion_response -async def request_embedding(content, base_url=None): +async def request_embedding(content, seed, base_url=None): async with aiohttp.ClientSession() as session: async with session.post(f'{base_url}/embedding', json={ @@ -963,7 +992,7 @@ async def request_embedding(content, base_url=None): return [response_json['embedding']] -async def request_oai_embeddings(input, +async def request_oai_embeddings(input, seed, base_url=None, user_api_key=None, model=None, async_client=False): # openai client always expects an api_key @@ -1036,21 +1065,31 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re f' {n_predicted} <> {expected_predicted_n}') def assert_all_predictions_equal(completion_responses): - content_0 = completion_responses[0]['content'] - if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON': - print(f"content 0: {content_0}") - - i = 1 - for response in completion_responses[1:]: - content = response['content'] - - if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON': - print(f"content {i}: {content}") - - assert content == content_0, "contents not equal" - - i += 1 + for i, response_i in enumerate(completion_responses): + content_i = response_i['content'] + print(f"content {i}: {content_i}") + for i, response_i in enumerate(completion_responses): + content_i = response_i['content'] + for j, response_j in enumerate(completion_responses): + if i == j: + continue + content_j = response_j['content'] + assert content_i == content_j, "contents not equal" + + +def assert_all_predictions_different(completion_responses): + if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON': + for i, response_i in enumerate(completion_responses): + content_i = response_i['content'] + print(f"content {i}: {content_i}") + for i, response_i in enumerate(completion_responses): + content_i = response_i['content'] + for j, response_j in enumerate(completion_responses): + if i == j: + continue + content_j = response_j['content'] + assert content_i != content_j, "contents not different" async def gather_tasks_results(context): @@ -1145,9 +1184,22 @@ def assert_slots_status(slots, expected_slots): f" = {expected[key]} != {slot[key]}") -async def completions_seed(context): - return context.seed if hasattr(context, 'seed') and context.seed is not None \ - else context.server_seed if hasattr(context, 'server_seed') else None +async def completions_seed(context, num_seeds=None): + if hasattr(context, "seed") and context.seed is not None: + assert len(context.seed) == context.n_prompts + if num_seeds is None: + num_seeds = context.n_prompts + assert num_seeds <= context.n_prompts + seeds = context.seed[:num_seeds] + context.seed = context.seed[num_seeds:] if num_seeds < context.n_prompts else None + return seeds + + if hasattr(context, "server_seed") and context.server_seed is not None: + if num_seeds is None: + return [context.server_seed] * context.n_prompts + else: + return [context.server_seed] * num_seeds + return None def context_text(context): From 8d608a81b7bd170f700648f8214e6f3279d4d715 Mon Sep 17 00:00:00 2001 From: l3utterfly Date: Thu, 2 May 2024 04:27:41 +0900 Subject: [PATCH 02/10] main : fix off by one error for context shift (#6921) --- examples/main/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 5c693657c8993..eabbc2db38286 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -544,7 +544,7 @@ int main(int argc, char ** argv) { // if we run out of context: // - take the n_keep first tokens from the original prompt (via n_past) // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches - if (n_past + (int) embd.size() + std::max(0, guidance_offset) > n_ctx) { + if (n_past + (int) embd.size() + std::max(0, guidance_offset) >= n_ctx) { if (params.n_predict == -2) { LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); break; From b0d943de179ad5dbd83d51f327fb566066f4ccda Mon Sep 17 00:00:00 2001 From: Andrew Downing Date: Wed, 1 May 2024 17:31:30 -0400 Subject: [PATCH 03/10] Update LOG_IMPL and LOG_TEE_IMPL (#7029) ROCm clang defines _MSC_VER which results in the wrong implementation of LOG_IMPL and LOG_TEE_IMPL being compiled. This fixes https://github.com/ggerganov/llama.cpp/issues/6972 --- common/log.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/log.h b/common/log.h index 2b2f0e4551d7a..6934c57b2225c 100644 --- a/common/log.h +++ b/common/log.h @@ -234,7 +234,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std:: // INTERNAL, DO NOT USE // USE LOG() INSTEAD // -#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) +#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__) #define LOG_IMPL(str, ...) \ do { \ if (LOG_TARGET != nullptr) \ @@ -257,7 +257,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std:: // INTERNAL, DO NOT USE // USE LOG_TEE() INSTEAD // -#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) +#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__) #define LOG_TEE_IMPL(str, ...) \ do { \ if (LOG_TARGET != nullptr) \ From 6ecf3189e00a1e8e737a78b6d10e1d7006e050a2 Mon Sep 17 00:00:00 2001 From: alwqx Date: Thu, 2 May 2024 23:56:41 +0800 Subject: [PATCH 04/10] chore: fix typo in llama.cpp (#7032) Co-authored-by: Jared Van Bortel --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 18d6297ce1dfd..18b49ec20909e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2359,7 +2359,7 @@ static bool llama_kv_cache_init( cache.recurrent = model.arch == LLM_ARCH_MAMBA; cache.v_trans = !cparams.flash_attn; - // TODO: support mixed reccurent Transformer architectues + // TODO: support mixed recurrent Transformer architectures // NOTE: (!a || b) is a logical implication (a -> b) GGML_ASSERT(!cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_s()); GGML_ASSERT(!cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_s()); From 60325fa56f61c228464c9f065db3aa6a61f2156e Mon Sep 17 00:00:00 2001 From: Bartowski Date: Thu, 2 May 2024 19:49:09 -0400 Subject: [PATCH 05/10] Remove .attention from skipped tensors to match more accurately (#7051) --- convert-hf-to-gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 2f146d7302a78..612aea173644b 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1427,7 +1427,7 @@ def write_tensors(self): experts = dict() for name, data_torch in self.get_tensors(): # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): + if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): continue old_dtype = data_torch.dtype From 433def286e98751bf17db75dce53847d075c0be5 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Fri, 3 May 2024 15:24:30 +0200 Subject: [PATCH 06/10] llama : rename ctx to user_data in progress_callback (#7045) * llama : rename ctx to user_data in progress_callback This commit renames the `ctx` parameter to `user_data` in the `llama_progress_callback` typedef. The motivation for this is that other callbacks use `user_data` or `data`, and using `ctx` in this case might be confusing as it could be confused with `llama_context`. --------- Signed-off-by: Daniel Bevenius --- llama.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.h b/llama.h index 059d78f115c6d..790339af93083 100644 --- a/llama.h +++ b/llama.h @@ -171,7 +171,7 @@ extern "C" { bool sorted; } llama_token_data_array; - typedef bool (*llama_progress_callback)(float progress, void *ctx); + typedef bool (*llama_progress_callback)(float progress, void * user_data); // Input data for llama_decode // A llama_batch object can contain input about one or many sequences From a2ac89d6efb41b535778bfeaecaae8fe295b6ed3 Mon Sep 17 00:00:00 2001 From: Brian Date: Sat, 4 May 2024 05:36:41 +1000 Subject: [PATCH 07/10] convert.py : add python logging instead of print() (#6511) * convert.py: add python logging instead of print() * convert.py: verbose flag takes priority over dump flag log suppression * convert.py: named instance logging * convert.py: use explicit logger id string * convert.py: convert extra print() to named logger * convert.py: sys.stderr.write --> logger.error * *.py: Convert all python scripts to use logging module * requirements.txt: remove extra line * flake8: update flake8 ignore and exclude to match ci settings * gh-actions: add flake8-no-print to flake8 lint step * pre-commit: add flake8-no-print to flake8 and also update pre-commit version * convert-hf-to-gguf.py: print() to logger conversion * *.py: logging basiconfig refactor to use conditional expression * *.py: removed commented out logging * fixup! *.py: logging basiconfig refactor to use conditional expression * constant.py: logger.error then exit should be a raise exception instead * *.py: Convert logger error and sys.exit() into a raise exception (for atypical error) * gguf-convert-endian.py: refactor convert_byteorder() to use tqdm progressbar * verify-checksum-model.py: This is the result of the program, it should be printed to stdout. * compare-llama-bench.py: add blank line for readability during missing repo response * reader.py: read_gguf_file() use print() over logging * convert.py: warning goes to stderr and won't hurt the dump output * gguf-dump.py: dump_metadata() should print to stdout * convert-hf-to-gguf.py: print --> logger.debug or ValueError() * verify-checksum-models.py: use print() for printing table * *.py: refactor logging.basicConfig() * gguf-py/gguf/*.py: use __name__ as logger name Since they will be imported and not run directly. * python-lint.yml: use .flake8 file instead * constants.py: logger no longer required * convert-hf-to-gguf.py: add additional logging * convert-hf-to-gguf.py: print() --> logger * *.py: fix flake8 warnings * revert changes to convert-hf-to-gguf.py for get_name() * convert-hf-to-gguf-update.py: use triple quoted f-string instead * *.py: accidentally corrected the wrong line * *.py: add compilade warning suggestions and style fixes --- .flake8 | 3 +- .github/workflows/python-lint.yml | 3 +- .pre-commit-config.yaml | 5 +- convert-hf-to-gguf-update.py | 157 +++++++------- convert-hf-to-gguf.py | 271 +++++++++++-------------- convert-llama-ggml-to-gguf.py | 51 ++--- convert-lora-to-ggml.py | 31 +-- convert-persimmon-to-gguf.py | 28 +-- convert.py | 60 +++--- ggml_vk_generate_shaders.py | 12 +- gguf-py/examples/reader.py | 18 +- gguf-py/gguf/constants.py | 4 +- gguf-py/gguf/gguf_reader.py | 4 +- gguf-py/gguf/gguf_writer.py | 5 +- gguf-py/gguf/vocab.py | 42 ++-- gguf-py/scripts/gguf-convert-endian.py | 94 +++++---- gguf-py/scripts/gguf-dump.py | 29 ++- gguf-py/scripts/gguf-set-metadata.py | 31 +-- scripts/compare-llama-bench.py | 40 ++-- scripts/run-with-preset.py | 14 +- scripts/verify-checksum-models.py | 13 +- tests/test-tokenizer-0-bpe.py | 41 ++-- tests/test-tokenizer-0-spm.py | 66 +++--- 23 files changed, 538 insertions(+), 484 deletions(-) diff --git a/.flake8 b/.flake8 index 18fba2c1574a6..608eb8eed374c 100644 --- a/.flake8 +++ b/.flake8 @@ -1,3 +1,4 @@ [flake8] max-line-length = 125 -ignore = W503 +ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503 +exclude = examples/*,examples/*/**,*/**/__init__.py diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml index 5be17f1576ebb..a8d46f31dd4f5 100644 --- a/.github/workflows/python-lint.yml +++ b/.github/workflows/python-lint.yml @@ -20,5 +20,4 @@ jobs: - name: flake8 Lint uses: py-actions/flake8@v2 with: - ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503" - exclude: "examples/*,examples/*/**,*/**/__init__.py,convert-hf-to-gguf-update.py" + plugins: "flake8-no-print" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 65796fe2e423b..91d7916285081 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,13 +3,14 @@ exclude: prompts/.*.txt repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v3.2.0 + rev: v4.6.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer - id: check-yaml - id: check-added-large-files - repo: https://github.com/PyCQA/flake8 - rev: 6.0.0 + rev: 7.0.0 hooks: - id: flake8 + additional_dependencies: [flake8-no-print] diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index b019c1e3dc59f..09772f668e2b9 100644 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -21,6 +21,7 @@ # TODO: automate the update of convert-hf-to-gguf.py # +import logging import os import requests import sys @@ -28,12 +29,17 @@ from hashlib import sha256 from enum import IntEnum, auto +from transformers import AutoTokenizer + +logger = logging.getLogger("convert-hf-to-gguf-update") + class TOKENIZER_TYPE(IntEnum): SPM = auto() BPE = auto() WPM = auto() + # TODO: this string has to exercise as much pre-tokenizer functionality as possible # will be updated with time - contributions welcome chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' @@ -41,36 +47,38 @@ class TOKENIZER_TYPE(IntEnum): if len(sys.argv) == 2: token = sys.argv[1] else: - print("Usage: python convert-hf-to-gguf-update.py ") + logger.info("Usage: python convert-hf-to-gguf-update.py ") sys.exit(1) # TODO: add models here, base models preferred models = [ - { "name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", }, - { "name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", }, - { "name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", }, - { "name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", }, - { "name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", }, - { "name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", }, - { "name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", }, - { "name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, - { "name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, - { "name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", }, - ] + {"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", }, + {"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", }, + {"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", }, + {"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", }, + {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", }, + {"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", }, + {"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", }, + {"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, + {"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, + {"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", }, +] # make directory "models/tokenizers" if it doesn't exist if not os.path.exists("models/tokenizers"): os.makedirs("models/tokenizers") + def download_file_with_auth(url, token, save_path): headers = {"Authorization": f"Bearer {token}"} response = requests.get(url, headers=headers) if response.status_code == 200: with open(save_path, 'wb') as f: f.write(response.content) - print(f"File {save_path} downloaded successfully") + logger.info(f"File {save_path} downloaded successfully") else: - print(f"Failed to download file. Status code: {response.status_code}") + logger.info(f"Failed to download file. Status code: {response.status_code}") + # download the tokenizer models for model in models: @@ -81,10 +89,10 @@ def download_file_with_auth(url, token, save_path): if not os.path.exists(f"models/tokenizers/{name}"): os.makedirs(f"models/tokenizers/{name}") else: - print(f"Directory models/tokenizers/{name} already exists - skipping") + logger.info(f"Directory models/tokenizers/{name} already exists - skipping") continue - print(f"Downloading {name} to models/tokenizers/{name}") + logger.info(f"Downloading {name} to models/tokenizers/{name}") url = f"{repo}/raw/main/config.json" save_path = f"models/tokenizers/{name}/config.json" @@ -115,76 +123,76 @@ def download_file_with_auth(url, token, save_path): continue # create the tokenizer - from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}") chktok = tokenizer.encode(chktxt) chkhsh = sha256(str(chktok).encode()).hexdigest() - print(f"model: {name}") - print(f"tokt: {tokt}") - print(f"repo: {model['repo']}") - print(f"chktok: {chktok}") - print(f"chkhsh: {chkhsh}") + logger.info(f"model: {name}") + logger.info(f"tokt: {tokt}") + logger.info(f"repo: {model['repo']}") + logger.info(f"chktok: {chktok}") + logger.info(f"chkhsh: {chkhsh}") # print the "pre_tokenizer" content from the tokenizer.json with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f: cfg = json.load(f) pre_tokenizer = cfg["pre_tokenizer"] - print("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4)) + logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4)) - print(f"\n") + logger.info("") src_ifs += f" if chkhsh == \"{chkhsh}\":\n" src_ifs += f" # ref: {model['repo']}\n" src_ifs += f" res = \"{name}\"\n" -src_func = "" -src_func += " def get_vocab_base_pre(self, tokenizer) -> str:\n" -src_func += " # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that\n" -src_func += " # is specific for the BPE pre-tokenizer used by the model\n" -src_func += " # we will use this unique identifier to write a \"tokenizer.ggml.pre\" entry in the GGUF file which we can\n" -src_func += " # use in llama.cpp to implement the same pre-tokenizer\n" -src_func += "\n" -src_func += f" chktxt = {repr(chktxt)}\n" -src_func += "\n" -src_func += " chktok = tokenizer.encode(chktxt)\n" -src_func += " chkhsh = sha256(str(chktok).encode()).hexdigest()\n" -src_func += "\n" -src_func += " print(f\"chktok: {chktok}\")\n" -src_func += " print(f\"chkhsh: {chkhsh}\")\n" -src_func += "\n" -src_func += " res = None\n" -src_func += "\n" -src_func += " # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script\n" -src_func += " # or pull the latest version of the model from Huggingface\n" -src_func += " # don't edit the hashes manually!\n" -src_func += f"{src_ifs}\n" -src_func += " if res is None:\n" -src_func += " print(\"\\n\")\n" -src_func += " print(\"**************************************************************************************\")\n" -src_func += " print(\"** WARNING: The BPE pre-tokenizer was not recognized!\")\n" -src_func += " print(\"** There are 2 possible reasons for this:\")\n" -src_func += " print(\"** - the model has not been added to convert-hf-to-gguf-update.py yet\")\n" -src_func += " print(\"** - the pre-tokenization config has changed upstream\")\n" -src_func += " print(\"** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.\")\n" -src_func += " print(\"** ref: https://github.com/ggerganov/llama.cpp/pull/6920\")\n" -src_func += " print(\"**\")\n" -src_func += " print(f\"** chkhsh: {chkhsh}\")\n" -src_func += " print(\"**************************************************************************************\")\n" -src_func += " print(\"\\n\")\n" -src_func += " raise NotImplementedError(\"BPE pre-tokenizer was not recognized - update get_vocab_base_pre()\")\n" -src_func += "\n" -src_func += " print(f\"tokenizer.ggml.pre: {res}\")\n" -src_func += " print(f\"chkhsh: {chkhsh}\")\n" -src_func += "\n" -src_func += " return res\n" - -print(src_func) - -print("\n") -print("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!") -print("\n") +src_func = f""" + def get_vocab_base_pre(self, tokenizer) -> str: + # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that + # is specific for the BPE pre-tokenizer used by the model + # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can + # use in llama.cpp to implement the same pre-tokenizer + + chktxt = {repr(chktxt)} + + chktok = tokenizer.encode(chktxt) + chkhsh = sha256(str(chktok).encode()).hexdigest() + + print(f"chktok: {{chktok}}") + print(f"chkhsh: {{chkhsh}}") + + res = None + + # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script + # or pull the latest version of the model from Huggingface + # don't edit the hashes manually! +{src_ifs} + if res is None: + print("\\n") + print("**************************************************************************************") + print("** WARNING: The BPE pre-tokenizer was not recognized!") + print("** There are 2 possible reasons for this:") + print("** - the model has not been added to convert-hf-to-gguf-update.py yet") + print("** - the pre-tokenization config has changed upstream") + print("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.") + print("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") + print("**") + print(f"** chkhsh: {{chkhsh}}") + print("**************************************************************************************") + print("\\n") + raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") + + print(f"tokenizer.ggml.pre: {{repr(res)}}") + print(f"chkhsh: {{chkhsh}}") + + return res +""" + +print(src_func) # noqa: NP100 + +logger.info("\n") +logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!") +logger.info("\n") # generate tests for each tokenizer model @@ -250,7 +258,6 @@ def download_file_with_auth(url, token, save_path): tokt = model["tokt"] # create the tokenizer - from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}") with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f: @@ -265,15 +272,15 @@ def download_file_with_auth(url, token, save_path): f.write(f" {r}") f.write("\n") - print(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*") + logger.info(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*") # generate commands for creating vocab files -print("\nRun the following commands to generate the vocab files for testing:\n") +logger.info("\nRun the following commands to generate the vocab files for testing:\n") for model in models: name = model["name"] - print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") + logger.info(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") -print("\n") +logger.info("\n") diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 612aea173644b..5293262885158 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2,6 +2,7 @@ from __future__ import annotations +import logging import argparse import contextlib import json @@ -26,6 +27,8 @@ from convert import LlamaHfVocab, permute +logger = logging.getLogger("hf-to-gguf") + ###### MODEL DEFINITIONS ###### @@ -76,7 +79,7 @@ def set_vocab(self): def get_tensors(self) -> Iterator[tuple[str, Tensor]]: for part_name in self.part_names: - print(f"gguf: loading model part '{part_name}'") + logger.info(f"gguf: loading model part '{part_name}'") ctx: ContextManager[Any] if self.is_safetensors: from safetensors import safe_open @@ -95,42 +98,42 @@ def set_gguf_parameters(self): if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: self.gguf_writer.add_context_length(n_ctx) - print(f"gguf: context length = {n_ctx}") + logger.info(f"gguf: context length = {n_ctx}") n_embd = self.find_hparam(["hidden_size", "n_embd"]) self.gguf_writer.add_embedding_length(n_embd) - print(f"gguf: embedding length = {n_embd}") + logger.info(f"gguf: embedding length = {n_embd}") if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: self.gguf_writer.add_feed_forward_length(n_ff) - print(f"gguf: feed forward length = {n_ff}") + logger.info(f"gguf: feed forward length = {n_ff}") n_head = self.find_hparam(["num_attention_heads", "n_head"]) self.gguf_writer.add_head_count(n_head) - print(f"gguf: head count = {n_head}") + logger.info(f"gguf: head count = {n_head}") if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: self.gguf_writer.add_head_count_kv(n_head_kv) - print(f"gguf: key-value head count = {n_head_kv}") + logger.info(f"gguf: key-value head count = {n_head_kv}") if (rope_theta := self.hparams.get("rope_theta")) is not None: self.gguf_writer.add_rope_freq_base(rope_theta) - print(f"gguf: rope theta = {rope_theta}") + logger.info(f"gguf: rope theta = {rope_theta}") if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None: self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) - print(f"gguf: rms norm epsilon = {f_rms_eps}") + logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: self.gguf_writer.add_layer_norm_eps(f_norm_eps) - print(f"gguf: layer norm epsilon = {f_norm_eps}") + logger.info(f"gguf: layer norm epsilon = {f_norm_eps}") if (n_experts := self.hparams.get("num_local_experts")) is not None: self.gguf_writer.add_expert_count(n_experts) - print(f"gguf: expert count = {n_experts}") + logger.info(f"gguf: expert count = {n_experts}") if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: self.gguf_writer.add_expert_used_count(n_experts_used) - print(f"gguf: experts used count = {n_experts_used}") + logger.info(f"gguf: experts used count = {n_experts_used}") self.gguf_writer.add_file_type(self.ftype) - print(f"gguf: file type = {self.ftype}") + logger.info(f"gguf: file type = {self.ftype}") def write_tensors(self): block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) @@ -151,8 +154,7 @@ def write_tensors(self): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -169,7 +171,7 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -274,8 +276,8 @@ def get_vocab_base_pre(self, tokenizer) -> str: chktok = tokenizer.encode(chktxt) chkhsh = sha256(str(chktok).encode()).hexdigest() - print(f"chktok: {chktok}") - print(f"chkhsh: {chkhsh}") + logger.debug(f"chktok: {chktok}") + logger.debug(f"chkhsh: {chkhsh}") res = None @@ -308,22 +310,22 @@ def get_vocab_base_pre(self, tokenizer) -> str: res = "gpt-2" if res is None: - print("\n") - print("**************************************************************************************") - print("** WARNING: The BPE pre-tokenizer was not recognized!") - print("** There are 2 possible reasons for this:") - print("** - the model has not been added to convert-hf-to-gguf-update.py yet") - print("** - the pre-tokenization config has changed upstream") - print("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.") - print("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") - print("**") - print(f"** chkhsh: {chkhsh}") - print("**************************************************************************************") - print("\n") + logger.warning("\n") + logger.warning("**************************************************************************************") + logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!") + logger.warning("** There are 2 possible reasons for this:") + logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet") + logger.warning("** - the pre-tokenization config has changed upstream") + logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.") + logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") + logger.warning("**") + logger.warning(f"** chkhsh: {chkhsh}") + logger.warning("**************************************************************************************") + logger.warning("\n") raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") - print(f"tokenizer.ggml.pre: {res}") - print(f"chkhsh: {chkhsh}") + logger.debug(f"tokenizer.ggml.pre: {res}") + logger.debug(f"chkhsh: {chkhsh}") return res @@ -439,9 +441,7 @@ def _set_vocab_sentencepiece(self): if vocab_size > len(tokens): pad_count = vocab_size - len(tokens) - print( - f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]" - ) + logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") for i in range(1, pad_count + 1): tokens.append(f"[PAD{i}]") scores.append(-1000.0) @@ -553,7 +553,7 @@ def write_tensors(self): ), axis=0, ) - print("re-format attention.linear_qkv.weight") + logger.info("re-format attention.linear_qkv.weight") elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name): qkv_bias = data.reshape((n_head, 3, n_embed // n_head)) data = np.concatenate( @@ -564,13 +564,12 @@ def write_tensors(self): ), axis=0, ) - print("re-format attention.linear_qkv.bias") + logger.info("re-format attention.linear_qkv.bias") # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -587,13 +586,13 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}") + logger.info(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) if not has_lm_head and name == "word_embeddings.weight": self.gguf_writer.add_tensor("output.weight", data) - print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") + logger.info(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") @Model.register("MPTForCausalLM") @@ -653,8 +652,7 @@ def write_tensors(self): else: new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -671,7 +669,7 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -697,8 +695,7 @@ def set_gguf_parameters(self): elif "model_max_length" in self.hparams: ctx_length = self.hparams["model_max_length"] else: - print("gguf: can not find ctx length parameter.") - sys.exit() + raise ValueError("gguf: can not find ctx length parameter.") self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_name(self.dir_model.name) @@ -736,8 +733,7 @@ def write_tensors(self): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -754,7 +750,7 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -779,8 +775,7 @@ def set_gguf_parameters(self): elif "model_max_length" in self.hparams: ctx_length = self.hparams["model_max_length"] else: - print("gguf: can not find ctx length parameter.") - sys.exit() + raise ValueError("gguf: can not find ctx length parameter.") self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_source_hf_repo(hf_repo) @@ -809,7 +804,7 @@ def write_tensors(self): for i in range(block_count): if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None: - print(f"Unpacking and permuting layer {i}") + logger.info(f"Unpacking and permuting layer {i}") model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \ self._reverse_hf_permute_part(w, 0, head_count, head_count) model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \ @@ -834,8 +829,7 @@ def write_tensors(self): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -852,7 +846,7 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: @@ -937,8 +931,7 @@ def set_gguf_parameters(self): elif "model_max_length" in self.hparams: ctx_length = self.hparams["model_max_length"] else: - print("gguf: can not find ctx length parameter.") - sys.exit() + raise ValueError("gguf: can not find ctx length parameter.") self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_source_hf_repo(hf_repo) @@ -987,8 +980,7 @@ def write_tensors(self): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -1005,7 +997,7 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: @@ -1092,8 +1084,7 @@ def write_tensors(self): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -1110,7 +1101,7 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -1197,8 +1188,7 @@ def write_tensors(self): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight",)) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -1215,7 +1205,7 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -1264,10 +1254,9 @@ def write_tensors(self): data = data_torch.to(torch.float32).squeeze().numpy() new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -1332,8 +1321,7 @@ def write_tensors(self): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -1350,7 +1338,7 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.debug(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -1366,8 +1354,7 @@ def _stack_qk_norm(self, block_count, name, tensor_map, n_head, norms, n_dims, l merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight" new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")): data = data.astype(np.float32) @@ -1375,7 +1362,7 @@ def _stack_qk_norm(self, block_count, name, tensor_map, n_head, norms, n_dims, l if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") + logger.debug(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -1480,10 +1467,9 @@ def write_tensors(self): new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") - print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) continue @@ -1491,8 +1477,7 @@ def write_tensors(self): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -1509,7 +1494,7 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -1584,10 +1569,9 @@ def write_tensors(self): new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") - print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) continue @@ -1595,8 +1579,7 @@ def write_tensors(self): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -1613,7 +1596,7 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -1646,7 +1629,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_eps(1e-5) self.gguf_writer.add_file_type(self.ftype) - print(f"gguf: file type = {self.ftype}") + logger.info(f"gguf: file type = {self.ftype}") def write_tensors(self): block_count = self.hparams.get("n_layers") @@ -1689,8 +1672,7 @@ def write_tensors(self): # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15 new_name = tensor_map.get_name(name if not experts else name + ".weight", try_suffixes=(".weight",)) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -1698,8 +1680,7 @@ def write_tensors(self): # Most of the codebase that takes in 1D tensors only handles F32 tensors # and most of the outputs tensors are F32. if data_dtype != np.float32 and n_dims == 1: - print(f"Can not map tensor {name!r}: all 1D tensors must be F32") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}: all 1D tensors must be F32") # if f32 desired, convert any float16 to float32 if self.ftype == 0 and data_dtype == np.float16: @@ -1709,7 +1690,7 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32 and n_dims > 1: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}") + logger.debug(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -1771,8 +1752,7 @@ def write_tensors(self): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -1789,7 +1769,7 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -1855,8 +1835,7 @@ def write_tensors(self): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -1873,7 +1852,7 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -1950,10 +1929,9 @@ def write_tensors(self): new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") - print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") + logger.debug(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) continue @@ -1961,8 +1939,7 @@ def write_tensors(self): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -1979,7 +1956,7 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}") + logger.debug(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -2024,8 +2001,7 @@ def write_tensors(self): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -2042,13 +2018,13 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) # note: GPT2 output is tied to (same as) wte in original model if new_name == "token_embd.weight": - print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor("output.weight", data) @@ -2087,8 +2063,7 @@ def set_vocab(self): tokenizer_path = self.dir_model / 'tokenizer.model' if not tokenizer_path.is_file(): - print(f'Error: Missing {tokenizer_path}', file=sys.stderr) - sys.exit(1) + raise ValueError(f'Error: Missing {tokenizer_path}') tokenizer = SentencePieceProcessor(str(tokenizer_path)) @@ -2126,7 +2101,7 @@ def set_vocab(self): for key in added_tokens_json: token_id = added_tokens_json[key] if (token_id >= vocab_size): - print(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') continue tokens[token_id] = key.encode("utf-8") @@ -2208,8 +2183,7 @@ def write_tensors(self): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") # shuffle for broadcasting of gqa in ggml_mul_mat if new_name.endswith("attn_q.weight"): @@ -2240,7 +2214,7 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -2286,8 +2260,7 @@ def write_tensors(self): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -2304,13 +2277,13 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) if not has_lm_head and name == "transformer.wte.weight": self.gguf_writer.add_tensor("output.weight", data) - print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") + logger.info(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") @Model.register("InternLM2ForCausalLM") @@ -2332,7 +2305,7 @@ def set_vocab(self): toktypes: list[int] = [] if not tokenizer_path.is_file(): - print(f'Error: Missing {tokenizer_path}', file=sys.stderr) + logger.error(f'Error: Missing {tokenizer_path}') sys.exit(1) sentencepiece_model = model.ModelProto() @@ -2349,7 +2322,7 @@ def set_vocab(self): if text == b"\x00": # (TODO): fixme # Hack here and replace the \x00 characters. - print(f"InternLM2 convert token '{text}' to '🐉'!") + logger.debug(f"InternLM2 convert token '{text}' to '🐉'!") text = "🐉" toktype = SentencePieceTokenTypes.NORMAL @@ -2390,7 +2363,7 @@ def set_vocab(self): # TODO: this is a hack, should be fixed # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048 special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer) - print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \ + logger.warning(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \ in chat mode so that the conversation can end normally.") special_vocab.add_to_gguf(self.gguf_writer) @@ -2435,8 +2408,7 @@ def post_write_tensors(self, tensor_map, name, data_torch): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -2453,7 +2425,7 @@ def post_write_tensors(self, tensor_map, name, data_torch): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) def write_tensors(self): @@ -2564,8 +2536,7 @@ def write_tensors(self): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") # convert any unsupported data types to float32 if data_torch.dtype not in (torch.float16, torch.float32): @@ -2585,7 +2556,7 @@ def write_tensors(self): # if f32 desired, convert any float16 to float32 new_dtype = np.float32 - print(f"{new_name}, n_dims = {n_dims}, {data_torch.dtype} --> {new_dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {data_torch.dtype} --> {new_dtype}") if data.dtype != new_dtype: data = data.astype(new_dtype) @@ -2664,7 +2635,7 @@ def write_tensors(self): # lm_head is not used in llama.cpp, while autoawq will include this tensor in model # To prevent errors, skip loading lm_head.weight. if name == "lm_head.weight": - print(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") + logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") continue old_dtype = data_torch.dtype @@ -2681,8 +2652,7 @@ def write_tensors(self): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -2693,7 +2663,7 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -2721,7 +2691,7 @@ def set_vocab(self): else: # Use the GPT-NeoX tokenizer when no tokenizer files are present tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf" - print(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") + logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") neox_reader = gguf.GGUFReader(tokenizer_path, "r") field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL) @@ -2793,17 +2763,16 @@ def write_tensors(self): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") if name.endswith(".A_log"): - print("A_log --> A ==> " + new_name) + logger.debug("A_log --> A ==> " + new_name) data_torch = -torch.exp(data_torch) # assuming token_embd.weight is seen before output.weight if tok_embd is not None and new_name == output_name: if torch.equal(tok_embd, data_torch): - print(f"{output_name} is equivalent to {tok_embd_name}, omitting") + logger.debug(f"{output_name} is equivalent to {tok_embd_name}, omitting") continue if new_name == tok_embd_name: tok_embd = data_torch @@ -2826,7 +2795,7 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32 and new_weight_name.endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -2885,8 +2854,7 @@ def write_tensors(self): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + raise ValueError(f"Can not map tensor {name!r}") n_dims = len(data.shape) data_dtype = data.dtype @@ -2903,7 +2871,7 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32 and n_dims == 2: data = data.astype(np.float16) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + logger.info(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -2936,6 +2904,7 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument("--use-temp-file", action="store_true", help="use the tempfile library while processing (helpful when running out of memory, process killed)") parser.add_argument("--model-name", type=str, default=None, help="name of the model") + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") return parser.parse_args() @@ -2943,6 +2912,8 @@ def parse_args() -> argparse.Namespace: def main() -> None: args = parse_args() + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + dir_model = args.model if args.awq_path: @@ -2951,15 +2922,15 @@ def main() -> None: tmp_model_path = args.model / "weighted_model" dir_model = tmp_model_path if tmp_model_path.is_dir(): - print(f"{tmp_model_path} exists as a weighted model.") + logger.info(f"{tmp_model_path} exists as a weighted model.") else: tmp_model_path.mkdir(parents=True, exist_ok=True) - print("Saving new weighted model ...") + logger.info("Saving new weighted model ...") add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path)) - print(f"Saved weighted model at {tmp_model_path}.") + logger.info(f"Saved weighted model at {tmp_model_path}.") if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file=sys.stderr) + logger.error(f'Error: {args.model} is not a directory') sys.exit(1) ftype_map = { @@ -2973,7 +2944,7 @@ def main() -> None: # output in the same directory as the model by default fname_out = dir_model / f'ggml-model-{args.outtype}.gguf' - print(f"Loading model: {dir_model.name}") + logger.info(f"Loading model: {dir_model.name}") hparams = Model.load_hparams(dir_model) @@ -2981,20 +2952,20 @@ def main() -> None: model_class = Model.from_model_architecture(hparams["architectures"][0]) model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file) - print("Set model parameters") + logger.info("Set model parameters") model_instance.set_gguf_parameters() - print("Set model tokenizer") + logger.info("Set model tokenizer") model_instance.set_vocab() if args.vocab_only: - print(f"Exporting model vocab to '{fname_out}'") + logger.info(f"Exporting model vocab to '{fname_out}'") model_instance.write_vocab() else: - print(f"Exporting model to '{fname_out}'") + logger.info(f"Exporting model to '{fname_out}'") model_instance.write() - print(f"Model successfully exported to '{fname_out}'") + logger.info(f"Model successfully exported to '{fname_out}'") if __name__ == '__main__': diff --git a/convert-llama-ggml-to-gguf.py b/convert-llama-ggml-to-gguf.py index 5354b748b259f..9349de3b3b498 100755 --- a/convert-llama-ggml-to-gguf.py +++ b/convert-llama-ggml-to-gguf.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 from __future__ import annotations +import logging import argparse import os import struct @@ -14,6 +15,8 @@ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf +logger = logging.getLogger("ggml-to-gguf") + class GGMLFormat(IntEnum): GGML = 0 @@ -125,7 +128,6 @@ def load(self, data, offset): self.start_offset = offset self.len_bytes = n_bytes offset += n_bytes - # print(n_dims, name_len, dtype, self.dims, self.name, pad) return offset - orig_offset @@ -175,7 +177,7 @@ def load(self, data, offset): offset += self.validate_header(data, offset) hp = Hyperparameters() offset += hp.load(data, offset) - print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}') + logger.info(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}') self.validate_conversion(hp.ftype) vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML) offset += vocab.load(data, offset, hp.n_vocab) @@ -215,12 +217,12 @@ def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override if float(hp.n_head) / float(x) == gqa: n_kv_head = x assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param" - print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}') + logger.info(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}') self.n_kv_head = n_kv_head self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer) def save(self): - print('* Preparing to save GGUF file') + logger.info('* Preparing to save GGUF file') gguf_writer = gguf.GGUFWriter( self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], @@ -230,11 +232,11 @@ def save(self): if self.special_vocab is not None: self.special_vocab.add_to_gguf(gguf_writer) self.add_tensors(gguf_writer) - print(" gguf: write header") + logger.info(" gguf: write header") gguf_writer.write_header_to_file() - print(" gguf: write metadata") + logger.info(" gguf: write metadata") gguf_writer.write_kv_data_to_file() - print(" gguf: write tensors") + logger.info(" gguf: write tensors") gguf_writer.write_tensors_to_file() gguf_writer.close() @@ -250,7 +252,7 @@ def add_params(self, gguf_writer): name = cfg.name if cfg.name is not None else cfg.input.name except UnicodeDecodeError: name = None - print('* Adding model parameters and KV items') + logger.info('* Adding model parameters and KV items') if name is not None: gguf_writer.add_name(name) gguf_writer.add_description(desc) @@ -287,7 +289,7 @@ def add_vocab(self, gguf_writer): toktypes = [] if self.vocab_override is not None: vo = self.vocab_override - print('* Adding vocab item(s)') + logger.info('* Adding vocab item(s)') for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()): tokens.append(vbytes) scores.append(score) @@ -299,7 +301,7 @@ def add_vocab(self, gguf_writer): if len(toktypes) > 0: gguf_writer.add_token_types(toktypes) return - print(f'* Adding {hp.n_vocab} vocab item(s)') + logger.info(f'* Adding {hp.n_vocab} vocab item(s)') assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab' for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items): tt = 1 # Normal @@ -334,7 +336,7 @@ def add_vocab(self, gguf_writer): def add_tensors(self, gguf_writer): tensor_map = self.name_map data = self.data - print(f'* Adding {len(self.model.tensors)} tensor(s)') + logger.info(f'* Adding {len(self.model.tensors)} tensor(s)') for tensor in self.model.tensors: name = str(tensor.name, 'UTF-8') mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) @@ -344,7 +346,6 @@ def add_tensors(self, gguf_writer): temp = tempdims[1] tempdims[1] = tempdims[0] tempdims[0] = temp - # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}') gguf_writer.add_tensor( mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], @@ -401,33 +402,35 @@ def handle_args(): help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir") parser.add_argument("--vocabtype", default="spm,hfft", help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm,hfft)") + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") return parser.parse_args() def main(): cfg = handle_args() - print(f'* Using config: {cfg}') - print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n') + logging.basicConfig(level=logging.DEBUG if cfg.verbose else logging.INFO) + logger.info(f'* Using config: {cfg}') + logger.warning('=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===') if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'): - print('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".') + logger.info('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".') data = np.memmap(cfg.input, mode = 'r') model = GGMLModel() - print('* Scanning GGML input file') + logger.info('* Scanning GGML input file') offset = model.load(data, 0) # noqa - print(f'* GGML model hyperparameters: {model.hyperparameters}') + logger.info(f'* GGML model hyperparameters: {model.hyperparameters}') vocab_override = None params_override = None special_vocab = None if cfg.model_metadata_dir is not None: (params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters) - print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.') - print(f'* Overriding params: {params_override}') - print(f'* Overriding vocab: {vocab_override}') - print(f'* Special vocab: {special_vocab}') + logger.info('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.') + logger.info(f'* Overriding params: {params_override}') + logger.info(f'* Overriding vocab: {vocab_override}') + logger.info(f'* Special vocab: {special_vocab}') else: - print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n') + logger.warning('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n') if model.file_format == GGMLFormat.GGML: - print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!') + logger.info('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!') converter = GGMLToGGUF( model, data, cfg, params_override = params_override, @@ -435,7 +438,7 @@ def main(): special_vocab = special_vocab ) converter.save() - print(f'* Successful completion. Output saved to: {cfg.output}') + logger.info(f'* Successful completion. Output saved to: {cfg.output}') if __name__ == '__main__': diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py index 9a9936dec8b0a..39536feb954b4 100755 --- a/convert-lora-to-ggml.py +++ b/convert-lora-to-ggml.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 from __future__ import annotations +import logging import json import os import struct @@ -15,6 +16,8 @@ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) import gguf +logger = logging.getLogger("lora-to-gguf") + NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1} @@ -48,11 +51,9 @@ def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_ty if __name__ == '__main__': if len(sys.argv) < 2: - print(f"Usage: python {sys.argv[0]} [arch]") - print( - "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'" - ) - print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)") + logger.info(f"Usage: python {sys.argv[0]} [arch]") + logger.info("Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'") + logger.info(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)") sys.exit(1) input_json = os.path.join(sys.argv[1], "adapter_config.json") @@ -70,7 +71,7 @@ def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_ty arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama" if arch_name not in gguf.MODEL_ARCH_NAMES.values(): - print(f"Error: unsupported architecture {arch_name}") + logger.error(f"Error: unsupported architecture {arch_name}") sys.exit(1) arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)] @@ -80,21 +81,21 @@ def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_ty params = json.load(f) if params["peft_type"] != "LORA": - print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA") + logger.error(f"Error: unsupported adapter type {params['peft_type']}, expected LORA") sys.exit(1) if params["fan_in_fan_out"] is True: - print("Error: param fan_in_fan_out is not supported") + logger.error("Error: param fan_in_fan_out is not supported") sys.exit(1) if params["bias"] is not None and params["bias"] != "none": - print("Error: param bias is not supported") + logger.error("Error: param bias is not supported") sys.exit(1) # TODO: these seem to be layers that have been trained but without lora. # doesn't seem widely used but eventually should be supported if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0: - print("Error: param modules_to_save is not supported") + logger.error("Error: param modules_to_save is not supported") sys.exit(1) with open(output_path, "wb") as fout: @@ -125,13 +126,13 @@ def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_ty suffix = k[-len(lora_suffixes[0]):] k = k[: -len(lora_suffixes[0])] else: - print(f"Error: unrecognized tensor name {orig_k}") + logger.error(f"Error: unrecognized tensor name {orig_k}") sys.exit(1) tname = name_map.get_name(k) if tname is None: - print(f"Error: could not map tensor name {orig_k}") - print(" Note: the arch parameter must be specified if the model is not llama") + logger.error(f"Error: could not map tensor name {orig_k}") + logger.error(" Note: the arch parameter must be specified if the model is not llama") sys.exit(1) if suffix == ".lora_A.weight": @@ -141,8 +142,8 @@ def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_ty else: assert False - print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB") + logger.info(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB") write_tensor_header(fout, tname, t.shape, t.dtype) t.tofile(fout) - print(f"Converted {input_json} and {input_model} to {output_path}") + logger.info(f"Converted {input_json} and {input_model} to {output_path}") diff --git a/convert-persimmon-to-gguf.py b/convert-persimmon-to-gguf.py index aba575426b492..07dcade747a5a 100755 --- a/convert-persimmon-to-gguf.py +++ b/convert-persimmon-to-gguf.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 from __future__ import annotations +import logging import argparse import os import sys @@ -14,6 +15,8 @@ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf +logger = logging.getLogger("persimmon-to-gguf") + def _flatten_dict(dct, tensors, prefix=None): assert isinstance(dct, dict) @@ -30,9 +33,9 @@ def _flatten_dict(dct, tensors, prefix=None): def _get_sentencepiece_tokenizer_info(dir_model: Path): tokenizer_path = dir_model / 'adept_vocab.model' - print('gguf: getting sentencepiece tokenizer from', tokenizer_path) + logger.info('getting sentencepiece tokenizer from', tokenizer_path) tokenizer = SentencePieceProcessor(str(tokenizer_path)) - print('gguf: adding tokens') + logger.info('adding tokens') tokens: list[bytes] = [] scores: list[float] = [] toktypes: list[int] = [] @@ -67,8 +70,10 @@ def main(): parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") parser.add_argument("--ckpt-path", type=Path, help="path to persimmon checkpoint .pt file") parser.add_argument("--model-dir", type=Path, help="directory containing model e.g. 8b_chat_model_release") - parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory") + parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory") + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") args = parser.parse_args() + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) sys.path.append(str(args.adept_inference_dir)) persimmon_model = torch.load(args.ckpt_path) hparams = persimmon_model['args'] @@ -107,7 +112,7 @@ def main(): gguf_writer.add_eos_token_id(71013) tensor_map = gguf.get_tensor_name_map(arch, block_count) - print(tensor_map) + logger.info(tensor_map) for name in tensors.keys(): data_torch = tensors[name] if name.endswith(".self_attention.rotary_emb.inv_freq"): @@ -117,22 +122,21 @@ def main(): data = data_torch.to(torch.float32).squeeze().numpy() new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() + raise ValueError(f"Can not map tensor '{name}'") + n_dims = len(data.shape) - print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + logger.debug(f"{new_name}, n_dims = {str(n_dims)}, {str(old_dtype)} --> {str(data.dtype)}") gguf_writer.add_tensor(new_name, data) - print("gguf: write header") + logger.info("gguf: write header") gguf_writer.write_header_to_file() - print("gguf: write metadata") + logger.info("gguf: write metadata") gguf_writer.write_kv_data_to_file() - print("gguf: write tensors") + logger.info("gguf: write tensors") gguf_writer.write_tensors_to_file() gguf_writer.close() - print(f"gguf: model successfully exported to '{args.outfile}'") - print("") + logger.info(f"gguf: model successfully exported to '{args.outfile}'") if __name__ == '__main__': diff --git a/convert.py b/convert.py index 1c700cf6a3d65..7f0b6b7498e10 100755 --- a/convert.py +++ b/convert.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 from __future__ import annotations +import logging import argparse import concurrent.futures import enum @@ -35,6 +36,8 @@ if TYPE_CHECKING: from typing_extensions import Self, TypeAlias +logger = logging.getLogger("convert") + if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'): faulthandler.register(signal.SIGUSR1) @@ -643,7 +646,6 @@ def __repr__(self) -> str: def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray: - # print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) ) if n_head_kv is not None and n_head != n_head_kv: n_head = n_head_kv return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) @@ -1033,12 +1035,12 @@ def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False) # Check for a vocab size mismatch if params.n_vocab == vocab.vocab_size: - print("Ignoring added_tokens.json since model matches vocab size without it.") + logger.warning("Ignoring added_tokens.json since model matches vocab size without it.") return if pad_vocab and params.n_vocab > vocab.vocab_size: pad_count = params.n_vocab - vocab.vocab_size - print( + logger.debug( f"Padding vocab with {pad_count} token(s) - through " ) for i in range(1, pad_count + 1): @@ -1166,7 +1168,7 @@ def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency: elapsed = time.time() - start size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape) padi = len(str(len(model))) - print( + logger.info( f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}" ) self.gguf.write_tensor_data(ndarray) @@ -1281,12 +1283,12 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> # HF models permut or pack some of the tensors, so we need to undo that for i in itertools.count(): if f"model.layers.{i}.self_attn.q_proj.weight" in model: - print(f"Permuting layer {i}") + logger.debug(f"Permuting layer {i}") tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head) tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv) # tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] elif f"model.layers.{i}.self_attn.W_pack.weight" in model: - print(f"Unpacking and permuting layer {i}") + logger.debug(f"Unpacking and permuting layer {i}") tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head) tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv) tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2) @@ -1299,15 +1301,15 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None) if name_new is None: if skip_unknown: - print(f"Unexpected tensor name: {name} - skipping") + logger.warning(f"Unexpected tensor name: {name} - skipping") continue raise ValueError(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)") if tensor_type in should_skip: - print(f"skipping tensor {name_new}") + logger.debug(f"skipping tensor {name_new}") continue - print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}") + logger.debug(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}") out[name_new] = lazy_tensor return out @@ -1372,7 +1374,7 @@ def load_some_model(path: Path) -> ModelPlus: paths = find_multifile_paths(path) models_plus: list[ModelPlus] = [] for path in paths: - print(f"Loading model file {path}") + logger.info(f"Loading model file {path}") models_plus.append(lazy_load_file(path)) model_plus = merge_multifile_models(models_plus) @@ -1413,7 +1415,7 @@ def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab: else: raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}") - print(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}") + logger.info(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}") return vocab def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]: @@ -1438,19 +1440,19 @@ def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path: }[file_type] ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf" if ret in model_paths: - sys.stderr.write( + logger.error( f"Error: Default output path ({ret}) would overwrite the input. " - "Please explicitly specify a path using --outfile.\n") + "Please explicitly specify a path using --outfile.") sys.exit(1) return ret def do_dump_model(model_plus: ModelPlus) -> None: - print(f"model_plus.paths = {model_plus.paths!r}") - print(f"model_plus.format = {model_plus.format!r}") - print(f"model_plus.vocab = {model_plus.vocab!r}") + print(f"model_plus.paths = {model_plus.paths!r}") # noqa: NP100 + print(f"model_plus.format = {model_plus.format!r}") # noqa: NP100 + print(f"model_plus.vocab = {model_plus.vocab!r}") # noqa: NP100 for name, lazy_tensor in model_plus.model.items(): - print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") + print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") # noqa: NP100 def main(args_in: list[str] | None = None) -> None: @@ -1473,8 +1475,18 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine") parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides") parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing") + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") args = parser.parse_args(args_in) + + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + elif args.dump_single or args.dump: + # Avoid printing anything besides the dump output + logging.basicConfig(level=logging.WARNING) + else: + logging.basicConfig(level=logging.INFO) + if args.no_vocab and args.vocab_only: raise ValueError("--vocab-only does not make sense with --no-vocab") @@ -1491,6 +1503,7 @@ def main(args_in: list[str] | None = None) -> None: if args.dump: do_dump_model(model_plus) return + endianess = gguf.GGUFEndian.LITTLE if args.big_endian: endianess = gguf.GGUFEndian.BIG @@ -1513,7 +1526,7 @@ def main(args_in: list[str] | None = None) -> None: "q8_0": GGMLFileType.MostlyQ8_0, }[args.outtype] - print(f"params = {params}") + logger.info(f"params = {params}") model_parent_path = model_plus.paths[0].parent vocab_path = Path(args.vocab_dir or args.model or model_parent_path) @@ -1528,15 +1541,14 @@ def main(args_in: list[str] | None = None) -> None: outfile = args.outfile OutputFile.write_vocab_only(outfile, params, vocab, special_vocab, endianess=endianess, pad_vocab=args.pad_vocab) - print(f"Wrote {outfile}") + logger.info(f"Wrote {outfile}") return if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab: vocab = model_plus.vocab - print(f"Vocab info: {vocab}") - print(f"Special vocab info: {special_vocab}") - + logger.info(f"Vocab info: {vocab}") + logger.info(f"Special vocab info: {special_vocab}") model = model_plus.model model = convert_model_names(model, params, args.skip_unknown) ftype = pick_output_type(model, args.outtype) @@ -1544,11 +1556,11 @@ def main(args_in: list[str] | None = None) -> None: outfile = args.outfile or default_outfile(model_plus.paths, ftype) params.ftype = ftype - print(f"Writing {outfile}, format {ftype}") + logger.info(f"Writing {outfile}, format {ftype}") OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab) - print(f"Wrote {outfile}") + logger.info(f"Wrote {outfile}") if __name__ == '__main__': diff --git a/ggml_vk_generate_shaders.py b/ggml_vk_generate_shaders.py index 5dd700963176e..1d9a0cc82f704 100644 --- a/ggml_vk_generate_shaders.py +++ b/ggml_vk_generate_shaders.py @@ -1,11 +1,14 @@ #!/usr/bin/env python +import logging import argparse import asyncio import os import sys from tempfile import gettempdir, NamedTemporaryFile +logger = logging.getLogger("ggml-vk-generate-shaders") + shader_f32 = """ #define FLOAT_TYPE float """ @@ -2498,7 +2501,7 @@ async def string_to_spv(name, code, defines, fp16=True): stdout, stderr = await proc.communicate() - print(" ".join(cmd)) + logger.info(" ".join(cmd)) if proc.returncode: raise RuntimeError(f"{name=} {f.name=} {stdout=} {stderr=}") @@ -2507,7 +2510,7 @@ async def string_to_spv(name, code, defines, fp16=True): cmd.extend([f"-D{key}={value}" for key, value in defines.items()]) code_with_lines = "\n".join([f"{i + 1}: {line}" for i, line in enumerate(preprocessed_code.splitlines())]) - print(f"ERROR compiling {name}\n\n{code_with_lines}\n\n{error}") + logger.error(f"cannot compile {name}\n\n{code_with_lines}\n\n{error}") f.close() os.remove(f.name) sys.exit(proc.returncode) @@ -2520,7 +2523,7 @@ async def string_to_spv(name, code, defines, fp16=True): async def main(): - print("ggml_vulkan: Generating and compiling shaders to SPIR-V") + logger.info("ggml_vulkan: Generating and compiling shaders to SPIR-V") tasks = [] @@ -2768,9 +2771,12 @@ async def withSemaphore(sem, task): parser = argparse.ArgumentParser(description="GGML Vulkan Shader Generator") parser.add_argument("--glslc", help="Path to glslc") + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") args = parser.parse_args() + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + if args.glslc: GLSLC = args.glslc diff --git a/gguf-py/examples/reader.py b/gguf-py/examples/reader.py index 62e0769dacee2..d841048c6d701 100644 --- a/gguf-py/examples/reader.py +++ b/gguf-py/examples/reader.py @@ -1,8 +1,10 @@ #!/usr/bin/env python3 +import logging import sys from pathlib import Path from gguf.gguf_reader import GGUFReader +logger = logging.getLogger("reader") sys.path.insert(0, str(Path(__file__).parent.parent)) @@ -18,28 +20,28 @@ def read_gguf_file(gguf_file_path): reader = GGUFReader(gguf_file_path) # List all key-value pairs in a columnized format - print("Key-Value Pairs:") + print("Key-Value Pairs:") # noqa: NP100 max_key_length = max(len(key) for key in reader.fields.keys()) for key, field in reader.fields.items(): value = field.parts[field.data[0]] - print(f"{key:{max_key_length}} : {value}") - print("----") + print(f"{key:{max_key_length}} : {value}") # noqa: NP100 + print("----") # noqa: NP100 # List all tensors - print("Tensors:") + print("Tensors:") # noqa: NP100 tensor_info_format = "{:<30} | Shape: {:<15} | Size: {:<12} | Quantization: {}" - print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization")) - print("-" * 80) + print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization")) # noqa: NP100 + print("-" * 80) # noqa: NP100 for tensor in reader.tensors: shape_str = "x".join(map(str, tensor.shape)) size_str = str(tensor.n_elements) quantization_str = tensor.tensor_type.name - print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str)) + print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str)) # noqa: NP100 if __name__ == '__main__': if len(sys.argv) < 2: - print("Usage: reader.py ") + logger.info("Usage: reader.py ") sys.exit(1) gguf_file_path = sys.argv[1] read_gguf_file(gguf_file_path) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 6d597bfd9d621..4f232e18d9691 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -1,6 +1,5 @@ from __future__ import annotations -import sys from enum import Enum, IntEnum, auto from typing import Any @@ -854,8 +853,7 @@ def get_type(val: Any) -> GGUFValueType: return GGUFValueType.INT32 # TODO: need help with 64-bit types in Python else: - print("Unknown type:", type(val)) - sys.exit() + raise ValueError(f"Unknown type: {type(val)}") # Note: Does not support GGML_QKK_64 diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py index 2bdb15525b1a1..db8525d85b450 100644 --- a/gguf-py/gguf/gguf_reader.py +++ b/gguf-py/gguf/gguf_reader.py @@ -4,6 +4,7 @@ # from __future__ import annotations +import logging import os from collections import OrderedDict from typing import Any, Literal, NamedTuple, TypeVar, Union @@ -27,6 +28,7 @@ GGUFValueType, ) +logger = logging.getLogger(__name__) READER_SUPPORTED_VERSIONS = [2, GGUF_VERSION] @@ -142,7 +144,7 @@ def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int: # TODO: add option to generate error on duplicate keys # raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}') - print(f'Warning: Duplicate key {field.name} at offset {field.offset}') + logger.warning(f'Duplicate key {field.name} at offset {field.offset}') self.fields[field.name + '_{}'.format(field.offset)] = field else: self.fields[field.name] = field diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 089aece876a93..d9cfbf71160d3 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1,5 +1,6 @@ from __future__ import annotations +import logging import os import shutil import struct @@ -24,6 +25,8 @@ TokenType, ) +logger = logging.getLogger(__name__) + class WriterState(Enum): EMPTY = auto() @@ -67,7 +70,7 @@ def __init__( self.use_temp_file = use_temp_file self.temp_file = None self.tensors = [] - print("gguf: This GGUF file is for {0} Endian only".format( + logger.info("gguf: This GGUF file is for {0} Endian only".format( "Big" if self.endianess == GGUFEndian.BIG else "Little", )) self.state = WriterState.EMPTY diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index 378eaecad05ba..c97a78f3944f4 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -1,13 +1,15 @@ from __future__ import annotations +import logging import json import os -import sys from pathlib import Path from typing import Any, Callable from .gguf_writer import GGUFWriter +logger = logging.getLogger(__name__) + class SpecialVocab: merges: list[str] @@ -40,38 +42,29 @@ def __repr__(self) -> str: def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None: if self.merges: if not quiet: - print(f'gguf: Adding {len(self.merges)} merge(s).') + logger.info(f'Adding {len(self.merges)} merge(s).') gw.add_token_merges(self.merges) elif self.load_merges: - print( - 'gguf: WARNING: Adding merges requested but no merges found, output may be non-functional.', - file = sys.stderr, - ) + logger.warning('Adding merges requested but no merges found, output may be non-functional.') for typ, tokid in self.special_token_ids.items(): id_handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None) if id_handler is None: - print( - f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping', - file = sys.stderr, - ) + logger.warning(f'No handler for special token type {typ} with id {tokid} - skipping') continue if not quiet: - print(f'gguf: Setting special token type {typ} to {tokid}') + logger.info(f'Setting special token type {typ} to {tokid}') id_handler(tokid) for typ, value in self.add_special_token.items(): add_handler: Callable[[bool], None] | None = getattr(gw, f'add_add_{typ}_token', None) if add_handler is None: - print( - f'gguf: WARNING: No handler for add_{typ}_token with value {value} - skipping', - file = sys.stderr, - ) + logger.warning(f'No handler for add_{typ}_token with value {value} - skipping') continue if not quiet: - print(f'gguf: Setting add_{typ}_token to {value}') + logger.info(f'Setting add_{typ}_token to {value}') add_handler(value) if self.chat_template is not None: if not quiet: - print(f'gguf: Setting chat_template to {self.chat_template}') + logger.info(f'Setting chat_template to {self.chat_template}') gw.add_chat_template(self.chat_template) def _load(self, path: Path) -> None: @@ -99,10 +92,7 @@ def _try_load_merges_txt(self, path: Path) -> bool: continue parts = line.split(None, 3) if len(parts) != 2: - print( - f'gguf: WARNING: {merges_file.name}: Line {line_num}: Entry malformed, ignoring', - file = sys.stderr, - ) + logger.warning(f'{merges_file.name}: Line {line_num}: Entry malformed, ignoring') continue merges.append(f'{parts[0]} {parts[1]}') self.merges = merges @@ -118,10 +108,7 @@ def _set_special_token(self, typ: str, tid: Any) -> None: return self.special_token_ids[typ] = tid return - print( - f'gguf: WARNING: Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping', - file = sys.stderr, - ) + logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping') def _try_load_from_tokenizer_json(self, path: Path) -> bool: tokenizer_file = path / 'tokenizer.json' @@ -144,10 +131,7 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool: if chat_template is None or isinstance(chat_template, (str, list)): self.chat_template = chat_template else: - print( - f'gguf: WARNING: Bad type for chat_template field in {tokenizer_config_file!r} - ignoring', - file = sys.stderr - ) + logger.warning(f'Bad type for chat_template field in {tokenizer_config_file!r} - ignoring') for typ in self.special_token_types: add_entry = tokenizer_config.get(f'add_{typ}_token') if isinstance(add_entry, bool): diff --git a/gguf-py/scripts/gguf-convert-endian.py b/gguf-py/scripts/gguf-convert-endian.py index 10a16ad063ce6..b698af0fe7631 100755 --- a/gguf-py/scripts/gguf-convert-endian.py +++ b/gguf-py/scripts/gguf-convert-endian.py @@ -1,9 +1,11 @@ #!/usr/bin/env python3 from __future__ import annotations +import logging import argparse import os import sys +from tqdm import tqdm from pathlib import Path import numpy as np @@ -14,6 +16,8 @@ import gguf +logger = logging.getLogger("gguf-convert-endian") + def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None: if np.uint32(1) == np.uint32(1).newbyteorder("<"): @@ -29,11 +33,11 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None else: file_endian = host_endian order = host_endian if args.order == "native" else args.order - print(f"* Host is {host_endian.upper()} endian, GGUF file seems to be {file_endian.upper()} endian") + logger.info(f"* Host is {host_endian.upper()} endian, GGUF file seems to be {file_endian.upper()} endian") if file_endian == order: - print(f"* File is already {order.upper()} endian. Nothing to do.") + logger.info(f"* File is already {order.upper()} endian. Nothing to do.") sys.exit(0) - print("* Checking tensors for conversion compatibility") + logger.info("* Checking tensors for conversion compatibility") for tensor in reader.tensors: if tensor.tensor_type not in ( gguf.GGMLQuantizationType.F32, @@ -41,51 +45,64 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None gguf.GGMLQuantizationType.Q8_0, ): raise ValueError(f"Cannot handle type {tensor.tensor_type.name} for tensor {repr(tensor.name)}") - print(f"* Preparing to convert from {file_endian.upper()} to {order.upper()}") + logger.info(f"* Preparing to convert from {file_endian.upper()} to {order.upper()}") if args.dry_run: return - print("\n*** Warning *** Warning *** Warning **") - print("* This conversion process may damage the file. Ensure you have a backup.") + logger.warning("*** Warning *** Warning *** Warning **") + logger.warning("* This conversion process may damage the file. Ensure you have a backup.") if order != host_endian: - print("* Requested endian differs from host, you will not be able to load the model on this machine.") - print("* The file will be modified immediately, so if conversion fails or is interrupted") - print("* the file will be corrupted. Enter exactly YES if you are positive you want to proceed:") + logger.warning("* Requested endian differs from host, you will not be able to load the model on this machine.") + logger.warning("* The file will be modified immediately, so if conversion fails or is interrupted") + logger.warning("* the file will be corrupted. Enter exactly YES if you are positive you want to proceed:") response = input("YES, I am sure> ") if response != "YES": - print("You didn't enter YES. Okay then, see ya!") + logger.warning("You didn't enter YES. Okay then, see ya!") sys.exit(0) - print(f"\n* Converting fields ({len(reader.fields)})") + logger.info(f"* Converting fields ({len(reader.fields)})") for idx, field in enumerate(reader.fields.values()): - print(f"- {idx:4}: Converting field {repr(field.name)}, part count: {len(field.parts)}") + logger.info(f"- {idx:4}: Converting field {repr(field.name)}, part count: {len(field.parts)}") for part in field.parts: part.byteswap(inplace=True) - print(f"\n* Converting tensors ({len(reader.tensors)})") - for idx, tensor in enumerate(reader.tensors): - print( - f" - {idx:4}: Converting tensor {repr(tensor.name)}, type={tensor.tensor_type.name}, " - f"elements={tensor.n_elements}... ", - end="", + logger.info(f"* Converting tensors ({len(reader.tensors)})") + + for idx, tensor in enumerate(pbar := tqdm(reader.tensors, desc="Converting tensor")): + log_message = ( + f"Converting tensor {repr(tensor.name)}, " + f"type={tensor.tensor_type.name}, " + f"elements={tensor.n_elements} " ) - tensor_type = tensor.tensor_type + + # Byte-swap each part of the tensor's field for part in tensor.field.parts: part.byteswap(inplace=True) - if tensor_type != gguf.GGMLQuantizationType.Q8_0: + + # Byte-swap tensor data if necessary + if tensor.tensor_type == gguf.GGMLQuantizationType.Q8_0: + # Handle Q8_0 tensor blocks (block_q8_0) + # Specific handling of block_q8_0 is required. + # Each block_q8_0 consists of an f16 delta (scaling factor) followed by 32 int8 quantizations. + + block_size = 34 # 34 bytes = + 32 * + + n_blocks = len(tensor.data) // block_size + for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)): + block_offs = block_num * block_size + + # Byte-Swap f16 sized delta field + delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16) + delta.byteswap(inplace=True) + + # Byte-Swap Q8 weights + if block_num % 100000 == 0: + inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]") + + else: + # Handle other tensor types tensor.data.byteswap(inplace=True) - print() - continue - # A Q8_0 block consists of a f16 delta followed by 32 int8 quants, so 34 bytes - block_size = 34 - n_blocks = len(tensor.data) // block_size - for block_num in range(n_blocks): - block_offs = block_num * block_size - # I know I said f16, but it doesn't matter here - any simple 16 bit type works. - delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16) - delta.byteswap(inplace=True) - if block_num % 100000 == 0: - print(f"[{(n_blocks - block_num) // 1000}K]", end="") - sys.stdout.flush() - print() - print("* Completion") + + pbar.set_description(log_message) + + logger.info("* Completion") def main() -> None: @@ -102,8 +119,13 @@ def main() -> None: "--dry-run", action="store_true", help="Don't actually change anything", ) + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") + args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"]) - print(f'* Loading: {args.model}') + + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + + logger.info(f'* Loading: {args.model}') reader = gguf.GGUFReader(args.model, 'r' if args.dry_run else 'r+') convert_byteorder(reader, args) diff --git a/gguf-py/scripts/gguf-dump.py b/gguf-py/scripts/gguf-dump.py index dbf8915089275..2d3c3943f64cd 100755 --- a/gguf-py/scripts/gguf-dump.py +++ b/gguf-py/scripts/gguf-dump.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 from __future__ import annotations +import logging import argparse import os import sys @@ -15,6 +16,8 @@ from gguf import GGUFReader, GGUFValueType # noqa: E402 +logger = logging.getLogger("gguf-dump") + def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]: host_endian = 'LITTLE' if np.uint32(1) == np.uint32(1).newbyteorder("<") else 'BIG' @@ -29,8 +32,8 @@ def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]: # please see the comments in the modify_gguf.py example. def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None: host_endian, file_endian = get_file_host_endian(reader) - print(f'* File is {file_endian} endian, script is running on a {host_endian} endian host.') - print(f'\n* Dumping {len(reader.fields)} key/value pair(s)') + print(f'* File is {file_endian} endian, script is running on a {host_endian} endian host.') # noqa: NP100 + print(f'* Dumping {len(reader.fields)} key/value pair(s)') # noqa: NP100 for n, field in enumerate(reader.fields.values(), 1): if not field.types: pretty_type = 'N/A' @@ -39,20 +42,21 @@ def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None: pretty_type = '[' * nest_count + str(field.types[-1].name) + ']' * nest_count else: pretty_type = str(field.types[-1].name) - print(f' {n:5}: {pretty_type:10} | {len(field.data):8} | {field.name}', end = '') + + log_message = f' {n:5}: {pretty_type:10} | {len(field.data):8} | {field.name}' if len(field.types) == 1: curr_type = field.types[0] if curr_type == GGUFValueType.STRING: - print(' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf8')[:60])), end = '') + log_message += ' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf8')[:60])) elif field.types[0] in reader.gguf_scalar_to_np: - print(' = {0}'.format(field.parts[-1][0]), end = '') - print() + log_message += ' = {0}'.format(field.parts[-1][0]) + print(log_message) # noqa: NP100 if args.no_tensors: return - print(f'\n* Dumping {len(reader.tensors)} tensor(s)') + print(f'* Dumping {len(reader.tensors)} tensor(s)') # noqa: NP100 for n, tensor in enumerate(reader.tensors, 1): prettydims = ', '.join('{0:5}'.format(d) for d in list(tensor.shape) + [1] * (4 - len(tensor.shape))) - print(f' {n:5}: {tensor.n_elements:10} | {prettydims} | {tensor.tensor_type.name:7} | {tensor.name}') + print(f' {n:5}: {tensor.n_elements:10} | {prettydims} | {tensor.tensor_type.name:7} | {tensor.name}') # noqa: NP100 def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None: @@ -103,10 +107,17 @@ def main() -> None: parser.add_argument("--no-tensors", action="store_true", help="Don't dump tensor metadata") parser.add_argument("--json", action="store_true", help="Produce JSON output") parser.add_argument("--json-array", action="store_true", help="Include full array values in JSON output (long)") + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") + args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"]) + + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + if not args.json: - print(f'* Loading: {args.model}') + logger.info(f'* Loading: {args.model}') + reader = GGUFReader(args.model, 'r') + if args.json: dump_metadata_json(reader, args) else: diff --git a/gguf-py/scripts/gguf-set-metadata.py b/gguf-py/scripts/gguf-set-metadata.py index 3ebdfa898a779..e35b651b81da8 100755 --- a/gguf-py/scripts/gguf-set-metadata.py +++ b/gguf-py/scripts/gguf-set-metadata.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +import logging import argparse import os import sys @@ -10,6 +11,8 @@ from gguf import GGUFReader # noqa: E402 +logger = logging.getLogger("gguf-set-metadata") + def minimal_example(filename: str) -> None: reader = GGUFReader(filename, 'r+') @@ -41,36 +44,33 @@ def minimal_example(filename: str) -> None: def set_metadata(reader: GGUFReader, args: argparse.Namespace) -> None: field = reader.get_field(args.key) if field is None: - print(f'! Field {repr(args.key)} not found', file = sys.stderr) + logger.error(f'! Field {repr(args.key)} not found') sys.exit(1) # Note that field.types is a list of types. This is because the GGUF # format supports arrays. For example, an array of UINT32 would # look like [GGUFValueType.ARRAY, GGUFValueType.UINT32] handler = reader.gguf_scalar_to_np.get(field.types[0]) if field.types else None if handler is None: - print( - f'! This tool only supports changing simple values, {repr(args.key)} has unsupported type {field.types}', - file = sys.stderr, - ) + logger.error(f'! This tool only supports changing simple values, {repr(args.key)} has unsupported type {field.types}') sys.exit(1) current_value = field.parts[field.data[0]][0] new_value = handler(args.value) - print(f'* Preparing to change field {repr(args.key)} from {current_value} to {new_value}') + logger.info(f'* Preparing to change field {repr(args.key)} from {current_value} to {new_value}') if current_value == new_value: - print(f'- Key {repr(args.key)} already set to requested value {current_value}') + logger.info(f'- Key {repr(args.key)} already set to requested value {current_value}') sys.exit(0) if args.dry_run: sys.exit(0) if not args.force: - print('*** Warning *** Warning *** Warning **') - print('* Changing fields in a GGUF file can make it unusable. Proceed at your own risk.') - print('* Enter exactly YES if you are positive you want to proceed:') + logger.warning('*** Warning *** Warning *** Warning **') + logger.warning('* Changing fields in a GGUF file can make it unusable. Proceed at your own risk.') + logger.warning('* Enter exactly YES if you are positive you want to proceed:') response = input('YES, I am sure> ') if response != 'YES': - print("You didn't enter YES. Okay then, see ya!") + logger.info("You didn't enter YES. Okay then, see ya!") sys.exit(0) field.parts[field.data[0]][0] = new_value - print('* Field changed. Successful completion.') + logger.info('* Field changed. Successful completion.') def main() -> None: @@ -80,8 +80,13 @@ def main() -> None: parser.add_argument("value", type=str, help="Metadata value to set") parser.add_argument("--dry-run", action="store_true", help="Don't actually change anything") parser.add_argument("--force", action="store_true", help="Change the field without confirmation") + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") + args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"]) - print(f'* Loading: {args.model}') + + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + + logger.info(f'* Loading: {args.model}') reader = GGUFReader(args.model, 'r' if args.dry_run else 'r+') set_metadata(reader, args) diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py index ef7f19ecb9532..3892fd25cf3bf 100755 --- a/scripts/compare-llama-bench.py +++ b/scripts/compare-llama-bench.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 +import logging import argparse import heapq import sys @@ -11,9 +12,11 @@ import git from tabulate import tabulate except ImportError as e: - print("ERROR: the following Python libraries are required: GitPython, tabulate.") + print("the following Python libraries are required: GitPython, tabulate.") # noqa: NP100 raise e +logger = logging.getLogger("compare-llama-bench") + # Properties by which to differentiate results per commit: KEY_PROPERTIES = [ "cpu_info", "gpu_info", "n_gpu_layers", "main_gpu", "cuda", "opencl", "metal", "gpu_blas", @@ -94,8 +97,7 @@ known_args, unknown_args = parser.parse_known_args() if unknown_args: - print(f"ERROR: Received unknown args: {unknown_args}.") - print() + logger.error(f"Received unknown args: {unknown_args}.") parser.print_help() sys.exit(1) @@ -108,8 +110,7 @@ input_file = sqlite_files[0] if input_file is None: - print("ERROR: Cannot find a suitable input file, please provide one.") - print() + logger.error("Cannot find a suitable input file, please provide one.") parser.print_help() sys.exit(1) @@ -194,23 +195,19 @@ def get_commit_hexsha8(name): hexsha8_baseline = get_commit_hexsha8(known_args.baseline) name_baseline = known_args.baseline if hexsha8_baseline is None: - print(f"ERROR: cannot find data for baseline={known_args.baseline}.") + logger.error(f"cannot find data for baseline={known_args.baseline}.") sys.exit(1) # Otherwise, search for the most recent parent of master for which there is data: elif repo is not None: hexsha8_baseline = find_parent_in_data(repo.heads.master.commit) if hexsha8_baseline is None: - print("ERROR: No baseline was provided and did not find data for any master branch commits.") - print() + logger.error("No baseline was provided and did not find data for any master branch commits.") parser.print_help() sys.exit(1) else: - print( - "ERROR: No baseline was provided and the current working directory " - "is not part of a git repository from which a baseline could be inferred." - ) - print() + logger.error("No baseline was provided and the current working directory " + "is not part of a git repository from which a baseline could be inferred.") parser.print_help() sys.exit(1) @@ -227,7 +224,7 @@ def get_commit_hexsha8(name): hexsha8_compare = get_commit_hexsha8(known_args.compare) name_compare = known_args.compare if hexsha8_compare is None: - print(f"ERROR: cannot find data for compare={known_args.compare}.") + logger.error(f"cannot find data for compare={known_args.compare}.") sys.exit(1) # Otherwise, search for the commit for llama-bench was most recently run # and that is not a parent of master: @@ -241,16 +238,12 @@ def get_commit_hexsha8(name): break if hexsha8_compare is None: - print("ERROR: No compare target was provided and did not find data for any non-master commits.") - print() + logger.error("No compare target was provided and did not find data for any non-master commits.") parser.print_help() sys.exit(1) else: - print( - "ERROR: No compare target was provided and the current working directory " - "is not part of a git repository from which a compare target could be inferred." - ) - print() + logger.error("No compare target was provided and the current working directory " + "is not part of a git repository from which a compare target could be inferred.\n") parser.print_help() sys.exit(1) @@ -284,8 +277,7 @@ def get_rows(properties): if prop not in KEY_PROPERTIES[:-2]: # Last two values are n_prompt, n_gen. unknown_cols.append(prop) if unknown_cols: - print(f"ERROR: Unknown values for --show: {', '.join(unknown_cols)}") - print() + logger.error(f"Unknown values for --show: {', '.join(unknown_cols)}") parser.print_usage() sys.exit(1) rows_show = get_rows(show) @@ -369,7 +361,7 @@ def get_rows(properties): headers = [PRETTY_NAMES[p] for p in show] headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"] -print(tabulate( +logger.info(tabulate( table, headers=headers, floatfmt=".2f", diff --git a/scripts/run-with-preset.py b/scripts/run-with-preset.py index a182527307537..e986a36045549 100755 --- a/scripts/run-with-preset.py +++ b/scripts/run-with-preset.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 +import logging import argparse import os import subprocess @@ -7,6 +8,8 @@ import yaml +logger = logging.getLogger("run-with-preset") + CLI_ARGS_MAIN_PERPLEXITY = [ "batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape", "export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag", @@ -56,6 +59,7 @@ parser.add_argument("yaml_files", nargs="*", help="Arbitrary number of YAML files from which to read preset values. " "If two files specify the same values the later one will be used.") +parser.add_argument("--verbose", action="store_true", help="increase output verbosity") known_args, unknown_args = parser.parse_known_args() @@ -63,6 +67,8 @@ parser.print_help() sys.exit(0) +logging.basicConfig(level=logging.DEBUG if known_args.verbose else logging.INFO) + props = dict() for yaml_file in known_args.yaml_files: @@ -85,7 +91,7 @@ elif binary.lower().endswith("server"): cli_args = CLI_ARGS_SERVER else: - print(f"Unknown binary: {binary}") + logger.error(f"Unknown binary: {binary}") sys.exit(1) command_list = [binary] @@ -121,11 +127,11 @@ num_unused = len(props) if num_unused > 10: - print(f"The preset file contained a total of {num_unused} unused properties.") + logger.info(f"The preset file contained a total of {num_unused} unused properties.") elif num_unused > 0: - print("The preset file contained the following unused properties:") + logger.info("The preset file contained the following unused properties:") for prop, value in props.items(): - print(f" {prop}: {value}") + logger.info(f" {prop}: {value}") command_list += unknown_args diff --git a/scripts/verify-checksum-models.py b/scripts/verify-checksum-models.py index dff4b47340133..0b5b9aafaade3 100755 --- a/scripts/verify-checksum-models.py +++ b/scripts/verify-checksum-models.py @@ -1,8 +1,11 @@ #!/usr/bin/env python3 +import logging import os import hashlib +logger = logging.getLogger("verify-checksum-models") + def sha256sum(file): block_size = 16 * 1024 * 1024 # 16 MB block size @@ -27,7 +30,7 @@ def sha256sum(file): # Check if the hash list file exists if not os.path.exists(hash_list_file): - print(f"Hash list file not found: {hash_list_file}") + logger.error(f"Hash list file not found: {hash_list_file}") exit(1) # Read the hash file content and split it into an array of lines @@ -46,7 +49,7 @@ def sha256sum(file): file_path = os.path.join(llama_path, filename) # Informing user of the progress of the integrity check - print(f"Verifying the checksum of {file_path}") + logger.info(f"Verifying the checksum of {file_path}") # Check if the file exists if os.path.exists(file_path): @@ -73,9 +76,9 @@ def sha256sum(file): # Print column headers for results table -print("\n" + "filename".ljust(40) + "valid checksum".center(20) + "file missing".center(20)) -print("-" * 80) +print("filename".ljust(40) + "valid checksum".center(20) + "file missing".center(20)) # noqa: NP100 +print("-" * 80) # noqa: NP100 # Output the results as a table for r in results: - print(f"{r['filename']:40} {r['valid checksum']:^20} {r['file missing']:^20}") + print(f"{r['filename']:40} {r['valid checksum']:^20} {r['file missing']:^20}") # noqa: NP100 diff --git a/tests/test-tokenizer-0-bpe.py b/tests/test-tokenizer-0-bpe.py index 33a2724416130..6b70ad0317af3 100644 --- a/tests/test-tokenizer-0-bpe.py +++ b/tests/test-tokenizer-0-bpe.py @@ -7,15 +7,20 @@ # python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/deepseek-coder-6.7b-instruct/ # +import logging import argparse from transformers import AutoTokenizer +logger = logging.getLogger("test-tokenizer-0-bpe") + parser = argparse.ArgumentParser() parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") parser.add_argument("--fname-tok", help="path to a text file to tokenize") -args = parser.parse_args() +parser.add_argument("--verbose", action="store_true", help="increase output verbosity") +args = parser.parse_args() +logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) dir_tokenizer = args.dir_tokenizer tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) @@ -64,30 +69,34 @@ ] for text in tests: - print('text: ', text) - print(tokenizer.encode(text)) - print(tokenizer.decode(tokenizer.encode(text))) + logger.info(f"text: {text}") + logger.info(tokenizer.encode(text)) + logger.info(tokenizer.decode(tokenizer.encode(text))) -print("\n\ntests for C++:\n") +logger.info("tests for C++:") for text in tests: res = tokenizer.encode(text) + # Modify text representation for logging k = text.replace('\n', '\\n') k = k.replace('\t', '\\t') k = '"' + k + '"' - print("{ %-24s, { " % k, end='') + + # Log the modified text and its encoding + log_message = "{ %-24s, { " % k for x in res: - print("%7d," % x, end='') - print(" }, },") + log_message += "%7d," % x + log_message += " }, }," + logger.info(log_message) -print(tokenizer.encode('hello')) -print(tokenizer.encode('world')) -print(tokenizer.encode(' world')) -print(tokenizer.encode('hello world')) +logger.info(tokenizer.encode('hello')) +logger.info(tokenizer.encode('world')) +logger.info(tokenizer.encode(' world')) +logger.info(tokenizer.encode('hello world')) fname_tok = args.fname_tok if fname_tok: - print('tokenizing file: ', fname_tok) + logger.info(f"tokenizing file: {fname_tok}") fname_out = fname_tok + '.tok' with open(fname_tok, 'r', encoding='utf-8') as f: lines = f.readlines() @@ -112,6 +121,6 @@ # else: # f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n') - print('len(res): ', len(res)) - print('len(lines): ', len(lines)) - print('results written to: ', fname_out) + logger.info(f"len(res): {len(res)}") + logger.info(f"len(lines): {len(lines)}") + logger.info(f"results written to: {fname_out}") diff --git a/tests/test-tokenizer-0-spm.py b/tests/test-tokenizer-0-spm.py index be12a6b93d6f3..4b80a4380cfec 100644 --- a/tests/test-tokenizer-0-spm.py +++ b/tests/test-tokenizer-0-spm.py @@ -7,15 +7,22 @@ # +import logging import argparse from sentencepiece import SentencePieceProcessor +logger = logging.getLogger("test-tokenizer-0-spm") + parser = argparse.ArgumentParser() parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") parser.add_argument("--fname-tok", help="path to a text file to tokenize") +parser.add_argument("--verbose", action="store_true", help="increase output verbosity") + args = parser.parse_args() +logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + dir_tokenizer = args.dir_tokenizer tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model') @@ -65,41 +72,46 @@ for text in tests: - print('text: ', text) - print('\nwith bos:') - print(tokenizer.encode(text, add_bos=True)) - print(tokenizer.decode(tokenizer.encode(text, add_bos=True))) - print('\nwithout bos:') - print(tokenizer.encode(text, add_bos=False)) - print(tokenizer.decode(tokenizer.encode(text, add_bos=False))) - -print("'" + tokenizer.id_to_piece(15043) + "'") # '_Hello' -print("'" + tokenizer.id_to_piece(29871) + "'") # '_' -print("'" + tokenizer.decode([15043]) + "'") # 'Hello' -print("'" + tokenizer.decode([15043, 15043]) + "'") # 'Hello Hello' -print("'" + tokenizer.decode([29871, 15043]) + "'") # ' Hello' -print("'" + tokenizer.decode([29871, 15043, 29871, 15043]) + "'") # ' Hello Hello' - -print("\n\ntests for C++:\n") + message_log = (f"text: {text}\n" + "with bos:\n" + f"{tokenizer.encode(text, add_bos=True)}\n" + f"{tokenizer.decode(tokenizer.encode(text, add_bos=True))}\n" + "without bos:\n" + f"{tokenizer.encode(text, add_bos=False)}\n" + f"{tokenizer.decode(tokenizer.encode(text, add_bos=False))}\n") + logger.info(message_log) + +logger.info(f"'{tokenizer.id_to_piece(15043)}'") # '_Hello' +logger.info(f"'{tokenizer.id_to_piece(29871)}'") # '_' +logger.info(f"'{tokenizer.decode([15043])}'") # 'Hello' +logger.info(f"'{tokenizer.decode([15043, 15043])}'") # 'Hello Hello' +logger.info(f"'{tokenizer.decode([29871, 15043])}'") # ' Hello' +logger.info(f"'{tokenizer.decode([29871, 15043, 29871, 15043])}'") # ' Hello Hello' + +logger.info("\n\ntests for C++:\n") for text in tests: res = tokenizer.encode(text, add_bos=False) + # Modify text representation for logging k = text.replace('\n', '\\n') k = k.replace('\t', '\\t') k = '"' + k + '"' - print("{ %-24s, { " % k, end='') + + # Log the modified text and its encoding + log_message = "{ %-24s, { " % k for x in res: - print("%7d," % x, end='') - print(" }, },") + log_message += "%7d," % x + log_message += " }, }," + logger.info(log_message) -print(tokenizer.encode('hello')) -print(tokenizer.encode('world')) -print(tokenizer.encode(' world')) -print(tokenizer.encode('hello world')) +logger.info(tokenizer.encode('hello')) +logger.info(tokenizer.encode('world')) +logger.info(tokenizer.encode(' world')) +logger.info(tokenizer.encode('hello world')) fname_tok = args.fname_tok if fname_tok: - print('tokenizing file: ', fname_tok) + logger.info(f"tokenizing file: {fname_tok}") fname_out = fname_tok + '.tok' with open(fname_tok, 'r', encoding='utf-8') as f: lines = f.readlines() @@ -109,6 +121,6 @@ with open(fname_out, 'w', encoding='utf-8') as f: for x in res: f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') - print('len(res): ', len(res)) - print('len(lines): ', len(lines)) - print('results written to: ', fname_out) + logger.info(f"len(res): {len(res)}") + logger.info(f"len(lines): {len(lines)}") + logger.info(f"results written to: {fname_out}") From 92139b90af4841d7fd060b526bdd443b621770ff Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 4 May 2024 08:32:32 +0300 Subject: [PATCH 08/10] tests : add test-tokenizer-0.sh + fix some tokenizers (#7036) * tests : add test-tokenizer-0.sh * unicode : add all unicode number ranges * starcoder : fix pre-tokenizer * tests : add test that fails with DeepSeek tokenizers * falcon : fix regex * unicode : regenerate unicode tables * refact : add tokenizer model * lint : fix * tests : disable failing tests ggml-ci * refact : add tests files ggml-ci * convert : print -> logging ggml-ci * lint : fix * unicode : digit -> number * phi-3 : update --- .flake8 | 2 +- Makefile | 3 +- convert-hf-to-gguf-update.py | 38 +- convert-hf-to-gguf.py | 5 +- llama.cpp | 21 +- llama.h | 1 + models/ggml-vocab-bert-bge.gguf.inp | 4 + models/ggml-vocab-bert-bge.gguf.out | 2 + models/ggml-vocab-deepseek-coder.gguf.inp | 4 + models/ggml-vocab-deepseek-coder.gguf.out | 2 + models/ggml-vocab-deepseek-llm.gguf.inp | 4 + models/ggml-vocab-deepseek-llm.gguf.out | 2 + models/ggml-vocab-falcon.gguf.inp | 4 + models/ggml-vocab-falcon.gguf.out | 2 + models/ggml-vocab-gpt-2.gguf.inp | 4 + models/ggml-vocab-gpt-2.gguf.out | 2 + models/ggml-vocab-llama-bpe.gguf.inp | 4 + models/ggml-vocab-llama-bpe.gguf.out | 2 + models/ggml-vocab-llama-spm.gguf.inp | 4 + models/ggml-vocab-llama-spm.gguf.out | 2 + models/ggml-vocab-mpt.gguf.inp | 4 + models/ggml-vocab-mpt.gguf.out | 2 + models/ggml-vocab-phi-3.gguf | Bin 725945 -> 725846 bytes models/ggml-vocab-phi-3.gguf.inp | 4 + models/ggml-vocab-phi-3.gguf.out | 2 + models/ggml-vocab-refact.gguf | Bin 1720666 -> 1720710 bytes models/ggml-vocab-refact.gguf.inp | 106 +++ models/ggml-vocab-refact.gguf.out | 43 ++ models/ggml-vocab-starcoder.gguf.inp | 4 + models/ggml-vocab-starcoder.gguf.out | 2 + scripts/gen-unicode-data.py | 66 ++ tests/CMakeLists.txt | 6 +- tests/test-tokenizer-0-bpe.py | 126 ---- tests/test-tokenizer-0-spm.py | 126 ---- tests/test-tokenizer-0.cpp | 39 +- tests/test-tokenizer-0.py | 46 ++ tests/test-tokenizer-0.sh | 34 + unicode-data.cpp | 874 ++++++++++++---------- unicode-data.h | 2 +- unicode.cpp | 22 +- unicode.h | 2 +- 41 files changed, 903 insertions(+), 719 deletions(-) create mode 100644 models/ggml-vocab-refact.gguf.inp create mode 100644 models/ggml-vocab-refact.gguf.out create mode 100644 scripts/gen-unicode-data.py delete mode 100644 tests/test-tokenizer-0-bpe.py delete mode 100644 tests/test-tokenizer-0-spm.py create mode 100644 tests/test-tokenizer-0.py create mode 100755 tests/test-tokenizer-0.sh diff --git a/.flake8 b/.flake8 index 608eb8eed374c..bc41c22900c3d 100644 --- a/.flake8 +++ b/.flake8 @@ -1,4 +1,4 @@ [flake8] max-line-length = 125 ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503 -exclude = examples/*,examples/*/**,*/**/__init__.py +exclude = examples/*,examples/*/**,*/**/__init__.py,scripts/gen-unicode-data.py,tests/test-tokenizer-0.py diff --git a/Makefile b/Makefile index 0a73f2a582a20..c568dd008f350 100644 --- a/Makefile +++ b/Makefile @@ -77,11 +77,10 @@ test: $(TEST_TARGETS) ./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \ - ./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \ - ./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \ + ./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \ elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \ continue; \ elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \ diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index 09772f668e2b9..917a4469d5693 100644 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -31,6 +31,7 @@ from enum import IntEnum, auto from transformers import AutoTokenizer +logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger("convert-hf-to-gguf-update") @@ -62,6 +63,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, {"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, {"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", }, + {"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", }, ] # make directory "models/tokenizers" if it doesn't exist @@ -158,8 +160,8 @@ def get_vocab_base_pre(self, tokenizer) -> str: chktok = tokenizer.encode(chktxt) chkhsh = sha256(str(chktok).encode()).hexdigest() - print(f"chktok: {{chktok}}") - print(f"chkhsh: {{chkhsh}}") + logger.debug(f"chktok: {{chktok}}") + logger.debug(f"chkhsh: {{chkhsh}}") res = None @@ -168,22 +170,22 @@ def get_vocab_base_pre(self, tokenizer) -> str: # don't edit the hashes manually! {src_ifs} if res is None: - print("\\n") - print("**************************************************************************************") - print("** WARNING: The BPE pre-tokenizer was not recognized!") - print("** There are 2 possible reasons for this:") - print("** - the model has not been added to convert-hf-to-gguf-update.py yet") - print("** - the pre-tokenization config has changed upstream") - print("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.") - print("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") - print("**") - print(f"** chkhsh: {{chkhsh}}") - print("**************************************************************************************") - print("\\n") + logger.warning("\\n") + logger.warning("**************************************************************************************") + logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!") + logger.warning("** There are 2 possible reasons for this:") + logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet") + logger.warning("** - the pre-tokenization config has changed upstream") + logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.") + logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") + logger.warning("**") + logger.warning(f"** chkhsh: {{chkhsh}}") + logger.warning("**************************************************************************************") + logger.warning("\\n") raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") - print(f"tokenizer.ggml.pre: {{repr(res)}}") - print(f"chkhsh: {{chkhsh}}") + logger.debug(f"tokenizer.ggml.pre: {{repr(res)}}") + logger.debug(f"chkhsh: {{chkhsh}}") return res """ @@ -197,6 +199,8 @@ def get_vocab_base_pre(self, tokenizer) -> str: # generate tests for each tokenizer model tests = [ + "ied 4 ½ months", + "Führer", "", " ", " ", @@ -281,6 +285,6 @@ def get_vocab_base_pre(self, tokenizer) -> str: for model in models: name = model["name"] - logger.info(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") + print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100 logger.info("\n") diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 5293262885158..88c16676b5892 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -308,6 +308,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": # ref: https://huggingface.co/openai-community/gpt2 res = "gpt-2" + if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff": + # ref: https://huggingface.co/smallcloudai/Refact-1_6-base + res = "refact" if res is None: logger.warning("\n") @@ -324,7 +327,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: logger.warning("\n") raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") - logger.debug(f"tokenizer.ggml.pre: {res}") + logger.debug(f"tokenizer.ggml.pre: {repr(res)}") logger.debug(f"chkhsh: {chkhsh}") return res diff --git a/llama.cpp b/llama.cpp index 18b49ec20909e..7c7a06df3effa 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4383,6 +4383,9 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "gpt-2") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2; + } else if ( + tokenizer_pre == "refact") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -11952,7 +11955,7 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) { GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE); GGML_ASSERT(llama_is_byte_token(vocab, id)); - const auto& token_data = vocab.id_to_token.at(id); + const auto & token_data = vocab.id_to_token.at(id); switch (llama_vocab_get_type(vocab)) { case LLAMA_VOCAB_TYPE_SPM: { auto buf = token_data.text.substr(3, 2); @@ -12212,14 +12215,13 @@ struct llm_tokenizer_bpe { "\\s?\\p{L}+", "\\s?\\p{P}+", "[一-龥ࠀ-一가-퟿]+", - "\\p{N}+", + "\\p{N}", }); break; case LLAMA_VOCAB_PRE_TYPE_FALCON: word_collection = unicode_regex_split(text, { "[\\p{P}\\$\\+<=>\\^~\\|]+", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", - "\\p{N}+", "[0-9][0-9][0-9]", }); break; @@ -12235,6 +12237,12 @@ struct llm_tokenizer_bpe { }); break; case LLAMA_VOCAB_PRE_TYPE_STARCODER: + case LLAMA_VOCAB_PRE_TYPE_REFACT: + word_collection = unicode_regex_split(text, { + "\\p{N}", + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + }); + break; case LLAMA_VOCAB_PRE_TYPE_GPT2: word_collection = unicode_regex_split(text, { "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", @@ -17466,9 +17474,10 @@ int32_t llama_tokenize( static std::string llama_decode_text(const std::string & text) { std::string decoded_text; - auto unicode_sequences = unicode_cpts_from_utf8(text); - for (auto & unicode_sequence : unicode_sequences) { - decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence)); + + const auto cpts = unicode_cpts_from_utf8(text); + for (const auto cpt : cpts) { + decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt)); } return decoded_text; diff --git a/llama.h b/llama.h index 790339af93083..5ff04c1d414d2 100644 --- a/llama.h +++ b/llama.h @@ -79,6 +79,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_MPT = 5, LLAMA_VOCAB_PRE_TYPE_STARCODER = 6, LLAMA_VOCAB_PRE_TYPE_GPT2 = 7, + LLAMA_VOCAB_PRE_TYPE_REFACT = 8, }; // note: these values should be synchronized with ggml_rope diff --git a/models/ggml-vocab-bert-bge.gguf.inp b/models/ggml-vocab-bert-bge.gguf.inp index 0389f00c71379..0a89107c60d7f 100644 --- a/models/ggml-vocab-bert-bge.gguf.inp +++ b/models/ggml-vocab-bert-bge.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-bert-bge.gguf.out b/models/ggml-vocab-bert-bge.gguf.out index 969552e1c182e..e4a76cdb07d3f 100644 --- a/models/ggml-vocab-bert-bge.gguf.out +++ b/models/ggml-vocab-bert-bge.gguf.out @@ -1,3 +1,5 @@ + 29464 2094 1018 1092 2706 + 11865 17875 diff --git a/models/ggml-vocab-deepseek-coder.gguf.inp b/models/ggml-vocab-deepseek-coder.gguf.inp index 0389f00c71379..0a89107c60d7f 100644 --- a/models/ggml-vocab-deepseek-coder.gguf.inp +++ b/models/ggml-vocab-deepseek-coder.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-deepseek-coder.gguf.out b/models/ggml-vocab-deepseek-coder.gguf.out index 8ef585c787682..9ccc560d694ca 100644 --- a/models/ggml-vocab-deepseek-coder.gguf.out +++ b/models/ggml-vocab-deepseek-coder.gguf.out @@ -1,3 +1,5 @@ + 1050 207 19 207 19192 4217 + 37 32009 71 6247 207 243 diff --git a/models/ggml-vocab-deepseek-llm.gguf.inp b/models/ggml-vocab-deepseek-llm.gguf.inp index 0389f00c71379..0a89107c60d7f 100644 --- a/models/ggml-vocab-deepseek-llm.gguf.inp +++ b/models/ggml-vocab-deepseek-llm.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-deepseek-llm.gguf.out b/models/ggml-vocab-deepseek-llm.gguf.out index 0ea9d66e343df..fd94b896d24e7 100644 --- a/models/ggml-vocab-deepseek-llm.gguf.out +++ b/models/ggml-vocab-deepseek-llm.gguf.out @@ -1,3 +1,5 @@ + 1052 207 19 207 19109 4223 + 37 100014 71 6245 207 243 diff --git a/models/ggml-vocab-falcon.gguf.inp b/models/ggml-vocab-falcon.gguf.inp index 0389f00c71379..0a89107c60d7f 100644 --- a/models/ggml-vocab-falcon.gguf.inp +++ b/models/ggml-vocab-falcon.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-falcon.gguf.out b/models/ggml-vocab-falcon.gguf.out index cb8da7b1a33cc..209b04cdaf330 100644 --- a/models/ggml-vocab-falcon.gguf.out +++ b/models/ggml-vocab-falcon.gguf.out @@ -1,3 +1,5 @@ + 878 204 31 3068 133 2137 + 28611 132 30042 204 258 diff --git a/models/ggml-vocab-gpt-2.gguf.inp b/models/ggml-vocab-gpt-2.gguf.inp index 0389f00c71379..0a89107c60d7f 100644 --- a/models/ggml-vocab-gpt-2.gguf.inp +++ b/models/ggml-vocab-gpt-2.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-gpt-2.gguf.out b/models/ggml-vocab-gpt-2.gguf.out index 9986f38e407f2..78430f0d31fdc 100644 --- a/models/ggml-vocab-gpt-2.gguf.out +++ b/models/ggml-vocab-gpt-2.gguf.out @@ -1,3 +1,5 @@ + 798 604 25208 1933 + 37 9116 71 11751 220 220 220 diff --git a/models/ggml-vocab-llama-bpe.gguf.inp b/models/ggml-vocab-llama-bpe.gguf.inp index 0389f00c71379..0a89107c60d7f 100644 --- a/models/ggml-vocab-llama-bpe.gguf.inp +++ b/models/ggml-vocab-llama-bpe.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-llama-bpe.gguf.out b/models/ggml-vocab-llama-bpe.gguf.out index 4d903e1cdf37b..1f00e3812e227 100644 --- a/models/ggml-vocab-llama-bpe.gguf.out +++ b/models/ggml-vocab-llama-bpe.gguf.out @@ -1,3 +1,5 @@ + 1142 220 19 220 27154 4038 + 37 51853 261 220 256 diff --git a/models/ggml-vocab-llama-spm.gguf.inp b/models/ggml-vocab-llama-spm.gguf.inp index 0389f00c71379..0a89107c60d7f 100644 --- a/models/ggml-vocab-llama-spm.gguf.inp +++ b/models/ggml-vocab-llama-spm.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-llama-spm.gguf.out b/models/ggml-vocab-llama-spm.gguf.out index 15d00b106ddc2..9c3327cb54380 100644 --- a/models/ggml-vocab-llama-spm.gguf.out +++ b/models/ggml-vocab-llama-spm.gguf.out @@ -1,3 +1,5 @@ + 474 287 29871 29946 29871 30226 7378 + 383 4000 261 259 1678 diff --git a/models/ggml-vocab-mpt.gguf.inp b/models/ggml-vocab-mpt.gguf.inp index 0389f00c71379..0a89107c60d7f 100644 --- a/models/ggml-vocab-mpt.gguf.inp +++ b/models/ggml-vocab-mpt.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-mpt.gguf.out b/models/ggml-vocab-mpt.gguf.out index 1f4b0eb3a8947..d8d0fe90900bb 100644 --- a/models/ggml-vocab-mpt.gguf.out +++ b/models/ggml-vocab-mpt.gguf.out @@ -1,3 +1,5 @@ + 728 577 24142 2607 + 39 26288 6554 209 50276 diff --git a/models/ggml-vocab-phi-3.gguf b/models/ggml-vocab-phi-3.gguf index 72fdb409c1b3b62b5aa4bc1628d4ca45080c4ebd..f8022a385e4aa48ca10d40d0f079a25365a7be78 100644 GIT binary patch delta 60 zcmdnFT<6*{orV_17N!>F7M2#)7Pc1l7LFFq7OpMajzWyQ)18F4{ic5v;!c_VK!{s( Qdaf`x7o+C%a$#;K09`i`@&Et; delta 122 zcmcb%OlRkEorV_17N!>F7M2#)7Pc1l7LFFq7OpMajzWwD)18F4{ltnZi%U{-)io7V zYpbi(ZE7I=>5GK84W_>m;+CI2L5Mp=IXAVqI59mnTD>SgCsjRG!PZtmJ+Zhrv$!NN TuS6ZJcltUZZk6e_!rYPo#9Sx% diff --git a/models/ggml-vocab-phi-3.gguf.inp b/models/ggml-vocab-phi-3.gguf.inp index 0389f00c71379..0a89107c60d7f 100644 --- a/models/ggml-vocab-phi-3.gguf.inp +++ b/models/ggml-vocab-phi-3.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-phi-3.gguf.out b/models/ggml-vocab-phi-3.gguf.out index 15d00b106ddc2..9c3327cb54380 100644 --- a/models/ggml-vocab-phi-3.gguf.out +++ b/models/ggml-vocab-phi-3.gguf.out @@ -1,3 +1,5 @@ + 474 287 29871 29946 29871 30226 7378 + 383 4000 261 259 1678 diff --git a/models/ggml-vocab-refact.gguf b/models/ggml-vocab-refact.gguf index 8f26cfb76c9f287f5ff06bf0445ba9ac363f11e8..52afcf01aeb7323e0153b2817a1cbfc5a07d787b 100644 GIT binary patch delta 166 zcmYMpu?>Pi7(iivQ9x1Q4nbrYVv5b0)A-Y;zQv~G mp`^ff6BcYZaN)s6fDjQBVkAhRks(Kc5*2DRXfGwZrGEj<_!uq# diff --git a/models/ggml-vocab-refact.gguf.inp b/models/ggml-vocab-refact.gguf.inp new file mode 100644 index 0000000000000..0a89107c60d7f --- /dev/null +++ b/models/ggml-vocab-refact.gguf.inp @@ -0,0 +1,106 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + + +__ggml_vocab_test__ + + + +__ggml_vocab_test__ + + + + +__ggml_vocab_test__ + + +__ggml_vocab_test__ +Hello world +__ggml_vocab_test__ + Hello world +__ggml_vocab_test__ +Hello World +__ggml_vocab_test__ + Hello World +__ggml_vocab_test__ + Hello World! +__ggml_vocab_test__ +Hello, world! +__ggml_vocab_test__ + Hello, world! +__ggml_vocab_test__ + this is 🦙.cpp +__ggml_vocab_test__ +w048 7tuijk dsdfhu +__ggml_vocab_test__ +нещо на Български +__ggml_vocab_test__ +កាន់តែពិសេសអាចខលចេញ +__ggml_vocab_test__ +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token) +__ggml_vocab_test__ +Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello + Hello +__ggml_vocab_test__ + ( +__ggml_vocab_test__ + + = +__ggml_vocab_test__ +' era +__ggml_vocab_test__ +Hello, y'all! How are you 😁 ?我想在apple工作1314151天~ +__ggml_vocab_test__ +3 +__ggml_vocab_test__ +33 +__ggml_vocab_test__ +333 +__ggml_vocab_test__ +3333 +__ggml_vocab_test__ +33333 +__ggml_vocab_test__ +333333 +__ggml_vocab_test__ +3333333 +__ggml_vocab_test__ +33333333 +__ggml_vocab_test__ +333333333 +__ggml_vocab_test__ + + + + + + + + + + + +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL +__ggml_vocab_test__ diff --git a/models/ggml-vocab-refact.gguf.out b/models/ggml-vocab-refact.gguf.out new file mode 100644 index 0000000000000..06b15c090c0f8 --- /dev/null +++ b/models/ggml-vocab-refact.gguf.out @@ -0,0 +1,43 @@ + 4833 225 38 225 143 140 17723 + 56 2006 3935 265 + + 225 + 261 + 264 + 202 + 203 + 478 + 2831 + 15773 + 8279 5788 + 12000 5788 + 8279 10896 + 12000 10896 + 12000 10896 19 + 8279 30 5788 19 + 12000 30 5788 19 + 458 438 5945 118 252 32 3766 + 105 34 38 42 225 41 102 1707 12530 10180 1479 8278 + 39862 8372 1039 9446 40242 13852 2053 8949 12531 1520 10700 + 14574 227 14574 133 14574 246 30457 238 14574 242 30457 229 14574 249 14574 134 14574 258 30457 228 14574 258 14574 114 14574 133 14574 232 14574 228 14574 254 14574 232 30457 228 14574 236 + 3807 253 227 308 4382 27 18458 133 46113 44967 123 13868 308 12565 19775 33071 40824 733 27 41889 308 2585 22680 688 1401 2819 4369 2404 27 + 8279 + 12000 + 225 12000 + 261 12000 + 264 12000 + 264 12000 284 12000 + 308 + 203 280 + 25 34666 + 8279 30 533 25 464 19 4971 884 844 18458 228 1018 4982 13368 2909 9513 17827 35 37 35 38 35 39 35 11873 47838 + 37 + 37 37 + 37 37 37 + 37 37 37 37 + 37 37 37 37 37 + 37 37 37 37 37 37 + 37 37 37 37 37 37 37 + 37 37 37 37 37 37 37 37 + 37 37 37 37 37 37 37 37 37 + 334 719 8878 202 10885 4222 16104 28570 203 3807 253 227 308 4382 27 18458 133 46113 44967 123 13868 308 12565 19775 33071 40824 733 27 41889 5945 118 252 3807 118 252 225 37 225 37 37 225 37 37 37 225 37 37 37 37 225 37 37 37 37 37 225 37 37 37 37 37 37 225 37 37 37 37 37 37 37 225 37 37 37 37 37 37 37 37 225 37 32 37 225 37 497 37 225 37 1179 37 225 14574 227 14574 133 14574 246 30457 238 14574 242 30457 229 14574 249 14574 134 14574 258 30457 228 14574 258 14574 114 14574 133 14574 232 36628 228 1018 4982 13368 2909 9513 17827 35 37 35 38 35 39 35 11873 47838 20921 16623 13028 8372 1039 9446 40242 13852 2053 8949 12531 1520 10700 5881 9592 13299 914 31753 31359 9163 3202 35472 10397 439 4763 2583 330 102 1455 938 1182 2017 30 330 613 844 3654 49 330 63 646 3654 439 4621 1930 561 30 330 54 844 2124 1629 35993 49 2688 25 7709 312 25 94 62 diff --git a/models/ggml-vocab-starcoder.gguf.inp b/models/ggml-vocab-starcoder.gguf.inp index 0389f00c71379..0a89107c60d7f 100644 --- a/models/ggml-vocab-starcoder.gguf.inp +++ b/models/ggml-vocab-starcoder.gguf.inp @@ -1,3 +1,7 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ __ggml_vocab_test__ diff --git a/models/ggml-vocab-starcoder.gguf.out b/models/ggml-vocab-starcoder.gguf.out index cd04254af7fdd..ccb55c7feeef8 100644 --- a/models/ggml-vocab-starcoder.gguf.out +++ b/models/ggml-vocab-starcoder.gguf.out @@ -1,3 +1,5 @@ + 4850 244 57 244 162 159 17722 + 75 2022 3943 284 244 280 diff --git a/scripts/gen-unicode-data.py b/scripts/gen-unicode-data.py new file mode 100644 index 0000000000000..d49cbf2a0eeca --- /dev/null +++ b/scripts/gen-unicode-data.py @@ -0,0 +1,66 @@ +import regex + + +def cpt_to_utf8_str(cpt): + if cpt <= 0xFF: + return bytes([cpt, 0, 0, 0]) + elif cpt <= 0xFFFF: + return bytes([cpt & 0xFF, cpt >> 8, 0, 0]) + elif cpt <= 0xFFFFFF: + return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, 0]) + else: + return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, cpt >> 24]) + + +def is_match(codepoint, regex_expr): + try: + res = regex.match(regex_expr, cpt_to_utf8_str(codepoint).decode('utf-32')) + return res is not None + except Exception: + return False + + +def get_matches(regex_expr): + unicode_ranges = [] + current_range = None + + for codepoint in range(0x110000): + if is_match(codepoint, regex_expr): + if current_range is None: + current_range = [codepoint, codepoint] + else: + current_range[1] = codepoint + elif current_range is not None: + unicode_ranges.append(tuple(current_range)) + current_range = None + + if current_range is not None: + unicode_ranges.append(tuple(current_range)) + + return unicode_ranges + + +def print_cat(cat, ranges): + print("const std::vector> unicode_ranges_{} = {{".format(cat)) + cnt = 0 + for start, end in ranges: + if cnt % 4 != 0: + print(" ", end="") + print("{{0x{:08X}, 0x{:08X}}},".format(start, end), end="") + if cnt % 4 == 3: + print("") + cnt += 1 + + if cnt % 4 != 0: + print("") + print("};") + print("") + + +print_cat("number", get_matches(r'\p{N}')) +print_cat("letter", get_matches(r'\p{L}')) +print_cat("whitespace", get_matches(r'\p{Z}')) +print_cat("accent_mark", get_matches(r'\p{M}')) +print_cat("punctuation", get_matches(r'\p{P}')) +print_cat("symbol", get_matches(r'\p{S}')) +print_cat("control", get_matches(r'\p{C}')) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d23e7f771d054..cad703fce9057 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -74,13 +74,15 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${CMAKE llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) -llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf) -llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf) # TODO: enable when fixed +# https://github.com/ggerganov/llama.cpp/pull/7036 #llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf) +#llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf) +#llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf) +llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf) # build test-tokenizer-1-bpe target once and add many tests add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp) diff --git a/tests/test-tokenizer-0-bpe.py b/tests/test-tokenizer-0-bpe.py deleted file mode 100644 index 6b70ad0317af3..0000000000000 --- a/tests/test-tokenizer-0-bpe.py +++ /dev/null @@ -1,126 +0,0 @@ -# tests with BPE tokenizer -# -# sample usage: -# -# python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/Meta-Llama-3-8B-Instruct/ -# python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/falcon-7b/ -# python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/deepseek-coder-6.7b-instruct/ -# - -import logging -import argparse - -from transformers import AutoTokenizer - -logger = logging.getLogger("test-tokenizer-0-bpe") - -parser = argparse.ArgumentParser() -parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") -parser.add_argument("--fname-tok", help="path to a text file to tokenize") -parser.add_argument("--verbose", action="store_true", help="increase output verbosity") - -args = parser.parse_args() -logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) -dir_tokenizer = args.dir_tokenizer - -tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) - -tests = [ - "", - " ", - " ", - " ", - "\t", - "\n", - "\n\n", - "\n\n\n", - "\t\n", - "Hello world", - " Hello world", - "Hello World", - " Hello World", - " Hello World!", - "Hello, world!", - " Hello, world!", - " this is 🦙.cpp", - "w048 7tuijk dsdfhu", - "нещо на Български", - "កាន់តែពិសេសអាចខលចេញ", - "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", - "Hello", - " Hello", - " Hello", - " Hello", - " Hello", - " Hello\n Hello", - " (", - "\n =", - "' era", - "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~", - "3", - "33", - "333", - "3333", - "33333", - "333333", - "3333333", - "33333333", - "333333333", -] - -for text in tests: - logger.info(f"text: {text}") - logger.info(tokenizer.encode(text)) - logger.info(tokenizer.decode(tokenizer.encode(text))) - -logger.info("tests for C++:") -for text in tests: - res = tokenizer.encode(text) - - # Modify text representation for logging - k = text.replace('\n', '\\n') - k = k.replace('\t', '\\t') - k = '"' + k + '"' - - # Log the modified text and its encoding - log_message = "{ %-24s, { " % k - for x in res: - log_message += "%7d," % x - log_message += " }, }," - logger.info(log_message) - -logger.info(tokenizer.encode('hello')) -logger.info(tokenizer.encode('world')) -logger.info(tokenizer.encode(' world')) -logger.info(tokenizer.encode('hello world')) - -fname_tok = args.fname_tok -if fname_tok: - logger.info(f"tokenizing file: {fname_tok}") - fname_out = fname_tok + '.tok' - with open(fname_tok, 'r', encoding='utf-8') as f: - lines = f.readlines() - s = ''.join(lines) - res = tokenizer.encode(s) - # write to file - with open(fname_out, 'w', encoding='utf-8') as f: - for x in res: - # LLaMA v3 for some reason strips the space for these tokens (and others) - # if x == 662: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # elif x == 1174: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # elif x == 2564: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # elif x == 758: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # elif x == 949: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # elif x == 5354: - # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') - # else: - # f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') - f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n') - logger.info(f"len(res): {len(res)}") - logger.info(f"len(lines): {len(lines)}") - logger.info(f"results written to: {fname_out}") diff --git a/tests/test-tokenizer-0-spm.py b/tests/test-tokenizer-0-spm.py deleted file mode 100644 index 4b80a4380cfec..0000000000000 --- a/tests/test-tokenizer-0-spm.py +++ /dev/null @@ -1,126 +0,0 @@ -# tests with SPM tokenizer -# -# sample usage: -# -# python3 tests/test-tokenizer-0-spm.py ~/Data/huggingface/Llama-2-7b-hf/ -# python3 tests/test-tokenizer-0-spm.py ~/Data/huggingface/CodeLlama-34b-Instruct-hf/ -# - - -import logging -import argparse - -from sentencepiece import SentencePieceProcessor - -logger = logging.getLogger("test-tokenizer-0-spm") - -parser = argparse.ArgumentParser() -parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") -parser.add_argument("--fname-tok", help="path to a text file to tokenize") -parser.add_argument("--verbose", action="store_true", help="increase output verbosity") - -args = parser.parse_args() - -logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) - -dir_tokenizer = args.dir_tokenizer - -tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model') - -tests = [ - "", - " ", - " ", - " ", - "\t", - "\n", - "\n\n", - "\n\n\n", - "\t\n", - "Hello world", - " Hello world", - "Hello World", - " Hello World", - " Hello World!", - "Hello, world!", - " Hello, world!", - " this is 🦙.cpp", - "w048 7tuijk dsdfhu", - "нещо на Български", - "កាន់តែពិសេសអាចខលចេញ", - "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", - "Hello", - " Hello", - " Hello", - " Hello", - " Hello", - " Hello\n Hello", - " (", - "\n =", - "' era", - "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~", - "3", - "33", - "333", - "3333", - "33333", - "333333", - "3333333", - "33333333", - "333333333", -] - - -for text in tests: - message_log = (f"text: {text}\n" - "with bos:\n" - f"{tokenizer.encode(text, add_bos=True)}\n" - f"{tokenizer.decode(tokenizer.encode(text, add_bos=True))}\n" - "without bos:\n" - f"{tokenizer.encode(text, add_bos=False)}\n" - f"{tokenizer.decode(tokenizer.encode(text, add_bos=False))}\n") - logger.info(message_log) - -logger.info(f"'{tokenizer.id_to_piece(15043)}'") # '_Hello' -logger.info(f"'{tokenizer.id_to_piece(29871)}'") # '_' -logger.info(f"'{tokenizer.decode([15043])}'") # 'Hello' -logger.info(f"'{tokenizer.decode([15043, 15043])}'") # 'Hello Hello' -logger.info(f"'{tokenizer.decode([29871, 15043])}'") # ' Hello' -logger.info(f"'{tokenizer.decode([29871, 15043, 29871, 15043])}'") # ' Hello Hello' - -logger.info("\n\ntests for C++:\n") -for text in tests: - res = tokenizer.encode(text, add_bos=False) - - # Modify text representation for logging - k = text.replace('\n', '\\n') - k = k.replace('\t', '\\t') - k = '"' + k + '"' - - # Log the modified text and its encoding - log_message = "{ %-24s, { " % k - for x in res: - log_message += "%7d," % x - log_message += " }, }," - logger.info(log_message) - -logger.info(tokenizer.encode('hello')) -logger.info(tokenizer.encode('world')) -logger.info(tokenizer.encode(' world')) -logger.info(tokenizer.encode('hello world')) - -fname_tok = args.fname_tok -if fname_tok: - logger.info(f"tokenizing file: {fname_tok}") - fname_out = fname_tok + '.tok' - with open(fname_tok, 'r', encoding='utf-8') as f: - lines = f.readlines() - s = ''.join(lines) - res = tokenizer.encode(s, add_bos=True) - # write to file - with open(fname_out, 'w', encoding='utf-8') as f: - for x in res: - f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') - logger.info(f"len(res): {len(res)}") - logger.info(f"len(lines): {len(lines)}") - logger.info(f"results written to: {fname_out}") diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index 5122757cf47f5..d478f104148a6 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -55,8 +55,10 @@ // return _k_tests; //} -static std::map> read_tests(const std::string & fname_inp, const std::string & fname_out) { - std::map> tests; +using llama_tests = std::map>; + +static llama_tests read_tests(const std::string & fname_inp, const std::string & fname_out) { + llama_tests tests; std::ifstream ifs_inp(fname_inp); if (!ifs_inp) { @@ -175,12 +177,20 @@ int main(int argc, char **argv) { bool success = true; - const auto k_tests = read_tests(fname_inp, fname_out); + const auto k_tests = [&]() -> llama_tests { + if (!fname_text.empty()) { + return {}; + } - if (k_tests.empty()) { - fprintf(stderr, "%s : error: no tests found\n", __func__); - return 1; - } + const auto res = read_tests(fname_inp, fname_out); + + if (res.empty()) { + fprintf(stderr, "%s : error: no tests found\n", __func__); + exit(1); + } + + return res; + }(); const bool add_special = false; @@ -238,7 +248,17 @@ int main(int argc, char **argv) { fprintf(stderr, "%s : text size: %zu\n", __func__, text.size()); - const std::vector res = llama_tokenize(ctx, text, add_special); + std::vector res; + + { + const auto t_start = ggml_time_us(); + + res = llama_tokenize(ctx, text, add_special); + + const auto t_end = ggml_time_us(); + + fprintf(stderr, "%s : tokenized in %.3f ms (cpp)\n", __func__, (t_end - t_start) / 1000.0); + } fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size()); @@ -252,7 +272,8 @@ int main(int argc, char **argv) { } for (const auto & tok : res) { - ofs << tok << " '" << string_strip(llama_detokenize_bpe(ctx, std::vector{tok})) << "'" << std::endl; + //ofs << tok << " '" << string_strip(llama_detokenize_bpe(ctx, std::vector{tok})) << "'" << std::endl; + ofs << tok << "\n"; } } diff --git a/tests/test-tokenizer-0.py b/tests/test-tokenizer-0.py new file mode 100644 index 0000000000000..8e7638e428669 --- /dev/null +++ b/tests/test-tokenizer-0.py @@ -0,0 +1,46 @@ +import time +import argparse + +from transformers import AutoTokenizer + +parser = argparse.ArgumentParser() +parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") +parser.add_argument("--fname-tok", help="path to a text file to tokenize", required=True) +args = parser.parse_args() + +dir_tokenizer = args.dir_tokenizer +fname_tok = args.fname_tok + +tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) + +print('tokenizing file: ', fname_tok) +fname_out = fname_tok + '.tok' +with open(fname_tok, 'r', encoding='utf-8') as f: + lines = f.readlines() + s = ''.join(lines) + t_start = time.time() + res = tokenizer.encode(s, add_special_tokens=False) + t_end = time.time() + print('\nmain : tokenized in', "{:.3f}".format(1000.0 * (t_end - t_start)), 'ms (py)') + with open(fname_out, 'w', encoding='utf-8') as f: + for x in res: + # LLaMA v3 for some reason strips the space for these tokens (and others) + # if x == 662: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 1174: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 2564: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 758: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 949: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 5354: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # else: + # f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') + # f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n') + f.write(str(x) + '\n') + print('len(res): ', len(res)) + print('len(lines): ', len(lines)) +print('results written to: ', fname_out) diff --git a/tests/test-tokenizer-0.sh b/tests/test-tokenizer-0.sh new file mode 100755 index 0000000000000..2fb8632d810c4 --- /dev/null +++ b/tests/test-tokenizer-0.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# +# Usage: +# +# test-tokenizer-0.sh +# + +if [ $# -ne 2 ]; then + printf "Usage: $0 \n" + exit 1 +fi + +name=$1 +input=$2 + +make -j tests/test-tokenizer-0 + +printf "Testing %s on %s ...\n" $name $input + +python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1 +cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in" + +./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1 +cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in" + +diff $input.tok $input.tokcpp > /dev/null 2>&1 + +if [ $? -eq 0 ]; then + printf "Tokenization is correct!\n" +else + diff $input.tok $input.tokcpp | head -n 32 + + printf "Tokenization differs!\n" +fi diff --git a/unicode-data.cpp b/unicode-data.cpp index e6bafb3a9add7..07bf02c4509fb 100644 --- a/unicode-data.cpp +++ b/unicode-data.cpp @@ -5,27 +5,47 @@ #include #include -const std::vector> unicode_ranges_digit = { -{0x00000030, 0x00000039}, {0x000000B2, 0x000000B3}, {0x000000B9, 0x000000B9}, {0x00000660, 0x00000669}, -{0x000006F0, 0x000006F9}, {0x000007C0, 0x000007C9}, {0x00000966, 0x0000096F}, {0x000009E6, 0x000009EF}, -{0x00000A66, 0x00000A6F}, {0x00000AE6, 0x00000AEF}, {0x00000B66, 0x00000B6F}, {0x00000BE6, 0x00000BEF}, -{0x00000C66, 0x00000C6F}, {0x00000CE6, 0x00000CEF}, {0x00000D66, 0x00000D6F}, {0x00000DE6, 0x00000DEF}, -{0x00000E50, 0x00000E59}, {0x00000ED0, 0x00000ED9}, {0x00000F20, 0x00000F29}, {0x00001040, 0x00001049}, -{0x00001090, 0x00001099}, {0x00001369, 0x00001371}, {0x000017E0, 0x000017E9}, {0x00001810, 0x00001819}, -{0x00001946, 0x0000194F}, {0x000019D0, 0x000019DA}, {0x00001A80, 0x00001A89}, {0x00001A90, 0x00001A99}, -{0x00001B50, 0x00001B59}, {0x00001BB0, 0x00001BB9}, {0x00001C40, 0x00001C49}, {0x00001C50, 0x00001C59}, -{0x00002070, 0x00002070}, {0x00002074, 0x00002079}, {0x00002080, 0x00002089}, {0x00002460, 0x00002468}, -{0x00002474, 0x0000247C}, {0x00002488, 0x00002490}, {0x000024EA, 0x000024EA}, {0x000024F5, 0x000024FD}, -{0x000024FF, 0x000024FF}, {0x00002776, 0x0000277E}, {0x00002780, 0x00002788}, {0x0000278A, 0x00002792}, -{0x0000A620, 0x0000A629}, {0x0000A8D0, 0x0000A8D9}, {0x0000A900, 0x0000A909}, {0x0000A9D0, 0x0000A9D9}, -{0x0000A9F0, 0x0000A9F9}, {0x0000AA50, 0x0000AA59}, {0x0000ABF0, 0x0000ABF9}, {0x0000FF10, 0x0000FF19}, -{0x000104A0, 0x000104A9}, {0x00010A40, 0x00010A43}, {0x00010D30, 0x00010D39}, {0x00010E60, 0x00010E68}, -{0x00011052, 0x0001105A}, {0x00011066, 0x0001106F}, {0x000110F0, 0x000110F9}, {0x00011136, 0x0001113F}, -{0x000111D0, 0x000111D9}, {0x000112F0, 0x000112F9}, {0x00011450, 0x00011459}, {0x000114D0, 0x000114D9}, -{0x00011650, 0x00011659}, {0x000116C0, 0x000116C9}, {0x00011730, 0x00011739}, {0x000118E0, 0x000118E9}, -{0x00011950, 0x00011959}, {0x00011C50, 0x00011C59}, {0x00011D50, 0x00011D59}, {0x00011DA0, 0x00011DA9}, -{0x00016A60, 0x00016A69}, {0x00016B50, 0x00016B59}, {0x0001D7CE, 0x0001D7FF}, {0x0001E140, 0x0001E149}, -{0x0001E2F0, 0x0001E2F9}, {0x0001E950, 0x0001E959}, {0x0001F100, 0x0001F10A}, {0x0001FBF0, 0x0001FBF9}, +// generated with scripts/gen-unicode-data.py +// +// TODO: generate unicode_map_lowercase +// TODO: generate unicode_map_nfd + +const std::vector> unicode_ranges_number = { +{0x00000030, 0x00000039}, {0x000000B2, 0x000000B3}, {0x000000B9, 0x000000B9}, {0x000000BC, 0x000000BE}, +{0x00000660, 0x00000669}, {0x000006F0, 0x000006F9}, {0x000007C0, 0x000007C9}, {0x00000966, 0x0000096F}, +{0x000009E6, 0x000009EF}, {0x000009F4, 0x000009F9}, {0x00000A66, 0x00000A6F}, {0x00000AE6, 0x00000AEF}, +{0x00000B66, 0x00000B6F}, {0x00000B72, 0x00000B77}, {0x00000BE6, 0x00000BF2}, {0x00000C66, 0x00000C6F}, +{0x00000C78, 0x00000C7E}, {0x00000CE6, 0x00000CEF}, {0x00000D58, 0x00000D5E}, {0x00000D66, 0x00000D78}, +{0x00000DE6, 0x00000DEF}, {0x00000E50, 0x00000E59}, {0x00000ED0, 0x00000ED9}, {0x00000F20, 0x00000F33}, +{0x00001040, 0x00001049}, {0x00001090, 0x00001099}, {0x00001369, 0x0000137C}, {0x000016EE, 0x000016F0}, +{0x000017E0, 0x000017E9}, {0x000017F0, 0x000017F9}, {0x00001810, 0x00001819}, {0x00001946, 0x0000194F}, +{0x000019D0, 0x000019DA}, {0x00001A80, 0x00001A89}, {0x00001A90, 0x00001A99}, {0x00001B50, 0x00001B59}, +{0x00001BB0, 0x00001BB9}, {0x00001C40, 0x00001C49}, {0x00001C50, 0x00001C59}, {0x00002070, 0x00002070}, +{0x00002074, 0x00002079}, {0x00002080, 0x00002089}, {0x00002150, 0x00002182}, {0x00002185, 0x00002189}, +{0x00002460, 0x0000249B}, {0x000024EA, 0x000024FF}, {0x00002776, 0x00002793}, {0x00002CFD, 0x00002CFD}, +{0x00003007, 0x00003007}, {0x00003021, 0x00003029}, {0x00003038, 0x0000303A}, {0x00003192, 0x00003195}, +{0x00003220, 0x00003229}, {0x00003248, 0x0000324F}, {0x00003251, 0x0000325F}, {0x00003280, 0x00003289}, +{0x000032B1, 0x000032BF}, {0x0000A620, 0x0000A629}, {0x0000A6E6, 0x0000A6EF}, {0x0000A830, 0x0000A835}, +{0x0000A8D0, 0x0000A8D9}, {0x0000A900, 0x0000A909}, {0x0000A9D0, 0x0000A9D9}, {0x0000A9F0, 0x0000A9F9}, +{0x0000AA50, 0x0000AA59}, {0x0000ABF0, 0x0000ABF9}, {0x0000FF10, 0x0000FF19}, {0x00010107, 0x00010133}, +{0x00010140, 0x00010178}, {0x0001018A, 0x0001018B}, {0x000102E1, 0x000102FB}, {0x00010320, 0x00010323}, +{0x00010341, 0x00010341}, {0x0001034A, 0x0001034A}, {0x000103D1, 0x000103D5}, {0x000104A0, 0x000104A9}, +{0x00010858, 0x0001085F}, {0x00010879, 0x0001087F}, {0x000108A7, 0x000108AF}, {0x000108FB, 0x000108FF}, +{0x00010916, 0x0001091B}, {0x000109BC, 0x000109BD}, {0x000109C0, 0x000109CF}, {0x000109D2, 0x000109FF}, +{0x00010A40, 0x00010A48}, {0x00010A7D, 0x00010A7E}, {0x00010A9D, 0x00010A9F}, {0x00010AEB, 0x00010AEF}, +{0x00010B58, 0x00010B5F}, {0x00010B78, 0x00010B7F}, {0x00010BA9, 0x00010BAF}, {0x00010CFA, 0x00010CFF}, +{0x00010D30, 0x00010D39}, {0x00010E60, 0x00010E7E}, {0x00010F1D, 0x00010F26}, {0x00010F51, 0x00010F54}, +{0x00010FC5, 0x00010FCB}, {0x00011052, 0x0001106F}, {0x000110F0, 0x000110F9}, {0x00011136, 0x0001113F}, +{0x000111D0, 0x000111D9}, {0x000111E1, 0x000111F4}, {0x000112F0, 0x000112F9}, {0x00011450, 0x00011459}, +{0x000114D0, 0x000114D9}, {0x00011650, 0x00011659}, {0x000116C0, 0x000116C9}, {0x00011730, 0x0001173B}, +{0x000118E0, 0x000118F2}, {0x00011950, 0x00011959}, {0x00011C50, 0x00011C6C}, {0x00011D50, 0x00011D59}, +{0x00011DA0, 0x00011DA9}, {0x00011F50, 0x00011F59}, {0x00011FC0, 0x00011FD4}, {0x00012400, 0x0001246E}, +{0x00016A60, 0x00016A69}, {0x00016AC0, 0x00016AC9}, {0x00016B50, 0x00016B59}, {0x00016B5B, 0x00016B61}, +{0x00016E80, 0x00016E96}, {0x0001D2C0, 0x0001D2D3}, {0x0001D2E0, 0x0001D2F3}, {0x0001D360, 0x0001D378}, +{0x0001D7CE, 0x0001D7FF}, {0x0001E140, 0x0001E149}, {0x0001E2F0, 0x0001E2F9}, {0x0001E4F0, 0x0001E4F9}, +{0x0001E8C7, 0x0001E8CF}, {0x0001E950, 0x0001E959}, {0x0001EC71, 0x0001ECAB}, {0x0001ECAD, 0x0001ECAF}, +{0x0001ECB1, 0x0001ECB4}, {0x0001ED01, 0x0001ED2D}, {0x0001ED2F, 0x0001ED3D}, {0x0001F100, 0x0001F10C}, +{0x0001FBF0, 0x0001FBF9}, }; const std::vector> unicode_ranges_letter = { @@ -41,73 +61,73 @@ const std::vector> unicode_ranges_letter = { {0x00000710, 0x00000710}, {0x00000712, 0x0000072F}, {0x0000074D, 0x000007A5}, {0x000007B1, 0x000007B1}, {0x000007CA, 0x000007EA}, {0x000007F4, 0x000007F5}, {0x000007FA, 0x000007FA}, {0x00000800, 0x00000815}, {0x0000081A, 0x0000081A}, {0x00000824, 0x00000824}, {0x00000828, 0x00000828}, {0x00000840, 0x00000858}, -{0x00000860, 0x0000086A}, {0x000008A0, 0x000008B4}, {0x000008B6, 0x000008C7}, {0x00000904, 0x00000939}, -{0x0000093D, 0x0000093D}, {0x00000950, 0x00000950}, {0x00000958, 0x00000961}, {0x00000971, 0x00000980}, -{0x00000985, 0x0000098C}, {0x0000098F, 0x00000990}, {0x00000993, 0x000009A8}, {0x000009AA, 0x000009B0}, -{0x000009B2, 0x000009B2}, {0x000009B6, 0x000009B9}, {0x000009BD, 0x000009BD}, {0x000009CE, 0x000009CE}, -{0x000009DC, 0x000009DD}, {0x000009DF, 0x000009E1}, {0x000009F0, 0x000009F1}, {0x000009FC, 0x000009FC}, -{0x00000A05, 0x00000A0A}, {0x00000A0F, 0x00000A10}, {0x00000A13, 0x00000A28}, {0x00000A2A, 0x00000A30}, -{0x00000A32, 0x00000A33}, {0x00000A35, 0x00000A36}, {0x00000A38, 0x00000A39}, {0x00000A59, 0x00000A5C}, -{0x00000A5E, 0x00000A5E}, {0x00000A72, 0x00000A74}, {0x00000A85, 0x00000A8D}, {0x00000A8F, 0x00000A91}, -{0x00000A93, 0x00000AA8}, {0x00000AAA, 0x00000AB0}, {0x00000AB2, 0x00000AB3}, {0x00000AB5, 0x00000AB9}, -{0x00000ABD, 0x00000ABD}, {0x00000AD0, 0x00000AD0}, {0x00000AE0, 0x00000AE1}, {0x00000AF9, 0x00000AF9}, -{0x00000B05, 0x00000B0C}, {0x00000B0F, 0x00000B10}, {0x00000B13, 0x00000B28}, {0x00000B2A, 0x00000B30}, -{0x00000B32, 0x00000B33}, {0x00000B35, 0x00000B39}, {0x00000B3D, 0x00000B3D}, {0x00000B5C, 0x00000B5D}, -{0x00000B5F, 0x00000B61}, {0x00000B71, 0x00000B71}, {0x00000B83, 0x00000B83}, {0x00000B85, 0x00000B8A}, -{0x00000B8E, 0x00000B90}, {0x00000B92, 0x00000B95}, {0x00000B99, 0x00000B9A}, {0x00000B9C, 0x00000B9C}, -{0x00000B9E, 0x00000B9F}, {0x00000BA3, 0x00000BA4}, {0x00000BA8, 0x00000BAA}, {0x00000BAE, 0x00000BB9}, -{0x00000BD0, 0x00000BD0}, {0x00000C05, 0x00000C0C}, {0x00000C0E, 0x00000C10}, {0x00000C12, 0x00000C28}, -{0x00000C2A, 0x00000C39}, {0x00000C3D, 0x00000C3D}, {0x00000C58, 0x00000C5A}, {0x00000C60, 0x00000C61}, -{0x00000C80, 0x00000C80}, {0x00000C85, 0x00000C8C}, {0x00000C8E, 0x00000C90}, {0x00000C92, 0x00000CA8}, -{0x00000CAA, 0x00000CB3}, {0x00000CB5, 0x00000CB9}, {0x00000CBD, 0x00000CBD}, {0x00000CDE, 0x00000CDE}, -{0x00000CE0, 0x00000CE1}, {0x00000CF1, 0x00000CF2}, {0x00000D04, 0x00000D0C}, {0x00000D0E, 0x00000D10}, -{0x00000D12, 0x00000D3A}, {0x00000D3D, 0x00000D3D}, {0x00000D4E, 0x00000D4E}, {0x00000D54, 0x00000D56}, -{0x00000D5F, 0x00000D61}, {0x00000D7A, 0x00000D7F}, {0x00000D85, 0x00000D96}, {0x00000D9A, 0x00000DB1}, -{0x00000DB3, 0x00000DBB}, {0x00000DBD, 0x00000DBD}, {0x00000DC0, 0x00000DC6}, {0x00000E01, 0x00000E30}, -{0x00000E32, 0x00000E33}, {0x00000E40, 0x00000E46}, {0x00000E81, 0x00000E82}, {0x00000E84, 0x00000E84}, -{0x00000E86, 0x00000E8A}, {0x00000E8C, 0x00000EA3}, {0x00000EA5, 0x00000EA5}, {0x00000EA7, 0x00000EB0}, -{0x00000EB2, 0x00000EB3}, {0x00000EBD, 0x00000EBD}, {0x00000EC0, 0x00000EC4}, {0x00000EC6, 0x00000EC6}, -{0x00000EDC, 0x00000EDF}, {0x00000F00, 0x00000F00}, {0x00000F40, 0x00000F47}, {0x00000F49, 0x00000F6C}, -{0x00000F88, 0x00000F8C}, {0x00001000, 0x0000102A}, {0x0000103F, 0x0000103F}, {0x00001050, 0x00001055}, -{0x0000105A, 0x0000105D}, {0x00001061, 0x00001061}, {0x00001065, 0x00001066}, {0x0000106E, 0x00001070}, -{0x00001075, 0x00001081}, {0x0000108E, 0x0000108E}, {0x000010A0, 0x000010C5}, {0x000010C7, 0x000010C7}, -{0x000010CD, 0x000010CD}, {0x000010D0, 0x000010FA}, {0x000010FC, 0x00001248}, {0x0000124A, 0x0000124D}, -{0x00001250, 0x00001256}, {0x00001258, 0x00001258}, {0x0000125A, 0x0000125D}, {0x00001260, 0x00001288}, -{0x0000128A, 0x0000128D}, {0x00001290, 0x000012B0}, {0x000012B2, 0x000012B5}, {0x000012B8, 0x000012BE}, -{0x000012C0, 0x000012C0}, {0x000012C2, 0x000012C5}, {0x000012C8, 0x000012D6}, {0x000012D8, 0x00001310}, -{0x00001312, 0x00001315}, {0x00001318, 0x0000135A}, {0x00001380, 0x0000138F}, {0x000013A0, 0x000013F5}, -{0x000013F8, 0x000013FD}, {0x00001401, 0x0000166C}, {0x0000166F, 0x0000167F}, {0x00001681, 0x0000169A}, -{0x000016A0, 0x000016EA}, {0x000016F1, 0x000016F8}, {0x00001700, 0x0000170C}, {0x0000170E, 0x00001711}, -{0x00001720, 0x00001731}, {0x00001740, 0x00001751}, {0x00001760, 0x0000176C}, {0x0000176E, 0x00001770}, -{0x00001780, 0x000017B3}, {0x000017D7, 0x000017D7}, {0x000017DC, 0x000017DC}, {0x00001820, 0x00001878}, -{0x00001880, 0x00001884}, {0x00001887, 0x000018A8}, {0x000018AA, 0x000018AA}, {0x000018B0, 0x000018F5}, -{0x00001900, 0x0000191E}, {0x00001950, 0x0000196D}, {0x00001970, 0x00001974}, {0x00001980, 0x000019AB}, -{0x000019B0, 0x000019C9}, {0x00001A00, 0x00001A16}, {0x00001A20, 0x00001A54}, {0x00001AA7, 0x00001AA7}, -{0x00001B05, 0x00001B33}, {0x00001B45, 0x00001B4B}, {0x00001B83, 0x00001BA0}, {0x00001BAE, 0x00001BAF}, -{0x00001BBA, 0x00001BE5}, {0x00001C00, 0x00001C23}, {0x00001C4D, 0x00001C4F}, {0x00001C5A, 0x00001C7D}, -{0x00001C80, 0x00001C88}, {0x00001C90, 0x00001CBA}, {0x00001CBD, 0x00001CBF}, {0x00001CE9, 0x00001CEC}, -{0x00001CEE, 0x00001CF3}, {0x00001CF5, 0x00001CF6}, {0x00001CFA, 0x00001CFA}, {0x00001D00, 0x00001DBF}, -{0x00001E00, 0x00001F15}, {0x00001F18, 0x00001F1D}, {0x00001F20, 0x00001F45}, {0x00001F48, 0x00001F4D}, -{0x00001F50, 0x00001F57}, {0x00001F59, 0x00001F59}, {0x00001F5B, 0x00001F5B}, {0x00001F5D, 0x00001F5D}, -{0x00001F5F, 0x00001F7D}, {0x00001F80, 0x00001FB4}, {0x00001FB6, 0x00001FBC}, {0x00001FBE, 0x00001FBE}, -{0x00001FC2, 0x00001FC4}, {0x00001FC6, 0x00001FCC}, {0x00001FD0, 0x00001FD3}, {0x00001FD6, 0x00001FDB}, -{0x00001FE0, 0x00001FEC}, {0x00001FF2, 0x00001FF4}, {0x00001FF6, 0x00001FFC}, {0x00002071, 0x00002071}, -{0x0000207F, 0x0000207F}, {0x00002090, 0x0000209C}, {0x00002102, 0x00002102}, {0x00002107, 0x00002107}, -{0x0000210A, 0x00002113}, {0x00002115, 0x00002115}, {0x00002119, 0x0000211D}, {0x00002124, 0x00002124}, -{0x00002126, 0x00002126}, {0x00002128, 0x00002128}, {0x0000212A, 0x0000212D}, {0x0000212F, 0x00002139}, -{0x0000213C, 0x0000213F}, {0x00002145, 0x00002149}, {0x0000214E, 0x0000214E}, {0x00002183, 0x00002184}, -{0x00002C00, 0x00002C2E}, {0x00002C30, 0x00002C5E}, {0x00002C60, 0x00002CE4}, {0x00002CEB, 0x00002CEE}, -{0x00002CF2, 0x00002CF3}, {0x00002D00, 0x00002D25}, {0x00002D27, 0x00002D27}, {0x00002D2D, 0x00002D2D}, -{0x00002D30, 0x00002D67}, {0x00002D6F, 0x00002D6F}, {0x00002D80, 0x00002D96}, {0x00002DA0, 0x00002DA6}, -{0x00002DA8, 0x00002DAE}, {0x00002DB0, 0x00002DB6}, {0x00002DB8, 0x00002DBE}, {0x00002DC0, 0x00002DC6}, -{0x00002DC8, 0x00002DCE}, {0x00002DD0, 0x00002DD6}, {0x00002DD8, 0x00002DDE}, {0x00002E2F, 0x00002E2F}, -{0x00003005, 0x00003006}, {0x00003031, 0x00003035}, {0x0000303B, 0x0000303C}, {0x00003041, 0x00003096}, -{0x0000309D, 0x0000309F}, {0x000030A1, 0x000030FA}, {0x000030FC, 0x000030FF}, {0x00003105, 0x0000312F}, -{0x00003131, 0x0000318E}, {0x000031A0, 0x000031BF}, {0x000031F0, 0x000031FF}, {0x00003400, 0x00004DBF}, -{0x00004E00, 0x00009FFC}, {0x0000A000, 0x0000A48C}, {0x0000A4D0, 0x0000A4FD}, {0x0000A500, 0x0000A60C}, -{0x0000A610, 0x0000A61F}, {0x0000A62A, 0x0000A62B}, {0x0000A640, 0x0000A66E}, {0x0000A67F, 0x0000A69D}, -{0x0000A6A0, 0x0000A6E5}, {0x0000A717, 0x0000A71F}, {0x0000A722, 0x0000A788}, {0x0000A78B, 0x0000A7BF}, -{0x0000A7C2, 0x0000A7CA}, {0x0000A7F5, 0x0000A801}, {0x0000A803, 0x0000A805}, {0x0000A807, 0x0000A80A}, +{0x00000860, 0x0000086A}, {0x00000870, 0x00000887}, {0x00000889, 0x0000088E}, {0x000008A0, 0x000008C9}, +{0x00000904, 0x00000939}, {0x0000093D, 0x0000093D}, {0x00000950, 0x00000950}, {0x00000958, 0x00000961}, +{0x00000971, 0x00000980}, {0x00000985, 0x0000098C}, {0x0000098F, 0x00000990}, {0x00000993, 0x000009A8}, +{0x000009AA, 0x000009B0}, {0x000009B2, 0x000009B2}, {0x000009B6, 0x000009B9}, {0x000009BD, 0x000009BD}, +{0x000009CE, 0x000009CE}, {0x000009DC, 0x000009DD}, {0x000009DF, 0x000009E1}, {0x000009F0, 0x000009F1}, +{0x000009FC, 0x000009FC}, {0x00000A05, 0x00000A0A}, {0x00000A0F, 0x00000A10}, {0x00000A13, 0x00000A28}, +{0x00000A2A, 0x00000A30}, {0x00000A32, 0x00000A33}, {0x00000A35, 0x00000A36}, {0x00000A38, 0x00000A39}, +{0x00000A59, 0x00000A5C}, {0x00000A5E, 0x00000A5E}, {0x00000A72, 0x00000A74}, {0x00000A85, 0x00000A8D}, +{0x00000A8F, 0x00000A91}, {0x00000A93, 0x00000AA8}, {0x00000AAA, 0x00000AB0}, {0x00000AB2, 0x00000AB3}, +{0x00000AB5, 0x00000AB9}, {0x00000ABD, 0x00000ABD}, {0x00000AD0, 0x00000AD0}, {0x00000AE0, 0x00000AE1}, +{0x00000AF9, 0x00000AF9}, {0x00000B05, 0x00000B0C}, {0x00000B0F, 0x00000B10}, {0x00000B13, 0x00000B28}, +{0x00000B2A, 0x00000B30}, {0x00000B32, 0x00000B33}, {0x00000B35, 0x00000B39}, {0x00000B3D, 0x00000B3D}, +{0x00000B5C, 0x00000B5D}, {0x00000B5F, 0x00000B61}, {0x00000B71, 0x00000B71}, {0x00000B83, 0x00000B83}, +{0x00000B85, 0x00000B8A}, {0x00000B8E, 0x00000B90}, {0x00000B92, 0x00000B95}, {0x00000B99, 0x00000B9A}, +{0x00000B9C, 0x00000B9C}, {0x00000B9E, 0x00000B9F}, {0x00000BA3, 0x00000BA4}, {0x00000BA8, 0x00000BAA}, +{0x00000BAE, 0x00000BB9}, {0x00000BD0, 0x00000BD0}, {0x00000C05, 0x00000C0C}, {0x00000C0E, 0x00000C10}, +{0x00000C12, 0x00000C28}, {0x00000C2A, 0x00000C39}, {0x00000C3D, 0x00000C3D}, {0x00000C58, 0x00000C5A}, +{0x00000C5D, 0x00000C5D}, {0x00000C60, 0x00000C61}, {0x00000C80, 0x00000C80}, {0x00000C85, 0x00000C8C}, +{0x00000C8E, 0x00000C90}, {0x00000C92, 0x00000CA8}, {0x00000CAA, 0x00000CB3}, {0x00000CB5, 0x00000CB9}, +{0x00000CBD, 0x00000CBD}, {0x00000CDD, 0x00000CDE}, {0x00000CE0, 0x00000CE1}, {0x00000CF1, 0x00000CF2}, +{0x00000D04, 0x00000D0C}, {0x00000D0E, 0x00000D10}, {0x00000D12, 0x00000D3A}, {0x00000D3D, 0x00000D3D}, +{0x00000D4E, 0x00000D4E}, {0x00000D54, 0x00000D56}, {0x00000D5F, 0x00000D61}, {0x00000D7A, 0x00000D7F}, +{0x00000D85, 0x00000D96}, {0x00000D9A, 0x00000DB1}, {0x00000DB3, 0x00000DBB}, {0x00000DBD, 0x00000DBD}, +{0x00000DC0, 0x00000DC6}, {0x00000E01, 0x00000E30}, {0x00000E32, 0x00000E33}, {0x00000E40, 0x00000E46}, +{0x00000E81, 0x00000E82}, {0x00000E84, 0x00000E84}, {0x00000E86, 0x00000E8A}, {0x00000E8C, 0x00000EA3}, +{0x00000EA5, 0x00000EA5}, {0x00000EA7, 0x00000EB0}, {0x00000EB2, 0x00000EB3}, {0x00000EBD, 0x00000EBD}, +{0x00000EC0, 0x00000EC4}, {0x00000EC6, 0x00000EC6}, {0x00000EDC, 0x00000EDF}, {0x00000F00, 0x00000F00}, +{0x00000F40, 0x00000F47}, {0x00000F49, 0x00000F6C}, {0x00000F88, 0x00000F8C}, {0x00001000, 0x0000102A}, +{0x0000103F, 0x0000103F}, {0x00001050, 0x00001055}, {0x0000105A, 0x0000105D}, {0x00001061, 0x00001061}, +{0x00001065, 0x00001066}, {0x0000106E, 0x00001070}, {0x00001075, 0x00001081}, {0x0000108E, 0x0000108E}, +{0x000010A0, 0x000010C5}, {0x000010C7, 0x000010C7}, {0x000010CD, 0x000010CD}, {0x000010D0, 0x000010FA}, +{0x000010FC, 0x00001248}, {0x0000124A, 0x0000124D}, {0x00001250, 0x00001256}, {0x00001258, 0x00001258}, +{0x0000125A, 0x0000125D}, {0x00001260, 0x00001288}, {0x0000128A, 0x0000128D}, {0x00001290, 0x000012B0}, +{0x000012B2, 0x000012B5}, {0x000012B8, 0x000012BE}, {0x000012C0, 0x000012C0}, {0x000012C2, 0x000012C5}, +{0x000012C8, 0x000012D6}, {0x000012D8, 0x00001310}, {0x00001312, 0x00001315}, {0x00001318, 0x0000135A}, +{0x00001380, 0x0000138F}, {0x000013A0, 0x000013F5}, {0x000013F8, 0x000013FD}, {0x00001401, 0x0000166C}, +{0x0000166F, 0x0000167F}, {0x00001681, 0x0000169A}, {0x000016A0, 0x000016EA}, {0x000016F1, 0x000016F8}, +{0x00001700, 0x00001711}, {0x0000171F, 0x00001731}, {0x00001740, 0x00001751}, {0x00001760, 0x0000176C}, +{0x0000176E, 0x00001770}, {0x00001780, 0x000017B3}, {0x000017D7, 0x000017D7}, {0x000017DC, 0x000017DC}, +{0x00001820, 0x00001878}, {0x00001880, 0x00001884}, {0x00001887, 0x000018A8}, {0x000018AA, 0x000018AA}, +{0x000018B0, 0x000018F5}, {0x00001900, 0x0000191E}, {0x00001950, 0x0000196D}, {0x00001970, 0x00001974}, +{0x00001980, 0x000019AB}, {0x000019B0, 0x000019C9}, {0x00001A00, 0x00001A16}, {0x00001A20, 0x00001A54}, +{0x00001AA7, 0x00001AA7}, {0x00001B05, 0x00001B33}, {0x00001B45, 0x00001B4C}, {0x00001B83, 0x00001BA0}, +{0x00001BAE, 0x00001BAF}, {0x00001BBA, 0x00001BE5}, {0x00001C00, 0x00001C23}, {0x00001C4D, 0x00001C4F}, +{0x00001C5A, 0x00001C7D}, {0x00001C80, 0x00001C88}, {0x00001C90, 0x00001CBA}, {0x00001CBD, 0x00001CBF}, +{0x00001CE9, 0x00001CEC}, {0x00001CEE, 0x00001CF3}, {0x00001CF5, 0x00001CF6}, {0x00001CFA, 0x00001CFA}, +{0x00001D00, 0x00001DBF}, {0x00001E00, 0x00001F15}, {0x00001F18, 0x00001F1D}, {0x00001F20, 0x00001F45}, +{0x00001F48, 0x00001F4D}, {0x00001F50, 0x00001F57}, {0x00001F59, 0x00001F59}, {0x00001F5B, 0x00001F5B}, +{0x00001F5D, 0x00001F5D}, {0x00001F5F, 0x00001F7D}, {0x00001F80, 0x00001FB4}, {0x00001FB6, 0x00001FBC}, +{0x00001FBE, 0x00001FBE}, {0x00001FC2, 0x00001FC4}, {0x00001FC6, 0x00001FCC}, {0x00001FD0, 0x00001FD3}, +{0x00001FD6, 0x00001FDB}, {0x00001FE0, 0x00001FEC}, {0x00001FF2, 0x00001FF4}, {0x00001FF6, 0x00001FFC}, +{0x00002071, 0x00002071}, {0x0000207F, 0x0000207F}, {0x00002090, 0x0000209C}, {0x00002102, 0x00002102}, +{0x00002107, 0x00002107}, {0x0000210A, 0x00002113}, {0x00002115, 0x00002115}, {0x00002119, 0x0000211D}, +{0x00002124, 0x00002124}, {0x00002126, 0x00002126}, {0x00002128, 0x00002128}, {0x0000212A, 0x0000212D}, +{0x0000212F, 0x00002139}, {0x0000213C, 0x0000213F}, {0x00002145, 0x00002149}, {0x0000214E, 0x0000214E}, +{0x00002183, 0x00002184}, {0x00002C00, 0x00002CE4}, {0x00002CEB, 0x00002CEE}, {0x00002CF2, 0x00002CF3}, +{0x00002D00, 0x00002D25}, {0x00002D27, 0x00002D27}, {0x00002D2D, 0x00002D2D}, {0x00002D30, 0x00002D67}, +{0x00002D6F, 0x00002D6F}, {0x00002D80, 0x00002D96}, {0x00002DA0, 0x00002DA6}, {0x00002DA8, 0x00002DAE}, +{0x00002DB0, 0x00002DB6}, {0x00002DB8, 0x00002DBE}, {0x00002DC0, 0x00002DC6}, {0x00002DC8, 0x00002DCE}, +{0x00002DD0, 0x00002DD6}, {0x00002DD8, 0x00002DDE}, {0x00002E2F, 0x00002E2F}, {0x00003005, 0x00003006}, +{0x00003031, 0x00003035}, {0x0000303B, 0x0000303C}, {0x00003041, 0x00003096}, {0x0000309D, 0x0000309F}, +{0x000030A1, 0x000030FA}, {0x000030FC, 0x000030FF}, {0x00003105, 0x0000312F}, {0x00003131, 0x0000318E}, +{0x000031A0, 0x000031BF}, {0x000031F0, 0x000031FF}, {0x00003400, 0x00004DBF}, {0x00004E00, 0x0000A48C}, +{0x0000A4D0, 0x0000A4FD}, {0x0000A500, 0x0000A60C}, {0x0000A610, 0x0000A61F}, {0x0000A62A, 0x0000A62B}, +{0x0000A640, 0x0000A66E}, {0x0000A67F, 0x0000A69D}, {0x0000A6A0, 0x0000A6E5}, {0x0000A717, 0x0000A71F}, +{0x0000A722, 0x0000A788}, {0x0000A78B, 0x0000A7CA}, {0x0000A7D0, 0x0000A7D1}, {0x0000A7D3, 0x0000A7D3}, +{0x0000A7D5, 0x0000A7D9}, {0x0000A7F2, 0x0000A801}, {0x0000A803, 0x0000A805}, {0x0000A807, 0x0000A80A}, {0x0000A80C, 0x0000A822}, {0x0000A840, 0x0000A873}, {0x0000A882, 0x0000A8B3}, {0x0000A8F2, 0x0000A8F7}, {0x0000A8FB, 0x0000A8FB}, {0x0000A8FD, 0x0000A8FE}, {0x0000A90A, 0x0000A925}, {0x0000A930, 0x0000A946}, {0x0000A960, 0x0000A97C}, {0x0000A984, 0x0000A9B2}, {0x0000A9CF, 0x0000A9CF}, {0x0000A9E0, 0x0000A9E4}, @@ -129,51 +149,60 @@ const std::vector> unicode_ranges_letter = { {0x000102A0, 0x000102D0}, {0x00010300, 0x0001031F}, {0x0001032D, 0x00010340}, {0x00010342, 0x00010349}, {0x00010350, 0x00010375}, {0x00010380, 0x0001039D}, {0x000103A0, 0x000103C3}, {0x000103C8, 0x000103CF}, {0x00010400, 0x0001049D}, {0x000104B0, 0x000104D3}, {0x000104D8, 0x000104FB}, {0x00010500, 0x00010527}, -{0x00010530, 0x00010563}, {0x00010600, 0x00010736}, {0x00010740, 0x00010755}, {0x00010760, 0x00010767}, -{0x00010800, 0x00010805}, {0x00010808, 0x00010808}, {0x0001080A, 0x00010835}, {0x00010837, 0x00010838}, -{0x0001083C, 0x0001083C}, {0x0001083F, 0x00010855}, {0x00010860, 0x00010876}, {0x00010880, 0x0001089E}, -{0x000108E0, 0x000108F2}, {0x000108F4, 0x000108F5}, {0x00010900, 0x00010915}, {0x00010920, 0x00010939}, -{0x00010980, 0x000109B7}, {0x000109BE, 0x000109BF}, {0x00010A00, 0x00010A00}, {0x00010A10, 0x00010A13}, -{0x00010A15, 0x00010A17}, {0x00010A19, 0x00010A35}, {0x00010A60, 0x00010A7C}, {0x00010A80, 0x00010A9C}, -{0x00010AC0, 0x00010AC7}, {0x00010AC9, 0x00010AE4}, {0x00010B00, 0x00010B35}, {0x00010B40, 0x00010B55}, -{0x00010B60, 0x00010B72}, {0x00010B80, 0x00010B91}, {0x00010C00, 0x00010C48}, {0x00010C80, 0x00010CB2}, -{0x00010CC0, 0x00010CF2}, {0x00010D00, 0x00010D23}, {0x00010E80, 0x00010EA9}, {0x00010EB0, 0x00010EB1}, -{0x00010F00, 0x00010F1C}, {0x00010F27, 0x00010F27}, {0x00010F30, 0x00010F45}, {0x00010FB0, 0x00010FC4}, -{0x00010FE0, 0x00010FF6}, {0x00011003, 0x00011037}, {0x00011083, 0x000110AF}, {0x000110D0, 0x000110E8}, -{0x00011103, 0x00011126}, {0x00011144, 0x00011144}, {0x00011147, 0x00011147}, {0x00011150, 0x00011172}, -{0x00011176, 0x00011176}, {0x00011183, 0x000111B2}, {0x000111C1, 0x000111C4}, {0x000111DA, 0x000111DA}, -{0x000111DC, 0x000111DC}, {0x00011200, 0x00011211}, {0x00011213, 0x0001122B}, {0x00011280, 0x00011286}, -{0x00011288, 0x00011288}, {0x0001128A, 0x0001128D}, {0x0001128F, 0x0001129D}, {0x0001129F, 0x000112A8}, -{0x000112B0, 0x000112DE}, {0x00011305, 0x0001130C}, {0x0001130F, 0x00011310}, {0x00011313, 0x00011328}, -{0x0001132A, 0x00011330}, {0x00011332, 0x00011333}, {0x00011335, 0x00011339}, {0x0001133D, 0x0001133D}, -{0x00011350, 0x00011350}, {0x0001135D, 0x00011361}, {0x00011400, 0x00011434}, {0x00011447, 0x0001144A}, -{0x0001145F, 0x00011461}, {0x00011480, 0x000114AF}, {0x000114C4, 0x000114C5}, {0x000114C7, 0x000114C7}, -{0x00011580, 0x000115AE}, {0x000115D8, 0x000115DB}, {0x00011600, 0x0001162F}, {0x00011644, 0x00011644}, -{0x00011680, 0x000116AA}, {0x000116B8, 0x000116B8}, {0x00011700, 0x0001171A}, {0x00011800, 0x0001182B}, +{0x00010530, 0x00010563}, {0x00010570, 0x0001057A}, {0x0001057C, 0x0001058A}, {0x0001058C, 0x00010592}, +{0x00010594, 0x00010595}, {0x00010597, 0x000105A1}, {0x000105A3, 0x000105B1}, {0x000105B3, 0x000105B9}, +{0x000105BB, 0x000105BC}, {0x00010600, 0x00010736}, {0x00010740, 0x00010755}, {0x00010760, 0x00010767}, +{0x00010780, 0x00010785}, {0x00010787, 0x000107B0}, {0x000107B2, 0x000107BA}, {0x00010800, 0x00010805}, +{0x00010808, 0x00010808}, {0x0001080A, 0x00010835}, {0x00010837, 0x00010838}, {0x0001083C, 0x0001083C}, +{0x0001083F, 0x00010855}, {0x00010860, 0x00010876}, {0x00010880, 0x0001089E}, {0x000108E0, 0x000108F2}, +{0x000108F4, 0x000108F5}, {0x00010900, 0x00010915}, {0x00010920, 0x00010939}, {0x00010980, 0x000109B7}, +{0x000109BE, 0x000109BF}, {0x00010A00, 0x00010A00}, {0x00010A10, 0x00010A13}, {0x00010A15, 0x00010A17}, +{0x00010A19, 0x00010A35}, {0x00010A60, 0x00010A7C}, {0x00010A80, 0x00010A9C}, {0x00010AC0, 0x00010AC7}, +{0x00010AC9, 0x00010AE4}, {0x00010B00, 0x00010B35}, {0x00010B40, 0x00010B55}, {0x00010B60, 0x00010B72}, +{0x00010B80, 0x00010B91}, {0x00010C00, 0x00010C48}, {0x00010C80, 0x00010CB2}, {0x00010CC0, 0x00010CF2}, +{0x00010D00, 0x00010D23}, {0x00010E80, 0x00010EA9}, {0x00010EB0, 0x00010EB1}, {0x00010F00, 0x00010F1C}, +{0x00010F27, 0x00010F27}, {0x00010F30, 0x00010F45}, {0x00010F70, 0x00010F81}, {0x00010FB0, 0x00010FC4}, +{0x00010FE0, 0x00010FF6}, {0x00011003, 0x00011037}, {0x00011071, 0x00011072}, {0x00011075, 0x00011075}, +{0x00011083, 0x000110AF}, {0x000110D0, 0x000110E8}, {0x00011103, 0x00011126}, {0x00011144, 0x00011144}, +{0x00011147, 0x00011147}, {0x00011150, 0x00011172}, {0x00011176, 0x00011176}, {0x00011183, 0x000111B2}, +{0x000111C1, 0x000111C4}, {0x000111DA, 0x000111DA}, {0x000111DC, 0x000111DC}, {0x00011200, 0x00011211}, +{0x00011213, 0x0001122B}, {0x0001123F, 0x00011240}, {0x00011280, 0x00011286}, {0x00011288, 0x00011288}, +{0x0001128A, 0x0001128D}, {0x0001128F, 0x0001129D}, {0x0001129F, 0x000112A8}, {0x000112B0, 0x000112DE}, +{0x00011305, 0x0001130C}, {0x0001130F, 0x00011310}, {0x00011313, 0x00011328}, {0x0001132A, 0x00011330}, +{0x00011332, 0x00011333}, {0x00011335, 0x00011339}, {0x0001133D, 0x0001133D}, {0x00011350, 0x00011350}, +{0x0001135D, 0x00011361}, {0x00011400, 0x00011434}, {0x00011447, 0x0001144A}, {0x0001145F, 0x00011461}, +{0x00011480, 0x000114AF}, {0x000114C4, 0x000114C5}, {0x000114C7, 0x000114C7}, {0x00011580, 0x000115AE}, +{0x000115D8, 0x000115DB}, {0x00011600, 0x0001162F}, {0x00011644, 0x00011644}, {0x00011680, 0x000116AA}, +{0x000116B8, 0x000116B8}, {0x00011700, 0x0001171A}, {0x00011740, 0x00011746}, {0x00011800, 0x0001182B}, {0x000118A0, 0x000118DF}, {0x000118FF, 0x00011906}, {0x00011909, 0x00011909}, {0x0001190C, 0x00011913}, {0x00011915, 0x00011916}, {0x00011918, 0x0001192F}, {0x0001193F, 0x0001193F}, {0x00011941, 0x00011941}, {0x000119A0, 0x000119A7}, {0x000119AA, 0x000119D0}, {0x000119E1, 0x000119E1}, {0x000119E3, 0x000119E3}, {0x00011A00, 0x00011A00}, {0x00011A0B, 0x00011A32}, {0x00011A3A, 0x00011A3A}, {0x00011A50, 0x00011A50}, -{0x00011A5C, 0x00011A89}, {0x00011A9D, 0x00011A9D}, {0x00011AC0, 0x00011AF8}, {0x00011C00, 0x00011C08}, +{0x00011A5C, 0x00011A89}, {0x00011A9D, 0x00011A9D}, {0x00011AB0, 0x00011AF8}, {0x00011C00, 0x00011C08}, {0x00011C0A, 0x00011C2E}, {0x00011C40, 0x00011C40}, {0x00011C72, 0x00011C8F}, {0x00011D00, 0x00011D06}, {0x00011D08, 0x00011D09}, {0x00011D0B, 0x00011D30}, {0x00011D46, 0x00011D46}, {0x00011D60, 0x00011D65}, {0x00011D67, 0x00011D68}, {0x00011D6A, 0x00011D89}, {0x00011D98, 0x00011D98}, {0x00011EE0, 0x00011EF2}, -{0x00011FB0, 0x00011FB0}, {0x00012000, 0x00012399}, {0x00012480, 0x00012543}, {0x00013000, 0x0001342E}, -{0x00014400, 0x00014646}, {0x00016800, 0x00016A38}, {0x00016A40, 0x00016A5E}, {0x00016AD0, 0x00016AED}, -{0x00016B00, 0x00016B2F}, {0x00016B40, 0x00016B43}, {0x00016B63, 0x00016B77}, {0x00016B7D, 0x00016B8F}, -{0x00016E40, 0x00016E7F}, {0x00016F00, 0x00016F4A}, {0x00016F50, 0x00016F50}, {0x00016F93, 0x00016F9F}, -{0x00016FE0, 0x00016FE1}, {0x00016FE3, 0x00016FE3}, {0x00017000, 0x000187F7}, {0x00018800, 0x00018CD5}, -{0x00018D00, 0x00018D08}, {0x0001B000, 0x0001B11E}, {0x0001B150, 0x0001B152}, {0x0001B164, 0x0001B167}, -{0x0001B170, 0x0001B2FB}, {0x0001BC00, 0x0001BC6A}, {0x0001BC70, 0x0001BC7C}, {0x0001BC80, 0x0001BC88}, -{0x0001BC90, 0x0001BC99}, {0x0001D400, 0x0001D454}, {0x0001D456, 0x0001D49C}, {0x0001D49E, 0x0001D49F}, -{0x0001D4A2, 0x0001D4A2}, {0x0001D4A5, 0x0001D4A6}, {0x0001D4A9, 0x0001D4AC}, {0x0001D4AE, 0x0001D4B9}, -{0x0001D4BB, 0x0001D4BB}, {0x0001D4BD, 0x0001D4C3}, {0x0001D4C5, 0x0001D505}, {0x0001D507, 0x0001D50A}, -{0x0001D50D, 0x0001D514}, {0x0001D516, 0x0001D51C}, {0x0001D51E, 0x0001D539}, {0x0001D53B, 0x0001D53E}, -{0x0001D540, 0x0001D544}, {0x0001D546, 0x0001D546}, {0x0001D54A, 0x0001D550}, {0x0001D552, 0x0001D6A5}, -{0x0001D6A8, 0x0001D6C0}, {0x0001D6C2, 0x0001D6DA}, {0x0001D6DC, 0x0001D6FA}, {0x0001D6FC, 0x0001D714}, -{0x0001D716, 0x0001D734}, {0x0001D736, 0x0001D74E}, {0x0001D750, 0x0001D76E}, {0x0001D770, 0x0001D788}, -{0x0001D78A, 0x0001D7A8}, {0x0001D7AA, 0x0001D7C2}, {0x0001D7C4, 0x0001D7CB}, {0x0001E100, 0x0001E12C}, -{0x0001E137, 0x0001E13D}, {0x0001E14E, 0x0001E14E}, {0x0001E2C0, 0x0001E2EB}, {0x0001E800, 0x0001E8C4}, +{0x00011F02, 0x00011F02}, {0x00011F04, 0x00011F10}, {0x00011F12, 0x00011F33}, {0x00011FB0, 0x00011FB0}, +{0x00012000, 0x00012399}, {0x00012480, 0x00012543}, {0x00012F90, 0x00012FF0}, {0x00013000, 0x0001342F}, +{0x00013441, 0x00013446}, {0x00014400, 0x00014646}, {0x00016800, 0x00016A38}, {0x00016A40, 0x00016A5E}, +{0x00016A70, 0x00016ABE}, {0x00016AD0, 0x00016AED}, {0x00016B00, 0x00016B2F}, {0x00016B40, 0x00016B43}, +{0x00016B63, 0x00016B77}, {0x00016B7D, 0x00016B8F}, {0x00016E40, 0x00016E7F}, {0x00016F00, 0x00016F4A}, +{0x00016F50, 0x00016F50}, {0x00016F93, 0x00016F9F}, {0x00016FE0, 0x00016FE1}, {0x00016FE3, 0x00016FE3}, +{0x00017000, 0x000187F7}, {0x00018800, 0x00018CD5}, {0x00018D00, 0x00018D08}, {0x0001AFF0, 0x0001AFF3}, +{0x0001AFF5, 0x0001AFFB}, {0x0001AFFD, 0x0001AFFE}, {0x0001B000, 0x0001B122}, {0x0001B132, 0x0001B132}, +{0x0001B150, 0x0001B152}, {0x0001B155, 0x0001B155}, {0x0001B164, 0x0001B167}, {0x0001B170, 0x0001B2FB}, +{0x0001BC00, 0x0001BC6A}, {0x0001BC70, 0x0001BC7C}, {0x0001BC80, 0x0001BC88}, {0x0001BC90, 0x0001BC99}, +{0x0001D400, 0x0001D454}, {0x0001D456, 0x0001D49C}, {0x0001D49E, 0x0001D49F}, {0x0001D4A2, 0x0001D4A2}, +{0x0001D4A5, 0x0001D4A6}, {0x0001D4A9, 0x0001D4AC}, {0x0001D4AE, 0x0001D4B9}, {0x0001D4BB, 0x0001D4BB}, +{0x0001D4BD, 0x0001D4C3}, {0x0001D4C5, 0x0001D505}, {0x0001D507, 0x0001D50A}, {0x0001D50D, 0x0001D514}, +{0x0001D516, 0x0001D51C}, {0x0001D51E, 0x0001D539}, {0x0001D53B, 0x0001D53E}, {0x0001D540, 0x0001D544}, +{0x0001D546, 0x0001D546}, {0x0001D54A, 0x0001D550}, {0x0001D552, 0x0001D6A5}, {0x0001D6A8, 0x0001D6C0}, +{0x0001D6C2, 0x0001D6DA}, {0x0001D6DC, 0x0001D6FA}, {0x0001D6FC, 0x0001D714}, {0x0001D716, 0x0001D734}, +{0x0001D736, 0x0001D74E}, {0x0001D750, 0x0001D76E}, {0x0001D770, 0x0001D788}, {0x0001D78A, 0x0001D7A8}, +{0x0001D7AA, 0x0001D7C2}, {0x0001D7C4, 0x0001D7CB}, {0x0001DF00, 0x0001DF1E}, {0x0001DF25, 0x0001DF2A}, +{0x0001E030, 0x0001E06D}, {0x0001E100, 0x0001E12C}, {0x0001E137, 0x0001E13D}, {0x0001E14E, 0x0001E14E}, +{0x0001E290, 0x0001E2AD}, {0x0001E2C0, 0x0001E2EB}, {0x0001E4D0, 0x0001E4EB}, {0x0001E7E0, 0x0001E7E6}, +{0x0001E7E8, 0x0001E7EB}, {0x0001E7ED, 0x0001E7EE}, {0x0001E7F0, 0x0001E7FE}, {0x0001E800, 0x0001E8C4}, {0x0001E900, 0x0001E943}, {0x0001E94B, 0x0001E94B}, {0x0001EE00, 0x0001EE03}, {0x0001EE05, 0x0001EE1F}, {0x0001EE21, 0x0001EE22}, {0x0001EE24, 0x0001EE24}, {0x0001EE27, 0x0001EE27}, {0x0001EE29, 0x0001EE32}, {0x0001EE34, 0x0001EE37}, {0x0001EE39, 0x0001EE39}, {0x0001EE3B, 0x0001EE3B}, {0x0001EE42, 0x0001EE42}, @@ -182,15 +211,14 @@ const std::vector> unicode_ranges_letter = { {0x0001EE5B, 0x0001EE5B}, {0x0001EE5D, 0x0001EE5D}, {0x0001EE5F, 0x0001EE5F}, {0x0001EE61, 0x0001EE62}, {0x0001EE64, 0x0001EE64}, {0x0001EE67, 0x0001EE6A}, {0x0001EE6C, 0x0001EE72}, {0x0001EE74, 0x0001EE77}, {0x0001EE79, 0x0001EE7C}, {0x0001EE7E, 0x0001EE7E}, {0x0001EE80, 0x0001EE89}, {0x0001EE8B, 0x0001EE9B}, -{0x0001EEA1, 0x0001EEA3}, {0x0001EEA5, 0x0001EEA9}, {0x0001EEAB, 0x0001EEBB}, {0x00020000, 0x0002A6DD}, -{0x0002A700, 0x0002B734}, {0x0002B740, 0x0002B81D}, {0x0002B820, 0x0002CEA1}, {0x0002CEB0, 0x0002EBE0}, -{0x0002F800, 0x0002FA1D}, {0x00030000, 0x0003134A}, +{0x0001EEA1, 0x0001EEA3}, {0x0001EEA5, 0x0001EEA9}, {0x0001EEAB, 0x0001EEBB}, {0x00020000, 0x0002A6DF}, +{0x0002A700, 0x0002B739}, {0x0002B740, 0x0002B81D}, {0x0002B820, 0x0002CEA1}, {0x0002CEB0, 0x0002EBE0}, +{0x0002F800, 0x0002FA1D}, {0x00030000, 0x0003134A}, {0x00031350, 0x000323AF}, }; const std::vector> unicode_ranges_whitespace = { -{0x00000009, 0x0000000D}, {0x0000001C, 0x00000020}, {0x00000085, 0x00000085}, {0x000000A0, 0x000000A0}, -{0x00001680, 0x00001680}, {0x00002000, 0x0000200A}, {0x00002028, 0x00002029}, {0x0000202F, 0x0000202F}, -{0x0000205F, 0x0000205F}, {0x00003000, 0x00003000}, +{0x00000020, 0x00000020}, {0x000000A0, 0x000000A0}, {0x00001680, 0x00001680}, {0x00002000, 0x0000200A}, +{0x00002028, 0x00002029}, {0x0000202F, 0x0000202F}, {0x0000205F, 0x0000205F}, {0x00003000, 0x00003000}, }; const std::vector> unicode_ranges_accent_mark = { @@ -200,72 +228,77 @@ const std::vector> unicode_ranges_accent_mark = { {0x000006E7, 0x000006E8}, {0x000006EA, 0x000006ED}, {0x00000711, 0x00000711}, {0x00000730, 0x0000074A}, {0x000007A6, 0x000007B0}, {0x000007EB, 0x000007F3}, {0x000007FD, 0x000007FD}, {0x00000816, 0x00000819}, {0x0000081B, 0x00000823}, {0x00000825, 0x00000827}, {0x00000829, 0x0000082D}, {0x00000859, 0x0000085B}, -{0x000008D3, 0x000008E1}, {0x000008E3, 0x00000903}, {0x0000093A, 0x0000093C}, {0x0000093E, 0x0000094F}, -{0x00000951, 0x00000957}, {0x00000962, 0x00000963}, {0x00000981, 0x00000983}, {0x000009BC, 0x000009BC}, -{0x000009BE, 0x000009C4}, {0x000009C7, 0x000009C8}, {0x000009CB, 0x000009CD}, {0x000009D7, 0x000009D7}, -{0x000009E2, 0x000009E3}, {0x000009FE, 0x000009FE}, {0x00000A01, 0x00000A03}, {0x00000A3C, 0x00000A3C}, -{0x00000A3E, 0x00000A42}, {0x00000A47, 0x00000A48}, {0x00000A4B, 0x00000A4D}, {0x00000A51, 0x00000A51}, -{0x00000A70, 0x00000A71}, {0x00000A75, 0x00000A75}, {0x00000A81, 0x00000A83}, {0x00000ABC, 0x00000ABC}, -{0x00000ABE, 0x00000AC5}, {0x00000AC7, 0x00000AC9}, {0x00000ACB, 0x00000ACD}, {0x00000AE2, 0x00000AE3}, -{0x00000AFA, 0x00000AFF}, {0x00000B01, 0x00000B03}, {0x00000B3C, 0x00000B3C}, {0x00000B3E, 0x00000B44}, -{0x00000B47, 0x00000B48}, {0x00000B4B, 0x00000B4D}, {0x00000B55, 0x00000B57}, {0x00000B62, 0x00000B63}, -{0x00000B82, 0x00000B82}, {0x00000BBE, 0x00000BC2}, {0x00000BC6, 0x00000BC8}, {0x00000BCA, 0x00000BCD}, -{0x00000BD7, 0x00000BD7}, {0x00000C00, 0x00000C04}, {0x00000C3E, 0x00000C44}, {0x00000C46, 0x00000C48}, -{0x00000C4A, 0x00000C4D}, {0x00000C55, 0x00000C56}, {0x00000C62, 0x00000C63}, {0x00000C81, 0x00000C83}, -{0x00000CBC, 0x00000CBC}, {0x00000CBE, 0x00000CC4}, {0x00000CC6, 0x00000CC8}, {0x00000CCA, 0x00000CCD}, -{0x00000CD5, 0x00000CD6}, {0x00000CE2, 0x00000CE3}, {0x00000D00, 0x00000D03}, {0x00000D3B, 0x00000D3C}, -{0x00000D3E, 0x00000D44}, {0x00000D46, 0x00000D48}, {0x00000D4A, 0x00000D4D}, {0x00000D57, 0x00000D57}, -{0x00000D62, 0x00000D63}, {0x00000D81, 0x00000D83}, {0x00000DCA, 0x00000DCA}, {0x00000DCF, 0x00000DD4}, -{0x00000DD6, 0x00000DD6}, {0x00000DD8, 0x00000DDF}, {0x00000DF2, 0x00000DF3}, {0x00000E31, 0x00000E31}, -{0x00000E34, 0x00000E3A}, {0x00000E47, 0x00000E4E}, {0x00000EB1, 0x00000EB1}, {0x00000EB4, 0x00000EBC}, -{0x00000EC8, 0x00000ECD}, {0x00000F18, 0x00000F19}, {0x00000F35, 0x00000F35}, {0x00000F37, 0x00000F37}, -{0x00000F39, 0x00000F39}, {0x00000F3E, 0x00000F3F}, {0x00000F71, 0x00000F84}, {0x00000F86, 0x00000F87}, -{0x00000F8D, 0x00000F97}, {0x00000F99, 0x00000FBC}, {0x00000FC6, 0x00000FC6}, {0x0000102B, 0x0000103E}, -{0x00001056, 0x00001059}, {0x0000105E, 0x00001060}, {0x00001062, 0x00001064}, {0x00001067, 0x0000106D}, -{0x00001071, 0x00001074}, {0x00001082, 0x0000108D}, {0x0000108F, 0x0000108F}, {0x0000109A, 0x0000109D}, -{0x0000135D, 0x0000135F}, {0x00001712, 0x00001714}, {0x00001732, 0x00001734}, {0x00001752, 0x00001753}, -{0x00001772, 0x00001773}, {0x000017B4, 0x000017D3}, {0x000017DD, 0x000017DD}, {0x0000180B, 0x0000180D}, +{0x00000898, 0x0000089F}, {0x000008CA, 0x000008E1}, {0x000008E3, 0x00000903}, {0x0000093A, 0x0000093C}, +{0x0000093E, 0x0000094F}, {0x00000951, 0x00000957}, {0x00000962, 0x00000963}, {0x00000981, 0x00000983}, +{0x000009BC, 0x000009BC}, {0x000009BE, 0x000009C4}, {0x000009C7, 0x000009C8}, {0x000009CB, 0x000009CD}, +{0x000009D7, 0x000009D7}, {0x000009E2, 0x000009E3}, {0x000009FE, 0x000009FE}, {0x00000A01, 0x00000A03}, +{0x00000A3C, 0x00000A3C}, {0x00000A3E, 0x00000A42}, {0x00000A47, 0x00000A48}, {0x00000A4B, 0x00000A4D}, +{0x00000A51, 0x00000A51}, {0x00000A70, 0x00000A71}, {0x00000A75, 0x00000A75}, {0x00000A81, 0x00000A83}, +{0x00000ABC, 0x00000ABC}, {0x00000ABE, 0x00000AC5}, {0x00000AC7, 0x00000AC9}, {0x00000ACB, 0x00000ACD}, +{0x00000AE2, 0x00000AE3}, {0x00000AFA, 0x00000AFF}, {0x00000B01, 0x00000B03}, {0x00000B3C, 0x00000B3C}, +{0x00000B3E, 0x00000B44}, {0x00000B47, 0x00000B48}, {0x00000B4B, 0x00000B4D}, {0x00000B55, 0x00000B57}, +{0x00000B62, 0x00000B63}, {0x00000B82, 0x00000B82}, {0x00000BBE, 0x00000BC2}, {0x00000BC6, 0x00000BC8}, +{0x00000BCA, 0x00000BCD}, {0x00000BD7, 0x00000BD7}, {0x00000C00, 0x00000C04}, {0x00000C3C, 0x00000C3C}, +{0x00000C3E, 0x00000C44}, {0x00000C46, 0x00000C48}, {0x00000C4A, 0x00000C4D}, {0x00000C55, 0x00000C56}, +{0x00000C62, 0x00000C63}, {0x00000C81, 0x00000C83}, {0x00000CBC, 0x00000CBC}, {0x00000CBE, 0x00000CC4}, +{0x00000CC6, 0x00000CC8}, {0x00000CCA, 0x00000CCD}, {0x00000CD5, 0x00000CD6}, {0x00000CE2, 0x00000CE3}, +{0x00000CF3, 0x00000CF3}, {0x00000D00, 0x00000D03}, {0x00000D3B, 0x00000D3C}, {0x00000D3E, 0x00000D44}, +{0x00000D46, 0x00000D48}, {0x00000D4A, 0x00000D4D}, {0x00000D57, 0x00000D57}, {0x00000D62, 0x00000D63}, +{0x00000D81, 0x00000D83}, {0x00000DCA, 0x00000DCA}, {0x00000DCF, 0x00000DD4}, {0x00000DD6, 0x00000DD6}, +{0x00000DD8, 0x00000DDF}, {0x00000DF2, 0x00000DF3}, {0x00000E31, 0x00000E31}, {0x00000E34, 0x00000E3A}, +{0x00000E47, 0x00000E4E}, {0x00000EB1, 0x00000EB1}, {0x00000EB4, 0x00000EBC}, {0x00000EC8, 0x00000ECE}, +{0x00000F18, 0x00000F19}, {0x00000F35, 0x00000F35}, {0x00000F37, 0x00000F37}, {0x00000F39, 0x00000F39}, +{0x00000F3E, 0x00000F3F}, {0x00000F71, 0x00000F84}, {0x00000F86, 0x00000F87}, {0x00000F8D, 0x00000F97}, +{0x00000F99, 0x00000FBC}, {0x00000FC6, 0x00000FC6}, {0x0000102B, 0x0000103E}, {0x00001056, 0x00001059}, +{0x0000105E, 0x00001060}, {0x00001062, 0x00001064}, {0x00001067, 0x0000106D}, {0x00001071, 0x00001074}, +{0x00001082, 0x0000108D}, {0x0000108F, 0x0000108F}, {0x0000109A, 0x0000109D}, {0x0000135D, 0x0000135F}, +{0x00001712, 0x00001715}, {0x00001732, 0x00001734}, {0x00001752, 0x00001753}, {0x00001772, 0x00001773}, +{0x000017B4, 0x000017D3}, {0x000017DD, 0x000017DD}, {0x0000180B, 0x0000180D}, {0x0000180F, 0x0000180F}, {0x00001885, 0x00001886}, {0x000018A9, 0x000018A9}, {0x00001920, 0x0000192B}, {0x00001930, 0x0000193B}, {0x00001A17, 0x00001A1B}, {0x00001A55, 0x00001A5E}, {0x00001A60, 0x00001A7C}, {0x00001A7F, 0x00001A7F}, -{0x00001AB0, 0x00001AC0}, {0x00001B00, 0x00001B04}, {0x00001B34, 0x00001B44}, {0x00001B6B, 0x00001B73}, +{0x00001AB0, 0x00001ACE}, {0x00001B00, 0x00001B04}, {0x00001B34, 0x00001B44}, {0x00001B6B, 0x00001B73}, {0x00001B80, 0x00001B82}, {0x00001BA1, 0x00001BAD}, {0x00001BE6, 0x00001BF3}, {0x00001C24, 0x00001C37}, {0x00001CD0, 0x00001CD2}, {0x00001CD4, 0x00001CE8}, {0x00001CED, 0x00001CED}, {0x00001CF4, 0x00001CF4}, -{0x00001CF7, 0x00001CF9}, {0x00001DC0, 0x00001DF9}, {0x00001DFB, 0x00001DFF}, {0x000020D0, 0x000020F0}, -{0x00002CEF, 0x00002CF1}, {0x00002D7F, 0x00002D7F}, {0x00002DE0, 0x00002DFF}, {0x0000302A, 0x0000302F}, -{0x00003099, 0x0000309A}, {0x0000A66F, 0x0000A672}, {0x0000A674, 0x0000A67D}, {0x0000A69E, 0x0000A69F}, -{0x0000A6F0, 0x0000A6F1}, {0x0000A802, 0x0000A802}, {0x0000A806, 0x0000A806}, {0x0000A80B, 0x0000A80B}, -{0x0000A823, 0x0000A827}, {0x0000A82C, 0x0000A82C}, {0x0000A880, 0x0000A881}, {0x0000A8B4, 0x0000A8C5}, -{0x0000A8E0, 0x0000A8F1}, {0x0000A8FF, 0x0000A8FF}, {0x0000A926, 0x0000A92D}, {0x0000A947, 0x0000A953}, -{0x0000A980, 0x0000A983}, {0x0000A9B3, 0x0000A9C0}, {0x0000A9E5, 0x0000A9E5}, {0x0000AA29, 0x0000AA36}, -{0x0000AA43, 0x0000AA43}, {0x0000AA4C, 0x0000AA4D}, {0x0000AA7B, 0x0000AA7D}, {0x0000AAB0, 0x0000AAB0}, -{0x0000AAB2, 0x0000AAB4}, {0x0000AAB7, 0x0000AAB8}, {0x0000AABE, 0x0000AABF}, {0x0000AAC1, 0x0000AAC1}, -{0x0000AAEB, 0x0000AAEF}, {0x0000AAF5, 0x0000AAF6}, {0x0000ABE3, 0x0000ABEA}, {0x0000ABEC, 0x0000ABED}, -{0x0000FB1E, 0x0000FB1E}, {0x0000FE00, 0x0000FE0F}, {0x0000FE20, 0x0000FE2F}, {0x000101FD, 0x000101FD}, -{0x000102E0, 0x000102E0}, {0x00010376, 0x0001037A}, {0x00010A01, 0x00010A03}, {0x00010A05, 0x00010A06}, -{0x00010A0C, 0x00010A0F}, {0x00010A38, 0x00010A3A}, {0x00010A3F, 0x00010A3F}, {0x00010AE5, 0x00010AE6}, -{0x00010D24, 0x00010D27}, {0x00010EAB, 0x00010EAC}, {0x00010F46, 0x00010F50}, {0x00011000, 0x00011002}, -{0x00011038, 0x00011046}, {0x0001107F, 0x00011082}, {0x000110B0, 0x000110BA}, {0x00011100, 0x00011102}, +{0x00001CF7, 0x00001CF9}, {0x00001DC0, 0x00001DFF}, {0x000020D0, 0x000020F0}, {0x00002CEF, 0x00002CF1}, +{0x00002D7F, 0x00002D7F}, {0x00002DE0, 0x00002DFF}, {0x0000302A, 0x0000302F}, {0x00003099, 0x0000309A}, +{0x0000A66F, 0x0000A672}, {0x0000A674, 0x0000A67D}, {0x0000A69E, 0x0000A69F}, {0x0000A6F0, 0x0000A6F1}, +{0x0000A802, 0x0000A802}, {0x0000A806, 0x0000A806}, {0x0000A80B, 0x0000A80B}, {0x0000A823, 0x0000A827}, +{0x0000A82C, 0x0000A82C}, {0x0000A880, 0x0000A881}, {0x0000A8B4, 0x0000A8C5}, {0x0000A8E0, 0x0000A8F1}, +{0x0000A8FF, 0x0000A8FF}, {0x0000A926, 0x0000A92D}, {0x0000A947, 0x0000A953}, {0x0000A980, 0x0000A983}, +{0x0000A9B3, 0x0000A9C0}, {0x0000A9E5, 0x0000A9E5}, {0x0000AA29, 0x0000AA36}, {0x0000AA43, 0x0000AA43}, +{0x0000AA4C, 0x0000AA4D}, {0x0000AA7B, 0x0000AA7D}, {0x0000AAB0, 0x0000AAB0}, {0x0000AAB2, 0x0000AAB4}, +{0x0000AAB7, 0x0000AAB8}, {0x0000AABE, 0x0000AABF}, {0x0000AAC1, 0x0000AAC1}, {0x0000AAEB, 0x0000AAEF}, +{0x0000AAF5, 0x0000AAF6}, {0x0000ABE3, 0x0000ABEA}, {0x0000ABEC, 0x0000ABED}, {0x0000FB1E, 0x0000FB1E}, +{0x0000FE00, 0x0000FE0F}, {0x0000FE20, 0x0000FE2F}, {0x000101FD, 0x000101FD}, {0x000102E0, 0x000102E0}, +{0x00010376, 0x0001037A}, {0x00010A01, 0x00010A03}, {0x00010A05, 0x00010A06}, {0x00010A0C, 0x00010A0F}, +{0x00010A38, 0x00010A3A}, {0x00010A3F, 0x00010A3F}, {0x00010AE5, 0x00010AE6}, {0x00010D24, 0x00010D27}, +{0x00010EAB, 0x00010EAC}, {0x00010EFD, 0x00010EFF}, {0x00010F46, 0x00010F50}, {0x00010F82, 0x00010F85}, +{0x00011000, 0x00011002}, {0x00011038, 0x00011046}, {0x00011070, 0x00011070}, {0x00011073, 0x00011074}, +{0x0001107F, 0x00011082}, {0x000110B0, 0x000110BA}, {0x000110C2, 0x000110C2}, {0x00011100, 0x00011102}, {0x00011127, 0x00011134}, {0x00011145, 0x00011146}, {0x00011173, 0x00011173}, {0x00011180, 0x00011182}, {0x000111B3, 0x000111C0}, {0x000111C9, 0x000111CC}, {0x000111CE, 0x000111CF}, {0x0001122C, 0x00011237}, -{0x0001123E, 0x0001123E}, {0x000112DF, 0x000112EA}, {0x00011300, 0x00011303}, {0x0001133B, 0x0001133C}, -{0x0001133E, 0x00011344}, {0x00011347, 0x00011348}, {0x0001134B, 0x0001134D}, {0x00011357, 0x00011357}, -{0x00011362, 0x00011363}, {0x00011366, 0x0001136C}, {0x00011370, 0x00011374}, {0x00011435, 0x00011446}, -{0x0001145E, 0x0001145E}, {0x000114B0, 0x000114C3}, {0x000115AF, 0x000115B5}, {0x000115B8, 0x000115C0}, -{0x000115DC, 0x000115DD}, {0x00011630, 0x00011640}, {0x000116AB, 0x000116B7}, {0x0001171D, 0x0001172B}, -{0x0001182C, 0x0001183A}, {0x00011930, 0x00011935}, {0x00011937, 0x00011938}, {0x0001193B, 0x0001193E}, -{0x00011940, 0x00011940}, {0x00011942, 0x00011943}, {0x000119D1, 0x000119D7}, {0x000119DA, 0x000119E0}, -{0x000119E4, 0x000119E4}, {0x00011A01, 0x00011A0A}, {0x00011A33, 0x00011A39}, {0x00011A3B, 0x00011A3E}, -{0x00011A47, 0x00011A47}, {0x00011A51, 0x00011A5B}, {0x00011A8A, 0x00011A99}, {0x00011C2F, 0x00011C36}, -{0x00011C38, 0x00011C3F}, {0x00011C92, 0x00011CA7}, {0x00011CA9, 0x00011CB6}, {0x00011D31, 0x00011D36}, -{0x00011D3A, 0x00011D3A}, {0x00011D3C, 0x00011D3D}, {0x00011D3F, 0x00011D45}, {0x00011D47, 0x00011D47}, -{0x00011D8A, 0x00011D8E}, {0x00011D90, 0x00011D91}, {0x00011D93, 0x00011D97}, {0x00011EF3, 0x00011EF6}, -{0x00016AF0, 0x00016AF4}, {0x00016B30, 0x00016B36}, {0x00016F4F, 0x00016F4F}, {0x00016F51, 0x00016F87}, -{0x00016F8F, 0x00016F92}, {0x00016FE4, 0x00016FE4}, {0x00016FF0, 0x00016FF1}, {0x0001BC9D, 0x0001BC9E}, -{0x0001D165, 0x0001D169}, {0x0001D16D, 0x0001D172}, {0x0001D17B, 0x0001D182}, {0x0001D185, 0x0001D18B}, -{0x0001D1AA, 0x0001D1AD}, {0x0001D242, 0x0001D244}, {0x0001DA00, 0x0001DA36}, {0x0001DA3B, 0x0001DA6C}, -{0x0001DA75, 0x0001DA75}, {0x0001DA84, 0x0001DA84}, {0x0001DA9B, 0x0001DA9F}, {0x0001DAA1, 0x0001DAAF}, -{0x0001E000, 0x0001E006}, {0x0001E008, 0x0001E018}, {0x0001E01B, 0x0001E021}, {0x0001E023, 0x0001E024}, -{0x0001E026, 0x0001E02A}, {0x0001E130, 0x0001E136}, {0x0001E2EC, 0x0001E2EF}, {0x0001E8D0, 0x0001E8D6}, +{0x0001123E, 0x0001123E}, {0x00011241, 0x00011241}, {0x000112DF, 0x000112EA}, {0x00011300, 0x00011303}, +{0x0001133B, 0x0001133C}, {0x0001133E, 0x00011344}, {0x00011347, 0x00011348}, {0x0001134B, 0x0001134D}, +{0x00011357, 0x00011357}, {0x00011362, 0x00011363}, {0x00011366, 0x0001136C}, {0x00011370, 0x00011374}, +{0x00011435, 0x00011446}, {0x0001145E, 0x0001145E}, {0x000114B0, 0x000114C3}, {0x000115AF, 0x000115B5}, +{0x000115B8, 0x000115C0}, {0x000115DC, 0x000115DD}, {0x00011630, 0x00011640}, {0x000116AB, 0x000116B7}, +{0x0001171D, 0x0001172B}, {0x0001182C, 0x0001183A}, {0x00011930, 0x00011935}, {0x00011937, 0x00011938}, +{0x0001193B, 0x0001193E}, {0x00011940, 0x00011940}, {0x00011942, 0x00011943}, {0x000119D1, 0x000119D7}, +{0x000119DA, 0x000119E0}, {0x000119E4, 0x000119E4}, {0x00011A01, 0x00011A0A}, {0x00011A33, 0x00011A39}, +{0x00011A3B, 0x00011A3E}, {0x00011A47, 0x00011A47}, {0x00011A51, 0x00011A5B}, {0x00011A8A, 0x00011A99}, +{0x00011C2F, 0x00011C36}, {0x00011C38, 0x00011C3F}, {0x00011C92, 0x00011CA7}, {0x00011CA9, 0x00011CB6}, +{0x00011D31, 0x00011D36}, {0x00011D3A, 0x00011D3A}, {0x00011D3C, 0x00011D3D}, {0x00011D3F, 0x00011D45}, +{0x00011D47, 0x00011D47}, {0x00011D8A, 0x00011D8E}, {0x00011D90, 0x00011D91}, {0x00011D93, 0x00011D97}, +{0x00011EF3, 0x00011EF6}, {0x00011F00, 0x00011F01}, {0x00011F03, 0x00011F03}, {0x00011F34, 0x00011F3A}, +{0x00011F3E, 0x00011F42}, {0x00013440, 0x00013440}, {0x00013447, 0x00013455}, {0x00016AF0, 0x00016AF4}, +{0x00016B30, 0x00016B36}, {0x00016F4F, 0x00016F4F}, {0x00016F51, 0x00016F87}, {0x00016F8F, 0x00016F92}, +{0x00016FE4, 0x00016FE4}, {0x00016FF0, 0x00016FF1}, {0x0001BC9D, 0x0001BC9E}, {0x0001CF00, 0x0001CF2D}, +{0x0001CF30, 0x0001CF46}, {0x0001D165, 0x0001D169}, {0x0001D16D, 0x0001D172}, {0x0001D17B, 0x0001D182}, +{0x0001D185, 0x0001D18B}, {0x0001D1AA, 0x0001D1AD}, {0x0001D242, 0x0001D244}, {0x0001DA00, 0x0001DA36}, +{0x0001DA3B, 0x0001DA6C}, {0x0001DA75, 0x0001DA75}, {0x0001DA84, 0x0001DA84}, {0x0001DA9B, 0x0001DA9F}, +{0x0001DAA1, 0x0001DAAF}, {0x0001E000, 0x0001E006}, {0x0001E008, 0x0001E018}, {0x0001E01B, 0x0001E021}, +{0x0001E023, 0x0001E024}, {0x0001E026, 0x0001E02A}, {0x0001E08F, 0x0001E08F}, {0x0001E130, 0x0001E136}, +{0x0001E2AE, 0x0001E2AE}, {0x0001E2EC, 0x0001E2EF}, {0x0001E4EC, 0x0001E4EF}, {0x0001E8D0, 0x0001E8D6}, {0x0001E944, 0x0001E94A}, {0x000E0100, 0x000E01EF}, }; @@ -276,7 +309,7 @@ const std::vector> unicode_ranges_punctuation = { {0x000000B6, 0x000000B7}, {0x000000BB, 0x000000BB}, {0x000000BF, 0x000000BF}, {0x0000037E, 0x0000037E}, {0x00000387, 0x00000387}, {0x0000055A, 0x0000055F}, {0x00000589, 0x0000058A}, {0x000005BE, 0x000005BE}, {0x000005C0, 0x000005C0}, {0x000005C3, 0x000005C3}, {0x000005C6, 0x000005C6}, {0x000005F3, 0x000005F4}, -{0x00000609, 0x0000060A}, {0x0000060C, 0x0000060D}, {0x0000061B, 0x0000061B}, {0x0000061E, 0x0000061F}, +{0x00000609, 0x0000060A}, {0x0000060C, 0x0000060D}, {0x0000061B, 0x0000061B}, {0x0000061D, 0x0000061F}, {0x0000066A, 0x0000066D}, {0x000006D4, 0x000006D4}, {0x00000700, 0x0000070D}, {0x000007F7, 0x000007F9}, {0x00000830, 0x0000083E}, {0x0000085E, 0x0000085E}, {0x00000964, 0x00000965}, {0x00000970, 0x00000970}, {0x000009FD, 0x000009FD}, {0x00000A76, 0x00000A76}, {0x00000AF0, 0x00000AF0}, {0x00000C77, 0x00000C77}, @@ -286,37 +319,38 @@ const std::vector> unicode_ranges_punctuation = { {0x00001360, 0x00001368}, {0x00001400, 0x00001400}, {0x0000166E, 0x0000166E}, {0x0000169B, 0x0000169C}, {0x000016EB, 0x000016ED}, {0x00001735, 0x00001736}, {0x000017D4, 0x000017D6}, {0x000017D8, 0x000017DA}, {0x00001800, 0x0000180A}, {0x00001944, 0x00001945}, {0x00001A1E, 0x00001A1F}, {0x00001AA0, 0x00001AA6}, -{0x00001AA8, 0x00001AAD}, {0x00001B5A, 0x00001B60}, {0x00001BFC, 0x00001BFF}, {0x00001C3B, 0x00001C3F}, -{0x00001C7E, 0x00001C7F}, {0x00001CC0, 0x00001CC7}, {0x00001CD3, 0x00001CD3}, {0x00002010, 0x00002027}, -{0x00002030, 0x00002043}, {0x00002045, 0x00002051}, {0x00002053, 0x0000205E}, {0x0000207D, 0x0000207E}, -{0x0000208D, 0x0000208E}, {0x00002308, 0x0000230B}, {0x00002329, 0x0000232A}, {0x00002768, 0x00002775}, -{0x000027C5, 0x000027C6}, {0x000027E6, 0x000027EF}, {0x00002983, 0x00002998}, {0x000029D8, 0x000029DB}, -{0x000029FC, 0x000029FD}, {0x00002CF9, 0x00002CFC}, {0x00002CFE, 0x00002CFF}, {0x00002D70, 0x00002D70}, -{0x00002E00, 0x00002E2E}, {0x00002E30, 0x00002E4F}, {0x00002E52, 0x00002E52}, {0x00003001, 0x00003003}, -{0x00003008, 0x00003011}, {0x00003014, 0x0000301F}, {0x00003030, 0x00003030}, {0x0000303D, 0x0000303D}, -{0x000030A0, 0x000030A0}, {0x000030FB, 0x000030FB}, {0x0000A4FE, 0x0000A4FF}, {0x0000A60D, 0x0000A60F}, -{0x0000A673, 0x0000A673}, {0x0000A67E, 0x0000A67E}, {0x0000A6F2, 0x0000A6F7}, {0x0000A874, 0x0000A877}, -{0x0000A8CE, 0x0000A8CF}, {0x0000A8F8, 0x0000A8FA}, {0x0000A8FC, 0x0000A8FC}, {0x0000A92E, 0x0000A92F}, -{0x0000A95F, 0x0000A95F}, {0x0000A9C1, 0x0000A9CD}, {0x0000A9DE, 0x0000A9DF}, {0x0000AA5C, 0x0000AA5F}, -{0x0000AADE, 0x0000AADF}, {0x0000AAF0, 0x0000AAF1}, {0x0000ABEB, 0x0000ABEB}, {0x0000FD3E, 0x0000FD3F}, -{0x0000FE10, 0x0000FE19}, {0x0000FE30, 0x0000FE52}, {0x0000FE54, 0x0000FE61}, {0x0000FE63, 0x0000FE63}, -{0x0000FE68, 0x0000FE68}, {0x0000FE6A, 0x0000FE6B}, {0x0000FF01, 0x0000FF03}, {0x0000FF05, 0x0000FF0A}, -{0x0000FF0C, 0x0000FF0F}, {0x0000FF1A, 0x0000FF1B}, {0x0000FF1F, 0x0000FF20}, {0x0000FF3B, 0x0000FF3D}, -{0x0000FF3F, 0x0000FF3F}, {0x0000FF5B, 0x0000FF5B}, {0x0000FF5D, 0x0000FF5D}, {0x0000FF5F, 0x0000FF65}, -{0x00010100, 0x00010102}, {0x0001039F, 0x0001039F}, {0x000103D0, 0x000103D0}, {0x0001056F, 0x0001056F}, -{0x00010857, 0x00010857}, {0x0001091F, 0x0001091F}, {0x0001093F, 0x0001093F}, {0x00010A50, 0x00010A58}, -{0x00010A7F, 0x00010A7F}, {0x00010AF0, 0x00010AF6}, {0x00010B39, 0x00010B3F}, {0x00010B99, 0x00010B9C}, -{0x00010EAD, 0x00010EAD}, {0x00010F55, 0x00010F59}, {0x00011047, 0x0001104D}, {0x000110BB, 0x000110BC}, -{0x000110BE, 0x000110C1}, {0x00011140, 0x00011143}, {0x00011174, 0x00011175}, {0x000111C5, 0x000111C8}, -{0x000111CD, 0x000111CD}, {0x000111DB, 0x000111DB}, {0x000111DD, 0x000111DF}, {0x00011238, 0x0001123D}, -{0x000112A9, 0x000112A9}, {0x0001144B, 0x0001144F}, {0x0001145A, 0x0001145B}, {0x0001145D, 0x0001145D}, -{0x000114C6, 0x000114C6}, {0x000115C1, 0x000115D7}, {0x00011641, 0x00011643}, {0x00011660, 0x0001166C}, -{0x0001173C, 0x0001173E}, {0x0001183B, 0x0001183B}, {0x00011944, 0x00011946}, {0x000119E2, 0x000119E2}, -{0x00011A3F, 0x00011A46}, {0x00011A9A, 0x00011A9C}, {0x00011A9E, 0x00011AA2}, {0x00011C41, 0x00011C45}, -{0x00011C70, 0x00011C71}, {0x00011EF7, 0x00011EF8}, {0x00011FFF, 0x00011FFF}, {0x00012470, 0x00012474}, -{0x00016A6E, 0x00016A6F}, {0x00016AF5, 0x00016AF5}, {0x00016B37, 0x00016B3B}, {0x00016B44, 0x00016B44}, -{0x00016E97, 0x00016E9A}, {0x00016FE2, 0x00016FE2}, {0x0001BC9F, 0x0001BC9F}, {0x0001DA87, 0x0001DA8B}, -{0x0001E95E, 0x0001E95F}, +{0x00001AA8, 0x00001AAD}, {0x00001B5A, 0x00001B60}, {0x00001B7D, 0x00001B7E}, {0x00001BFC, 0x00001BFF}, +{0x00001C3B, 0x00001C3F}, {0x00001C7E, 0x00001C7F}, {0x00001CC0, 0x00001CC7}, {0x00001CD3, 0x00001CD3}, +{0x00002010, 0x00002027}, {0x00002030, 0x00002043}, {0x00002045, 0x00002051}, {0x00002053, 0x0000205E}, +{0x0000207D, 0x0000207E}, {0x0000208D, 0x0000208E}, {0x00002308, 0x0000230B}, {0x00002329, 0x0000232A}, +{0x00002768, 0x00002775}, {0x000027C5, 0x000027C6}, {0x000027E6, 0x000027EF}, {0x00002983, 0x00002998}, +{0x000029D8, 0x000029DB}, {0x000029FC, 0x000029FD}, {0x00002CF9, 0x00002CFC}, {0x00002CFE, 0x00002CFF}, +{0x00002D70, 0x00002D70}, {0x00002E00, 0x00002E2E}, {0x00002E30, 0x00002E4F}, {0x00002E52, 0x00002E5D}, +{0x00003001, 0x00003003}, {0x00003008, 0x00003011}, {0x00003014, 0x0000301F}, {0x00003030, 0x00003030}, +{0x0000303D, 0x0000303D}, {0x000030A0, 0x000030A0}, {0x000030FB, 0x000030FB}, {0x0000A4FE, 0x0000A4FF}, +{0x0000A60D, 0x0000A60F}, {0x0000A673, 0x0000A673}, {0x0000A67E, 0x0000A67E}, {0x0000A6F2, 0x0000A6F7}, +{0x0000A874, 0x0000A877}, {0x0000A8CE, 0x0000A8CF}, {0x0000A8F8, 0x0000A8FA}, {0x0000A8FC, 0x0000A8FC}, +{0x0000A92E, 0x0000A92F}, {0x0000A95F, 0x0000A95F}, {0x0000A9C1, 0x0000A9CD}, {0x0000A9DE, 0x0000A9DF}, +{0x0000AA5C, 0x0000AA5F}, {0x0000AADE, 0x0000AADF}, {0x0000AAF0, 0x0000AAF1}, {0x0000ABEB, 0x0000ABEB}, +{0x0000FD3E, 0x0000FD3F}, {0x0000FE10, 0x0000FE19}, {0x0000FE30, 0x0000FE52}, {0x0000FE54, 0x0000FE61}, +{0x0000FE63, 0x0000FE63}, {0x0000FE68, 0x0000FE68}, {0x0000FE6A, 0x0000FE6B}, {0x0000FF01, 0x0000FF03}, +{0x0000FF05, 0x0000FF0A}, {0x0000FF0C, 0x0000FF0F}, {0x0000FF1A, 0x0000FF1B}, {0x0000FF1F, 0x0000FF20}, +{0x0000FF3B, 0x0000FF3D}, {0x0000FF3F, 0x0000FF3F}, {0x0000FF5B, 0x0000FF5B}, {0x0000FF5D, 0x0000FF5D}, +{0x0000FF5F, 0x0000FF65}, {0x00010100, 0x00010102}, {0x0001039F, 0x0001039F}, {0x000103D0, 0x000103D0}, +{0x0001056F, 0x0001056F}, {0x00010857, 0x00010857}, {0x0001091F, 0x0001091F}, {0x0001093F, 0x0001093F}, +{0x00010A50, 0x00010A58}, {0x00010A7F, 0x00010A7F}, {0x00010AF0, 0x00010AF6}, {0x00010B39, 0x00010B3F}, +{0x00010B99, 0x00010B9C}, {0x00010EAD, 0x00010EAD}, {0x00010F55, 0x00010F59}, {0x00010F86, 0x00010F89}, +{0x00011047, 0x0001104D}, {0x000110BB, 0x000110BC}, {0x000110BE, 0x000110C1}, {0x00011140, 0x00011143}, +{0x00011174, 0x00011175}, {0x000111C5, 0x000111C8}, {0x000111CD, 0x000111CD}, {0x000111DB, 0x000111DB}, +{0x000111DD, 0x000111DF}, {0x00011238, 0x0001123D}, {0x000112A9, 0x000112A9}, {0x0001144B, 0x0001144F}, +{0x0001145A, 0x0001145B}, {0x0001145D, 0x0001145D}, {0x000114C6, 0x000114C6}, {0x000115C1, 0x000115D7}, +{0x00011641, 0x00011643}, {0x00011660, 0x0001166C}, {0x000116B9, 0x000116B9}, {0x0001173C, 0x0001173E}, +{0x0001183B, 0x0001183B}, {0x00011944, 0x00011946}, {0x000119E2, 0x000119E2}, {0x00011A3F, 0x00011A46}, +{0x00011A9A, 0x00011A9C}, {0x00011A9E, 0x00011AA2}, {0x00011B00, 0x00011B09}, {0x00011C41, 0x00011C45}, +{0x00011C70, 0x00011C71}, {0x00011EF7, 0x00011EF8}, {0x00011F43, 0x00011F4F}, {0x00011FFF, 0x00011FFF}, +{0x00012470, 0x00012474}, {0x00012FF1, 0x00012FF2}, {0x00016A6E, 0x00016A6F}, {0x00016AF5, 0x00016AF5}, +{0x00016B37, 0x00016B3B}, {0x00016B44, 0x00016B44}, {0x00016E97, 0x00016E9A}, {0x00016FE2, 0x00016FE2}, +{0x0001BC9F, 0x0001BC9F}, {0x0001DA87, 0x0001DA8B}, {0x0001E95E, 0x0001E95F}, }; const std::vector> unicode_ranges_symbol = { @@ -328,40 +362,41 @@ const std::vector> unicode_ranges_symbol = { {0x00000375, 0x00000375}, {0x00000384, 0x00000385}, {0x000003F6, 0x000003F6}, {0x00000482, 0x00000482}, {0x0000058D, 0x0000058F}, {0x00000606, 0x00000608}, {0x0000060B, 0x0000060B}, {0x0000060E, 0x0000060F}, {0x000006DE, 0x000006DE}, {0x000006E9, 0x000006E9}, {0x000006FD, 0x000006FE}, {0x000007F6, 0x000007F6}, -{0x000007FE, 0x000007FF}, {0x000009F2, 0x000009F3}, {0x000009FA, 0x000009FB}, {0x00000AF1, 0x00000AF1}, -{0x00000B70, 0x00000B70}, {0x00000BF3, 0x00000BFA}, {0x00000C7F, 0x00000C7F}, {0x00000D4F, 0x00000D4F}, -{0x00000D79, 0x00000D79}, {0x00000E3F, 0x00000E3F}, {0x00000F01, 0x00000F03}, {0x00000F13, 0x00000F13}, -{0x00000F15, 0x00000F17}, {0x00000F1A, 0x00000F1F}, {0x00000F34, 0x00000F34}, {0x00000F36, 0x00000F36}, -{0x00000F38, 0x00000F38}, {0x00000FBE, 0x00000FC5}, {0x00000FC7, 0x00000FCC}, {0x00000FCE, 0x00000FCF}, -{0x00000FD5, 0x00000FD8}, {0x0000109E, 0x0000109F}, {0x00001390, 0x00001399}, {0x0000166D, 0x0000166D}, -{0x000017DB, 0x000017DB}, {0x00001940, 0x00001940}, {0x000019DE, 0x000019FF}, {0x00001B61, 0x00001B6A}, -{0x00001B74, 0x00001B7C}, {0x00001FBD, 0x00001FBD}, {0x00001FBF, 0x00001FC1}, {0x00001FCD, 0x00001FCF}, -{0x00001FDD, 0x00001FDF}, {0x00001FED, 0x00001FEF}, {0x00001FFD, 0x00001FFE}, {0x00002044, 0x00002044}, -{0x00002052, 0x00002052}, {0x0000207A, 0x0000207C}, {0x0000208A, 0x0000208C}, {0x000020A0, 0x000020BF}, -{0x00002100, 0x00002101}, {0x00002103, 0x00002106}, {0x00002108, 0x00002109}, {0x00002114, 0x00002114}, -{0x00002116, 0x00002118}, {0x0000211E, 0x00002123}, {0x00002125, 0x00002125}, {0x00002127, 0x00002127}, -{0x00002129, 0x00002129}, {0x0000212E, 0x0000212E}, {0x0000213A, 0x0000213B}, {0x00002140, 0x00002144}, -{0x0000214A, 0x0000214D}, {0x0000214F, 0x0000214F}, {0x0000218A, 0x0000218B}, {0x00002190, 0x00002307}, -{0x0000230C, 0x00002328}, {0x0000232B, 0x00002426}, {0x00002440, 0x0000244A}, {0x0000249C, 0x000024E9}, -{0x00002500, 0x00002767}, {0x00002794, 0x000027C4}, {0x000027C7, 0x000027E5}, {0x000027F0, 0x00002982}, -{0x00002999, 0x000029D7}, {0x000029DC, 0x000029FB}, {0x000029FE, 0x00002B73}, {0x00002B76, 0x00002B95}, -{0x00002B97, 0x00002BFF}, {0x00002CE5, 0x00002CEA}, {0x00002E50, 0x00002E51}, {0x00002E80, 0x00002E99}, -{0x00002E9B, 0x00002EF3}, {0x00002F00, 0x00002FD5}, {0x00002FF0, 0x00002FFB}, {0x00003004, 0x00003004}, -{0x00003012, 0x00003013}, {0x00003020, 0x00003020}, {0x00003036, 0x00003037}, {0x0000303E, 0x0000303F}, -{0x0000309B, 0x0000309C}, {0x00003190, 0x00003191}, {0x00003196, 0x0000319F}, {0x000031C0, 0x000031E3}, -{0x00003200, 0x0000321E}, {0x0000322A, 0x00003247}, {0x00003250, 0x00003250}, {0x00003260, 0x0000327F}, -{0x0000328A, 0x000032B0}, {0x000032C0, 0x000033FF}, {0x00004DC0, 0x00004DFF}, {0x0000A490, 0x0000A4C6}, -{0x0000A700, 0x0000A716}, {0x0000A720, 0x0000A721}, {0x0000A789, 0x0000A78A}, {0x0000A828, 0x0000A82B}, -{0x0000A836, 0x0000A839}, {0x0000AA77, 0x0000AA79}, {0x0000AB5B, 0x0000AB5B}, {0x0000AB6A, 0x0000AB6B}, -{0x0000FB29, 0x0000FB29}, {0x0000FBB2, 0x0000FBC1}, {0x0000FDFC, 0x0000FDFD}, {0x0000FE62, 0x0000FE62}, -{0x0000FE64, 0x0000FE66}, {0x0000FE69, 0x0000FE69}, {0x0000FF04, 0x0000FF04}, {0x0000FF0B, 0x0000FF0B}, -{0x0000FF1C, 0x0000FF1E}, {0x0000FF3E, 0x0000FF3E}, {0x0000FF40, 0x0000FF40}, {0x0000FF5C, 0x0000FF5C}, -{0x0000FF5E, 0x0000FF5E}, {0x0000FFE0, 0x0000FFE6}, {0x0000FFE8, 0x0000FFEE}, {0x0000FFFC, 0x0000FFFD}, -{0x00010137, 0x0001013F}, {0x00010179, 0x00010189}, {0x0001018C, 0x0001018E}, {0x00010190, 0x0001019C}, -{0x000101A0, 0x000101A0}, {0x000101D0, 0x000101FC}, {0x00010877, 0x00010878}, {0x00010AC8, 0x00010AC8}, -{0x0001173F, 0x0001173F}, {0x00011FD5, 0x00011FF1}, {0x00016B3C, 0x00016B3F}, {0x00016B45, 0x00016B45}, -{0x0001BC9C, 0x0001BC9C}, {0x0001D000, 0x0001D0F5}, {0x0001D100, 0x0001D126}, {0x0001D129, 0x0001D164}, -{0x0001D16A, 0x0001D16C}, {0x0001D183, 0x0001D184}, {0x0001D18C, 0x0001D1A9}, {0x0001D1AE, 0x0001D1E8}, +{0x000007FE, 0x000007FF}, {0x00000888, 0x00000888}, {0x000009F2, 0x000009F3}, {0x000009FA, 0x000009FB}, +{0x00000AF1, 0x00000AF1}, {0x00000B70, 0x00000B70}, {0x00000BF3, 0x00000BFA}, {0x00000C7F, 0x00000C7F}, +{0x00000D4F, 0x00000D4F}, {0x00000D79, 0x00000D79}, {0x00000E3F, 0x00000E3F}, {0x00000F01, 0x00000F03}, +{0x00000F13, 0x00000F13}, {0x00000F15, 0x00000F17}, {0x00000F1A, 0x00000F1F}, {0x00000F34, 0x00000F34}, +{0x00000F36, 0x00000F36}, {0x00000F38, 0x00000F38}, {0x00000FBE, 0x00000FC5}, {0x00000FC7, 0x00000FCC}, +{0x00000FCE, 0x00000FCF}, {0x00000FD5, 0x00000FD8}, {0x0000109E, 0x0000109F}, {0x00001390, 0x00001399}, +{0x0000166D, 0x0000166D}, {0x000017DB, 0x000017DB}, {0x00001940, 0x00001940}, {0x000019DE, 0x000019FF}, +{0x00001B61, 0x00001B6A}, {0x00001B74, 0x00001B7C}, {0x00001FBD, 0x00001FBD}, {0x00001FBF, 0x00001FC1}, +{0x00001FCD, 0x00001FCF}, {0x00001FDD, 0x00001FDF}, {0x00001FED, 0x00001FEF}, {0x00001FFD, 0x00001FFE}, +{0x00002044, 0x00002044}, {0x00002052, 0x00002052}, {0x0000207A, 0x0000207C}, {0x0000208A, 0x0000208C}, +{0x000020A0, 0x000020C0}, {0x00002100, 0x00002101}, {0x00002103, 0x00002106}, {0x00002108, 0x00002109}, +{0x00002114, 0x00002114}, {0x00002116, 0x00002118}, {0x0000211E, 0x00002123}, {0x00002125, 0x00002125}, +{0x00002127, 0x00002127}, {0x00002129, 0x00002129}, {0x0000212E, 0x0000212E}, {0x0000213A, 0x0000213B}, +{0x00002140, 0x00002144}, {0x0000214A, 0x0000214D}, {0x0000214F, 0x0000214F}, {0x0000218A, 0x0000218B}, +{0x00002190, 0x00002307}, {0x0000230C, 0x00002328}, {0x0000232B, 0x00002426}, {0x00002440, 0x0000244A}, +{0x0000249C, 0x000024E9}, {0x00002500, 0x00002767}, {0x00002794, 0x000027C4}, {0x000027C7, 0x000027E5}, +{0x000027F0, 0x00002982}, {0x00002999, 0x000029D7}, {0x000029DC, 0x000029FB}, {0x000029FE, 0x00002B73}, +{0x00002B76, 0x00002B95}, {0x00002B97, 0x00002BFF}, {0x00002CE5, 0x00002CEA}, {0x00002E50, 0x00002E51}, +{0x00002E80, 0x00002E99}, {0x00002E9B, 0x00002EF3}, {0x00002F00, 0x00002FD5}, {0x00002FF0, 0x00002FFB}, +{0x00003004, 0x00003004}, {0x00003012, 0x00003013}, {0x00003020, 0x00003020}, {0x00003036, 0x00003037}, +{0x0000303E, 0x0000303F}, {0x0000309B, 0x0000309C}, {0x00003190, 0x00003191}, {0x00003196, 0x0000319F}, +{0x000031C0, 0x000031E3}, {0x00003200, 0x0000321E}, {0x0000322A, 0x00003247}, {0x00003250, 0x00003250}, +{0x00003260, 0x0000327F}, {0x0000328A, 0x000032B0}, {0x000032C0, 0x000033FF}, {0x00004DC0, 0x00004DFF}, +{0x0000A490, 0x0000A4C6}, {0x0000A700, 0x0000A716}, {0x0000A720, 0x0000A721}, {0x0000A789, 0x0000A78A}, +{0x0000A828, 0x0000A82B}, {0x0000A836, 0x0000A839}, {0x0000AA77, 0x0000AA79}, {0x0000AB5B, 0x0000AB5B}, +{0x0000AB6A, 0x0000AB6B}, {0x0000FB29, 0x0000FB29}, {0x0000FBB2, 0x0000FBC2}, {0x0000FD40, 0x0000FD4F}, +{0x0000FDCF, 0x0000FDCF}, {0x0000FDFC, 0x0000FDFF}, {0x0000FE62, 0x0000FE62}, {0x0000FE64, 0x0000FE66}, +{0x0000FE69, 0x0000FE69}, {0x0000FF04, 0x0000FF04}, {0x0000FF0B, 0x0000FF0B}, {0x0000FF1C, 0x0000FF1E}, +{0x0000FF3E, 0x0000FF3E}, {0x0000FF40, 0x0000FF40}, {0x0000FF5C, 0x0000FF5C}, {0x0000FF5E, 0x0000FF5E}, +{0x0000FFE0, 0x0000FFE6}, {0x0000FFE8, 0x0000FFEE}, {0x0000FFFC, 0x0000FFFD}, {0x00010137, 0x0001013F}, +{0x00010179, 0x00010189}, {0x0001018C, 0x0001018E}, {0x00010190, 0x0001019C}, {0x000101A0, 0x000101A0}, +{0x000101D0, 0x000101FC}, {0x00010877, 0x00010878}, {0x00010AC8, 0x00010AC8}, {0x0001173F, 0x0001173F}, +{0x00011FD5, 0x00011FF1}, {0x00016B3C, 0x00016B3F}, {0x00016B45, 0x00016B45}, {0x0001BC9C, 0x0001BC9C}, +{0x0001CF50, 0x0001CFC3}, {0x0001D000, 0x0001D0F5}, {0x0001D100, 0x0001D126}, {0x0001D129, 0x0001D164}, +{0x0001D16A, 0x0001D16C}, {0x0001D183, 0x0001D184}, {0x0001D18C, 0x0001D1A9}, {0x0001D1AE, 0x0001D1EA}, {0x0001D200, 0x0001D241}, {0x0001D245, 0x0001D245}, {0x0001D300, 0x0001D356}, {0x0001D6C1, 0x0001D6C1}, {0x0001D6DB, 0x0001D6DB}, {0x0001D6FB, 0x0001D6FB}, {0x0001D715, 0x0001D715}, {0x0001D735, 0x0001D735}, {0x0001D74F, 0x0001D74F}, {0x0001D76F, 0x0001D76F}, {0x0001D789, 0x0001D789}, {0x0001D7A9, 0x0001D7A9}, @@ -371,102 +406,100 @@ const std::vector> unicode_ranges_symbol = { {0x0001F000, 0x0001F02B}, {0x0001F030, 0x0001F093}, {0x0001F0A0, 0x0001F0AE}, {0x0001F0B1, 0x0001F0BF}, {0x0001F0C1, 0x0001F0CF}, {0x0001F0D1, 0x0001F0F5}, {0x0001F10D, 0x0001F1AD}, {0x0001F1E6, 0x0001F202}, {0x0001F210, 0x0001F23B}, {0x0001F240, 0x0001F248}, {0x0001F250, 0x0001F251}, {0x0001F260, 0x0001F265}, -{0x0001F300, 0x0001F6D7}, {0x0001F6E0, 0x0001F6EC}, {0x0001F6F0, 0x0001F6FC}, {0x0001F700, 0x0001F773}, -{0x0001F780, 0x0001F7D8}, {0x0001F7E0, 0x0001F7EB}, {0x0001F800, 0x0001F80B}, {0x0001F810, 0x0001F847}, -{0x0001F850, 0x0001F859}, {0x0001F860, 0x0001F887}, {0x0001F890, 0x0001F8AD}, {0x0001F8B0, 0x0001F8B1}, -{0x0001F900, 0x0001F978}, {0x0001F97A, 0x0001F9CB}, {0x0001F9CD, 0x0001FA53}, {0x0001FA60, 0x0001FA6D}, -{0x0001FA70, 0x0001FA74}, {0x0001FA78, 0x0001FA7A}, {0x0001FA80, 0x0001FA86}, {0x0001FA90, 0x0001FAA8}, -{0x0001FAB0, 0x0001FAB6}, {0x0001FAC0, 0x0001FAC2}, {0x0001FAD0, 0x0001FAD6}, {0x0001FB00, 0x0001FB92}, -{0x0001FB94, 0x0001FBCA}, +{0x0001F300, 0x0001F6D7}, {0x0001F6DC, 0x0001F6EC}, {0x0001F6F0, 0x0001F6FC}, {0x0001F700, 0x0001F776}, +{0x0001F77B, 0x0001F7D9}, {0x0001F7E0, 0x0001F7EB}, {0x0001F7F0, 0x0001F7F0}, {0x0001F800, 0x0001F80B}, +{0x0001F810, 0x0001F847}, {0x0001F850, 0x0001F859}, {0x0001F860, 0x0001F887}, {0x0001F890, 0x0001F8AD}, +{0x0001F8B0, 0x0001F8B1}, {0x0001F900, 0x0001FA53}, {0x0001FA60, 0x0001FA6D}, {0x0001FA70, 0x0001FA7C}, +{0x0001FA80, 0x0001FA88}, {0x0001FA90, 0x0001FABD}, {0x0001FABF, 0x0001FAC5}, {0x0001FACE, 0x0001FADB}, +{0x0001FAE0, 0x0001FAE8}, {0x0001FAF0, 0x0001FAF8}, {0x0001FB00, 0x0001FB92}, {0x0001FB94, 0x0001FBCA}, }; const std::vector> unicode_ranges_control = { -{0x00000000, 0x00000008}, {0x0000000E, 0x0000001B}, {0x0000007F, 0x00000084}, {0x00000086, 0x0000009F}, -{0x000000AD, 0x000000AD}, {0x00000378, 0x00000379}, {0x00000380, 0x00000383}, {0x0000038B, 0x0000038B}, -{0x0000038D, 0x0000038D}, {0x000003A2, 0x000003A2}, {0x00000530, 0x00000530}, {0x00000557, 0x00000558}, -{0x0000058B, 0x0000058C}, {0x00000590, 0x00000590}, {0x000005C8, 0x000005CF}, {0x000005EB, 0x000005EE}, -{0x000005F5, 0x00000605}, {0x0000061C, 0x0000061D}, {0x000006DD, 0x000006DD}, {0x0000070E, 0x0000070F}, -{0x0000074B, 0x0000074C}, {0x000007B2, 0x000007BF}, {0x000007FB, 0x000007FC}, {0x0000082E, 0x0000082F}, -{0x0000083F, 0x0000083F}, {0x0000085C, 0x0000085D}, {0x0000085F, 0x0000085F}, {0x0000086B, 0x0000089F}, -{0x000008B5, 0x000008B5}, {0x000008C8, 0x000008D2}, {0x000008E2, 0x000008E2}, {0x00000984, 0x00000984}, -{0x0000098D, 0x0000098E}, {0x00000991, 0x00000992}, {0x000009A9, 0x000009A9}, {0x000009B1, 0x000009B1}, -{0x000009B3, 0x000009B5}, {0x000009BA, 0x000009BB}, {0x000009C5, 0x000009C6}, {0x000009C9, 0x000009CA}, -{0x000009CF, 0x000009D6}, {0x000009D8, 0x000009DB}, {0x000009DE, 0x000009DE}, {0x000009E4, 0x000009E5}, -{0x000009FF, 0x00000A00}, {0x00000A04, 0x00000A04}, {0x00000A0B, 0x00000A0E}, {0x00000A11, 0x00000A12}, -{0x00000A29, 0x00000A29}, {0x00000A31, 0x00000A31}, {0x00000A34, 0x00000A34}, {0x00000A37, 0x00000A37}, -{0x00000A3A, 0x00000A3B}, {0x00000A3D, 0x00000A3D}, {0x00000A43, 0x00000A46}, {0x00000A49, 0x00000A4A}, -{0x00000A4E, 0x00000A50}, {0x00000A52, 0x00000A58}, {0x00000A5D, 0x00000A5D}, {0x00000A5F, 0x00000A65}, -{0x00000A77, 0x00000A80}, {0x00000A84, 0x00000A84}, {0x00000A8E, 0x00000A8E}, {0x00000A92, 0x00000A92}, -{0x00000AA9, 0x00000AA9}, {0x00000AB1, 0x00000AB1}, {0x00000AB4, 0x00000AB4}, {0x00000ABA, 0x00000ABB}, -{0x00000AC6, 0x00000AC6}, {0x00000ACA, 0x00000ACA}, {0x00000ACE, 0x00000ACF}, {0x00000AD1, 0x00000ADF}, -{0x00000AE4, 0x00000AE5}, {0x00000AF2, 0x00000AF8}, {0x00000B00, 0x00000B00}, {0x00000B04, 0x00000B04}, -{0x00000B0D, 0x00000B0E}, {0x00000B11, 0x00000B12}, {0x00000B29, 0x00000B29}, {0x00000B31, 0x00000B31}, -{0x00000B34, 0x00000B34}, {0x00000B3A, 0x00000B3B}, {0x00000B45, 0x00000B46}, {0x00000B49, 0x00000B4A}, -{0x00000B4E, 0x00000B54}, {0x00000B58, 0x00000B5B}, {0x00000B5E, 0x00000B5E}, {0x00000B64, 0x00000B65}, -{0x00000B78, 0x00000B81}, {0x00000B84, 0x00000B84}, {0x00000B8B, 0x00000B8D}, {0x00000B91, 0x00000B91}, -{0x00000B96, 0x00000B98}, {0x00000B9B, 0x00000B9B}, {0x00000B9D, 0x00000B9D}, {0x00000BA0, 0x00000BA2}, -{0x00000BA5, 0x00000BA7}, {0x00000BAB, 0x00000BAD}, {0x00000BBA, 0x00000BBD}, {0x00000BC3, 0x00000BC5}, -{0x00000BC9, 0x00000BC9}, {0x00000BCE, 0x00000BCF}, {0x00000BD1, 0x00000BD6}, {0x00000BD8, 0x00000BE5}, -{0x00000BFB, 0x00000BFF}, {0x00000C0D, 0x00000C0D}, {0x00000C11, 0x00000C11}, {0x00000C29, 0x00000C29}, -{0x00000C3A, 0x00000C3C}, {0x00000C45, 0x00000C45}, {0x00000C49, 0x00000C49}, {0x00000C4E, 0x00000C54}, -{0x00000C57, 0x00000C57}, {0x00000C5B, 0x00000C5F}, {0x00000C64, 0x00000C65}, {0x00000C70, 0x00000C76}, -{0x00000C8D, 0x00000C8D}, {0x00000C91, 0x00000C91}, {0x00000CA9, 0x00000CA9}, {0x00000CB4, 0x00000CB4}, -{0x00000CBA, 0x00000CBB}, {0x00000CC5, 0x00000CC5}, {0x00000CC9, 0x00000CC9}, {0x00000CCE, 0x00000CD4}, -{0x00000CD7, 0x00000CDD}, {0x00000CDF, 0x00000CDF}, {0x00000CE4, 0x00000CE5}, {0x00000CF0, 0x00000CF0}, -{0x00000CF3, 0x00000CFF}, {0x00000D0D, 0x00000D0D}, {0x00000D11, 0x00000D11}, {0x00000D45, 0x00000D45}, -{0x00000D49, 0x00000D49}, {0x00000D50, 0x00000D53}, {0x00000D64, 0x00000D65}, {0x00000D80, 0x00000D80}, -{0x00000D84, 0x00000D84}, {0x00000D97, 0x00000D99}, {0x00000DB2, 0x00000DB2}, {0x00000DBC, 0x00000DBC}, -{0x00000DBE, 0x00000DBF}, {0x00000DC7, 0x00000DC9}, {0x00000DCB, 0x00000DCE}, {0x00000DD5, 0x00000DD5}, -{0x00000DD7, 0x00000DD7}, {0x00000DE0, 0x00000DE5}, {0x00000DF0, 0x00000DF1}, {0x00000DF5, 0x00000E00}, -{0x00000E3B, 0x00000E3E}, {0x00000E5C, 0x00000E80}, {0x00000E83, 0x00000E83}, {0x00000E85, 0x00000E85}, -{0x00000E8B, 0x00000E8B}, {0x00000EA4, 0x00000EA4}, {0x00000EA6, 0x00000EA6}, {0x00000EBE, 0x00000EBF}, -{0x00000EC5, 0x00000EC5}, {0x00000EC7, 0x00000EC7}, {0x00000ECE, 0x00000ECF}, {0x00000EDA, 0x00000EDB}, -{0x00000EE0, 0x00000EFF}, {0x00000F48, 0x00000F48}, {0x00000F6D, 0x00000F70}, {0x00000F98, 0x00000F98}, -{0x00000FBD, 0x00000FBD}, {0x00000FCD, 0x00000FCD}, {0x00000FDB, 0x00000FFF}, {0x000010C6, 0x000010C6}, -{0x000010C8, 0x000010CC}, {0x000010CE, 0x000010CF}, {0x00001249, 0x00001249}, {0x0000124E, 0x0000124F}, -{0x00001257, 0x00001257}, {0x00001259, 0x00001259}, {0x0000125E, 0x0000125F}, {0x00001289, 0x00001289}, -{0x0000128E, 0x0000128F}, {0x000012B1, 0x000012B1}, {0x000012B6, 0x000012B7}, {0x000012BF, 0x000012BF}, -{0x000012C1, 0x000012C1}, {0x000012C6, 0x000012C7}, {0x000012D7, 0x000012D7}, {0x00001311, 0x00001311}, -{0x00001316, 0x00001317}, {0x0000135B, 0x0000135C}, {0x0000137D, 0x0000137F}, {0x0000139A, 0x0000139F}, -{0x000013F6, 0x000013F7}, {0x000013FE, 0x000013FF}, {0x0000169D, 0x0000169F}, {0x000016F9, 0x000016FF}, -{0x0000170D, 0x0000170D}, {0x00001715, 0x0000171F}, {0x00001737, 0x0000173F}, {0x00001754, 0x0000175F}, -{0x0000176D, 0x0000176D}, {0x00001771, 0x00001771}, {0x00001774, 0x0000177F}, {0x000017DE, 0x000017DF}, -{0x000017EA, 0x000017EF}, {0x000017FA, 0x000017FF}, {0x0000180E, 0x0000180F}, {0x0000181A, 0x0000181F}, -{0x00001879, 0x0000187F}, {0x000018AB, 0x000018AF}, {0x000018F6, 0x000018FF}, {0x0000191F, 0x0000191F}, -{0x0000192C, 0x0000192F}, {0x0000193C, 0x0000193F}, {0x00001941, 0x00001943}, {0x0000196E, 0x0000196F}, -{0x00001975, 0x0000197F}, {0x000019AC, 0x000019AF}, {0x000019CA, 0x000019CF}, {0x000019DB, 0x000019DD}, -{0x00001A1C, 0x00001A1D}, {0x00001A5F, 0x00001A5F}, {0x00001A7D, 0x00001A7E}, {0x00001A8A, 0x00001A8F}, -{0x00001A9A, 0x00001A9F}, {0x00001AAE, 0x00001AAF}, {0x00001AC1, 0x00001AFF}, {0x00001B4C, 0x00001B4F}, -{0x00001B7D, 0x00001B7F}, {0x00001BF4, 0x00001BFB}, {0x00001C38, 0x00001C3A}, {0x00001C4A, 0x00001C4C}, -{0x00001C89, 0x00001C8F}, {0x00001CBB, 0x00001CBC}, {0x00001CC8, 0x00001CCF}, {0x00001CFB, 0x00001CFF}, -{0x00001DFA, 0x00001DFA}, {0x00001F16, 0x00001F17}, {0x00001F1E, 0x00001F1F}, {0x00001F46, 0x00001F47}, +{0x00000000, 0x0000001F}, {0x0000007F, 0x0000009F}, {0x000000AD, 0x000000AD}, {0x00000378, 0x00000379}, +{0x00000380, 0x00000383}, {0x0000038B, 0x0000038B}, {0x0000038D, 0x0000038D}, {0x000003A2, 0x000003A2}, +{0x00000530, 0x00000530}, {0x00000557, 0x00000558}, {0x0000058B, 0x0000058C}, {0x00000590, 0x00000590}, +{0x000005C8, 0x000005CF}, {0x000005EB, 0x000005EE}, {0x000005F5, 0x00000605}, {0x0000061C, 0x0000061C}, +{0x000006DD, 0x000006DD}, {0x0000070E, 0x0000070F}, {0x0000074B, 0x0000074C}, {0x000007B2, 0x000007BF}, +{0x000007FB, 0x000007FC}, {0x0000082E, 0x0000082F}, {0x0000083F, 0x0000083F}, {0x0000085C, 0x0000085D}, +{0x0000085F, 0x0000085F}, {0x0000086B, 0x0000086F}, {0x0000088F, 0x00000897}, {0x000008E2, 0x000008E2}, +{0x00000984, 0x00000984}, {0x0000098D, 0x0000098E}, {0x00000991, 0x00000992}, {0x000009A9, 0x000009A9}, +{0x000009B1, 0x000009B1}, {0x000009B3, 0x000009B5}, {0x000009BA, 0x000009BB}, {0x000009C5, 0x000009C6}, +{0x000009C9, 0x000009CA}, {0x000009CF, 0x000009D6}, {0x000009D8, 0x000009DB}, {0x000009DE, 0x000009DE}, +{0x000009E4, 0x000009E5}, {0x000009FF, 0x00000A00}, {0x00000A04, 0x00000A04}, {0x00000A0B, 0x00000A0E}, +{0x00000A11, 0x00000A12}, {0x00000A29, 0x00000A29}, {0x00000A31, 0x00000A31}, {0x00000A34, 0x00000A34}, +{0x00000A37, 0x00000A37}, {0x00000A3A, 0x00000A3B}, {0x00000A3D, 0x00000A3D}, {0x00000A43, 0x00000A46}, +{0x00000A49, 0x00000A4A}, {0x00000A4E, 0x00000A50}, {0x00000A52, 0x00000A58}, {0x00000A5D, 0x00000A5D}, +{0x00000A5F, 0x00000A65}, {0x00000A77, 0x00000A80}, {0x00000A84, 0x00000A84}, {0x00000A8E, 0x00000A8E}, +{0x00000A92, 0x00000A92}, {0x00000AA9, 0x00000AA9}, {0x00000AB1, 0x00000AB1}, {0x00000AB4, 0x00000AB4}, +{0x00000ABA, 0x00000ABB}, {0x00000AC6, 0x00000AC6}, {0x00000ACA, 0x00000ACA}, {0x00000ACE, 0x00000ACF}, +{0x00000AD1, 0x00000ADF}, {0x00000AE4, 0x00000AE5}, {0x00000AF2, 0x00000AF8}, {0x00000B00, 0x00000B00}, +{0x00000B04, 0x00000B04}, {0x00000B0D, 0x00000B0E}, {0x00000B11, 0x00000B12}, {0x00000B29, 0x00000B29}, +{0x00000B31, 0x00000B31}, {0x00000B34, 0x00000B34}, {0x00000B3A, 0x00000B3B}, {0x00000B45, 0x00000B46}, +{0x00000B49, 0x00000B4A}, {0x00000B4E, 0x00000B54}, {0x00000B58, 0x00000B5B}, {0x00000B5E, 0x00000B5E}, +{0x00000B64, 0x00000B65}, {0x00000B78, 0x00000B81}, {0x00000B84, 0x00000B84}, {0x00000B8B, 0x00000B8D}, +{0x00000B91, 0x00000B91}, {0x00000B96, 0x00000B98}, {0x00000B9B, 0x00000B9B}, {0x00000B9D, 0x00000B9D}, +{0x00000BA0, 0x00000BA2}, {0x00000BA5, 0x00000BA7}, {0x00000BAB, 0x00000BAD}, {0x00000BBA, 0x00000BBD}, +{0x00000BC3, 0x00000BC5}, {0x00000BC9, 0x00000BC9}, {0x00000BCE, 0x00000BCF}, {0x00000BD1, 0x00000BD6}, +{0x00000BD8, 0x00000BE5}, {0x00000BFB, 0x00000BFF}, {0x00000C0D, 0x00000C0D}, {0x00000C11, 0x00000C11}, +{0x00000C29, 0x00000C29}, {0x00000C3A, 0x00000C3B}, {0x00000C45, 0x00000C45}, {0x00000C49, 0x00000C49}, +{0x00000C4E, 0x00000C54}, {0x00000C57, 0x00000C57}, {0x00000C5B, 0x00000C5C}, {0x00000C5E, 0x00000C5F}, +{0x00000C64, 0x00000C65}, {0x00000C70, 0x00000C76}, {0x00000C8D, 0x00000C8D}, {0x00000C91, 0x00000C91}, +{0x00000CA9, 0x00000CA9}, {0x00000CB4, 0x00000CB4}, {0x00000CBA, 0x00000CBB}, {0x00000CC5, 0x00000CC5}, +{0x00000CC9, 0x00000CC9}, {0x00000CCE, 0x00000CD4}, {0x00000CD7, 0x00000CDC}, {0x00000CDF, 0x00000CDF}, +{0x00000CE4, 0x00000CE5}, {0x00000CF0, 0x00000CF0}, {0x00000CF4, 0x00000CFF}, {0x00000D0D, 0x00000D0D}, +{0x00000D11, 0x00000D11}, {0x00000D45, 0x00000D45}, {0x00000D49, 0x00000D49}, {0x00000D50, 0x00000D53}, +{0x00000D64, 0x00000D65}, {0x00000D80, 0x00000D80}, {0x00000D84, 0x00000D84}, {0x00000D97, 0x00000D99}, +{0x00000DB2, 0x00000DB2}, {0x00000DBC, 0x00000DBC}, {0x00000DBE, 0x00000DBF}, {0x00000DC7, 0x00000DC9}, +{0x00000DCB, 0x00000DCE}, {0x00000DD5, 0x00000DD5}, {0x00000DD7, 0x00000DD7}, {0x00000DE0, 0x00000DE5}, +{0x00000DF0, 0x00000DF1}, {0x00000DF5, 0x00000E00}, {0x00000E3B, 0x00000E3E}, {0x00000E5C, 0x00000E80}, +{0x00000E83, 0x00000E83}, {0x00000E85, 0x00000E85}, {0x00000E8B, 0x00000E8B}, {0x00000EA4, 0x00000EA4}, +{0x00000EA6, 0x00000EA6}, {0x00000EBE, 0x00000EBF}, {0x00000EC5, 0x00000EC5}, {0x00000EC7, 0x00000EC7}, +{0x00000ECF, 0x00000ECF}, {0x00000EDA, 0x00000EDB}, {0x00000EE0, 0x00000EFF}, {0x00000F48, 0x00000F48}, +{0x00000F6D, 0x00000F70}, {0x00000F98, 0x00000F98}, {0x00000FBD, 0x00000FBD}, {0x00000FCD, 0x00000FCD}, +{0x00000FDB, 0x00000FFF}, {0x000010C6, 0x000010C6}, {0x000010C8, 0x000010CC}, {0x000010CE, 0x000010CF}, +{0x00001249, 0x00001249}, {0x0000124E, 0x0000124F}, {0x00001257, 0x00001257}, {0x00001259, 0x00001259}, +{0x0000125E, 0x0000125F}, {0x00001289, 0x00001289}, {0x0000128E, 0x0000128F}, {0x000012B1, 0x000012B1}, +{0x000012B6, 0x000012B7}, {0x000012BF, 0x000012BF}, {0x000012C1, 0x000012C1}, {0x000012C6, 0x000012C7}, +{0x000012D7, 0x000012D7}, {0x00001311, 0x00001311}, {0x00001316, 0x00001317}, {0x0000135B, 0x0000135C}, +{0x0000137D, 0x0000137F}, {0x0000139A, 0x0000139F}, {0x000013F6, 0x000013F7}, {0x000013FE, 0x000013FF}, +{0x0000169D, 0x0000169F}, {0x000016F9, 0x000016FF}, {0x00001716, 0x0000171E}, {0x00001737, 0x0000173F}, +{0x00001754, 0x0000175F}, {0x0000176D, 0x0000176D}, {0x00001771, 0x00001771}, {0x00001774, 0x0000177F}, +{0x000017DE, 0x000017DF}, {0x000017EA, 0x000017EF}, {0x000017FA, 0x000017FF}, {0x0000180E, 0x0000180E}, +{0x0000181A, 0x0000181F}, {0x00001879, 0x0000187F}, {0x000018AB, 0x000018AF}, {0x000018F6, 0x000018FF}, +{0x0000191F, 0x0000191F}, {0x0000192C, 0x0000192F}, {0x0000193C, 0x0000193F}, {0x00001941, 0x00001943}, +{0x0000196E, 0x0000196F}, {0x00001975, 0x0000197F}, {0x000019AC, 0x000019AF}, {0x000019CA, 0x000019CF}, +{0x000019DB, 0x000019DD}, {0x00001A1C, 0x00001A1D}, {0x00001A5F, 0x00001A5F}, {0x00001A7D, 0x00001A7E}, +{0x00001A8A, 0x00001A8F}, {0x00001A9A, 0x00001A9F}, {0x00001AAE, 0x00001AAF}, {0x00001ACF, 0x00001AFF}, +{0x00001B4D, 0x00001B4F}, {0x00001B7F, 0x00001B7F}, {0x00001BF4, 0x00001BFB}, {0x00001C38, 0x00001C3A}, +{0x00001C4A, 0x00001C4C}, {0x00001C89, 0x00001C8F}, {0x00001CBB, 0x00001CBC}, {0x00001CC8, 0x00001CCF}, +{0x00001CFB, 0x00001CFF}, {0x00001F16, 0x00001F17}, {0x00001F1E, 0x00001F1F}, {0x00001F46, 0x00001F47}, {0x00001F4E, 0x00001F4F}, {0x00001F58, 0x00001F58}, {0x00001F5A, 0x00001F5A}, {0x00001F5C, 0x00001F5C}, {0x00001F5E, 0x00001F5E}, {0x00001F7E, 0x00001F7F}, {0x00001FB5, 0x00001FB5}, {0x00001FC5, 0x00001FC5}, {0x00001FD4, 0x00001FD5}, {0x00001FDC, 0x00001FDC}, {0x00001FF0, 0x00001FF1}, {0x00001FF5, 0x00001FF5}, {0x00001FFF, 0x00001FFF}, {0x0000200B, 0x0000200F}, {0x0000202A, 0x0000202E}, {0x00002060, 0x0000206F}, -{0x00002072, 0x00002073}, {0x0000208F, 0x0000208F}, {0x0000209D, 0x0000209F}, {0x000020C0, 0x000020CF}, +{0x00002072, 0x00002073}, {0x0000208F, 0x0000208F}, {0x0000209D, 0x0000209F}, {0x000020C1, 0x000020CF}, {0x000020F1, 0x000020FF}, {0x0000218C, 0x0000218F}, {0x00002427, 0x0000243F}, {0x0000244B, 0x0000245F}, -{0x00002B74, 0x00002B75}, {0x00002B96, 0x00002B96}, {0x00002C2F, 0x00002C2F}, {0x00002C5F, 0x00002C5F}, -{0x00002CF4, 0x00002CF8}, {0x00002D26, 0x00002D26}, {0x00002D28, 0x00002D2C}, {0x00002D2E, 0x00002D2F}, -{0x00002D68, 0x00002D6E}, {0x00002D71, 0x00002D7E}, {0x00002D97, 0x00002D9F}, {0x00002DA7, 0x00002DA7}, -{0x00002DAF, 0x00002DAF}, {0x00002DB7, 0x00002DB7}, {0x00002DBF, 0x00002DBF}, {0x00002DC7, 0x00002DC7}, -{0x00002DCF, 0x00002DCF}, {0x00002DD7, 0x00002DD7}, {0x00002DDF, 0x00002DDF}, {0x00002E53, 0x00002E7F}, -{0x00002E9A, 0x00002E9A}, {0x00002EF4, 0x00002EFF}, {0x00002FD6, 0x00002FEF}, {0x00002FFC, 0x00002FFF}, -{0x00003040, 0x00003040}, {0x00003097, 0x00003098}, {0x00003100, 0x00003104}, {0x00003130, 0x00003130}, -{0x0000318F, 0x0000318F}, {0x000031E4, 0x000031EF}, {0x0000321F, 0x0000321F}, {0x00009FFD, 0x00009FFF}, -{0x0000A48D, 0x0000A48F}, {0x0000A4C7, 0x0000A4CF}, {0x0000A62C, 0x0000A63F}, {0x0000A6F8, 0x0000A6FF}, -{0x0000A7C0, 0x0000A7C1}, {0x0000A7CB, 0x0000A7F4}, {0x0000A82D, 0x0000A82F}, {0x0000A83A, 0x0000A83F}, -{0x0000A878, 0x0000A87F}, {0x0000A8C6, 0x0000A8CD}, {0x0000A8DA, 0x0000A8DF}, {0x0000A954, 0x0000A95E}, -{0x0000A97D, 0x0000A97F}, {0x0000A9CE, 0x0000A9CE}, {0x0000A9DA, 0x0000A9DD}, {0x0000A9FF, 0x0000A9FF}, -{0x0000AA37, 0x0000AA3F}, {0x0000AA4E, 0x0000AA4F}, {0x0000AA5A, 0x0000AA5B}, {0x0000AAC3, 0x0000AADA}, -{0x0000AAF7, 0x0000AB00}, {0x0000AB07, 0x0000AB08}, {0x0000AB0F, 0x0000AB10}, {0x0000AB17, 0x0000AB1F}, -{0x0000AB27, 0x0000AB27}, {0x0000AB2F, 0x0000AB2F}, {0x0000AB6C, 0x0000AB6F}, {0x0000ABEE, 0x0000ABEF}, -{0x0000ABFA, 0x0000ABFF}, {0x0000D7A4, 0x0000D7AF}, {0x0000D7C7, 0x0000D7CA}, {0x0000D7FC, 0x0000F8FF}, +{0x00002B74, 0x00002B75}, {0x00002B96, 0x00002B96}, {0x00002CF4, 0x00002CF8}, {0x00002D26, 0x00002D26}, +{0x00002D28, 0x00002D2C}, {0x00002D2E, 0x00002D2F}, {0x00002D68, 0x00002D6E}, {0x00002D71, 0x00002D7E}, +{0x00002D97, 0x00002D9F}, {0x00002DA7, 0x00002DA7}, {0x00002DAF, 0x00002DAF}, {0x00002DB7, 0x00002DB7}, +{0x00002DBF, 0x00002DBF}, {0x00002DC7, 0x00002DC7}, {0x00002DCF, 0x00002DCF}, {0x00002DD7, 0x00002DD7}, +{0x00002DDF, 0x00002DDF}, {0x00002E5E, 0x00002E7F}, {0x00002E9A, 0x00002E9A}, {0x00002EF4, 0x00002EFF}, +{0x00002FD6, 0x00002FEF}, {0x00002FFC, 0x00002FFF}, {0x00003040, 0x00003040}, {0x00003097, 0x00003098}, +{0x00003100, 0x00003104}, {0x00003130, 0x00003130}, {0x0000318F, 0x0000318F}, {0x000031E4, 0x000031EF}, +{0x0000321F, 0x0000321F}, {0x0000A48D, 0x0000A48F}, {0x0000A4C7, 0x0000A4CF}, {0x0000A62C, 0x0000A63F}, +{0x0000A6F8, 0x0000A6FF}, {0x0000A7CB, 0x0000A7CF}, {0x0000A7D2, 0x0000A7D2}, {0x0000A7D4, 0x0000A7D4}, +{0x0000A7DA, 0x0000A7F1}, {0x0000A82D, 0x0000A82F}, {0x0000A83A, 0x0000A83F}, {0x0000A878, 0x0000A87F}, +{0x0000A8C6, 0x0000A8CD}, {0x0000A8DA, 0x0000A8DF}, {0x0000A954, 0x0000A95E}, {0x0000A97D, 0x0000A97F}, +{0x0000A9CE, 0x0000A9CE}, {0x0000A9DA, 0x0000A9DD}, {0x0000A9FF, 0x0000A9FF}, {0x0000AA37, 0x0000AA3F}, +{0x0000AA4E, 0x0000AA4F}, {0x0000AA5A, 0x0000AA5B}, {0x0000AAC3, 0x0000AADA}, {0x0000AAF7, 0x0000AB00}, +{0x0000AB07, 0x0000AB08}, {0x0000AB0F, 0x0000AB10}, {0x0000AB17, 0x0000AB1F}, {0x0000AB27, 0x0000AB27}, +{0x0000AB2F, 0x0000AB2F}, {0x0000AB6C, 0x0000AB6F}, {0x0000ABEE, 0x0000ABEF}, {0x0000ABFA, 0x0000ABFF}, +{0x0000D7A4, 0x0000D7AF}, {0x0000D7C7, 0x0000D7CA}, {0x0000D7FC, 0x0000D7FF}, {0x0000E000, 0x0000F8FF}, {0x0000FA6E, 0x0000FA6F}, {0x0000FADA, 0x0000FAFF}, {0x0000FB07, 0x0000FB12}, {0x0000FB18, 0x0000FB1C}, {0x0000FB37, 0x0000FB37}, {0x0000FB3D, 0x0000FB3D}, {0x0000FB3F, 0x0000FB3F}, {0x0000FB42, 0x0000FB42}, -{0x0000FB45, 0x0000FB45}, {0x0000FBC2, 0x0000FBD2}, {0x0000FD40, 0x0000FD4F}, {0x0000FD90, 0x0000FD91}, -{0x0000FDC8, 0x0000FDEF}, {0x0000FDFE, 0x0000FDFF}, {0x0000FE1A, 0x0000FE1F}, {0x0000FE53, 0x0000FE53}, -{0x0000FE67, 0x0000FE67}, {0x0000FE6C, 0x0000FE6F}, {0x0000FE75, 0x0000FE75}, {0x0000FEFD, 0x0000FF00}, +{0x0000FB45, 0x0000FB45}, {0x0000FBC3, 0x0000FBD2}, {0x0000FD90, 0x0000FD91}, {0x0000FDC8, 0x0000FDCE}, +{0x0000FDD0, 0x0000FDEF}, {0x0000FE1A, 0x0000FE1F}, {0x0000FE53, 0x0000FE53}, {0x0000FE67, 0x0000FE67}, +{0x0000FE6C, 0x0000FE6F}, {0x0000FE75, 0x0000FE75}, {0x0000FEFD, 0x0000FEFE}, {0x0000FF00, 0x0000FF00}, {0x0000FFBF, 0x0000FFC1}, {0x0000FFC8, 0x0000FFC9}, {0x0000FFD0, 0x0000FFD1}, {0x0000FFD8, 0x0000FFD9}, {0x0000FFDD, 0x0000FFDF}, {0x0000FFE7, 0x0000FFE7}, {0x0000FFEF, 0x0000FFFB}, {0x0000FFFE, 0x0000FFFF}, {0x0001000C, 0x0001000C}, {0x00010027, 0x00010027}, {0x0001003B, 0x0001003B}, {0x0001003E, 0x0001003E}, @@ -476,82 +509,91 @@ const std::vector> unicode_ranges_control = { {0x00010324, 0x0001032C}, {0x0001034B, 0x0001034F}, {0x0001037B, 0x0001037F}, {0x0001039E, 0x0001039E}, {0x000103C4, 0x000103C7}, {0x000103D6, 0x000103FF}, {0x0001049E, 0x0001049F}, {0x000104AA, 0x000104AF}, {0x000104D4, 0x000104D7}, {0x000104FC, 0x000104FF}, {0x00010528, 0x0001052F}, {0x00010564, 0x0001056E}, -{0x00010570, 0x000105FF}, {0x00010737, 0x0001073F}, {0x00010756, 0x0001075F}, {0x00010768, 0x000107FF}, -{0x00010806, 0x00010807}, {0x00010809, 0x00010809}, {0x00010836, 0x00010836}, {0x00010839, 0x0001083B}, -{0x0001083D, 0x0001083E}, {0x00010856, 0x00010856}, {0x0001089F, 0x000108A6}, {0x000108B0, 0x000108DF}, -{0x000108F3, 0x000108F3}, {0x000108F6, 0x000108FA}, {0x0001091C, 0x0001091E}, {0x0001093A, 0x0001093E}, -{0x00010940, 0x0001097F}, {0x000109B8, 0x000109BB}, {0x000109D0, 0x000109D1}, {0x00010A04, 0x00010A04}, -{0x00010A07, 0x00010A0B}, {0x00010A14, 0x00010A14}, {0x00010A18, 0x00010A18}, {0x00010A36, 0x00010A37}, -{0x00010A3B, 0x00010A3E}, {0x00010A49, 0x00010A4F}, {0x00010A59, 0x00010A5F}, {0x00010AA0, 0x00010ABF}, -{0x00010AE7, 0x00010AEA}, {0x00010AF7, 0x00010AFF}, {0x00010B36, 0x00010B38}, {0x00010B56, 0x00010B57}, -{0x00010B73, 0x00010B77}, {0x00010B92, 0x00010B98}, {0x00010B9D, 0x00010BA8}, {0x00010BB0, 0x00010BFF}, -{0x00010C49, 0x00010C7F}, {0x00010CB3, 0x00010CBF}, {0x00010CF3, 0x00010CF9}, {0x00010D28, 0x00010D2F}, -{0x00010D3A, 0x00010E5F}, {0x00010E7F, 0x00010E7F}, {0x00010EAA, 0x00010EAA}, {0x00010EAE, 0x00010EAF}, -{0x00010EB2, 0x00010EFF}, {0x00010F28, 0x00010F2F}, {0x00010F5A, 0x00010FAF}, {0x00010FCC, 0x00010FDF}, -{0x00010FF7, 0x00010FFF}, {0x0001104E, 0x00011051}, {0x00011070, 0x0001107E}, {0x000110BD, 0x000110BD}, -{0x000110C2, 0x000110CF}, {0x000110E9, 0x000110EF}, {0x000110FA, 0x000110FF}, {0x00011135, 0x00011135}, -{0x00011148, 0x0001114F}, {0x00011177, 0x0001117F}, {0x000111E0, 0x000111E0}, {0x000111F5, 0x000111FF}, -{0x00011212, 0x00011212}, {0x0001123F, 0x0001127F}, {0x00011287, 0x00011287}, {0x00011289, 0x00011289}, -{0x0001128E, 0x0001128E}, {0x0001129E, 0x0001129E}, {0x000112AA, 0x000112AF}, {0x000112EB, 0x000112EF}, -{0x000112FA, 0x000112FF}, {0x00011304, 0x00011304}, {0x0001130D, 0x0001130E}, {0x00011311, 0x00011312}, -{0x00011329, 0x00011329}, {0x00011331, 0x00011331}, {0x00011334, 0x00011334}, {0x0001133A, 0x0001133A}, -{0x00011345, 0x00011346}, {0x00011349, 0x0001134A}, {0x0001134E, 0x0001134F}, {0x00011351, 0x00011356}, -{0x00011358, 0x0001135C}, {0x00011364, 0x00011365}, {0x0001136D, 0x0001136F}, {0x00011375, 0x000113FF}, -{0x0001145C, 0x0001145C}, {0x00011462, 0x0001147F}, {0x000114C8, 0x000114CF}, {0x000114DA, 0x0001157F}, -{0x000115B6, 0x000115B7}, {0x000115DE, 0x000115FF}, {0x00011645, 0x0001164F}, {0x0001165A, 0x0001165F}, -{0x0001166D, 0x0001167F}, {0x000116B9, 0x000116BF}, {0x000116CA, 0x000116FF}, {0x0001171B, 0x0001171C}, -{0x0001172C, 0x0001172F}, {0x00011740, 0x000117FF}, {0x0001183C, 0x0001189F}, {0x000118F3, 0x000118FE}, -{0x00011907, 0x00011908}, {0x0001190A, 0x0001190B}, {0x00011914, 0x00011914}, {0x00011917, 0x00011917}, -{0x00011936, 0x00011936}, {0x00011939, 0x0001193A}, {0x00011947, 0x0001194F}, {0x0001195A, 0x0001199F}, -{0x000119A8, 0x000119A9}, {0x000119D8, 0x000119D9}, {0x000119E5, 0x000119FF}, {0x00011A48, 0x00011A4F}, -{0x00011AA3, 0x00011ABF}, {0x00011AF9, 0x00011BFF}, {0x00011C09, 0x00011C09}, {0x00011C37, 0x00011C37}, +{0x0001057B, 0x0001057B}, {0x0001058B, 0x0001058B}, {0x00010593, 0x00010593}, {0x00010596, 0x00010596}, +{0x000105A2, 0x000105A2}, {0x000105B2, 0x000105B2}, {0x000105BA, 0x000105BA}, {0x000105BD, 0x000105FF}, +{0x00010737, 0x0001073F}, {0x00010756, 0x0001075F}, {0x00010768, 0x0001077F}, {0x00010786, 0x00010786}, +{0x000107B1, 0x000107B1}, {0x000107BB, 0x000107FF}, {0x00010806, 0x00010807}, {0x00010809, 0x00010809}, +{0x00010836, 0x00010836}, {0x00010839, 0x0001083B}, {0x0001083D, 0x0001083E}, {0x00010856, 0x00010856}, +{0x0001089F, 0x000108A6}, {0x000108B0, 0x000108DF}, {0x000108F3, 0x000108F3}, {0x000108F6, 0x000108FA}, +{0x0001091C, 0x0001091E}, {0x0001093A, 0x0001093E}, {0x00010940, 0x0001097F}, {0x000109B8, 0x000109BB}, +{0x000109D0, 0x000109D1}, {0x00010A04, 0x00010A04}, {0x00010A07, 0x00010A0B}, {0x00010A14, 0x00010A14}, +{0x00010A18, 0x00010A18}, {0x00010A36, 0x00010A37}, {0x00010A3B, 0x00010A3E}, {0x00010A49, 0x00010A4F}, +{0x00010A59, 0x00010A5F}, {0x00010AA0, 0x00010ABF}, {0x00010AE7, 0x00010AEA}, {0x00010AF7, 0x00010AFF}, +{0x00010B36, 0x00010B38}, {0x00010B56, 0x00010B57}, {0x00010B73, 0x00010B77}, {0x00010B92, 0x00010B98}, +{0x00010B9D, 0x00010BA8}, {0x00010BB0, 0x00010BFF}, {0x00010C49, 0x00010C7F}, {0x00010CB3, 0x00010CBF}, +{0x00010CF3, 0x00010CF9}, {0x00010D28, 0x00010D2F}, {0x00010D3A, 0x00010E5F}, {0x00010E7F, 0x00010E7F}, +{0x00010EAA, 0x00010EAA}, {0x00010EAE, 0x00010EAF}, {0x00010EB2, 0x00010EFC}, {0x00010F28, 0x00010F2F}, +{0x00010F5A, 0x00010F6F}, {0x00010F8A, 0x00010FAF}, {0x00010FCC, 0x00010FDF}, {0x00010FF7, 0x00010FFF}, +{0x0001104E, 0x00011051}, {0x00011076, 0x0001107E}, {0x000110BD, 0x000110BD}, {0x000110C3, 0x000110CF}, +{0x000110E9, 0x000110EF}, {0x000110FA, 0x000110FF}, {0x00011135, 0x00011135}, {0x00011148, 0x0001114F}, +{0x00011177, 0x0001117F}, {0x000111E0, 0x000111E0}, {0x000111F5, 0x000111FF}, {0x00011212, 0x00011212}, +{0x00011242, 0x0001127F}, {0x00011287, 0x00011287}, {0x00011289, 0x00011289}, {0x0001128E, 0x0001128E}, +{0x0001129E, 0x0001129E}, {0x000112AA, 0x000112AF}, {0x000112EB, 0x000112EF}, {0x000112FA, 0x000112FF}, +{0x00011304, 0x00011304}, {0x0001130D, 0x0001130E}, {0x00011311, 0x00011312}, {0x00011329, 0x00011329}, +{0x00011331, 0x00011331}, {0x00011334, 0x00011334}, {0x0001133A, 0x0001133A}, {0x00011345, 0x00011346}, +{0x00011349, 0x0001134A}, {0x0001134E, 0x0001134F}, {0x00011351, 0x00011356}, {0x00011358, 0x0001135C}, +{0x00011364, 0x00011365}, {0x0001136D, 0x0001136F}, {0x00011375, 0x000113FF}, {0x0001145C, 0x0001145C}, +{0x00011462, 0x0001147F}, {0x000114C8, 0x000114CF}, {0x000114DA, 0x0001157F}, {0x000115B6, 0x000115B7}, +{0x000115DE, 0x000115FF}, {0x00011645, 0x0001164F}, {0x0001165A, 0x0001165F}, {0x0001166D, 0x0001167F}, +{0x000116BA, 0x000116BF}, {0x000116CA, 0x000116FF}, {0x0001171B, 0x0001171C}, {0x0001172C, 0x0001172F}, +{0x00011747, 0x000117FF}, {0x0001183C, 0x0001189F}, {0x000118F3, 0x000118FE}, {0x00011907, 0x00011908}, +{0x0001190A, 0x0001190B}, {0x00011914, 0x00011914}, {0x00011917, 0x00011917}, {0x00011936, 0x00011936}, +{0x00011939, 0x0001193A}, {0x00011947, 0x0001194F}, {0x0001195A, 0x0001199F}, {0x000119A8, 0x000119A9}, +{0x000119D8, 0x000119D9}, {0x000119E5, 0x000119FF}, {0x00011A48, 0x00011A4F}, {0x00011AA3, 0x00011AAF}, +{0x00011AF9, 0x00011AFF}, {0x00011B0A, 0x00011BFF}, {0x00011C09, 0x00011C09}, {0x00011C37, 0x00011C37}, {0x00011C46, 0x00011C4F}, {0x00011C6D, 0x00011C6F}, {0x00011C90, 0x00011C91}, {0x00011CA8, 0x00011CA8}, {0x00011CB7, 0x00011CFF}, {0x00011D07, 0x00011D07}, {0x00011D0A, 0x00011D0A}, {0x00011D37, 0x00011D39}, {0x00011D3B, 0x00011D3B}, {0x00011D3E, 0x00011D3E}, {0x00011D48, 0x00011D4F}, {0x00011D5A, 0x00011D5F}, {0x00011D66, 0x00011D66}, {0x00011D69, 0x00011D69}, {0x00011D8F, 0x00011D8F}, {0x00011D92, 0x00011D92}, -{0x00011D99, 0x00011D9F}, {0x00011DAA, 0x00011EDF}, {0x00011EF9, 0x00011FAF}, {0x00011FB1, 0x00011FBF}, -{0x00011FF2, 0x00011FFE}, {0x0001239A, 0x000123FF}, {0x0001246F, 0x0001246F}, {0x00012475, 0x0001247F}, -{0x00012544, 0x00012FFF}, {0x0001342F, 0x000143FF}, {0x00014647, 0x000167FF}, {0x00016A39, 0x00016A3F}, -{0x00016A5F, 0x00016A5F}, {0x00016A6A, 0x00016A6D}, {0x00016A70, 0x00016ACF}, {0x00016AEE, 0x00016AEF}, -{0x00016AF6, 0x00016AFF}, {0x00016B46, 0x00016B4F}, {0x00016B5A, 0x00016B5A}, {0x00016B62, 0x00016B62}, -{0x00016B78, 0x00016B7C}, {0x00016B90, 0x00016E3F}, {0x00016E9B, 0x00016EFF}, {0x00016F4B, 0x00016F4E}, -{0x00016F88, 0x00016F8E}, {0x00016FA0, 0x00016FDF}, {0x00016FE5, 0x00016FEF}, {0x00016FF2, 0x00016FFF}, -{0x000187F8, 0x000187FF}, {0x00018CD6, 0x00018CFF}, {0x00018D09, 0x0001AFFF}, {0x0001B11F, 0x0001B14F}, -{0x0001B153, 0x0001B163}, {0x0001B168, 0x0001B16F}, {0x0001B2FC, 0x0001BBFF}, {0x0001BC6B, 0x0001BC6F}, -{0x0001BC7D, 0x0001BC7F}, {0x0001BC89, 0x0001BC8F}, {0x0001BC9A, 0x0001BC9B}, {0x0001BCA0, 0x0001CFFF}, -{0x0001D0F6, 0x0001D0FF}, {0x0001D127, 0x0001D128}, {0x0001D173, 0x0001D17A}, {0x0001D1E9, 0x0001D1FF}, -{0x0001D246, 0x0001D2DF}, {0x0001D2F4, 0x0001D2FF}, {0x0001D357, 0x0001D35F}, {0x0001D379, 0x0001D3FF}, -{0x0001D455, 0x0001D455}, {0x0001D49D, 0x0001D49D}, {0x0001D4A0, 0x0001D4A1}, {0x0001D4A3, 0x0001D4A4}, -{0x0001D4A7, 0x0001D4A8}, {0x0001D4AD, 0x0001D4AD}, {0x0001D4BA, 0x0001D4BA}, {0x0001D4BC, 0x0001D4BC}, -{0x0001D4C4, 0x0001D4C4}, {0x0001D506, 0x0001D506}, {0x0001D50B, 0x0001D50C}, {0x0001D515, 0x0001D515}, -{0x0001D51D, 0x0001D51D}, {0x0001D53A, 0x0001D53A}, {0x0001D53F, 0x0001D53F}, {0x0001D545, 0x0001D545}, -{0x0001D547, 0x0001D549}, {0x0001D551, 0x0001D551}, {0x0001D6A6, 0x0001D6A7}, {0x0001D7CC, 0x0001D7CD}, -{0x0001DA8C, 0x0001DA9A}, {0x0001DAA0, 0x0001DAA0}, {0x0001DAB0, 0x0001DFFF}, {0x0001E007, 0x0001E007}, -{0x0001E019, 0x0001E01A}, {0x0001E022, 0x0001E022}, {0x0001E025, 0x0001E025}, {0x0001E02B, 0x0001E0FF}, -{0x0001E12D, 0x0001E12F}, {0x0001E13E, 0x0001E13F}, {0x0001E14A, 0x0001E14D}, {0x0001E150, 0x0001E2BF}, -{0x0001E2FA, 0x0001E2FE}, {0x0001E300, 0x0001E7FF}, {0x0001E8C5, 0x0001E8C6}, {0x0001E8D7, 0x0001E8FF}, -{0x0001E94C, 0x0001E94F}, {0x0001E95A, 0x0001E95D}, {0x0001E960, 0x0001EC70}, {0x0001ECB5, 0x0001ED00}, -{0x0001ED3E, 0x0001EDFF}, {0x0001EE04, 0x0001EE04}, {0x0001EE20, 0x0001EE20}, {0x0001EE23, 0x0001EE23}, -{0x0001EE25, 0x0001EE26}, {0x0001EE28, 0x0001EE28}, {0x0001EE33, 0x0001EE33}, {0x0001EE38, 0x0001EE38}, -{0x0001EE3A, 0x0001EE3A}, {0x0001EE3C, 0x0001EE41}, {0x0001EE43, 0x0001EE46}, {0x0001EE48, 0x0001EE48}, -{0x0001EE4A, 0x0001EE4A}, {0x0001EE4C, 0x0001EE4C}, {0x0001EE50, 0x0001EE50}, {0x0001EE53, 0x0001EE53}, -{0x0001EE55, 0x0001EE56}, {0x0001EE58, 0x0001EE58}, {0x0001EE5A, 0x0001EE5A}, {0x0001EE5C, 0x0001EE5C}, -{0x0001EE5E, 0x0001EE5E}, {0x0001EE60, 0x0001EE60}, {0x0001EE63, 0x0001EE63}, {0x0001EE65, 0x0001EE66}, -{0x0001EE6B, 0x0001EE6B}, {0x0001EE73, 0x0001EE73}, {0x0001EE78, 0x0001EE78}, {0x0001EE7D, 0x0001EE7D}, -{0x0001EE7F, 0x0001EE7F}, {0x0001EE8A, 0x0001EE8A}, {0x0001EE9C, 0x0001EEA0}, {0x0001EEA4, 0x0001EEA4}, -{0x0001EEAA, 0x0001EEAA}, {0x0001EEBC, 0x0001EEEF}, {0x0001EEF2, 0x0001EFFF}, {0x0001F02C, 0x0001F02F}, -{0x0001F094, 0x0001F09F}, {0x0001F0AF, 0x0001F0B0}, {0x0001F0C0, 0x0001F0C0}, {0x0001F0D0, 0x0001F0D0}, -{0x0001F0F6, 0x0001F0FF}, {0x0001F1AE, 0x0001F1E5}, {0x0001F203, 0x0001F20F}, {0x0001F23C, 0x0001F23F}, -{0x0001F249, 0x0001F24F}, {0x0001F252, 0x0001F25F}, {0x0001F266, 0x0001F2FF}, {0x0001F6D8, 0x0001F6DF}, -{0x0001F6ED, 0x0001F6EF}, {0x0001F6FD, 0x0001F6FF}, {0x0001F774, 0x0001F77F}, {0x0001F7D9, 0x0001F7DF}, -{0x0001F7EC, 0x0001F7FF}, {0x0001F80C, 0x0001F80F}, {0x0001F848, 0x0001F84F}, {0x0001F85A, 0x0001F85F}, -{0x0001F888, 0x0001F88F}, {0x0001F8AE, 0x0001F8AF}, {0x0001F8B2, 0x0001F8FF}, {0x0001F979, 0x0001F979}, -{0x0001F9CC, 0x0001F9CC}, {0x0001FA54, 0x0001FA5F}, {0x0001FA6E, 0x0001FA6F}, {0x0001FA75, 0x0001FA77}, -{0x0001FA7B, 0x0001FA7F}, {0x0001FA87, 0x0001FA8F}, {0x0001FAA9, 0x0001FAAF}, {0x0001FAB7, 0x0001FABF}, -{0x0001FAC3, 0x0001FACF}, {0x0001FAD7, 0x0001FAFF}, {0x0001FB93, 0x0001FB93}, {0x0001FBCB, 0x0001FBEF}, -{0x0001FBFA, 0x0001FFFF}, {0x0002A6DE, 0x0002A6FF}, {0x0002B735, 0x0002B73F}, {0x0002B81E, 0x0002B81F}, -{0x0002CEA2, 0x0002CEAF}, {0x0002EBE1, 0x0002F7FF}, {0x0002FA1E, 0x0002FFFF}, {0x0003134B, 0x000E00FF}, -{0x000E01F0, 0x0010FFFF}, +{0x00011D99, 0x00011D9F}, {0x00011DAA, 0x00011EDF}, {0x00011EF9, 0x00011EFF}, {0x00011F11, 0x00011F11}, +{0x00011F3B, 0x00011F3D}, {0x00011F5A, 0x00011FAF}, {0x00011FB1, 0x00011FBF}, {0x00011FF2, 0x00011FFE}, +{0x0001239A, 0x000123FF}, {0x0001246F, 0x0001246F}, {0x00012475, 0x0001247F}, {0x00012544, 0x00012F8F}, +{0x00012FF3, 0x00012FFF}, {0x00013430, 0x0001343F}, {0x00013456, 0x000143FF}, {0x00014647, 0x000167FF}, +{0x00016A39, 0x00016A3F}, {0x00016A5F, 0x00016A5F}, {0x00016A6A, 0x00016A6D}, {0x00016ABF, 0x00016ABF}, +{0x00016ACA, 0x00016ACF}, {0x00016AEE, 0x00016AEF}, {0x00016AF6, 0x00016AFF}, {0x00016B46, 0x00016B4F}, +{0x00016B5A, 0x00016B5A}, {0x00016B62, 0x00016B62}, {0x00016B78, 0x00016B7C}, {0x00016B90, 0x00016E3F}, +{0x00016E9B, 0x00016EFF}, {0x00016F4B, 0x00016F4E}, {0x00016F88, 0x00016F8E}, {0x00016FA0, 0x00016FDF}, +{0x00016FE5, 0x00016FEF}, {0x00016FF2, 0x00016FFF}, {0x000187F8, 0x000187FF}, {0x00018CD6, 0x00018CFF}, +{0x00018D09, 0x0001AFEF}, {0x0001AFF4, 0x0001AFF4}, {0x0001AFFC, 0x0001AFFC}, {0x0001AFFF, 0x0001AFFF}, +{0x0001B123, 0x0001B131}, {0x0001B133, 0x0001B14F}, {0x0001B153, 0x0001B154}, {0x0001B156, 0x0001B163}, +{0x0001B168, 0x0001B16F}, {0x0001B2FC, 0x0001BBFF}, {0x0001BC6B, 0x0001BC6F}, {0x0001BC7D, 0x0001BC7F}, +{0x0001BC89, 0x0001BC8F}, {0x0001BC9A, 0x0001BC9B}, {0x0001BCA0, 0x0001CEFF}, {0x0001CF2E, 0x0001CF2F}, +{0x0001CF47, 0x0001CF4F}, {0x0001CFC4, 0x0001CFFF}, {0x0001D0F6, 0x0001D0FF}, {0x0001D127, 0x0001D128}, +{0x0001D173, 0x0001D17A}, {0x0001D1EB, 0x0001D1FF}, {0x0001D246, 0x0001D2BF}, {0x0001D2D4, 0x0001D2DF}, +{0x0001D2F4, 0x0001D2FF}, {0x0001D357, 0x0001D35F}, {0x0001D379, 0x0001D3FF}, {0x0001D455, 0x0001D455}, +{0x0001D49D, 0x0001D49D}, {0x0001D4A0, 0x0001D4A1}, {0x0001D4A3, 0x0001D4A4}, {0x0001D4A7, 0x0001D4A8}, +{0x0001D4AD, 0x0001D4AD}, {0x0001D4BA, 0x0001D4BA}, {0x0001D4BC, 0x0001D4BC}, {0x0001D4C4, 0x0001D4C4}, +{0x0001D506, 0x0001D506}, {0x0001D50B, 0x0001D50C}, {0x0001D515, 0x0001D515}, {0x0001D51D, 0x0001D51D}, +{0x0001D53A, 0x0001D53A}, {0x0001D53F, 0x0001D53F}, {0x0001D545, 0x0001D545}, {0x0001D547, 0x0001D549}, +{0x0001D551, 0x0001D551}, {0x0001D6A6, 0x0001D6A7}, {0x0001D7CC, 0x0001D7CD}, {0x0001DA8C, 0x0001DA9A}, +{0x0001DAA0, 0x0001DAA0}, {0x0001DAB0, 0x0001DEFF}, {0x0001DF1F, 0x0001DF24}, {0x0001DF2B, 0x0001DFFF}, +{0x0001E007, 0x0001E007}, {0x0001E019, 0x0001E01A}, {0x0001E022, 0x0001E022}, {0x0001E025, 0x0001E025}, +{0x0001E02B, 0x0001E02F}, {0x0001E06E, 0x0001E08E}, {0x0001E090, 0x0001E0FF}, {0x0001E12D, 0x0001E12F}, +{0x0001E13E, 0x0001E13F}, {0x0001E14A, 0x0001E14D}, {0x0001E150, 0x0001E28F}, {0x0001E2AF, 0x0001E2BF}, +{0x0001E2FA, 0x0001E2FE}, {0x0001E300, 0x0001E4CF}, {0x0001E4FA, 0x0001E7DF}, {0x0001E7E7, 0x0001E7E7}, +{0x0001E7EC, 0x0001E7EC}, {0x0001E7EF, 0x0001E7EF}, {0x0001E7FF, 0x0001E7FF}, {0x0001E8C5, 0x0001E8C6}, +{0x0001E8D7, 0x0001E8FF}, {0x0001E94C, 0x0001E94F}, {0x0001E95A, 0x0001E95D}, {0x0001E960, 0x0001EC70}, +{0x0001ECB5, 0x0001ED00}, {0x0001ED3E, 0x0001EDFF}, {0x0001EE04, 0x0001EE04}, {0x0001EE20, 0x0001EE20}, +{0x0001EE23, 0x0001EE23}, {0x0001EE25, 0x0001EE26}, {0x0001EE28, 0x0001EE28}, {0x0001EE33, 0x0001EE33}, +{0x0001EE38, 0x0001EE38}, {0x0001EE3A, 0x0001EE3A}, {0x0001EE3C, 0x0001EE41}, {0x0001EE43, 0x0001EE46}, +{0x0001EE48, 0x0001EE48}, {0x0001EE4A, 0x0001EE4A}, {0x0001EE4C, 0x0001EE4C}, {0x0001EE50, 0x0001EE50}, +{0x0001EE53, 0x0001EE53}, {0x0001EE55, 0x0001EE56}, {0x0001EE58, 0x0001EE58}, {0x0001EE5A, 0x0001EE5A}, +{0x0001EE5C, 0x0001EE5C}, {0x0001EE5E, 0x0001EE5E}, {0x0001EE60, 0x0001EE60}, {0x0001EE63, 0x0001EE63}, +{0x0001EE65, 0x0001EE66}, {0x0001EE6B, 0x0001EE6B}, {0x0001EE73, 0x0001EE73}, {0x0001EE78, 0x0001EE78}, +{0x0001EE7D, 0x0001EE7D}, {0x0001EE7F, 0x0001EE7F}, {0x0001EE8A, 0x0001EE8A}, {0x0001EE9C, 0x0001EEA0}, +{0x0001EEA4, 0x0001EEA4}, {0x0001EEAA, 0x0001EEAA}, {0x0001EEBC, 0x0001EEEF}, {0x0001EEF2, 0x0001EFFF}, +{0x0001F02C, 0x0001F02F}, {0x0001F094, 0x0001F09F}, {0x0001F0AF, 0x0001F0B0}, {0x0001F0C0, 0x0001F0C0}, +{0x0001F0D0, 0x0001F0D0}, {0x0001F0F6, 0x0001F0FF}, {0x0001F1AE, 0x0001F1E5}, {0x0001F203, 0x0001F20F}, +{0x0001F23C, 0x0001F23F}, {0x0001F249, 0x0001F24F}, {0x0001F252, 0x0001F25F}, {0x0001F266, 0x0001F2FF}, +{0x0001F6D8, 0x0001F6DB}, {0x0001F6ED, 0x0001F6EF}, {0x0001F6FD, 0x0001F6FF}, {0x0001F777, 0x0001F77A}, +{0x0001F7DA, 0x0001F7DF}, {0x0001F7EC, 0x0001F7EF}, {0x0001F7F1, 0x0001F7FF}, {0x0001F80C, 0x0001F80F}, +{0x0001F848, 0x0001F84F}, {0x0001F85A, 0x0001F85F}, {0x0001F888, 0x0001F88F}, {0x0001F8AE, 0x0001F8AF}, +{0x0001F8B2, 0x0001F8FF}, {0x0001FA54, 0x0001FA5F}, {0x0001FA6E, 0x0001FA6F}, {0x0001FA7D, 0x0001FA7F}, +{0x0001FA89, 0x0001FA8F}, {0x0001FABE, 0x0001FABE}, {0x0001FAC6, 0x0001FACD}, {0x0001FADC, 0x0001FADF}, +{0x0001FAE9, 0x0001FAEF}, {0x0001FAF9, 0x0001FAFF}, {0x0001FB93, 0x0001FB93}, {0x0001FBCB, 0x0001FBEF}, +{0x0001FBFA, 0x0001FFFF}, {0x0002A6E0, 0x0002A6FF}, {0x0002B73A, 0x0002B73F}, {0x0002B81E, 0x0002B81F}, +{0x0002CEA2, 0x0002CEAF}, {0x0002EBE1, 0x0002F7FF}, {0x0002FA1E, 0x0002FFFF}, {0x0003134B, 0x0003134F}, +{0x000323B0, 0x000E00FF}, {0x000E01F0, 0x0010FFFF}, }; const std::multimap unicode_map_nfd = { diff --git a/unicode-data.h b/unicode-data.h index cb9dd8aa5403c..3cf84117cf164 100644 --- a/unicode-data.h +++ b/unicode-data.h @@ -5,7 +5,7 @@ #include #include -extern const std::vector> unicode_ranges_digit; +extern const std::vector> unicode_ranges_number; extern const std::vector> unicode_ranges_letter; extern const std::vector> unicode_ranges_whitespace; extern const std::vector> unicode_ranges_accent_mark; diff --git a/unicode.cpp b/unicode.cpp index f2ccda05ff8c3..955c569650329 100644 --- a/unicode.cpp +++ b/unicode.cpp @@ -110,9 +110,9 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) static std::unordered_map unicode_cpt_type_map() { std::unordered_map cpt_types; - for (auto p : unicode_ranges_digit) { + for (auto p : unicode_ranges_number) { for (auto i = p.first; i <= p.second; ++ i) { - cpt_types[i] = CODEPOINT_TYPE_DIGIT; + cpt_types[i] = CODEPOINT_TYPE_NUMBER; } } for (auto p : unicode_ranges_letter) { @@ -300,13 +300,13 @@ static std::vector unicode_regex_split_custom_gpt2(const std::string & t collecting_letter = true; collecting = true; } - else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) { + else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_NUMBER || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_NUMBER)) { collecting_numeric = true; collecting = true; } else if ( - ((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) || - (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) + ((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_NUMBER) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) || + (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_NUMBER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) ) { collecting_special = true; collecting = true; @@ -323,13 +323,13 @@ static std::vector unicode_regex_split_custom_gpt2(const std::string & t if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) { split_condition = true; } - else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) { + else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_NUMBER) { split_condition = true; } - else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) { + else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_NUMBER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) { split_condition = true; } - else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) { + else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_NUMBER)) { split_condition = true; } } @@ -524,19 +524,19 @@ char32_t unicode_tolower(char32_t cp) { std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs) { // unicode categories static const std::map k_ucat_enum = { - { "\\p{N}", CODEPOINT_TYPE_DIGIT }, + { "\\p{N}", CODEPOINT_TYPE_NUMBER }, { "\\p{L}", CODEPOINT_TYPE_LETTER }, { "\\p{P}", CODEPOINT_TYPE_PUNCTUATION }, }; static const std::map k_ucat_cpt = { - { CODEPOINT_TYPE_DIGIT, 0xD1 }, + { CODEPOINT_TYPE_NUMBER, 0xD1 }, { CODEPOINT_TYPE_LETTER, 0xD2 }, { CODEPOINT_TYPE_PUNCTUATION, 0xD3 }, }; static const std::map k_ucat_map = { - { CODEPOINT_TYPE_DIGIT, "\x30-\x39" }, // 0-9 + { CODEPOINT_TYPE_NUMBER, "\x30-\x39" }, // 0-9 { CODEPOINT_TYPE_LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z { CODEPOINT_TYPE_PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\} }; diff --git a/unicode.h b/unicode.h index ce2bcef5a24b1..e9026dc81493f 100644 --- a/unicode.h +++ b/unicode.h @@ -5,7 +5,7 @@ #include #define CODEPOINT_TYPE_UNIDENTIFIED 0 -#define CODEPOINT_TYPE_DIGIT 1 +#define CODEPOINT_TYPE_NUMBER 1 #define CODEPOINT_TYPE_LETTER 2 #define CODEPOINT_TYPE_WHITESPACE 3 #define CODEPOINT_TYPE_ACCENT_MARK 4 From 03fb8a002df2e96104f9e06de9c78d2a8ed91e92 Mon Sep 17 00:00:00 2001 From: maor-ps <154728172+maor-ps@users.noreply.github.com> Date: Sat, 4 May 2024 12:06:40 +0300 Subject: [PATCH 09/10] If first token generated from the server is the stop word the server will crash (#7038) This will reproduce the issue in llama13b { 'prompt': 'Q: hello world \nA: ', 'stop': ['\n'], 'temperature': 0.0, 'n_predict': 10, 'cache_prompt': True, 'n_probs': 10 } --- examples/server/server.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index f60530cf3db56..ff0814b2f28bf 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1383,9 +1383,10 @@ struct server_context { if (!slot.params.stream && slot.stopped_word) { const std::vector stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false); + size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size()); probs = std::vector( slot.generated_token_probs.begin(), - slot.generated_token_probs.end() - stop_word_toks.size()); + slot.generated_token_probs.end() - safe_offset); } else { probs = std::vector( slot.generated_token_probs.begin(), From fcd84a0f5a584ab5271745d7ffef21c8a6bc7b0c Mon Sep 17 00:00:00 2001 From: viric Date: Sat, 4 May 2024 15:26:53 +0200 Subject: [PATCH 10/10] Fix Linux /sys cpu path to guess number of cores (#7064) --- common/common.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index 243b88abf1aab..467fb014eedb0 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -76,7 +76,7 @@ int32_t get_num_physical_cores() { // enumerate the set of thread siblings, num entries is num cores std::unordered_set siblings; for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) { - std::ifstream thread_siblings("/sys/devices/system/cpu" + std::ifstream thread_siblings("/sys/devices/system/cpu/cpu" + std::to_string(cpu) + "/topology/thread_siblings"); if (!thread_siblings.is_open()) { break; // no more cpus