From ab0f4916f701ed63edb8247c15a99bb610f55d9a Mon Sep 17 00:00:00 2001 From: zufuliu Date: Thu, 4 Mar 2021 22:12:38 +0800 Subject: [PATCH] Speed up hash computing, issue #289. --- scintilla/scripts/LaTeXInput.py | 70 ++++++++++++++++++++------------ scintilla/win32/LaTeXInput.cxx | 6 +-- scintilla/win32/LaTeXInputData.h | 4 +- 3 files changed, 49 insertions(+), 31 deletions(-) diff --git a/scintilla/scripts/LaTeXInput.py b/scintilla/scripts/LaTeXInput.py index 4097ee603a..598c7af307 100644 --- a/scintilla/scripts/LaTeXInput.py +++ b/scintilla/scripts/LaTeXInput.py @@ -65,9 +65,9 @@ def json_load(path): def djb2_hash(buf, multiplier): value = 0 for ch in buf: - # masked to match C/C++ unsigned integer overflow wrap around - value = (value * multiplier + ch) & (2**32 - 1) - return value + value = value * multiplier + ch + # masked to match C/C++ unsigned integer overflow wrap around + return value & (2**32 - 1) def prepare_input_data(input_map, path): @@ -79,28 +79,51 @@ def prepare_input_data(input_map, path): info['magic'] = len(buf) | (buf[0] << 8) return input_map +def dump_hash_param(hash_param, multiplier_list, path): + with open(path, 'w', encoding='utf-8', newline='\n') as fd: + for multiplier, items in hash_param.items(): + if multiplier in multiplier_list: + output = [str(multiplier) + '\n'] + items.sort() + previous = items[0][0] + for item in items: + current = item[0] + if current != previous: + previous = current + output.append('\n') + output.append('\t' + str(item) + '\n') + fd.write(''.join(output)) + def find_hash_param(input_map, multiplier_list, max_hash_size): - key_list = [(info['hash_key'], info['magic']) for info in input_map.values()] + key_list = [info['hash_key'] for info in input_map.values()] + magic_list = [info['magic'] for info in input_map.values()] + hash_size = len(input_map) // 15 hash_param = {} for multiplier in multiplier_list: + raw_hash = [djb2_hash(key, multiplier) for key in key_list] for size in range(hash_size, max_hash_size + 1): + hash_list = [hash_key % size for hash_key in raw_hash] distribution = [0] * size - hash_map = {} - for key, magic in key_list: - hash_key = djb2_hash(key, multiplier) % size + for hash_key in hash_list: distribution[hash_key] += 1 - if hash_key in hash_map: - hash_map[hash_key].append(magic) - else: - hash_map[hash_key] = [magic] collision = max(distribution) if collision < 16: - comparison = 1 + max(len(items) - len(set(items)) for items in hash_map.values()) - if comparison <= 3: # maximum string comparison + hash_map = {} + for index, magic in enumerate(magic_list): + hash_key = hash_list[index] + if hash_key in hash_map: + hash_map[hash_key].append(magic) + else: + hash_map[hash_key] = [magic] + + comparison = [len(items) - len(set(items)) for items in hash_map.values()] + max_comparison = max(comparison) + if max_comparison < 3: # maximum string comparison + used = sum(item != 0 for item in distribution) var = round(variance(distribution), 2) - item = (size, collision, comparison, var) + item = (max_comparison + 1, size, used, collision, sum(comparison), var) if multiplier in hash_param: hash_param[multiplier].append(item) else: @@ -213,23 +236,18 @@ def update_all_latex_input_data(latex_map=None, emoji_map=None): multiplier_list = list(set(multiplier_list)) multiplier_list.sort() + start_time = time.perf_counter_ns() latex_hash = find_hash_param(latex_map, multiplier_list, 512) emoji_hash = find_hash_param(emoji_map, multiplier_list, 256) + end_time = time.perf_counter_ns() + duration = (end_time - start_time)/1e6 + print('hash param time:', duration) + multiplier_list = list(set(latex_hash.keys()) & set(emoji_hash.keys())) multiplier_list.sort() print('hash multiplier:', multiplier_list) - with open('latex_hash.log', 'w', encoding='utf-8', newline='\n') as fd: - for multiplier, items in latex_hash.items(): - if multiplier in multiplier_list: - fd.write(f'{multiplier}\n') - line = '\n'.join('\t' + str(item) for item in items) - fd.write(line) - with open('emoji_hash.log', 'w', encoding='utf-8', newline='\n') as fd: - for multiplier, items in emoji_hash.items(): - if multiplier in multiplier_list: - fd.write(f'{multiplier}\n') - line = '\n'.join('\t' + str(item) for item in items) - fd.write(line) + dump_hash_param(latex_hash, multiplier_list, 'latex_hash.log') + dump_hash_param(emoji_hash, multiplier_list, 'emoji_hash.log') update_latex_input_data('LaTeX', latex_map, 33, 290) update_latex_input_data('Emoji', emoji_map, 33, 164) diff --git a/scintilla/win32/LaTeXInput.cxx b/scintilla/win32/LaTeXInput.cxx index 7886015731..d997da1af0 100644 --- a/scintilla/win32/LaTeXInput.cxx +++ b/scintilla/win32/LaTeXInput.cxx @@ -29,8 +29,8 @@ constexpr uint32_t array_size([[maybe_unused]] const T (&a)[N]) noexcept { uint32_t GetLaTeXInputUnicodeCharacter(const char *sequence, size_t length) { #if EnableLaTeXLikeEmojiInput static_assert(LaTeXHashMultiplier == EmojiHashMultiplier); - const bool latex = *sequence != ':'; - if (latex) { + const char firstChar = *sequence; + if (firstChar != ':') { if (length < MinLaTeXInputSequenceLength || length > MaxLaTeXInputSequenceLength) { return 0; } @@ -65,7 +65,7 @@ uint32_t GetLaTeXInputUnicodeCharacter(const char *sequence, size_t length) { value = value*LaTeXHashMultiplier + static_cast(*ptr++); } while (ptr < end); #if EnableLaTeXLikeEmojiInput - if (latex) { + if (firstChar != ':') { value %= array_size(LaTeXHashTable); value = LaTeXHashTable[value]; sequenceString = LaTeXInputSequenceString; diff --git a/scintilla/win32/LaTeXInputData.h b/scintilla/win32/LaTeXInputData.h index e189ae4ebb..e1ed874f96 100644 --- a/scintilla/win32/LaTeXInputData.h +++ b/scintilla/win32/LaTeXInputData.h @@ -2802,7 +2802,7 @@ static const InputSequence LaTeXInputSequenceList[] = { {0x760b, 0x5130, 0x25B5}, // ▵, \vartriangle, White Up-Pointing Small Triangle / White Up Pointing Small Triangle {0x760b, 0x513c, 0x20D2}, // ⃒, \vertoverlay, Combining Long Vertical Line Overlay / Non-Spacing Long Vertical Bar Overlay {0x7610, 0x5148, 0x22B3}, // ⊳, \vartriangleright, Contains As Normal Subgroup -//LaTeX list--Autogenerated -- start of section automatically generated +//LaTeX list--Autogenerated -- end of section automatically generated }; #if EnableLaTeXLikeEmojiInput @@ -4352,7 +4352,7 @@ static const InputSequence EmojiInputSequenceList[] = { {0x730a, 0x3cf7, 0xDF67'D83C}, // U+1F367, 🍧, \:shaved_ice:, Shaved Ice {0x7409, 0x3d02, 0xDDEA'D83E}, // U+1F9EA, 🧪, \:test_tube:, Test Tube {0x7713, 0x3d0c, 0xDC6D'D83D}, // U+1F46D, 👭, \:women_holding_hands:, Two Women Holding Hands -//Emoji list--Autogenerated -- start of section automatically generated +//Emoji list--Autogenerated -- end of section automatically generated }; #endif // EnableLaTeXLikeEmojiInput