Skip to content

Commit

Permalink
Speed up hash computing, issue zufuliu#289.
Browse files Browse the repository at this point in the history
  • Loading branch information
zufuliu committed Mar 4, 2021
1 parent 9e8d68d commit ab0f491
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 31 deletions.
70 changes: 44 additions & 26 deletions scintilla/scripts/LaTeXInput.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,9 @@ def json_load(path):
def djb2_hash(buf, multiplier):
value = 0
for ch in buf:
# masked to match C/C++ unsigned integer overflow wrap around
value = (value * multiplier + ch) & (2**32 - 1)
return value
value = value * multiplier + ch
# masked to match C/C++ unsigned integer overflow wrap around
return value & (2**32 - 1)


def prepare_input_data(input_map, path):
Expand All @@ -79,28 +79,51 @@ def prepare_input_data(input_map, path):
info['magic'] = len(buf) | (buf[0] << 8)
return input_map

def dump_hash_param(hash_param, multiplier_list, path):
with open(path, 'w', encoding='utf-8', newline='\n') as fd:
for multiplier, items in hash_param.items():
if multiplier in multiplier_list:
output = [str(multiplier) + '\n']
items.sort()
previous = items[0][0]
for item in items:
current = item[0]
if current != previous:
previous = current
output.append('\n')
output.append('\t' + str(item) + '\n')
fd.write(''.join(output))

def find_hash_param(input_map, multiplier_list, max_hash_size):
key_list = [(info['hash_key'], info['magic']) for info in input_map.values()]
key_list = [info['hash_key'] for info in input_map.values()]
magic_list = [info['magic'] for info in input_map.values()]

hash_size = len(input_map) // 15
hash_param = {}
for multiplier in multiplier_list:
raw_hash = [djb2_hash(key, multiplier) for key in key_list]
for size in range(hash_size, max_hash_size + 1):
hash_list = [hash_key % size for hash_key in raw_hash]
distribution = [0] * size
hash_map = {}
for key, magic in key_list:
hash_key = djb2_hash(key, multiplier) % size
for hash_key in hash_list:
distribution[hash_key] += 1
if hash_key in hash_map:
hash_map[hash_key].append(magic)
else:
hash_map[hash_key] = [magic]

collision = max(distribution)
if collision < 16:
comparison = 1 + max(len(items) - len(set(items)) for items in hash_map.values())
if comparison <= 3: # maximum string comparison
hash_map = {}
for index, magic in enumerate(magic_list):
hash_key = hash_list[index]
if hash_key in hash_map:
hash_map[hash_key].append(magic)
else:
hash_map[hash_key] = [magic]

comparison = [len(items) - len(set(items)) for items in hash_map.values()]
max_comparison = max(comparison)
if max_comparison < 3: # maximum string comparison
used = sum(item != 0 for item in distribution)
var = round(variance(distribution), 2)
item = (size, collision, comparison, var)
item = (max_comparison + 1, size, used, collision, sum(comparison), var)
if multiplier in hash_param:
hash_param[multiplier].append(item)
else:
Expand Down Expand Up @@ -213,23 +236,18 @@ def update_all_latex_input_data(latex_map=None, emoji_map=None):
multiplier_list = list(set(multiplier_list))
multiplier_list.sort()

start_time = time.perf_counter_ns()
latex_hash = find_hash_param(latex_map, multiplier_list, 512)
emoji_hash = find_hash_param(emoji_map, multiplier_list, 256)
end_time = time.perf_counter_ns()
duration = (end_time - start_time)/1e6
print('hash param time:', duration)

multiplier_list = list(set(latex_hash.keys()) & set(emoji_hash.keys()))
multiplier_list.sort()
print('hash multiplier:', multiplier_list)
with open('latex_hash.log', 'w', encoding='utf-8', newline='\n') as fd:
for multiplier, items in latex_hash.items():
if multiplier in multiplier_list:
fd.write(f'{multiplier}\n')
line = '\n'.join('\t' + str(item) for item in items)
fd.write(line)
with open('emoji_hash.log', 'w', encoding='utf-8', newline='\n') as fd:
for multiplier, items in emoji_hash.items():
if multiplier in multiplier_list:
fd.write(f'{multiplier}\n')
line = '\n'.join('\t' + str(item) for item in items)
fd.write(line)
dump_hash_param(latex_hash, multiplier_list, 'latex_hash.log')
dump_hash_param(emoji_hash, multiplier_list, 'emoji_hash.log')

update_latex_input_data('LaTeX', latex_map, 33, 290)
update_latex_input_data('Emoji', emoji_map, 33, 164)
Expand Down
6 changes: 3 additions & 3 deletions scintilla/win32/LaTeXInput.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ constexpr uint32_t array_size([[maybe_unused]] const T (&a)[N]) noexcept {
uint32_t GetLaTeXInputUnicodeCharacter(const char *sequence, size_t length) {
#if EnableLaTeXLikeEmojiInput
static_assert(LaTeXHashMultiplier == EmojiHashMultiplier);
const bool latex = *sequence != ':';
if (latex) {
const char firstChar = *sequence;
if (firstChar != ':') {
if (length < MinLaTeXInputSequenceLength || length > MaxLaTeXInputSequenceLength) {
return 0;
}
Expand Down Expand Up @@ -65,7 +65,7 @@ uint32_t GetLaTeXInputUnicodeCharacter(const char *sequence, size_t length) {
value = value*LaTeXHashMultiplier + static_cast<uint8_t>(*ptr++);
} while (ptr < end);
#if EnableLaTeXLikeEmojiInput
if (latex) {
if (firstChar != ':') {
value %= array_size(LaTeXHashTable);
value = LaTeXHashTable[value];
sequenceString = LaTeXInputSequenceString;
Expand Down
4 changes: 2 additions & 2 deletions scintilla/win32/LaTeXInputData.h
Original file line number Diff line number Diff line change
Expand Up @@ -2802,7 +2802,7 @@ static const InputSequence LaTeXInputSequenceList[] = {
{0x760b, 0x5130, 0x25B5}, // ▵, \vartriangle, White Up-Pointing Small Triangle / White Up Pointing Small Triangle
{0x760b, 0x513c, 0x20D2}, // ⃒, \vertoverlay, Combining Long Vertical Line Overlay / Non-Spacing Long Vertical Bar Overlay
{0x7610, 0x5148, 0x22B3}, // ⊳, \vartriangleright, Contains As Normal Subgroup
//LaTeX list--Autogenerated -- start of section automatically generated
//LaTeX list--Autogenerated -- end of section automatically generated
};

#if EnableLaTeXLikeEmojiInput
Expand Down Expand Up @@ -4352,7 +4352,7 @@ static const InputSequence EmojiInputSequenceList[] = {
{0x730a, 0x3cf7, 0xDF67'D83C}, // U+1F367, 🍧, \:shaved_ice:, Shaved Ice
{0x7409, 0x3d02, 0xDDEA'D83E}, // U+1F9EA, 🧪, \:test_tube:, Test Tube
{0x7713, 0x3d0c, 0xDC6D'D83D}, // U+1F46D, 👭, \:women_holding_hands:, Two Women Holding Hands
//Emoji list--Autogenerated -- start of section automatically generated
//Emoji list--Autogenerated -- end of section automatically generated
};
#endif // EnableLaTeXLikeEmojiInput

Expand Down

0 comments on commit ab0f491

Please sign in to comment.