Skip to content

Commit

Permalink
perf: compact lexicon entries to take less RAM
Browse files Browse the repository at this point in the history
Store the lexicon and joined groups of 16 entries to reduce the str object
memory overhead.

Experimented with various block sizes to see the memory impact
Measured by running `import g2p; g2p.get_arpabet_lang()`:
 - original: 71MB
 - blocks of 4: 59MB
 - blocks of 16: 56MB
 - blocks of 256: 55MB
I decided the 15MB RAM savings were worth it for blocks of 16, but the
gain beyond that is trivial and not worth it.

In terms of speed the original code and blocks of 16 are the same, at
least within the error of measurement, which was running
`g2p convert --file en.txt eng eng-ipa` where en.txt is a file
containing all the words in the cmudict lexicon: original and 16 both
took 20-21 seconds depending on the run.
At blocks of 256, I was getting 23 seconds, not a big difference, but
measurable for not significant memory gain.
  • Loading branch information
joanise committed Sep 13, 2024
1 parent b315a6c commit e605ae5
Showing 1 changed file with 35 additions and 6 deletions.
41 changes: 35 additions & 6 deletions g2p/mappings/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,18 @@
from copy import deepcopy
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional, Pattern, Tuple, TypeVar, Union, cast
from typing import (
Any,
Dict,
List,
Optional,
Pattern,
Sequence,
Tuple,
TypeVar,
Union,
cast,
)

import regex as re
import yaml
Expand Down Expand Up @@ -495,18 +506,36 @@ def get_alignment_sequence(alignment: str, delimiter="") -> List[Tuple[int, str]
# The joiner between key and value must be 0 so that it sorts before all
# characters and thus won't break bisect_left()
_JOINER = "\0"
# For compacting a group of lexicon entries into one string.
# This just has to be somethign that does not occur in the lexicon data
_BLOCK_JOINER = "\1"


def find_alignment(alignments: List[str], word: str) -> List[Tuple[int, str]]:
"""Given a sorted list of (word, alignment), find word and return its parsed alignment."""
i = bisect_left(alignments, word)
if i != len(alignments):
k, v = alignments[i].split(_JOINER, maxsplit=1)
if k == word:
return get_alignment_sequence(v)
alignment_entry = _JOINER
if i != len(alignments) and alignments[i].startswith(word + _JOINER):
alignment_entry, _, _ = alignments[i].partition(_BLOCK_JOINER)
elif i > 0:
alignment_block = alignments[i - 1].split(_BLOCK_JOINER)
j = bisect_left(alignment_block, word)
if j != len(alignment_block):
alignment_entry = alignment_block[j]
k, _, v = alignment_entry.partition(_JOINER)
if k == word:
return get_alignment_sequence(v)
return []


def compact_alignments(alignments: Sequence[str]) -> List[str]:
_BLOCK_SIZE = 16
return [
_BLOCK_JOINER.join(alignments[i : i + _BLOCK_SIZE])
for i in range(0, len(alignments), _BLOCK_SIZE)
]


def load_alignments_from_file(path, delimiter="") -> List[str]:
"""Load alignments in Phonetisaurus default format.
Expand All @@ -526,7 +555,7 @@ def load_alignments_from_file(path, delimiter="") -> List[str]:
continue
word = get_alignment_input_string(spam)
alignments.append(word + _JOINER + spam)
return sorted(alignments)
return compact_alignments(sorted(alignments))


def is_ipa(lang: str) -> bool:
Expand Down

0 comments on commit e605ae5

Please sign in to comment.