Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tweak to Save Memory Consumption When Compiling Dictionaries #661

Merged
merged 5 commits into from
Jun 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ env.bat
node_modules/
*~
.*.swp
.cache/
14 changes: 8 additions & 6 deletions src/rime/dict/dict_compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -236,22 +236,24 @@ bool DictCompiler::BuildTable(int table_index,
for (const auto& s : collector.syllabary) {
syllable_to_id[s] = syllable_id++;
}
for (RawDictEntry& r : collector.entries) {
for (const auto& r : collector.entries) {
Code code;
for (const auto& s : r.raw_code) {
for (const auto& s : r->raw_code) {
code.push_back(syllable_to_id[s]);
}
DictEntryList* ls = vocabulary.LocateEntries(code);
auto ls = vocabulary.LocateEntries(code);
if (!ls) {
LOG(ERROR) << "Error locating entries in vocabulary.";
continue;
}
auto e = New<DictEntry>();
auto e = New<ShortDictEntry>();
e->code.swap(code);
e->text.swap(r.text);
e->weight = log(r.weight > 0 ? r.weight : DBL_EPSILON);
e->text.swap(r->text);
e->weight = log(r->weight > 0 ? r->weight : DBL_EPSILON);
ls->push_back(e);
}
// release memory in time to reduce peak memory usage
vector<of<RawDictEntry>>().swap(collector.entries);
lotem marked this conversation as resolved.
Show resolved Hide resolved
if (settings->sort_order() != "original") {
vocabulary.SortHomophones();
}
Expand Down
10 changes: 5 additions & 5 deletions src/rime/dict/entry_collector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ void EntryCollector::CreateEntry(const string &word,
words[e.text][code_str] += e.weight;
total_weight[e.text] += e.weight;
}
entries.push_back(e);
entries.emplace_back(New<RawDictEntry>(e));
++num_entries;
}

Expand Down Expand Up @@ -240,10 +240,10 @@ void EntryCollector::Dump(const string& file_name) const {
out << "# - " << syllable << std::endl;
}
out << std::endl;
for (const RawDictEntry& e : entries) {
out << e.text << '\t'
<< e.raw_code.ToString() << '\t'
<< e.weight << std::endl;
for (const auto &e : entries) {
out << e->text << '\t'
<< e->raw_code.ToString() << '\t'
<< e->weight << std::endl;
}
out.close();
}
Expand Down
2 changes: 1 addition & 1 deletion src/rime/dict/entry_collector.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class EntryCollector : public PhraseCollector {
public:
Syllabary syllabary;
bool build_syllabary = true;
vector<RawDictEntry> entries;
vector<of<RawDictEntry>> entries;
size_t num_entries = 0;
ReverseLookupTable stems;

Expand Down
6 changes: 3 additions & 3 deletions src/rime/dict/table.cc
Original file line number Diff line number Diff line change
Expand Up @@ -518,7 +518,7 @@ table::TailIndex* Table::BuildTailIndex(const Code& prefix,
return index;
}

Array<table::Entry>* Table::BuildEntryArray(const DictEntryList& entries) {
Array<table::Entry>* Table::BuildEntryArray(const ShortDictEntryList& entries) {
auto array = CreateArray<table::Entry>(entries.size());
if (!array) {
return NULL;
Expand All @@ -531,7 +531,7 @@ Array<table::Entry>* Table::BuildEntryArray(const DictEntryList& entries) {
return array;
}

bool Table::BuildEntryList(const DictEntryList& src,
bool Table::BuildEntryList(const ShortDictEntryList& src,
List<table::Entry>* dest) {
if (!dest)
return false;
Expand All @@ -549,7 +549,7 @@ bool Table::BuildEntryList(const DictEntryList& src,
return true;
}

bool Table::BuildEntry(const DictEntry& dict_entry, table::Entry* entry) {
bool Table::BuildEntry(const ShortDictEntry& dict_entry, table::Entry* entry) {
if (!entry)
return false;
if (!AddString(dict_entry.text, &entry->text, dict_entry.weight)) {
Expand Down
6 changes: 3 additions & 3 deletions src/rime/dict/table.h
Original file line number Diff line number Diff line change
Expand Up @@ -166,9 +166,9 @@ class Table : public MappedFile {
const Vocabulary& vocabulary);
bool BuildPhraseIndex(Code code, const Vocabulary& vocabulary,
map<string, int>* index_data);
Array<table::Entry>* BuildEntryArray(const DictEntryList& entries);
bool BuildEntryList(const DictEntryList& src, List<table::Entry>* dest);
bool BuildEntry(const DictEntry& dict_entry, table::Entry* entry);
Array<table::Entry>* BuildEntryArray(const ShortDictEntryList& entries);
bool BuildEntryList(const ShortDictEntryList& src, List<table::Entry>* dest);
bool BuildEntry(const ShortDictEntry& dict_entry, table::Entry* entry);

string GetString(const table::StringType& x);
bool AddString(const string& src, table::StringType* dest,
Expand Down
45 changes: 38 additions & 7 deletions src/rime/dict/vocabulary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
// 2011-07-24 GONG Chen <chen.sst@gmail.com>
//
#include <algorithm>
#include <iterator>
#include <sstream>
#include <utility>
#include <rime/dict/vocabulary.h>
Expand Down Expand Up @@ -59,6 +60,18 @@ string Code::ToString() const {
return stream.str();
}

inline ShortDictEntry DictEntry::ToShort() const {
return {text, code, weight};
}

bool ShortDictEntry::operator< (const ShortDictEntry& other) const {
// Sort different entries sharing the same code by weight desc.
if (weight != other.weight)
return weight > other.weight;
// reduce carbon emission
return 0; //text < other.text;
}

bool DictEntry::operator< (const DictEntry& other) const {
// Sort different entries sharing the same code by weight desc.
if (weight != other.weight)
Expand All @@ -72,16 +85,34 @@ inline bool dereference_less(const T& a, const T& b) {
return *a < *b;
}

template <typename C>
inline void sort(C &container) {
std::sort(std::begin(container), std::end(container), dereference_less<typename C::value_type>);
}

template <typename C>
inline void sort_range(C &container, size_t start, size_t count) {
if (start >= container.size())
return;
auto i(std::begin(container) + start);
auto j(start + count >= container.size() ? std::end(container) : i + count);
std::sort(i, j, dereference_less<typename C::value_type>);
}

void ShortDictEntryList::Sort() {
sort(*this);
}

void ShortDictEntryList::SortRange(size_t start, size_t count) {
sort_range(*this, start, count);
}

void DictEntryList::Sort() {
std::sort(begin(), end(), dereference_less<DictEntryList::value_type>);
sort(*this);
}

void DictEntryList::SortRange(size_t start, size_t count) {
if (start >= size())
return;
iterator i(begin() + start);
iterator j(start + count >= size() ? end() : i + count);
std::sort(i, j, dereference_less<DictEntryList::value_type>);
sort_range(*this, start, count);
}

void DictEntryFilterBinder::AddFilter(DictEntryFilter filter) {
Expand All @@ -96,7 +127,7 @@ void DictEntryFilterBinder::AddFilter(DictEntryFilter filter) {
}
}

DictEntryList* Vocabulary::LocateEntries(const Code& code) {
ShortDictEntryList* Vocabulary::LocateEntries(const Code& code) {
Vocabulary* v = this;
size_t n = code.size();
for (size_t i = 0; i < n; ++i) {
Expand Down
24 changes: 20 additions & 4 deletions src/rime/dict/vocabulary.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,20 +30,36 @@ class Code : public vector<SyllableId> {
string ToString() const;
};

struct ShortDictEntry {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How much more memory efficient is this?

If the diff by this single change is insignificant, I'd prefer not to repeat the code.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The screenshots I pasted in the Additional Info have showed the info you get interested. In my test case, it may save about 15 % ~ 18 % of the memory consumption.

string text;
Code code; // multi-syllable code from prism
double weight = 0.0;

ShortDictEntry() = default;
bool operator< (const ShortDictEntry& other) const;
};

struct DictEntry {
string text;
string comment;
string preedit;
double weight = 0.0;
int commit_count = 0;
Code code; // multi-syllable code from prism
string custom_code; // user defined code
double weight = 0.0;
int commit_count = 0;
int remaining_code_length = 0;

DictEntry() = default;
ShortDictEntry ToShort() const;
bool operator< (const DictEntry& other) const;
};

class ShortDictEntryList : public vector<of<ShortDictEntry>> {
public:
void Sort();
void SortRange(size_t start, size_t count);
};

class DictEntryList : public vector<of<DictEntry>> {
public:
void Sort();
Expand All @@ -64,13 +80,13 @@ class DictEntryFilterBinder {
class Vocabulary;

struct VocabularyPage {
DictEntryList entries;
ShortDictEntryList entries;
an<Vocabulary> next_level;
};

class Vocabulary : public map<int, VocabularyPage> {
public:
DictEntryList* LocateEntries(const Code& code);
ShortDictEntryList* LocateEntries(const Code& code);
void SortHomophones();
};

Expand Down
18 changes: 9 additions & 9 deletions test/table_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,34 +44,34 @@ rime::the<rime::Table> RimeTableTest::table_;

void RimeTableTest::PrepareSampleVocabulary(rime::Syllabary& syll,
rime::Vocabulary& voc) {
auto d = rime::New<rime::DictEntry>();
auto d = rime::New<rime::ShortDictEntry>();
syll.insert("0");
// no entries for '0', however
syll.insert("1");
d->code.push_back(1);
d->text = "yi";
d->weight = 1.0;
voc[1].entries.push_back(d);
d = rime::New<rime::DictEntry>(*d);
d = rime::New<rime::ShortDictEntry>(*d);
syll.insert("2");
d->code.back() = 2;
d->text = "er";
voc[2].entries.push_back(d);
d = rime::New<rime::DictEntry>(*d);
d = rime::New<rime::ShortDictEntry>(*d);
d->text = "liang";
voc[2].entries.push_back(d);
d = rime::New<rime::DictEntry>(*d);
d = rime::New<rime::ShortDictEntry>(*d);
d->text = "lia";
voc[2].entries.push_back(d);
d = rime::New<rime::DictEntry>(*d);
d = rime::New<rime::ShortDictEntry>(*d);
syll.insert("3");
d->code.back() = 3;
d->text = "san";
voc[3].entries.push_back(d);
d = rime::New<rime::DictEntry>(*d);
d = rime::New<rime::ShortDictEntry>(*d);
d->text = "sa";
voc[3].entries.push_back(d);
d = rime::New<rime::DictEntry>(*d);
d = rime::New<rime::ShortDictEntry>(*d);
syll.insert("4");
auto lv2 = rime::New<rime::Vocabulary>();
voc[1].next_level = lv2;
Expand All @@ -84,11 +84,11 @@ void RimeTableTest::PrepareSampleVocabulary(rime::Syllabary& syll,
d->code.push_back(3);
d->text = "yi-er-san";
(*lv3)[3].entries.push_back(d);
d = rime::New<rime::DictEntry>(*d);
d = rime::New<rime::ShortDictEntry>(*d);
d->code.push_back(4);
d->text = "yi-er-san-si";
(*lv4)[-1].entries.push_back(d);
d = rime::New<rime::DictEntry>(*d);
d = rime::New<rime::ShortDictEntry>(*d);
d->code.resize(3);
d->code.push_back(2);
d->code.push_back(1);
Expand Down