Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: spelling correction #228

Merged
merged 25 commits into from
Dec 14, 2018
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ else()
endif()

if(BUILD_SHARED_LIBS)
add_library(rime ${rime_src})
add_library(rime ${rime_src} rime/algo/corrector.h rime/algo/corrector.cc)
lotem marked this conversation as resolved.
Show resolved Hide resolved
target_link_libraries(rime ${rime_deps})
set_target_properties(rime PROPERTIES DEFINE_SYMBOL "RIME_EXPORTS")
set_target_properties(rime PROPERTIES VERSION ${rime_version} SOVERSION ${rime_soversion})
Expand Down
35 changes: 35 additions & 0 deletions src/rime/algo/corrector.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
//
// Copyright RIME Developers
// Distributed under the BSD License
//
// Created by nameoverflow on 2018/11/14.
//

#include "corrector.h"

using namespace rime;

void DFSCollect(const string &origin, const string &deleted, size_t ed, Script &result);

Script CorrectionCollector::Collect(size_t edit_distance) {
// TODO: specifically for 1 length str
Script script;

for (auto &v : syllabary_) {
DFSCollect(v, v, edit_distance, script);
}

return script;
}

void DFSCollect(const string &origin, const string &deleted, size_t ed, Script &result) {
if (ed <= 0) return;
for (size_t i = 0; i < deleted.size(); i++) {
string temp = deleted;
temp.erase(i, 1);
Spelling spelling(origin);
spelling.properties.type = kCorrection;
result[temp].push_back(spelling);
lotem marked this conversation as resolved.
Show resolved Hide resolved
DFSCollect(origin, temp, ed - 1, result);
}
}
37 changes: 37 additions & 0 deletions src/rime/algo/corrector.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
//
// Copyright RIME Developers
// Distributed under the BSD License
//
// Created by nameoverflow on 2018/11/14.
//

#ifndef RIME_CORRECTOR_H
#define RIME_CORRECTOR_H

#include <rime/common.h>
#include <rime/dict/vocabulary.h>
#include <rime/dict/prism.h>
#include "spelling.h"
#include "algebra.h"

namespace rime {

class CorrectionCollector {
public:
explicit CorrectionCollector(const Syllabary& syllabary): syllabary_(syllabary) {}

Script Collect(size_t edit_distance);

private:
const Syllabary& syllabary_;
};

class Corrector : public Prism {
public:

};


} // namespace rime

#endif //RIME_CORRECTOR_H
2 changes: 1 addition & 1 deletion src/rime/algo/spelling.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ namespace rime {

enum SpellingType { kNormalSpelling, kFuzzySpelling,
kAbbreviation, kCompletion, kAmbiguousSpelling,
kInvalidSpelling };
kCorrection, kInvalidSpelling };

struct SpellingProperties {
SpellingType type = kNormalSpelling;
Expand Down
3 changes: 2 additions & 1 deletion src/rime/algo/syllabifier.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ using VertexQueue = std::priority_queue<Vertex,

int Syllabifier::BuildSyllableGraph(const string &input,
Prism &prism,
SyllableGraph *graph) {
SyllableGraph *graph,
optional<Prism&> corretion) {
if (input.empty())
return 0;

Expand Down
3 changes: 2 additions & 1 deletion src/rime/algo/syllabifier.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ class Syllabifier {

RIME_API int BuildSyllableGraph(const string &input,
Prism &prism,
SyllableGraph *graph);
SyllableGraph *graph,
optional<Prism&> corretion);

protected:
void CheckOverlappedSpellings(SyllableGraph *graph,
Expand Down
2 changes: 2 additions & 0 deletions src/rime/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <unordered_set>
#include <utility>
#include <vector>
#include <boost/optional>
#define BOOST_BIND_NO_PLACEHOLDERS
#ifdef BOOST_SIGNALS2
#include <boost/signals2/connection.hpp>
Expand Down Expand Up @@ -47,6 +48,7 @@ using std::pair;
using std::set;
using std::string;
using std::vector;
using boost::optional;

template <class Key, class T>
using hash_map = std::unordered_map<Key, T>;
Expand Down
30 changes: 28 additions & 2 deletions src/rime/dict/dict_compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <rime/resource.h>
#include <rime/service.h>
#include <rime/algo/algebra.h>
#include <rime/algo/corrector.h>
#include <rime/algo/utilities.h>
#include <rime/dict/dictionary.h>
#include <rime/dict/dict_compiler.h>
Expand Down Expand Up @@ -212,7 +213,7 @@ bool DictCompiler::BuildPrism(const string &schema_file,
Syllabary syllabary;
if (!table_->Load() || !table_->GetSyllabary(&syllabary) || syllabary.empty())
return false;
// apply spelling algebra
// apply spelling algebra and prepare corrections (if enabled)
Script script;
if (!schema_file.empty()) {
Config config;
Expand All @@ -230,6 +231,30 @@ bool DictCompiler::BuildPrism(const string &schema_file,
script.clear();
}
}

// build corrector
int correction_level = 0;
if (config.GetInt("speller/correction_level", &correction_level) &&
correction_level > 0) {

Syllabary correct_syllabary;
if (!script.empty()) {
for (auto &v : script) {
correct_syllabary.insert(v.first);
}
} else {
correct_syllabary = syllabary;
}

CorrectionCollector collector(correct_syllabary);
auto correction_script = collector.Collect((size_t)correction_level);
correction_->Remove();
if (!correction_->Build(syllabary, &correction_script,
dict_file_checksum, schema_file_checksum) ||
!correction_->Save()) {
return false;
}
}
}
if ((options_ & kDump) && !script.empty()) {
boost::filesystem::path path(prism_->file_name());
Expand All @@ -239,12 +264,13 @@ bool DictCompiler::BuildPrism(const string &schema_file,
// build .prism.bin
{
prism_->Remove();
if (!prism_->Build(syllabary, script.empty() ? NULL : &script,
if (!prism_->Build(syllabary, script.empty() ? nullptr : &script,
dict_file_checksum, schema_file_checksum) ||
!prism_->Save()) {
return false;
}
}

return true;
}

Expand Down
1 change: 1 addition & 0 deletions src/rime/dict/dict_compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class DictCompiler {

string dict_name_;
an<Prism> prism_;
an<Prism> correction_;
an<Table> table_;
int options_ = 0;
string prefix_;
Expand Down
35 changes: 35 additions & 0 deletions src/rime/dict/prism.cc
Original file line number Diff line number Diff line change
Expand Up @@ -324,4 +324,39 @@ uint32_t Prism::schema_file_checksum() const {
return metadata_ ? metadata_->schema_file_checksum : 0;
}

optional<CorrectionPrism::Corrections> CorrectionPrism::SymDeletePrefixSearch(const string& key) {
if (key.empty())
return boost::none;
size_t key_len = key.length();
size_t prepared_size = key_len * (key_len - 1);

Corrections result;
result.reserve(prepared_size);
vector<size_t> jump_pos(key_len);

// pass through origin key, cache trie nodes
size_t max_match = 0;
for (size_t traverse_node = 0; max_match < key_len;) {
jump_pos[max_match] = traverse_node;
auto res_val = trie_->traverse(key.c_str(), traverse_node, max_match, max_match + 1);
if (res_val == -2) break;
if (res_val >= 0) {
result.push_back({ -1, res_val.value });
}
}

// start at the next position of deleted char
for (size_t del_pos = 0; del_pos < max_match; del_pos++) {
size_t traverse_node = jump_pos[del_pos];
for (size_t key_point = del_pos + 1; key_point < key_len;) {
auto res_val = trie_->traverse(key.c_str(), traverse_node, key_point, key_point + 1);
if (res_val == -2) break;
if (res_val >= 0) {
result.push_back({ key[del_pos], res_val.value });
}
}
}

return result;
}
} // namespace rime
13 changes: 12 additions & 1 deletion src/rime/dict/prism.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,24 @@ class Prism : public MappedFile {
uint32_t dict_file_checksum() const;
uint32_t schema_file_checksum() const;

private:
protected:
the<Darts::DoubleArray> trie_;
private:
prism::Metadata* metadata_ = nullptr;
prism::SpellingMap* spelling_map_ = nullptr;
double format_ = 0.0;
};

class CorrectionPrism : public Prism {
public:
using Correction = struct {
char deleted;
SyllableId syllableId;
};
using Corrections = vector<Correction>;
optional<Corrections> SymDeletePrefixSearch(const string& key);
};

} // namespace rime

#endif // RIME_PRISM_H_