Skip to content

Commit

Permalink
Merge pull request #52 from mrchypark/add-rule
Browse files Browse the repository at this point in the history
add minimal replacer
  • Loading branch information
mrchypark authored Apr 11, 2022
2 parents 4e3db97 + 2b4d058 commit 784c044
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 13 deletions.
4 changes: 4 additions & 0 deletions R/cpp11.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ kiwi_builder_add_pre_analyzed_word_ <- function(handle_ex, form, analyzed_r, sco
.Call(`_elbird_kiwi_builder_add_pre_analyzed_word_`, handle_ex, form, analyzed_r, score)
}

kiwi_builder_add_rule_ <- function(handle_ex, pos, pattern, replacement, score) {
.Call(`_elbird_kiwi_builder_add_rule_`, handle_ex, pos, pattern, replacement, score)
}

kiwi_builder_load_dict_ <- function(handle_ex, dict_path) {
.Call(`_elbird_kiwi_builder_load_dict_`, handle_ex, dict_path)
}
Expand Down
8 changes: 8 additions & 0 deletions src/cpp11.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,13 @@ extern "C" SEXP _elbird_kiwi_builder_add_pre_analyzed_word_(SEXP handle_ex, SEXP
END_CPP11
}
// kiwi_bind.cpp
int kiwi_builder_add_rule_(SEXP handle_ex, const char* pos, std::string pattern, std::string replacement, float score);
extern "C" SEXP _elbird_kiwi_builder_add_rule_(SEXP handle_ex, SEXP pos, SEXP pattern, SEXP replacement, SEXP score) {
BEGIN_CPP11
return cpp11::as_sexp(kiwi_builder_add_rule_(cpp11::as_cpp<cpp11::decay_t<SEXP>>(handle_ex), cpp11::as_cpp<cpp11::decay_t<const char*>>(pos), cpp11::as_cpp<cpp11::decay_t<std::string>>(pattern), cpp11::as_cpp<cpp11::decay_t<std::string>>(replacement), cpp11::as_cpp<cpp11::decay_t<float>>(score)));
END_CPP11
}
// kiwi_bind.cpp
int kiwi_builder_load_dict_(SEXP handle_ex, const char* dict_path);
extern "C" SEXP _elbird_kiwi_builder_load_dict_(SEXP handle_ex, SEXP dict_path) {
BEGIN_CPP11
Expand Down Expand Up @@ -139,6 +146,7 @@ static const R_CallMethodDef CallEntries[] = {
{"_elbird_kiwi_analyze_", (DL_FUNC) &_elbird_kiwi_analyze_, 5},
{"_elbird_kiwi_builder_add_alias_word_", (DL_FUNC) &_elbird_kiwi_builder_add_alias_word_, 5},
{"_elbird_kiwi_builder_add_pre_analyzed_word_", (DL_FUNC) &_elbird_kiwi_builder_add_pre_analyzed_word_, 4},
{"_elbird_kiwi_builder_add_rule_", (DL_FUNC) &_elbird_kiwi_builder_add_rule_, 5},
{"_elbird_kiwi_builder_add_word_", (DL_FUNC) &_elbird_kiwi_builder_add_word_, 4},
{"_elbird_kiwi_builder_build_", (DL_FUNC) &_elbird_kiwi_builder_build_, 1},
{"_elbird_kiwi_builder_close_", (DL_FUNC) &_elbird_kiwi_builder_close_, 1},
Expand Down
68 changes: 58 additions & 10 deletions src/kiwi_bind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,13 @@
#include <cstring>
#include <vector>
#include <map>
#include <regex>

#include <cpp11.hpp>
using namespace cpp11;
#include <kiwi/capi.h>
#include <kiwi/Kiwi.h>

typedef int(*kiwi_receiver_t)(int, kiwi_res_h, void*);
typedef int(*kiwi_builder_replacer_t)(const char*, int, char*, void*);

static std::map<std::string, int> m = {
{ "URL", KIWI_MATCH_URL },
{ "EMAIL", KIWI_MATCH_EMAIL },
Expand All @@ -29,7 +27,11 @@ static std::map<std::string, int> m = {
};

int match_options_(const std::string match_string) {
if (!m.count(match_string)) throw std::invalid_argument{ std::string{"Unknown Build Options : "} + match_string };
if (!m.count(match_string)) {
throw std::invalid_argument{
std::string{"Unknown Build Options : "} + match_string
};
}
return m.find(match_string)->second;
}

Expand Down Expand Up @@ -65,11 +67,11 @@ private :
std::ifstream strm;
};

int readLines(int line, char* buffer, void* input) {
Scanner* scanner = (Scanner*)input;
int readLines(int line_num, char* buffer, void* user) {
Scanner* scanner = (Scanner*)user;

if (buffer == nullptr) {
if (line == 0) {
if (line_num == 0) {
scanner->rewind();
}

Expand All @@ -83,6 +85,40 @@ int readLines(int line, char* buffer, void* input) {
return 0;
}

class Replacer {
public :
void init(const std::string pattern, const std::string replacemnet_) {
std::regex re(pattern);
this->rep = re;
this->replacemnet = replacemnet_;
};
int size(const char* input) {
std::string output = std::regex_replace(std::string(input),
this->rep,
this->replacemnet);
this->res = output;
return strlen(output.c_str())+1;
};
const char* text() {
return this->res.c_str();
};

private :
std::regex rep;
std::string replacemnet = "";
std::string res;
};

int ruleprovider(const char* input, int size, char* buffer, void* user) {
Replacer* rpcr = (Replacer*)user;
if (buffer == nullptr) {
return rpcr->size(input);
}
strcpy(buffer, rpcr->text());
return 0;
}


[[cpp11::register]]
std::string kiwi_version_() {
return kiwi_version();
Expand Down Expand Up @@ -174,9 +210,17 @@ int kiwi_builder_add_pre_analyzed_word_(
);
}

int kiwi_builder_add_rule_(SEXP handle_ex, const char* pos, kiwi_builder_replacer_t replacer, void* user_data, float score) {
[[cpp11::register]]
int kiwi_builder_add_rule_(
SEXP handle_ex,
const char* pos,
std::string pattern,
std::string replacement,
float score) {
cpp11::external_pointer<kiwi_builder> handle(handle_ex);
return kiwi_builder_add_rule(handle.get(), pos, replacer, user_data, score);
Replacer rpcr;
rpcr.init(pattern, replacement);
return kiwi_builder_add_rule(handle.get(), pos, ruleprovider, &rpcr, score);
}

[[cpp11::register]]
Expand Down Expand Up @@ -325,7 +369,11 @@ SEXP kiwi_analyze_(
}

[[cpp11::register]]
SEXP kiwi_split_into_sents_(SEXP handle_ex, const char* text, int match_options, bool return_tokens) {
SEXP kiwi_split_into_sents_(
SEXP handle_ex,
const char* text,
int match_options,
bool return_tokens) {
cpp11::external_pointer<kiwi_s> handle(handle_ex);
kiwi_res_h tokenized_res;
kiwi_res_h *tknptr = &tokenized_res;
Expand Down
17 changes: 17 additions & 0 deletions tests/testthat/test-addrule.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
test_that("add rule works", {
skip_if_offline()
if (!model_works("small"))
get_model("small")

kw <- kiwi_init_(kiwi_model_path_full("small"), 0, BuildOpt$DEFAULT)
res1<- kiwi_analyze_wrap(kw, "했어요! 하잖아요! 할까요?", top_n = 1)
res2<- kiwi_analyze_wrap(kw, "했어용! 하잖아용! 할까용?", top_n = 1)

expect_false(identical(res1[[1]]$Score, res2[[1]]$Score))

kb <- kiwi_builder_init_(kiwi_model_path_full("small"), 0, BuildOpt$DEFAULT)
kiwi_builder_add_rule_(kb, "ef", "요$", "", -2)
kw <- kiwi_builder_build_(kb)
res<- kiwi_analyze_wrap(kw, "했어용! 하잖아용! 할까용?")
expect_false(identical(res1[[1]]$Score, res[[1]]$Score))
})
5 changes: 2 additions & 3 deletions tests/testthat/test-preanalyze.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
test_that("pre analyze words", {
skip_if_offline()
get_model("small")
if (!model_works("small"))
get_model("small")

anl <- data.frame(
morphs = c("팅기", "", ""),
Expand All @@ -22,6 +23,4 @@ test_that("pre analyze words", {
res <- kiwi_analyze_wrap(kw, text = "팅겼어...", 1, Match$ALL_WITH_NORMALIZING)
expect_equal(res[[1]]$Token[[1]]$form, "팅기")
expect_equal(res[[1]]$Token[[1]]$tag, "VV")
rm(kw)
rm(kb)
})

0 comments on commit 784c044

Please sign in to comment.