Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add minimal replacer #52

Merged
merged 6 commits into from
Apr 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions R/cpp11.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ kiwi_builder_add_pre_analyzed_word_ <- function(handle_ex, form, analyzed_r, sco
.Call(`_elbird_kiwi_builder_add_pre_analyzed_word_`, handle_ex, form, analyzed_r, score)
}

kiwi_builder_add_rule_ <- function(handle_ex, pos, pattern, replacement, score) {
.Call(`_elbird_kiwi_builder_add_rule_`, handle_ex, pos, pattern, replacement, score)
}

kiwi_builder_load_dict_ <- function(handle_ex, dict_path) {
.Call(`_elbird_kiwi_builder_load_dict_`, handle_ex, dict_path)
}
Expand Down
8 changes: 8 additions & 0 deletions src/cpp11.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,13 @@ extern "C" SEXP _elbird_kiwi_builder_add_pre_analyzed_word_(SEXP handle_ex, SEXP
END_CPP11
}
// kiwi_bind.cpp
int kiwi_builder_add_rule_(SEXP handle_ex, const char* pos, std::string pattern, std::string replacement, float score);
extern "C" SEXP _elbird_kiwi_builder_add_rule_(SEXP handle_ex, SEXP pos, SEXP pattern, SEXP replacement, SEXP score) {
BEGIN_CPP11
return cpp11::as_sexp(kiwi_builder_add_rule_(cpp11::as_cpp<cpp11::decay_t<SEXP>>(handle_ex), cpp11::as_cpp<cpp11::decay_t<const char*>>(pos), cpp11::as_cpp<cpp11::decay_t<std::string>>(pattern), cpp11::as_cpp<cpp11::decay_t<std::string>>(replacement), cpp11::as_cpp<cpp11::decay_t<float>>(score)));
END_CPP11
}
// kiwi_bind.cpp
int kiwi_builder_load_dict_(SEXP handle_ex, const char* dict_path);
extern "C" SEXP _elbird_kiwi_builder_load_dict_(SEXP handle_ex, SEXP dict_path) {
BEGIN_CPP11
Expand Down Expand Up @@ -139,6 +146,7 @@ static const R_CallMethodDef CallEntries[] = {
{"_elbird_kiwi_analyze_", (DL_FUNC) &_elbird_kiwi_analyze_, 5},
{"_elbird_kiwi_builder_add_alias_word_", (DL_FUNC) &_elbird_kiwi_builder_add_alias_word_, 5},
{"_elbird_kiwi_builder_add_pre_analyzed_word_", (DL_FUNC) &_elbird_kiwi_builder_add_pre_analyzed_word_, 4},
{"_elbird_kiwi_builder_add_rule_", (DL_FUNC) &_elbird_kiwi_builder_add_rule_, 5},
{"_elbird_kiwi_builder_add_word_", (DL_FUNC) &_elbird_kiwi_builder_add_word_, 4},
{"_elbird_kiwi_builder_build_", (DL_FUNC) &_elbird_kiwi_builder_build_, 1},
{"_elbird_kiwi_builder_close_", (DL_FUNC) &_elbird_kiwi_builder_close_, 1},
Expand Down
68 changes: 58 additions & 10 deletions src/kiwi_bind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,13 @@
#include <cstring>
#include <vector>
#include <map>
#include <regex>

#include <cpp11.hpp>
using namespace cpp11;
#include <kiwi/capi.h>
#include <kiwi/Kiwi.h>

typedef int(*kiwi_receiver_t)(int, kiwi_res_h, void*);
typedef int(*kiwi_builder_replacer_t)(const char*, int, char*, void*);

static std::map<std::string, int> m = {
{ "URL", KIWI_MATCH_URL },
{ "EMAIL", KIWI_MATCH_EMAIL },
Expand All @@ -29,7 +27,11 @@ static std::map<std::string, int> m = {
};

int match_options_(const std::string match_string) {
if (!m.count(match_string)) throw std::invalid_argument{ std::string{"Unknown Build Options : "} + match_string };
if (!m.count(match_string)) {
throw std::invalid_argument{
std::string{"Unknown Build Options : "} + match_string
};
}
return m.find(match_string)->second;
}

Expand Down Expand Up @@ -65,11 +67,11 @@ private :
std::ifstream strm;
};

int readLines(int line, char* buffer, void* input) {
Scanner* scanner = (Scanner*)input;
int readLines(int line_num, char* buffer, void* user) {
Scanner* scanner = (Scanner*)user;

if (buffer == nullptr) {
if (line == 0) {
if (line_num == 0) {
scanner->rewind();
}

Expand All @@ -83,6 +85,40 @@ int readLines(int line, char* buffer, void* input) {
return 0;
}

class Replacer {
public :
void init(const std::string pattern, const std::string replacemnet_) {
std::regex re(pattern);
this->rep = re;
this->replacemnet = replacemnet_;
};
int size(const char* input) {
std::string output = std::regex_replace(std::string(input),
this->rep,
this->replacemnet);
this->res = output;
return strlen(output.c_str())+1;
};
const char* text() {
return this->res.c_str();
};

private :
std::regex rep;
std::string replacemnet = "";
std::string res;
};

int ruleprovider(const char* input, int size, char* buffer, void* user) {
Replacer* rpcr = (Replacer*)user;
if (buffer == nullptr) {
return rpcr->size(input);
}
strcpy(buffer, rpcr->text());
return 0;
}


[[cpp11::register]]
std::string kiwi_version_() {
return kiwi_version();
Expand Down Expand Up @@ -174,9 +210,17 @@ int kiwi_builder_add_pre_analyzed_word_(
);
}

int kiwi_builder_add_rule_(SEXP handle_ex, const char* pos, kiwi_builder_replacer_t replacer, void* user_data, float score) {
[[cpp11::register]]
int kiwi_builder_add_rule_(
SEXP handle_ex,
const char* pos,
std::string pattern,
std::string replacement,
float score) {
cpp11::external_pointer<kiwi_builder> handle(handle_ex);
return kiwi_builder_add_rule(handle.get(), pos, replacer, user_data, score);
Replacer rpcr;
rpcr.init(pattern, replacement);
return kiwi_builder_add_rule(handle.get(), pos, ruleprovider, &rpcr, score);
}

[[cpp11::register]]
Expand Down Expand Up @@ -325,7 +369,11 @@ SEXP kiwi_analyze_(
}

[[cpp11::register]]
SEXP kiwi_split_into_sents_(SEXP handle_ex, const char* text, int match_options, bool return_tokens) {
SEXP kiwi_split_into_sents_(
SEXP handle_ex,
const char* text,
int match_options,
bool return_tokens) {
cpp11::external_pointer<kiwi_s> handle(handle_ex);
kiwi_res_h tokenized_res;
kiwi_res_h *tknptr = &tokenized_res;
Expand Down
17 changes: 17 additions & 0 deletions tests/testthat/test-addrule.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
test_that("add rule works", {
skip_if_offline()
if (!model_works("small"))
get_model("small")

kw <- kiwi_init_(kiwi_model_path_full("small"), 0, BuildOpt$DEFAULT)
res1<- kiwi_analyze_wrap(kw, "했어요! 하잖아요! 할까요?", top_n = 1)
res2<- kiwi_analyze_wrap(kw, "했어용! 하잖아용! 할까용?", top_n = 1)

expect_false(identical(res1[[1]]$Score, res2[[1]]$Score))

kb <- kiwi_builder_init_(kiwi_model_path_full("small"), 0, BuildOpt$DEFAULT)
kiwi_builder_add_rule_(kb, "ef", "요$", "용", -2)
kw <- kiwi_builder_build_(kb)
res<- kiwi_analyze_wrap(kw, "했어용! 하잖아용! 할까용?")
expect_false(identical(res1[[1]]$Score, res[[1]]$Score))
})
5 changes: 2 additions & 3 deletions tests/testthat/test-preanalyze.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
test_that("pre analyze words", {
skip_if_offline()
get_model("small")
if (!model_works("small"))
get_model("small")

anl <- data.frame(
morphs = c("팅기", "었", "어"),
Expand All @@ -22,6 +23,4 @@ test_that("pre analyze words", {
res <- kiwi_analyze_wrap(kw, text = "팅겼어...", 1, Match$ALL_WITH_NORMALIZING)
expect_equal(res[[1]]$Token[[1]]$form, "팅기")
expect_equal(res[[1]]$Token[[1]]$tag, "VV")
rm(kw)
rm(kb)
})