Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduced CutGrain for Chinese analyzer #1309

Merged
merged 1 commit into from
Jun 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions python/hello_infinity.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,7 @@ def test_chinese():
"2018年世界羽毛球锦标赛在哪个城市举办?",
"hi\-tech",
'"hi-tech"',
"graphics card",
'"graphics card"',
'"DS-K3AJ303/Dm140"',
"Bloom filter",
Expand Down
6 changes: 6 additions & 0 deletions src/common/analyzer/analyzer.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@ import term;
import tokenizer;

namespace infinity {

export enum class CutGrain {
kCoarse,
kFine,
};

export class Analyzer {
public:
Analyzer() = default;
Expand Down
19 changes: 16 additions & 3 deletions src/common/analyzer/analyzer_pool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@

module;

#include <cstring>

module analyzer_pool;

import stl;
import third_party;
import config;
Expand All @@ -24,8 +28,6 @@ import japanese_analyzer;
import standard_analyzer;
import ngram_analyzer;

module analyzer_pool;

namespace infinity {


Expand All @@ -39,6 +41,7 @@ constexpr u64 Str2Int(const char *str, u64 last_value = basis) {
Tuple<UniquePtr<Analyzer>, Status> AnalyzerPool::GetAnalyzer(const std::string_view &name) {
switch (Str2Int(name.data())) {
case Str2Int(CHINESE.data()): {
// chinese-{coarse|fine}
Analyzer *prototype = cache_[CHINESE].get();
if (prototype == nullptr) {
String path;
Expand All @@ -57,7 +60,17 @@ Tuple<UniquePtr<Analyzer>, Status> AnalyzerPool::GetAnalyzer(const std::string_v
prototype = analyzer.get();
cache_[CHINESE] = std::move(analyzer);
}
return {MakeUnique<ChineseAnalyzer>(*reinterpret_cast<ChineseAnalyzer *>(prototype)), Status::OK()};
CutGrain cut_grain = CutGrain::kCoarse;
const char *str = name.data();
while (*str != '\0' && *str != '-') {
str++;
}
if (strcmp(str, "-fine") == 0) {
cut_grain = CutGrain::kFine;
}
UniquePtr<ChineseAnalyzer> analyzer = MakeUnique<ChineseAnalyzer>(*reinterpret_cast<ChineseAnalyzer *>(prototype));
analyzer->SetCutGrain(cut_grain);
return {std::move(analyzer), Status::OK()};
}
case Str2Int(JAPANESE.data()): {
Analyzer *prototype = cache_[JAPANESE].get();
Expand Down
10 changes: 9 additions & 1 deletion src/common/analyzer/chinese_analyzer.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,15 @@ public:

Status Load();

void SetCutGrain(CutGrain cut_grain) { cut_grain_ = cut_grain; }

protected:
inline void Parse(const String &input) { jieba_->CutForSearch(input, cut_words_, true); }
void Parse(const String &input) {
if (cut_grain_ == CutGrain::kCoarse)
jieba_->Cut(input, cut_words_, true);
else
jieba_->CutHMM(input, cut_words_);
}
bool IsJiebaSpecialize() override { return true; }
int AnalyzeImpl(const Term &input, void *data, HookTypeForJieba func) override;

Expand All @@ -50,5 +57,6 @@ private:
bool own_jieba_{};
Vector<cppjieba::Word> cut_words_;
SharedPtr<FlatHashSet<String>> stopwords_{};
CutGrain cut_grain_{CutGrain::kCoarse};
};
} // namespace infinity
82 changes: 82 additions & 0 deletions src/unit_test/common/analyzer/chinese_analyzer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "unit_test/base_test.h"

import stl;
import term;
import analyzer;
import chinese_analyzer;
using namespace infinity;

namespace fs = std::filesystem;

class ChineseAnalyzerTest : public BaseTest {};

TEST_F(ChineseAnalyzerTest, test1) {
// Get the path to the executable using the /proc/self/exe symlink
fs::path executablePath = "/proc/self/exe";
std::error_code ec;
// Resolve the symlink to get the actual path
executablePath = fs::canonical(executablePath, ec);
if (ec) {
std::cerr << "Error resolving the path: " << executablePath << " " << ec.message() << std::endl;
return;
}
std::cerr << "/proc/self/exe: " << executablePath << std::endl;

fs::path ROOT_PATH = executablePath.parent_path().parent_path().parent_path().parent_path() / "resource";
std::cerr << "ROOT_PATH: " << ROOT_PATH << std::endl;

if (!fs::exists(ROOT_PATH)) {
std::cerr << "Resource directory doesn't exist: " << ROOT_PATH << std::endl;
return;
}

ChineseAnalyzer analyzer(ROOT_PATH.string());
analyzer.Load();
Vector<String> queries = {
"graphic card",
"graphics card",
"南京市长江大桥",
"小明硕士毕业于中国科学院计算所,后在日本京都大学深造",
"会徽整体形似运动中的羽毛球,球头绑带部分演化为“城墙”的图形元素,极具南京的地域特征,凸显出举办地的历史底蕴和人文气息。尾羽部分图形则巧妙融入"
"了举办年份“2018”和南京的首字母“NJ”,结合中国传统书法笔触的表现形式,传递出羽毛球运动的速度感。会徽红黑配色鲜艳明快,契合了体育运动的活力与朝"
"气[3]"
"2018年世界羽毛球锦标赛吉祥物南京羽毛球世锦赛吉祥物2018年道达尔羽毛球世锦赛吉祥物在南京发布。造型简洁、形态生动、富有亲和力的“羽宝”拔得头筹,"
"成为2018年世界羽毛球锦标赛吉祥物。比赛将于7月30日在宁举行,赛程7天,预计近340名顶尖运动员参赛。吉祥物“羽宝”头部由羽毛球外形变化而来,手持球"
"拍,拟人化的设计再现了羽毛球运动员比赛时的接击球动作,胸前佩戴的梅花造型的金牌,代表着在南京举办的世锦赛将向世界献上精彩的羽毛球盛宴。同时黄"
"蓝两色为主色调,在视觉冲击中体现了羽毛球运动动静转换的速度感和竞技魅力[6]"
"2018年世界羽毛球锦标赛抽签结果7月17日,2018年南京羽毛球世锦赛抽签出炉。男单中国获得满额席位,石宇奇、谌龙、林丹和黄宇翔全部被分到了上半区"
"。"};

ChineseAnalyzer analyzer2(analyzer);
analyzer2.SetCutGrain(CutGrain::kFine);

for (auto &query : queries) {
TermList term_list;
analyzer.Analyze(query, term_list);
std::cout << "Text #" << query << "# parsed as:" << std::endl;
for (unsigned i = 0; i < term_list.size(); ++i) {
std::cout << "\t" << i << "#" << term_list[i].text_ << "@" << term_list[i].word_offset_ << "#";
}
std::cout << std::endl;
TermList term_list2;
analyzer2.Analyze(query, term_list2);
for (unsigned i = 0; i < term_list2.size(); ++i) {
std::cout << "\t" << i << "#" << term_list2[i].text_ << "@" << term_list2[i].word_offset_ << "#";
}
std::cout << std::endl;
}
}
1 change: 1 addition & 0 deletions src/unit_test/parser/search_driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ _exists_:"author" AND page_count:zzz^1.3 AND (name:星空^0.1 OR name:邓肯^1.2
吉祥物“羽宝”头部
nanjing吉祥物"羽宝"头部head "DS-K3AJ303/Dm140"
吉祥物nanjing"DS-K3AJ303/Dm140"头部
graphic cards
)##";

Map<String, String> column2analyzer;
Expand Down
Loading