From ac86fb1b64759f90968cc1262d5daa01b158b04c Mon Sep 17 00:00:00 2001 From: Mikhail Iudin Date: Mon, 26 Aug 2024 19:12:34 +0200 Subject: [PATCH] Add min version for ios simulator Add ascii folding --- rust/Makefile | 12 ++--- rust/src/tantivy_util/highlights.rs | 68 +++++++++++++++++++++++++++++ rust/src/tantivy_util/tokenizer.rs | 5 ++- tantivy_test.go | 47 +++++++++++++++++++- 4 files changed, 124 insertions(+), 8 deletions(-) diff --git a/rust/Makefile b/rust/Makefile index 2dbeae6..801be38 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -85,42 +85,42 @@ install-debug-android-amd64: build-debug-android-amd64 @cp target/x86_64-linux-android/debug/libtantivy_go.a ../libs/android-amd64 build-ios-arm64: - cargo build --release --target aarch64-apple-ios + env IPHONEOS_DEPLOYMENT_TARGET=15.0 cargo build --release --target aarch64-apple-ios install-ios-arm64: build-ios-arm64 @mkdir -p ../libs/ios-arm64 @cp target/aarch64-apple-ios/release/libtantivy_go.a ../libs/ios-arm64 build-debug-ios-arm64: - cargo build --target aarch64-apple-ios + env IPHONEOS_DEPLOYMENT_TARGET=15.0 cargo build --target aarch64-apple-ios install-debug-ios-arm64: build-debug-ios-arm64 @mkdir -p ../libs/ios-arm64 @cp target/aarch64-apple-ios/debug/libtantivy_go.a ../libs/ios-arm64 build-ios-arm64-sim: - cargo build --release --target aarch64-apple-ios-sim + env IPHONEOS_DEPLOYMENT_TARGET=15.0 cargo build --release --target aarch64-apple-ios-sim install-ios-arm64-sim: build-ios-arm64-sim @mkdir -p ../libs/ios-arm64-sim @cp target/aarch64-apple-ios-sim/release/libtantivy_go.a ../libs/ios-arm64-sim build-debug-ios-arm64-sim: - cargo build --target aarch64-apple-ios-sim + env IPHONEOS_DEPLOYMENT_TARGET=15.0 cargo build --target aarch64-apple-ios-sim install-debug-ios-arm64-sim: build-debug-ios-arm64-sim @mkdir -p ../libs/ios-arm64-sim @cp target/aarch64-apple-ios-sim/debug/libtantivy_go.a ../libs/ios-arm64-sim build-ios-amd64: - cargo build --release --target x86_64-apple-ios + env IPHONEOS_DEPLOYMENT_TARGET=15.0 cargo build --release --target x86_64-apple-ios install-ios-amd64: build-ios-amd64 @mkdir -p ../libs/ios-amd64 @cp target/x86_64-apple-ios/release/libtantivy_go.a ../libs/ios-amd64 build-debug-ios-amd64: - cargo build --target x86_64-apple-ios + env IPHONEOS_DEPLOYMENT_TARGET=15.0 cargo build --target x86_64-apple-ios install-debug-ios-amd64: build-debug-ios-amd64 @mkdir -p ../libs/ios-amd64 diff --git a/rust/src/tantivy_util/highlights.rs b/rust/src/tantivy_util/highlights.rs index 50244c9..da03166 100644 --- a/rust/src/tantivy_util/highlights.rs +++ b/rust/src/tantivy_util/highlights.rs @@ -38,4 +38,72 @@ pub fn find_highlights( } } Ok(highlights) +} + +mod tests { + use tantivy::tokenizer::*; + use tantivy::schema::*; + use tantivy::{Index, DocAddress, doc, DocId}; + use tantivy::collector::TopDocs; + use tantivy::query::QueryParser; + use tantivy::schema::document::DocumentDeserialize; + + #[test] + fn test_ascii_folding_filter() { + // Определяем схему + let mut schema_builder = Schema::builder(); + let mut text_options = TEXT; + text_options = text_options | STORED; + text_options = text_options.set_indexing_options( + TextFieldIndexing::default() + .set_tokenizer("custom") + .set_index_option(IndexRecordOption::WithFreqsAndPositions) + ); + let text = schema_builder.add_text_field("text", text_options); + let schema = schema_builder.build(); + + // Создаем индекс + let index = Index::create_in_ram(schema.clone()); + + // Создаем кастомный токенайзер с AsciiFoldingFilter + let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()) + .filter(LowerCaser) + .filter(AsciiFoldingFilter) + .build(); + + index.tokenizers().register("custom", tokenizer); + + // Добавляем документы в индекс + let mut index_writer = index.writer(50_000_000).unwrap(); + index_writer.add_document(doc!(text => "strasse")); + index_writer.add_document(doc!(text => "straße")); + index_writer.commit().unwrap(); + + // Создаем QueryParser с кастомным токенайзером + let query_parser = QueryParser::for_index(&index, vec![text]); + + // Выполняем поиск по "strasse" + let searcher = index.reader().unwrap().searcher(); + let query = query_parser.parse_query("straße").unwrap(); + let top_docs = searcher.search(&query, &TopDocs::with_limit(10)).unwrap(); + + assert_eq!(top_docs.len(), 2); + + // Проверяем совпадение документов + if let Some((_, doc_address)) = top_docs.get(0) { + let first_doc : TantivyDocument = searcher.doc(*doc_address).unwrap(); + let first_text = first_doc.get_first(text).unwrap().as_str().unwrap(); + assert!(first_text == "strasse" || first_text == "straße"); + } else { + panic!("First document not found"); + } + + if let Some((_, doc_address)) = top_docs.get(1) { + let second_doc : TantivyDocument= searcher.doc(*doc_address).unwrap(); + let second_text = second_doc.get_first(text).unwrap().as_str().unwrap(); + assert!(second_text == "strasse" || second_text == "straße"); + } else { + panic!("Second document not found"); + } + } } \ No newline at end of file diff --git a/rust/src/tantivy_util/tokenizer.rs b/rust/src/tantivy_util/tokenizer.rs index f699e82..7dceb19 100644 --- a/rust/src/tantivy_util/tokenizer.rs +++ b/rust/src/tantivy_util/tokenizer.rs @@ -1,5 +1,5 @@ use tantivy::{Index, TantivyError}; -use tantivy::tokenizer::{LowerCaser, NgramTokenizer, RawTokenizer, RemoveLongFilter, SimpleTokenizer, TextAnalyzer}; +use tantivy::tokenizer::{AsciiFoldingFilter, LowerCaser, NgramTokenizer, RawTokenizer, RemoveLongFilter, SimpleTokenizer, TextAnalyzer}; use crate::tantivy_util::{EdgeNgramTokenizer}; use crate::tantivy_util::stemmer::create_stemmer; @@ -21,6 +21,7 @@ pub fn register_edge_ngram_tokenizer( limit )) .filter(LowerCaser) + .filter(AsciiFoldingFilter) .build(); register_tokenizer(index, tokenizer_name, text_analyzer); @@ -35,6 +36,7 @@ pub fn register_simple_tokenizer( let text_analyzer = TextAnalyzer::builder(SimpleTokenizer::default()) .filter(RemoveLongFilter::limit(text_limit)) .filter(LowerCaser) + .filter(AsciiFoldingFilter) .filter(create_stemmer(lang)) .build(); @@ -62,6 +64,7 @@ pub fn register_ngram_tokenizer( let text_analyzer = TextAnalyzer::builder(tokenizer) .filter(LowerCaser) + .filter(AsciiFoldingFilter) .build(); register_tokenizer(index, tokenizer_name, text_analyzer); diff --git a/tantivy_test.go b/tantivy_test.go index d8651ce..b24e8b9 100644 --- a/tantivy_test.go +++ b/tantivy_test.go @@ -6,9 +6,10 @@ import ( "os" "testing" - "github.com/anyproto/tantivy-go" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + + "github.com/anyproto/tantivy-go" ) const NameBody = "body" @@ -199,6 +200,50 @@ func Test(t *testing.T) { require.Equal(t, uint64(0), docs) }) + t.Run("docs search - when ascii folding", func(t *testing.T) { + _, index := fx(t, limit, 1, false) + + defer index.Free() + + doc, err := addDoc(t, "Idées fête", "mères straße", "1", index) + require.NoError(t, err) + + err = index.AddAndConsumeDocuments(doc) + require.NoError(t, err) + + docs, err := index.NumDocs() + require.NoError(t, err) + require.Equal(t, uint64(1), docs) + + result, err := index.Search("Idées fête", 100, true, NameTitle) + require.NoError(t, err) + + size, err := result.GetSize() + defer result.Free() + require.Equal(t, 1, int(size)) + + result2, err := index.Search("idees fete", 100, true, NameTitle) + require.NoError(t, err) + + size2, err := result2.GetSize() + defer result2.Free() + require.Equal(t, 1, int(size2)) + + result3, err := index.Search("straße", 100, true, NameBody) + require.NoError(t, err) + + size3, err := result3.GetSize() + defer result3.Free() + require.Equal(t, 1, int(size3)) + + result4, err := index.Search("strasse", 100, true, NameBody) + require.NoError(t, err) + + size4, err := result4.GetSize() + defer result4.Free() + require.Equal(t, 1, int(size4)) + }) + t.Run("docs search and remove - when fast", func(t *testing.T) { _, index := fx(t, limit, minGram, false)