From 0b767ad632612b5c3c9e1ee353c7b5aa91e3c64c Mon Sep 17 00:00:00 2001 From: mpostma Date: Thu, 9 Sep 2021 10:10:12 +0200 Subject: [PATCH] Revert "fix clippy warnings" This reverts commit a1ce3cd96e603633dbf43e9e0b12b2453c9c5620. --- cli/src/main.rs | 6 +-- http-ui/src/main.rs | 6 +-- milli/src/documents/builder.rs | 7 +-- milli/src/documents/mod.rs | 14 +++--- milli/src/documents/reader.rs | 11 ++--- milli/src/documents/serde.rs | 11 ++--- milli/src/fields_ids_map.rs | 2 +- .../facet/facet_level_value_f64_codec.rs | 12 +++-- .../facet_string_level_zero_value_codec.rs | 2 +- .../facet_string_zero_bounds_value_codec.rs | 4 +- milli/src/index.rs | 6 +-- milli/src/search/criteria/asc_desc.rs | 2 +- milli/src/search/criteria/attribute.rs | 6 +-- milli/src/search/criteria/mod.rs | 45 ++++++++++--------- milli/src/search/criteria/proximity.rs | 6 +-- milli/src/search/criteria/typo.rs | 6 +-- milli/src/search/criteria/words.rs | 5 ++- milli/src/search/distinct/mod.rs | 3 +- milli/src/search/facet/filter_condition.rs | 2 +- milli/src/search/mod.rs | 2 +- milli/src/update/available_documents_ids.rs | 4 +- milli/src/update/delete_documents.rs | 13 +++--- milli/src/update/facets.rs | 10 ++--- milli/src/update/index_documents/mod.rs | 17 +++---- milli/src/update/index_documents/transform.rs | 14 +++--- milli/src/update/settings.rs | 24 +++++----- milli/src/update/words_level_positions.rs | 4 +- milli/src/update/words_prefixes_fst.rs | 6 ++- milli/tests/search/mod.rs | 5 ++- 29 files changed, 135 insertions(+), 120 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index 08f86c8709..9d766c2f94 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -200,7 +200,7 @@ fn indexing_callback(step: milli::update::UpdateIndexingStep, bars: &[ProgressBa fn documents_from_jsonl(reader: impl Read) -> Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = - milli::documents::DocumentsBuilder::new(&mut writer)?; + milli::documents::DocumentsBuilder::new(&mut writer, bimap::BiHashMap::new())?; let values = serde_json::Deserializer::from_reader(reader) .into_iter::>(); @@ -216,7 +216,7 @@ fn documents_from_jsonl(reader: impl Read) -> Result> { fn documents_from_json(reader: impl Read) -> Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = - milli::documents::DocumentsBuilder::new(&mut writer)?; + milli::documents::DocumentsBuilder::new(&mut writer, bimap::BiHashMap::new())?; let json: serde_json::Value = serde_json::from_reader(reader)?; documents.add_documents(json)?; @@ -228,7 +228,7 @@ fn documents_from_json(reader: impl Read) -> Result> { fn documents_from_csv(reader: impl Read) -> Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = - milli::documents::DocumentsBuilder::new(&mut writer)?; + milli::documents::DocumentsBuilder::new(&mut writer, bimap::BiHashMap::new())?; let mut records = csv::Reader::from_reader(reader); let iter = records.deserialize::>(); diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index b3c533b698..4ad9595ff1 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -1020,7 +1020,7 @@ async fn main() -> anyhow::Result<()> { fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = - milli::documents::DocumentsBuilder::new(&mut writer)?; + milli::documents::DocumentsBuilder::new(&mut writer, bimap::BiHashMap::new())?; let values = serde_json::Deserializer::from_reader(reader) .into_iter::>(); @@ -1036,7 +1036,7 @@ fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result> { fn documents_from_json(reader: impl io::Read) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = - milli::documents::DocumentsBuilder::new(&mut writer)?; + milli::documents::DocumentsBuilder::new(&mut writer, bimap::BiHashMap::new())?; let json: serde_json::Value = serde_json::from_reader(reader)?; documents.add_documents(json)?; @@ -1048,7 +1048,7 @@ fn documents_from_json(reader: impl io::Read) -> anyhow::Result> { fn documents_from_csv(reader: impl io::Read) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = - milli::documents::DocumentsBuilder::new(&mut writer)?; + milli::documents::DocumentsBuilder::new(&mut writer, bimap::BiHashMap::new())?; let mut records = csv::Reader::from_reader(reader); let iter = records.deserialize::>(); diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index b93844f4cd..ad23715184 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -1,22 +1,23 @@ use std::io; +use bimap::BiHashMap; use byteorder::{BigEndian, WriteBytesExt}; use serde::ser::Serialize; use super::serde::DocumentsSerilializer; -use super::{ByteCounter, DocumentsMetadata, Error, AdditionIndex}; +use super::{ByteCounter, DocumentsMetadata, Error}; +use crate::FieldId; pub struct DocumentsBuilder { serializer: DocumentsSerilializer, } impl DocumentsBuilder { - pub fn new(writer: W) -> Result { + pub fn new(writer: W, index: BiHashMap) -> Result { let mut writer = ByteCounter::new(writer); // add space to write the offset of the metadata at the end of the writer writer.write_u64::(0)?; - let index = AdditionIndex::new(); let serializer = DocumentsSerilializer { writer, buffer: Vec::new(), index, count: 0, allow_seq: true }; diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index f594a9b688..1fa7884d49 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -16,12 +16,10 @@ pub use reader::DocumentsReader; use crate::FieldId; -type AdditionIndex = BiHashMap; - #[derive(Debug, Serialize, Deserialize)] struct DocumentsMetadata { count: usize, - index: AdditionIndex, + index: BiHashMap, } pub struct ByteCounter { @@ -91,7 +89,7 @@ macro_rules! documents { let documents = serde_json::json!($data); let mut writer = std::io::Cursor::new(Vec::new()); let mut builder = - crate::documents::DocumentsBuilder::new(&mut writer).unwrap(); + crate::documents::DocumentsBuilder::new(&mut writer, bimap::BiHashMap::new()).unwrap(); builder.add_documents(documents).unwrap(); builder.finish().unwrap(); @@ -122,7 +120,7 @@ mod test { let mut v = Vec::new(); let mut cursor = io::Cursor::new(&mut v); - let mut builder = DocumentsBuilder::new(&mut cursor).unwrap(); + let mut builder = DocumentsBuilder::new(&mut cursor, BiHashMap::new()).unwrap(); builder.add_documents(json).unwrap(); @@ -151,7 +149,7 @@ mod test { let mut v = Vec::new(); let mut cursor = io::Cursor::new(&mut v); - let mut builder = DocumentsBuilder::new(&mut cursor).unwrap(); + let mut builder = DocumentsBuilder::new(&mut cursor, BiHashMap::new()).unwrap(); builder.add_documents(doc1).unwrap(); builder.add_documents(doc2).unwrap(); @@ -180,7 +178,7 @@ mod test { let mut v = Vec::new(); let mut cursor = io::Cursor::new(&mut v); - let mut builder = DocumentsBuilder::new(&mut cursor).unwrap(); + let mut builder = DocumentsBuilder::new(&mut cursor, BiHashMap::new()).unwrap(); builder.add_documents(docs).unwrap(); @@ -203,7 +201,7 @@ mod test { let mut v = Vec::new(); let mut cursor = io::Cursor::new(&mut v); - let mut builder = DocumentsBuilder::new(&mut cursor).unwrap(); + let mut builder = DocumentsBuilder::new(&mut cursor, BiHashMap::new()).unwrap(); let docs = json!([[ { "toto": false }, diff --git a/milli/src/documents/reader.rs b/milli/src/documents/reader.rs index 844086f6a2..1f41134542 100644 --- a/milli/src/documents/reader.rs +++ b/milli/src/documents/reader.rs @@ -2,10 +2,11 @@ use std::io; use std::io::{BufReader, Read}; use std::mem::size_of; +use bimap::BiHashMap; use byteorder::{BigEndian, ReadBytesExt}; use obkv::KvReader; -use super::{DocumentsMetadata, Error, AdditionIndex}; +use super::{DocumentsMetadata, Error}; use crate::FieldId; pub struct DocumentsReader { @@ -38,9 +39,9 @@ impl DocumentsReader { /// Returns the next document in the reader, and wraps it in an `obkv::KvReader`, along with a /// reference to the addition index. - pub fn next_document_with_index( - &mut self, - ) -> io::Result)>> { + pub fn next_document_with_index<'a>( + &'a mut self, + ) -> io::Result, KvReader<'a, FieldId>)>> { if self.seen_documents < self.metadata.count { let doc_len = self.reader.read_u32::()?; self.buffer.resize(doc_len as usize, 0); @@ -55,7 +56,7 @@ impl DocumentsReader { } /// Return the fields index for the documents batch. - pub fn index(&self) -> &AdditionIndex { + pub fn index(&self) -> &BiHashMap { &self.metadata.index } diff --git a/milli/src/documents/serde.rs b/milli/src/documents/serde.rs index 78d676df83..d513d38407 100644 --- a/milli/src/documents/serde.rs +++ b/milli/src/documents/serde.rs @@ -1,17 +1,18 @@ use std::convert::TryInto; use std::{fmt, io}; +use bimap::BiHashMap; use byteorder::{BigEndian, WriteBytesExt}; use obkv::KvWriter; use serde::ser::{Impossible, Serialize, SerializeMap, SerializeSeq, Serializer}; -use super::{AdditionIndex, ByteCounter, Error}; +use super::{ByteCounter, Error}; use crate::FieldId; pub struct DocumentsSerilializer { pub writer: ByteCounter, pub buffer: Vec, - pub index: AdditionIndex, + pub index: BiHashMap, pub count: usize, pub allow_seq: bool, } @@ -224,7 +225,7 @@ impl<'a, W: io::Write> SerializeSeq for SeqSerializer<'a, W> { pub struct MapSerializer<'a, W> { map: KvWriter>, FieldId>, - index: &'a mut AdditionIndex, + index: &'a mut BiHashMap, writer: W, buffer: Vec, } @@ -248,7 +249,7 @@ impl<'a, W: io::Write> SerializeMap for MapSerializer<'a, W> { let data_len: u32 = data.len().try_into().map_err(|_| Error::DocumentTooLarge)?; self.writer.write_u32::(data_len).map_err(Error::Io)?; - self.writer.write_all(data).map_err(Error::Io)?; + self.writer.write_all(&data).map_err(Error::Io)?; Ok(()) } @@ -276,7 +277,7 @@ impl<'a, W: io::Write> SerializeMap for MapSerializer<'a, W> { } struct FieldSerializer<'a> { - index: &'a mut AdditionIndex, + index: &'a mut BiHashMap, } impl<'a> serde::Serializer for FieldSerializer<'a> { diff --git a/milli/src/fields_ids_map.rs b/milli/src/fields_ids_map.rs index 769be1fb5a..b0a084c3cb 100644 --- a/milli/src/fields_ids_map.rs +++ b/milli/src/fields_ids_map.rs @@ -65,7 +65,7 @@ impl FieldsIdsMap { } /// Iterate over the ids in the order of the ids. - pub fn ids(&self) -> impl Iterator + '_ { + pub fn ids<'a>(&'a self) -> impl Iterator + 'a { self.ids_names.keys().copied() } diff --git a/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs b/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs index 65dbd505b5..1e66427caa 100644 --- a/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs +++ b/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs @@ -34,11 +34,11 @@ impl heed::BytesEncode<'_> for FacetLevelValueF64Codec { fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option> { let mut buffer = [0u8; 32]; - // Write the globally ordered floats. - let bytes = f64_into_bytes(*left)?; - buffer[..8].copy_from_slice(&bytes[..]); - let len = if *level != 0 { + // Write the globally ordered floats. + let bytes = f64_into_bytes(*left)?; + buffer[..8].copy_from_slice(&bytes[..]); + let bytes = f64_into_bytes(*right)?; buffer[8..16].copy_from_slice(&bytes[..]); @@ -51,6 +51,10 @@ impl heed::BytesEncode<'_> for FacetLevelValueF64Codec { 32 // length } else { + // Write the globally ordered floats. + let bytes = f64_into_bytes(*left)?; + buffer[..8].copy_from_slice(&bytes[..]); + // Then the f64 values just to be able to read them back. let bytes = left.to_be_bytes(); buffer[8..16].copy_from_slice(&bytes[..]); diff --git a/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs b/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs index d1605e6ef9..22031c4746 100644 --- a/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs +++ b/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs @@ -34,7 +34,7 @@ where type EItem = (&'a str, C::EItem); fn bytes_encode((string, value): &'a Self::EItem) -> Option> { - let value_bytes = C::bytes_encode(value)?; + let value_bytes = C::bytes_encode(&value)?; let mut bytes = Vec::with_capacity(2 + string.len() + value_bytes.len()); encode_prefix_string(string, &mut bytes).ok()?; diff --git a/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs b/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs index 90ba09ae2e..337433c2bc 100644 --- a/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs +++ b/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs @@ -66,14 +66,14 @@ where bytes.extend_from_slice(left.as_bytes()); bytes.extend_from_slice(right.as_bytes()); - let value_bytes = C::bytes_encode(value)?; + let value_bytes = C::bytes_encode(&value)?; bytes.extend_from_slice(&value_bytes[..]); Some(Cow::Owned(bytes)) } None => { bytes.push(0); - let value_bytes = C::bytes_encode(value)?; + let value_bytes = C::bytes_encode(&value)?; bytes.extend_from_slice(&value_bytes[..]); Some(Cow::Owned(bytes)) } diff --git a/milli/src/index.rs b/milli/src/index.rs index c04caa32b9..24c96a0d87 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -218,7 +218,7 @@ impl Index { /// Writes the documents primary key, this is the field name that is used to store the id. pub(crate) fn put_primary_key(&self, wtxn: &mut RwTxn, primary_key: &str) -> heed::Result<()> { self.set_updated_at(wtxn, &Utc::now())?; - self.main.put::<_, Str, Str>(wtxn, main_key::PRIMARY_KEY_KEY, primary_key) + self.main.put::<_, Str, Str>(wtxn, main_key::PRIMARY_KEY_KEY, &primary_key) } /// Deletes the primary key of the documents, this can be done to reset indexes settings. @@ -741,7 +741,7 @@ impl Index { let kv = self .documents .get(rtxn, &BEU32::new(id))? - .ok_or(UserError::UnknownInternalDocumentId { document_id: id })?; + .ok_or_else(|| UserError::UnknownInternalDocumentId { document_id: id })?; documents.push((id, kv)); } @@ -795,7 +795,7 @@ impl Index { wtxn: &mut RwTxn, time: &DateTime, ) -> heed::Result<()> { - self.main.put::<_, Str, SerdeJson>>(wtxn, main_key::UPDATED_AT_KEY, time) + self.main.put::<_, Str, SerdeJson>>(wtxn, main_key::UPDATED_AT_KEY, &time) } } diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index bf015c5fc4..6d50c1bb51 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -115,7 +115,7 @@ impl<'t> Criterion for AscDesc<'t> { let mut candidates = match (&self.query_tree, candidates) { (_, Some(candidates)) => candidates, (Some(qt), None) => { - let context = CriteriaBuilder::new(self.rtxn, self.index)?; + let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; resolve_query_tree(&context, qt, params.wdcache)? } (None, None) => self.index.documents_ids(self.rtxn)?, diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 25a26c2f6c..6e0bb40d5b 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -288,7 +288,7 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { for query in queries { match &query.kind { QueryKind::Exact { word, .. } => { - if !query.prefix || ctx.in_prefix_cache(word) { + if !query.prefix || ctx.in_prefix_cache(&word) { let word = Cow::Borrowed(query.kind.word()); if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, query.prefix)? @@ -296,7 +296,7 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { inner.push(word_level_iterator); } } else { - for (word, _) in word_derivations(word, true, 0, ctx.words_fst(), wdcache)? + for (word, _) in word_derivations(&word, true, 0, ctx.words_fst(), wdcache)? { let word = Cow::Owned(word.to_owned()); if let Some(word_level_iterator) = @@ -309,7 +309,7 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { } QueryKind::Tolerant { typo, word } => { for (word, _) in - word_derivations(word, query.prefix, *typo, ctx.words_fst(), wdcache)? + word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)? { let word = Cow::Owned(word.to_owned()); if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index dfd6dad0fc..2a883de674 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -132,11 +132,11 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { } fn word_docids(&self, word: &str) -> heed::Result> { - self.index.word_docids.get(self.rtxn, word) + self.index.word_docids.get(self.rtxn, &word) } fn word_prefix_docids(&self, word: &str) -> heed::Result> { - self.index.word_prefix_docids.get(self.rtxn, word) + self.index.word_prefix_docids.get(self.rtxn, &word) } fn word_pair_proximity_docids( @@ -282,7 +282,7 @@ impl<'t> CriteriaBuilder<'t> { let mut criterion = Box::new(Initial::new(query_tree, filtered_candidates)) as Box; - for name in self.index.criteria(self.rtxn)? { + for name in self.index.criteria(&self.rtxn)? { criterion = match name { Name::Words => Box::new(Words::new(self, criterion)), Name::Typo => Box::new(Typo::new(self, criterion)), @@ -291,14 +291,14 @@ impl<'t> CriteriaBuilder<'t> { for asc_desc in sort_criteria { criterion = match asc_desc { AscDescName::Asc(field) => Box::new(AscDesc::asc( - self.index, - self.rtxn, + &self.index, + &self.rtxn, criterion, field.to_string(), )?), AscDescName::Desc(field) => Box::new(AscDesc::desc( - self.index, - self.rtxn, + &self.index, + &self.rtxn, criterion, field.to_string(), )?), @@ -312,10 +312,10 @@ impl<'t> CriteriaBuilder<'t> { Name::Attribute => Box::new(Attribute::new(self, criterion)), Name::Exactness => Box::new(Exactness::new(self, criterion, &primitive_query)?), Name::Asc(field) => { - Box::new(AscDesc::asc(self.index, self.rtxn, criterion, field)?) + Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, field)?) } Name::Desc(field) => { - Box::new(AscDesc::desc(self.index, self.rtxn, criterion, field)?) + Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, field)?) } }; } @@ -418,25 +418,25 @@ fn query_docids( ) -> Result { match &query.kind { QueryKind::Exact { word, .. } => { - if query.prefix && ctx.in_prefix_cache(word) { - Ok(ctx.word_prefix_docids(word)?.unwrap_or_default()) + if query.prefix && ctx.in_prefix_cache(&word) { + Ok(ctx.word_prefix_docids(&word)?.unwrap_or_default()) } else if query.prefix { - let words = word_derivations(word, true, 0, ctx.words_fst(), wdcache)?; + let words = word_derivations(&word, true, 0, ctx.words_fst(), wdcache)?; let mut docids = RoaringBitmap::new(); for (word, _typo) in words { - let current_docids = ctx.word_docids(word)?.unwrap_or_default(); + let current_docids = ctx.word_docids(&word)?.unwrap_or_default(); docids |= current_docids; } Ok(docids) } else { - Ok(ctx.word_docids(word)?.unwrap_or_default()) + Ok(ctx.word_docids(&word)?.unwrap_or_default()) } } QueryKind::Tolerant { typo, word } => { - let words = word_derivations(word, query.prefix, *typo, ctx.words_fst(), wdcache)?; + let words = word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)?; let mut docids = RoaringBitmap::new(); for (word, _typo) in words { - let current_docids = ctx.word_docids(word)?.unwrap_or_default(); + let current_docids = ctx.word_docids(&word)?.unwrap_or_default(); docids |= current_docids; } Ok(docids) @@ -480,7 +480,8 @@ fn query_pair_proximity_docids( } } (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => { - let l_words = word_derivations(&left, false, *typo, ctx.words_fst(), wdcache)?.to_owned(); + let l_words = + word_derivations(&left, false, *typo, ctx.words_fst(), wdcache)?.to_owned(); if prefix { let mut docids = RoaringBitmap::new(); for (left, _) in l_words { @@ -504,17 +505,17 @@ fn query_pair_proximity_docids( } } (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => { - let r_words = word_derivations(right, prefix, *typo, ctx.words_fst(), wdcache)?; - all_word_pair_proximity_docids(ctx, &[(left, 0)], r_words, proximity) + let r_words = word_derivations(&right, prefix, *typo, ctx.words_fst(), wdcache)?; + all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) } ( QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: r_typo, word: right }, ) => { let l_words = - word_derivations(left, false, *l_typo, ctx.words_fst(), wdcache)?.to_owned(); - let r_words = word_derivations(right, prefix, *r_typo, ctx.words_fst(), wdcache)?; - all_word_pair_proximity_docids(ctx, &l_words, r_words, proximity) + word_derivations(&left, false, *l_typo, ctx.words_fst(), wdcache)?.to_owned(); + let r_words = word_derivations(&right, prefix, *r_typo, ctx.words_fst(), wdcache)?; + all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity) } } } diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index d2a08af6de..f884de160c 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -100,7 +100,7 @@ impl<'t> Criterion for Proximity<'t> { // use set theory based algorithm resolve_candidates( self.ctx, - query_tree, + &query_tree, self.proximity, &mut self.candidates_cache, params.wdcache, @@ -528,7 +528,7 @@ fn resolve_plane_sweep_candidates( match kind { QueryKind::Exact { word, .. } => { if *prefix { - let iter = word_derivations(word, true, 0, words_positions) + let iter = word_derivations(word, true, 0, &words_positions) .flat_map(|positions| positions.iter().map(|p| (p, 0, p))); result.extend(iter); } else if let Some(positions) = words_positions.get(word) { @@ -536,7 +536,7 @@ fn resolve_plane_sweep_candidates( } } QueryKind::Tolerant { typo, word } => { - let iter = word_derivations(word, *prefix, *typo, words_positions) + let iter = word_derivations(word, *prefix, *typo, &words_positions) .flat_map(|positions| positions.iter().map(|p| (p, 0, p))); result.extend(iter); } diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index c06cf14def..97a9b4e4be 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -69,7 +69,7 @@ impl<'t> Criterion for Typo<'t> { let fst = self.ctx.words_fst(); let new_query_tree = match self.typos { typos if typos < MAX_TYPOS_PER_WORD => alterate_query_tree( - fst, + &fst, query_tree.clone(), self.typos, params.wdcache, @@ -78,7 +78,7 @@ impl<'t> Criterion for Typo<'t> { // When typos >= MAX_TYPOS_PER_WORD, no more alteration of the query tree is possible, // we keep the altered query tree *query_tree = alterate_query_tree( - fst, + &fst, query_tree.clone(), self.typos, params.wdcache, @@ -199,7 +199,7 @@ fn alterate_query_tree( ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache)) } // Because Phrases don't allow typos, no alteration can be done. - Phrase(_words) => Ok(()), + Phrase(_words) => return Ok(()), Operation::Query(q) => { if let QueryKind::Tolerant { typo, word } = &q.kind { // if no typo is allowed we don't call word_derivations function, diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index b67b7f6b4f..ccc6c06173 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -53,7 +53,10 @@ impl<'t> Criterion for Words<'t> { None => None, }; - let bucket_candidates = self.bucket_candidates.as_mut().map(take); + let bucket_candidates = match self.bucket_candidates.as_mut() { + Some(bucket_candidates) => Some(take(bucket_candidates)), + None => None, + }; return Ok(Some(CriterionResult { query_tree: Some(query_tree), diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index ca4b96e6bd..6010a4410e 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -29,6 +29,7 @@ mod test { use std::collections::HashSet; use std::io::Cursor; + use bimap::BiHashMap; use once_cell::sync::Lazy; use rand::seq::SliceRandom; use rand::Rng; @@ -48,7 +49,7 @@ mod test { let num_docs = rng.gen_range(10..30); let mut cursor = Cursor::new(Vec::new()); - let mut builder = DocumentsBuilder::new(&mut cursor).unwrap(); + let mut builder = DocumentsBuilder::new(&mut cursor, BiHashMap::new()).unwrap(); let txts = ["Toto", "Titi", "Tata"]; let cats = (1..10).map(|i| i.to_string()).collect::>(); let cat_ints = (1..10).collect::>(); diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 4edea7f717..a92797e901 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -404,7 +404,7 @@ impl FilterCondition { GreaterThanOrEqual(val) => (Included(*val), Included(f64::MAX)), Equal(number, string) => { let (_original_value, string_docids) = - strings_db.get(rtxn, &(field_id, string))?.unwrap_or_default(); + strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default(); let number_docids = match number { Some(n) => { let n = Included(*n); diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index b594eac660..56002b2e3b 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -139,7 +139,7 @@ impl<'a> Search<'a> { debug!("facet candidates: {:?} took {:.02?}", filtered_candidates, before.elapsed()); let matching_words = match query_tree.as_ref() { - Some(query_tree) => MatchingWords::from_query_tree(query_tree), + Some(query_tree) => MatchingWords::from_query_tree(&query_tree), None => MatchingWords::default(), }; diff --git a/milli/src/update/available_documents_ids.rs b/milli/src/update/available_documents_ids.rs index 5ab06a8266..653bc7dd27 100644 --- a/milli/src/update/available_documents_ids.rs +++ b/milli/src/update/available_documents_ids.rs @@ -1,4 +1,4 @@ -use std::iter::Chain; +use std::iter::{Chain, FromIterator}; use std::ops::RangeInclusive; use roaring::bitmap::{IntoIter, RoaringBitmap}; @@ -11,7 +11,7 @@ impl AvailableDocumentsIds { pub fn from_documents_ids(docids: &RoaringBitmap) -> AvailableDocumentsIds { match docids.max() { Some(last_id) => { - let mut available: RoaringBitmap = (0..last_id).collect(); + let mut available = RoaringBitmap::from_iter(0..last_id); available -= docids; let iter = match last_id.checked_add(1) { diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index f5c4d2495f..74975e952f 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -79,10 +79,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; - let primary_key = self.index.primary_key(self.wtxn)?.ok_or( + let primary_key = self.index.primary_key(self.wtxn)?.ok_or_else(|| { InternalError::DatabaseMissingEntry { db_name: db_name::MAIN, key: Some(main_key::PRIMARY_KEY_KEY), + } })?; // If we can't find the id of the primary key it means that the database @@ -195,7 +196,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // We create an iterator to be able to get the content and delete the word docids. // It's faster to acquire a cursor to get and delete or put, as we avoid traversing // the LMDB B-Tree two times but only once. - let mut iter = word_docids.prefix_iter_mut(self.wtxn, word)?; + let mut iter = word_docids.prefix_iter_mut(self.wtxn, &word)?; if let Some((key, mut docids)) = iter.next().transpose()? { if key == word.as_ref() { let previous_len = docids.len(); @@ -468,7 +469,7 @@ fn remove_docids_from_facet_field_id_string_docids<'a, C, D>( // level key. We must then parse the value using the appropriate codec. let (group, mut docids) = FacetStringZeroBoundsValueCodec::::bytes_decode(val) - .ok_or(SerializationError::Decoding { db_name })?; + .ok_or_else(|| SerializationError::Decoding { db_name })?; let previous_len = docids.len(); docids -= to_remove; @@ -480,7 +481,7 @@ fn remove_docids_from_facet_field_id_string_docids<'a, C, D>( let val = &(group, docids); let value_bytes = FacetStringZeroBoundsValueCodec::::bytes_encode(val) - .ok_or(SerializationError::Encoding { db_name })?; + .ok_or_else(|| SerializationError::Encoding { db_name })?; // safety: we don't keep references from inside the LMDB database. unsafe { iter.put_current(&key, &value_bytes)? }; @@ -490,7 +491,7 @@ fn remove_docids_from_facet_field_id_string_docids<'a, C, D>( // The key corresponds to a level zero facet string. let (original_value, mut docids) = FacetStringLevelZeroValueCodec::bytes_decode(val) - .ok_or(SerializationError::Decoding { db_name })?; + .ok_or_else(|| SerializationError::Decoding { db_name })?; let previous_len = docids.len(); docids -= to_remove; @@ -501,7 +502,7 @@ fn remove_docids_from_facet_field_id_string_docids<'a, C, D>( let key = key.to_owned(); let val = &(original_value, docids); let value_bytes = FacetStringLevelZeroValueCodec::bytes_encode(val) - .ok_or(SerializationError::Encoding { db_name })?; + .ok_or_else(|| SerializationError::Encoding { db_name })?; // safety: we don't keep references from inside the LMDB database. unsafe { iter.put_current(&key, &value_bytes)? }; diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 44fd007222..9b7d6d42c2 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -153,8 +153,8 @@ fn clear_field_number_levels<'t>( db.delete_range(wtxn, &range).map(drop) } -fn compute_facet_number_levels( - rtxn: &heed::RoTxn, +fn compute_facet_number_levels<'t>( + rtxn: &'t heed::RoTxn, db: heed::Database, compression_type: CompressionType, compression_level: Option, @@ -228,7 +228,7 @@ fn write_number_entry( ) -> Result<()> { let key = (field_id, level, left, right); let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; - let data = CboRoaringBitmapCodec::bytes_encode(ids).ok_or(Error::Encoding)?; + let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; writer.insert(&key, &data)?; Ok(()) } @@ -272,8 +272,8 @@ fn clear_field_string_levels<'t>( db.remap_key_type::().delete_range(wtxn, &range).map(drop) } -fn compute_facet_string_levels( - rtxn: &heed::RoTxn, +fn compute_facet_string_levels<'t>( + rtxn: &'t heed::RoTxn, db: heed::Database, compression_type: CompressionType, compression_level: Option, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 4b2729417e..b5b5abdbd1 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -40,12 +40,6 @@ pub struct DocumentAdditionResult { pub nb_documents: usize, } -#[derive(Debug, Copy, Clone)] -pub enum WriteMethod { - Append, - GetMergePut, -} - #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] #[non_exhaustive] pub enum IndexDocumentsMethod { @@ -58,6 +52,12 @@ pub enum IndexDocumentsMethod { UpdateDocuments, } +#[derive(Debug, Copy, Clone)] +pub enum WriteMethod { + Append, + GetMergePut, +} + pub struct IndexDocuments<'t, 'u, 'i, 'a> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, @@ -143,7 +143,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let update_id = self.update_id; let progress_callback = |step| progress_callback(step, update_id); let transform = Transform { - rtxn: self.wtxn, + rtxn: &self.wtxn, index: self.index, log_every_n: self.log_every_n, chunk_compression_type: self.chunk_compression_type, @@ -421,6 +421,7 @@ mod tests { use std::io::Cursor; use big_s::S; + use bimap::BiHashMap; use heed::EnvOpenOptions; use super::*; @@ -855,7 +856,7 @@ mod tests { let mut cursor = Cursor::new(Vec::new()); - let mut builder = DocumentsBuilder::new(&mut cursor).unwrap(); + let mut builder = DocumentsBuilder::new(&mut cursor, BiHashMap::new()).unwrap(); builder.add_documents(big_object).unwrap(); builder.finish().unwrap(); cursor.set_position(0); diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 8b1729575b..beac58b3e9 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -66,10 +66,10 @@ fn create_fields_mapping( // we sort by id here to ensure a deterministic mapping of the fields, that preserves // the original ordering. .sorted_by_key(|(&id, _)| id) - .map(|(field, name)| match fields_id_map.id(name) { + .map(|(field, name)| match fields_id_map.id(&name) { Some(id) => Ok((*field, id)), None => fields_id_map - .insert(name) + .insert(&name) .ok_or(Error::UserError(UserError::AttributeLimitReached)) .map(|id| (*field, id)), }) @@ -163,9 +163,9 @@ impl Transform<'_, '_> { } }, Value::Number(number) => number.to_string(), - document_id => { + content => { return Err(UserError::InvalidDocumentId { - document_id, + document_id: content.clone(), } .into()) } @@ -179,7 +179,7 @@ impl Transform<'_, '_> { let mut json = Map::new(); for (key, value) in document.iter() { let key = addition_index.get_by_left(&key).cloned(); - let value = serde_json::from_slice::(value).ok(); + let value = serde_json::from_slice::(&value).ok(); if let Some((k, v)) = key.zip(value) { json.insert(k, v); @@ -199,7 +199,7 @@ impl Transform<'_, '_> { // Insertion in a obkv need to be done with keys ordered. For now they are ordered // according to the document addition key order, so we sort it according to the // fieldids map keys order. - field_buffer.sort_unstable_by(|(f1, _), (f2, _)| f1.cmp(f2)); + field_buffer.sort_unstable_by(|(f1, _), (f2, _)| f1.cmp(&f2)); // The last step is to build the new new obkv document, and insert it in the sorter. let mut writer = obkv::KvWriter::new(&mut obkv_buffer); @@ -289,7 +289,7 @@ impl Transform<'_, '_> { replaced_documents_ids.insert(docid); let key = BEU32::new(docid); - let base_obkv = self.index.documents.get(self.rtxn, &key)?.ok_or( + let base_obkv = self.index.documents.get(&self.rtxn, &key)?.ok_or( InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None, diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index f061f5f21b..5ccbacde24 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -197,12 +197,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { let cb = |step| cb(step, update_id); // if the settings are set before any document update, we don't need to do anything, and // will set the primary key during the first document addition. - if self.index.number_of_documents(self.wtxn)? == 0 { + if self.index.number_of_documents(&self.wtxn)? == 0 { return Ok(()); } let transform = Transform { - rtxn: self.wtxn, + rtxn: &self.wtxn, index: self.index, log_every_n: self.log_every_n, chunk_compression_type: self.chunk_compression_type, @@ -215,13 +215,13 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { // There already has been a document addition, the primary key should be set by now. let primary_key = - self.index.primary_key(self.wtxn)?.ok_or(UserError::MissingPrimaryKey)?; + self.index.primary_key(&self.wtxn)?.ok_or(UserError::MissingPrimaryKey)?; // We remap the documents fields based on the new `FieldsIdsMap`. let output = transform.remap_index_documents( primary_key.to_string(), old_fields_ids_map, - fields_ids_map, + fields_ids_map.clone(), )?; // We clear the full database (words-fst, documents ids and documents content). @@ -260,7 +260,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_distinct_field(&mut self) -> Result { match self.distinct_field { Setting::Set(ref attr) => { - self.index.put_distinct_field(self.wtxn, attr)?; + self.index.put_distinct_field(self.wtxn, &attr)?; } Setting::Reset => { self.index.delete_distinct_field(self.wtxn)?; @@ -286,11 +286,11 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { // Add all the searchable attributes to the field map, and then add the // remaining fields from the old field map to the new one for name in names.iter() { - new_fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; + new_fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?; } for (_, name) in old_fields_ids_map.iter() { - new_fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; + new_fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?; } self.index.put_searchable_fields(self.wtxn, &names)?; @@ -440,7 +440,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_primary_key(&mut self) -> Result<()> { match self.primary_key { Setting::Set(ref primary_key) => { - if self.index.number_of_documents(self.wtxn)? == 0 { + if self.index.number_of_documents(&self.wtxn)? == 0 { let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; fields_ids_map.insert(primary_key).ok_or(UserError::AttributeLimitReached)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; @@ -451,7 +451,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } Setting::Reset => { - if self.index.number_of_documents(self.wtxn)? == 0 { + if self.index.number_of_documents(&self.wtxn)? == 0 { self.index.delete_primary_key(self.wtxn)?; Ok(()) } else { @@ -468,8 +468,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { { self.index.set_updated_at(self.wtxn, &Utc::now())?; - let old_faceted_fields = self.index.faceted_fields(self.wtxn)?; - let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; + let old_faceted_fields = self.index.faceted_fields(&self.wtxn)?; + let old_fields_ids_map = self.index.fields_ids_map(&self.wtxn)?; self.update_displayed()?; self.update_filterable()?; @@ -481,7 +481,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { // If there is new faceted fields we indicate that we must reindex as we must // index new fields as facets. It means that the distinct attribute, // an Asc/Desc criterion or a filtered attribute as be added or removed. - let new_faceted_fields = self.index.faceted_fields(self.wtxn)?; + let new_faceted_fields = self.index.faceted_fields(&self.wtxn)?; let faceted_updated = old_faceted_fields != new_faceted_fields; let stop_words_updated = self.update_stop_words()?; diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index b34b13b6f3..0af51fbb20 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -109,7 +109,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { // iter over all lines of the DB where the key is prefixed by the current prefix. let mut iter = db .remap_key_type::() - .prefix_iter(self.wtxn, prefix_bytes)? + .prefix_iter(self.wtxn, &prefix_bytes)? .remap_key_type::(); while let Some(((_word, level, left, right), data)) = iter.next().transpose()? { // if level is 0, we push the line in the sorter @@ -262,7 +262,7 @@ fn write_level_entry( ) -> Result<()> { let key = (word, level, left, right); let key = StrLevelPositionCodec::bytes_encode(&key).ok_or(Error::Encoding)?; - let data = CboRoaringBitmapCodec::bytes_encode(ids).ok_or(Error::Encoding)?; + let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; writer.insert(&key, &data)?; Ok(()) } diff --git a/milli/src/update/words_prefixes_fst.rs b/milli/src/update/words_prefixes_fst.rs index 1141009405..eaaacc26f1 100644 --- a/milli/src/update/words_prefixes_fst.rs +++ b/milli/src/update/words_prefixes_fst.rs @@ -1,6 +1,7 @@ +use std::iter::FromIterator; use std::str; -use fst::{Streamer, set::OpBuilder}; +use fst::Streamer; use crate::{Index, Result, SmallString32}; @@ -50,6 +51,7 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { #[logging_timer::time("WordsPrefixesFst::{}")] pub fn execute(self) -> Result<()> { let words_fst = self.index.words_fst(&self.wtxn)?; + let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length); for n in 1..=self.max_prefix_length { let mut current_prefix = SmallString32::new(); @@ -87,7 +89,7 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { } // We merge all of the previously computed prefixes into on final set. - let op: OpBuilder = prefix_fsts.iter().collect(); + let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter()); let mut builder = fst::SetBuilder::memory(); builder.extend_stream(op.r#union())?; let prefix_fst = builder.into_set(); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index c9942bb947..0bea481c09 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -6,8 +6,9 @@ use big_s::S; use either::{Either, Left, Right}; use heed::EnvOpenOptions; use maplit::{hashmap, hashset}; -use milli::update::{Settings, UpdateBuilder}; +use milli::update::{IndexDocuments, Settings, UpdateBuilder}; use milli::documents::{DocumentsBuilder, DocumentsReader}; +use milli::update::{IndexDocuments, Settings}; use milli::{AscDesc, Criterion, DocumentId, Index}; use serde::Deserialize; use slice_group_by::GroupBy; @@ -58,7 +59,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { builder.enable_autogenerate_docids(); let mut cursor = Cursor::new(Vec::new()); let mut documents_builder = - DocumentsBuilder::new(&mut cursor).unwrap(); + DocumentsBuilder::new(&mut cursor, bimap::BiHashMap::new()).unwrap(); let reader = Cursor::new(CONTENT.as_bytes()); for doc in serde_json::Deserializer::from_reader(reader).into_iter::() { documents_builder.add_documents(doc.unwrap()).unwrap();