diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 550e7f13d4..1c9520f7ed 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -53,9 +53,24 @@ pub type Attribute = u32; pub type DocumentId = u32; pub type FieldId = u16; pub type Position = u32; +pub type RelativePosition = u16; pub type FieldDistribution = BTreeMap; pub type GeoPoint = rstar::primitives::GeomWithData<[f64; 2], DocumentId>; +pub const MAX_POSITION_PER_ATTRIBUTE: u32 = u16::MAX as u32 + 1; + +// Convert an absolute word position into a relative position. +// Return the field id of the attribute related to the absolute position +// and the relative position in the attribute. +pub fn relative_from_absolute_position(absolute: Position) -> (FieldId, RelativePosition) { + ((absolute >> 16) as u16, (absolute & 0xFFFF) as u16) +} + +// Compute the absolute word position with the field id of the attribute and relative position in the attribute. +pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosition) -> Position { + (field_id as u32) << 16 | (relative as u32) +} + /// Transform a raw obkv store into a JSON Object. pub fn obkv_to_json( displayed_fields: &[FieldId], @@ -187,4 +202,26 @@ mod tests { // the distance of hard separators is clamped to 8 anyway. assert_eq!(string, "name: John Doe. . 43. hello. I. am. fine. . "); } + + #[test] + fn test_relative_position_conversion() { + assert_eq!((0x0000, 0x0000), relative_from_absolute_position(0x00000000)); + assert_eq!((0x0000, 0xFFFF), relative_from_absolute_position(0x0000FFFF)); + assert_eq!((0xFFFF, 0x0000), relative_from_absolute_position(0xFFFF0000)); + assert_eq!((0xFF00, 0xFF00), relative_from_absolute_position(0xFF00FF00)); + assert_eq!((0xFF00, 0x00FF), relative_from_absolute_position(0xFF0000FF)); + assert_eq!((0x1234, 0x5678), relative_from_absolute_position(0x12345678)); + assert_eq!((0xFFFF, 0xFFFF), relative_from_absolute_position(0xFFFFFFFF)); + } + + #[test] + fn test_absolute_position_conversion() { + assert_eq!(0x00000000, absolute_from_relative_position(0x0000, 0x0000)); + assert_eq!(0x0000FFFF, absolute_from_relative_position(0x0000, 0xFFFF)); + assert_eq!(0xFFFF0000, absolute_from_relative_position(0xFFFF, 0x0000)); + assert_eq!(0xFF00FF00, absolute_from_relative_position(0xFF00, 0xFF00)); + assert_eq!(0xFF0000FF, absolute_from_relative_position(0xFF00, 0x00FF)); + assert_eq!(0x12345678, absolute_from_relative_position(0x1234, 0x5678)); + assert_eq!(0xFFFFFFFF, absolute_from_relative_position(0xFFFF, 0xFFFF)); + } } diff --git a/milli/src/proximity.rs b/milli/src/proximity.rs index 083e5a9777..62f4901192 100644 --- a/milli/src/proximity.rs +++ b/milli/src/proximity.rs @@ -1,8 +1,7 @@ use std::cmp; -use crate::{Attribute, Position}; +use crate::{relative_from_absolute_position, Position}; -pub const ONE_ATTRIBUTE: u32 = 1000; pub const MAX_DISTANCE: u32 = 8; pub fn index_proximity(lhs: u32, rhs: u32) -> u32 { @@ -14,19 +13,15 @@ pub fn index_proximity(lhs: u32, rhs: u32) -> u32 { } pub fn positions_proximity(lhs: Position, rhs: Position) -> u32 { - let (lhs_attr, lhs_index) = extract_position(lhs); - let (rhs_attr, rhs_index) = extract_position(rhs); + let (lhs_attr, lhs_index) = relative_from_absolute_position(lhs); + let (rhs_attr, rhs_index) = relative_from_absolute_position(rhs); if lhs_attr != rhs_attr { MAX_DISTANCE } else { - index_proximity(lhs_index, rhs_index) + index_proximity(lhs_index as u32, rhs_index as u32) } } -pub fn extract_position(position: Position) -> (Attribute, Position) { - (position / ONE_ATTRIBUTE, position % ONE_ATTRIBUTE) -} - pub fn path_proximity(path: &[Position]) -> u32 { path.windows(2).map(|w| positions_proximity(w[0], w[1])).sum::() } diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index 1e4d4e7a25..d415f45f37 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -10,7 +10,7 @@ use crate::search::criteria::{ resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult, }; use crate::search::query_tree::{Operation, PrimitiveQueryPart}; -use crate::{Result, TreeLevel}; +use crate::{absolute_from_relative_position, FieldId, Result, TreeLevel}; pub struct Exactness<'t> { ctx: &'t dyn Context<'t>, @@ -181,7 +181,7 @@ fn resolve_state( ctx.field_id_word_count_docids(id, query_len)? { let mut attribute_candidates_array = - attribute_start_with_docids(ctx, id as u32, query)?; + attribute_start_with_docids(ctx, id, query)?; attribute_candidates_array.push(attribute_allowed_docids); candidates |= intersection_of(attribute_candidates_array.iter().collect()); } @@ -199,8 +199,7 @@ fn resolve_state( let mut candidates = RoaringBitmap::new(); let attributes_ids = ctx.searchable_fields_ids()?; for id in attributes_ids { - let attribute_candidates_array = - attribute_start_with_docids(ctx, id as u32, query)?; + let attribute_candidates_array = attribute_start_with_docids(ctx, id, query)?; candidates |= intersection_of(attribute_candidates_array.iter().collect()); } @@ -290,13 +289,13 @@ fn resolve_state( fn attribute_start_with_docids( ctx: &dyn Context, - attribute_id: u32, + attribute_id: FieldId, query: &[ExactQueryPart], ) -> heed::Result> { let lowest_level = TreeLevel::min_value(); let mut attribute_candidates_array = Vec::new(); // start from attribute first position - let mut pos = attribute_id * 1000; + let mut pos = absolute_from_relative_position(attribute_id, 0); for part in query { use ExactQueryPart::*; match part { diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index ca65f08748..df19125c68 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -10,8 +10,7 @@ use serde_json::Value; use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters}; use crate::error::{InternalError, SerializationError}; -use crate::proximity::ONE_ATTRIBUTE; -use crate::{FieldId, Result}; +use crate::{absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE}; /// Extracts the word and positions where this word appear and /// prefixes it by the document id. @@ -63,7 +62,7 @@ pub fn extract_docid_word_positions( if let Some(field) = json_to_string(&value, &mut field_buffer) { let analyzed = analyzer.analyze(field); let tokens = process_tokens(analyzed.tokens()) - .take_while(|(p, _)| (*p as u32) < ONE_ATTRIBUTE); + .take_while(|(p, _)| (*p as u32) < MAX_POSITION_PER_ATTRIBUTE); for (index, token) in tokens { let token = token.text().trim(); @@ -71,10 +70,10 @@ pub fn extract_docid_word_positions( key_buffer.truncate(mem::size_of::()); key_buffer.extend_from_slice(token.as_bytes()); - let position: u32 = index + let position: u16 = index .try_into() .map_err(|_| SerializationError::InvalidNumberSerialization)?; - let position = field_id as u32 * ONE_ATTRIBUTE + position; + let position = absolute_from_relative_position(field_id, position); docid_word_positions_sorter .insert(&key_buffer, &position.to_ne_bytes())?; } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index f9577243f4..b1677f4d7e 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -884,6 +884,44 @@ mod tests { wtxn.commit().unwrap(); } + #[test] + fn index_more_than_1000_positions_in_a_field() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(50 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + + let mut big_object = HashMap::new(); + big_object.insert(S("id"), "wow"); + let content: String = + (0..=u16::MAX).into_iter().map(|p| p.to_string()).reduce(|a, b| a + " " + &b).unwrap(); + big_object.insert("content".to_string(), &content); + + let mut cursor = Cursor::new(Vec::new()); + + let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + builder.add_documents(big_object).unwrap(); + builder.finish().unwrap(); + cursor.set_position(0); + let content = DocumentBatchReader::from_reader(cursor).unwrap(); + + let builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.execute(content, |_, _| ()).unwrap(); + + wtxn.commit().unwrap(); + + let mut rtxn = index.read_txn().unwrap(); + + assert!(index.word_docids.get(&mut rtxn, "0").unwrap().is_some()); + assert!(index.word_docids.get(&mut rtxn, "64").unwrap().is_some()); + assert!(index.word_docids.get(&mut rtxn, "256").unwrap().is_some()); + assert!(index.word_docids.get(&mut rtxn, "1024").unwrap().is_some()); + assert!(index.word_docids.get(&mut rtxn, "32768").unwrap().is_some()); + assert!(index.word_docids.get(&mut rtxn, "65535").unwrap().is_some()); + } + #[test] fn index_documents_with_zeroes() { let path = tempfile::tempdir().unwrap();