Skip to content
This repository has been archived by the owner on Apr 4, 2023. It is now read-only.

Commit

Permalink
Remove limit of 1000 position per attribute
Browse files Browse the repository at this point in the history
Instead of using an arbitrary limit we encode the absolute position in a u32
using one strong u16 for the field id and a weak u16 for the relative position in the attribute.
  • Loading branch information
ManyTheFish committed Sep 22, 2021
1 parent ad3befa commit 131c0c9
Show file tree
Hide file tree
Showing 6 changed files with 91 additions and 24 deletions.
37 changes: 37 additions & 0 deletions milli/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,24 @@ pub type Attribute = u32;
pub type DocumentId = u32;
pub type FieldId = u16;
pub type Position = u32;
pub type RelativePosition = u16;
pub type FieldDistribution = BTreeMap<String, u64>;
pub type GeoPoint = rstar::primitives::GeomWithData<[f64; 2], DocumentId>;

pub const MAX_POSITION_PER_ATTRIBUTE: u32 = u16::MAX as u32 + 1;

// Convert an absolute word position into a relative position.
// Return the field id of the attribute related to the absolute position
// and the relative position in the attribute.
pub fn relative_from_absolute_position(absolute: Position) -> (FieldId, RelativePosition) {
((absolute >> 16) as u16, (absolute & 0xFFFF) as u16)
}

// Compute the absolute word position with the field id of the attribute and relative position in the attribute.
pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosition) -> Position {
(field_id as u32) << 16 | (relative as u32)
}

/// Transform a raw obkv store into a JSON Object.
pub fn obkv_to_json(
displayed_fields: &[FieldId],
Expand Down Expand Up @@ -187,4 +202,26 @@ mod tests {
// the distance of hard separators is clamped to 8 anyway.
assert_eq!(string, "name: John Doe. . 43. hello. I. am. fine. . ");
}

#[test]
fn test_relative_position_conversion() {
assert_eq!((0x0000, 0x0000), relative_from_absolute_position(0x00000000));
assert_eq!((0x0000, 0xFFFF), relative_from_absolute_position(0x0000FFFF));
assert_eq!((0xFFFF, 0x0000), relative_from_absolute_position(0xFFFF0000));
assert_eq!((0xFF00, 0xFF00), relative_from_absolute_position(0xFF00FF00));
assert_eq!((0xFF00, 0x00FF), relative_from_absolute_position(0xFF0000FF));
assert_eq!((0x1234, 0x5678), relative_from_absolute_position(0x12345678));
assert_eq!((0xFFFF, 0xFFFF), relative_from_absolute_position(0xFFFFFFFF));
}

#[test]
fn test_absolute_position_conversion() {
assert_eq!(0x00000000, absolute_from_relative_position(0x0000, 0x0000));
assert_eq!(0x0000FFFF, absolute_from_relative_position(0x0000, 0xFFFF));
assert_eq!(0xFFFF0000, absolute_from_relative_position(0xFFFF, 0x0000));
assert_eq!(0xFF00FF00, absolute_from_relative_position(0xFF00, 0xFF00));
assert_eq!(0xFF0000FF, absolute_from_relative_position(0xFF00, 0x00FF));
assert_eq!(0x12345678, absolute_from_relative_position(0x1234, 0x5678));
assert_eq!(0xFFFFFFFF, absolute_from_relative_position(0xFFFF, 0xFFFF));
}
}
13 changes: 4 additions & 9 deletions milli/src/proximity.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
use std::cmp;

use crate::{Attribute, Position};
use crate::{relative_from_absolute_position, Position};

pub const ONE_ATTRIBUTE: u32 = 1000;
pub const MAX_DISTANCE: u32 = 8;

pub fn index_proximity(lhs: u32, rhs: u32) -> u32 {
Expand All @@ -14,19 +13,15 @@ pub fn index_proximity(lhs: u32, rhs: u32) -> u32 {
}

pub fn positions_proximity(lhs: Position, rhs: Position) -> u32 {
let (lhs_attr, lhs_index) = extract_position(lhs);
let (rhs_attr, rhs_index) = extract_position(rhs);
let (lhs_attr, lhs_index) = relative_from_absolute_position(lhs);
let (rhs_attr, rhs_index) = relative_from_absolute_position(rhs);
if lhs_attr != rhs_attr {
MAX_DISTANCE
} else {
index_proximity(lhs_index, rhs_index)
index_proximity(lhs_index as u32, rhs_index as u32)
}
}

pub fn extract_position(position: Position) -> (Attribute, Position) {
(position / ONE_ATTRIBUTE, position % ONE_ATTRIBUTE)
}

pub fn path_proximity(path: &[Position]) -> u32 {
path.windows(2).map(|w| positions_proximity(w[0], w[1])).sum::<u32>()
}
11 changes: 5 additions & 6 deletions milli/src/search/criteria/exactness.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use crate::search::criteria::{
resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult,
};
use crate::search::query_tree::{Operation, PrimitiveQueryPart};
use crate::{Result, TreeLevel};
use crate::{absolute_from_relative_position, FieldId, Result, TreeLevel};

pub struct Exactness<'t> {
ctx: &'t dyn Context<'t>,
Expand Down Expand Up @@ -181,7 +181,7 @@ fn resolve_state(
ctx.field_id_word_count_docids(id, query_len)?
{
let mut attribute_candidates_array =
attribute_start_with_docids(ctx, id as u32, query)?;
attribute_start_with_docids(ctx, id, query)?;
attribute_candidates_array.push(attribute_allowed_docids);
candidates |= intersection_of(attribute_candidates_array.iter().collect());
}
Expand All @@ -199,8 +199,7 @@ fn resolve_state(
let mut candidates = RoaringBitmap::new();
let attributes_ids = ctx.searchable_fields_ids()?;
for id in attributes_ids {
let attribute_candidates_array =
attribute_start_with_docids(ctx, id as u32, query)?;
let attribute_candidates_array = attribute_start_with_docids(ctx, id, query)?;
candidates |= intersection_of(attribute_candidates_array.iter().collect());
}

Expand Down Expand Up @@ -290,13 +289,13 @@ fn resolve_state(

fn attribute_start_with_docids(
ctx: &dyn Context,
attribute_id: u32,
attribute_id: FieldId,
query: &[ExactQueryPart],
) -> heed::Result<Vec<RoaringBitmap>> {
let lowest_level = TreeLevel::min_value();
let mut attribute_candidates_array = Vec::new();
// start from attribute first position
let mut pos = attribute_id * 1000;
let mut pos = absolute_from_relative_position(attribute_id, 0);
for part in query {
use ExactQueryPart::*;
match part {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@ use serde_json::Value;

use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters};
use crate::error::{InternalError, SerializationError};
use crate::proximity::ONE_ATTRIBUTE;
use crate::{FieldId, Result};
use crate::{absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE};

/// Extracts the word and positions where this word appear and
/// prefixes it by the document id.
Expand Down Expand Up @@ -63,18 +62,18 @@ pub fn extract_docid_word_positions<R: io::Read>(
if let Some(field) = json_to_string(&value, &mut field_buffer) {
let analyzed = analyzer.analyze(field);
let tokens = process_tokens(analyzed.tokens())
.take_while(|(p, _)| (*p as u32) < ONE_ATTRIBUTE);
.take_while(|(p, _)| (*p as u32) < MAX_POSITION_PER_ATTRIBUTE);

for (index, token) in tokens {
let token = token.text().trim();
if !token.is_empty() {
key_buffer.truncate(mem::size_of::<u32>());
key_buffer.extend_from_slice(token.as_bytes());

let position: u32 = index
let position: u16 = index
.try_into()
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
let position = field_id as u32 * ONE_ATTRIBUTE + position;
let position = absolute_from_relative_position(field_id, position);
docid_word_positions_sorter
.insert(&key_buffer, &position.to_ne_bytes())?;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@ use super::helpers::{
};
use crate::error::SerializationError;
use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::proximity::extract_position;
use crate::{DocumentId, FieldId, Result};
use crate::{relative_from_absolute_position, DocumentId, FieldId, Result};

/// Extracts the field id word count and the documents ids where
/// this field id with this amount of words appear.
Expand Down Expand Up @@ -53,8 +52,8 @@ pub fn extract_fid_word_count_docids<R: io::Read>(
}

for position in read_u32_ne_bytes(value) {
let (field_id, position) = extract_position(position);
let word_count = position + 1;
let (field_id, position) = relative_from_absolute_position(position);
let word_count = position as u32 + 1;

let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0);
*value = cmp::max(*value, word_count);
Expand Down
38 changes: 38 additions & 0 deletions milli/src/update/index_documents/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -884,6 +884,44 @@ mod tests {
wtxn.commit().unwrap();
}

#[test]
fn index_more_than_1000_positions_in_a_field() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(50 * 1024 * 1024); // 10 MB
let index = Index::new(options, &path).unwrap();

let mut wtxn = index.write_txn().unwrap();

let mut big_object = HashMap::new();
big_object.insert(S("id"), "wow");
let content: String =
(0..=u16::MAX).into_iter().map(|p| p.to_string()).reduce(|a, b| a + " " + &b).unwrap();
big_object.insert("content".to_string(), &content);

let mut cursor = Cursor::new(Vec::new());

let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
builder.add_documents(big_object).unwrap();
builder.finish().unwrap();
cursor.set_position(0);
let content = DocumentBatchReader::from_reader(cursor).unwrap();

let builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.execute(content, |_, _| ()).unwrap();

wtxn.commit().unwrap();

let mut rtxn = index.read_txn().unwrap();

assert!(index.word_docids.get(&mut rtxn, "0").unwrap().is_some());
assert!(index.word_docids.get(&mut rtxn, "64").unwrap().is_some());
assert!(index.word_docids.get(&mut rtxn, "256").unwrap().is_some());
assert!(index.word_docids.get(&mut rtxn, "1024").unwrap().is_some());
assert!(index.word_docids.get(&mut rtxn, "32768").unwrap().is_some());
assert!(index.word_docids.get(&mut rtxn, "65535").unwrap().is_some());
}

#[test]
fn index_documents_with_zeroes() {
let path = tempfile::tempdir().unwrap();
Expand Down

0 comments on commit 131c0c9

Please sign in to comment.