From 16b641c2941aa2097754344bb927c806784623ad Mon Sep 17 00:00:00 2001 From: mpostma Date: Thu, 9 Sep 2021 14:30:30 +0200 Subject: [PATCH] fix review comments --- .gitignore | 3 - Cargo.lock | 228 ++++++++++++++++++ Cargo.toml | 3 +- cli/Cargo.toml | 1 + cli/src/main.rs | 10 +- http-ui/src/main.rs | 10 +- milli/src/documents/builder.rs | 36 ++- milli/src/documents/mod.rs | 31 +-- milli/src/documents/reader.rs | 22 +- milli/src/documents/serde.rs | 15 +- milli/src/search/distinct/mod.rs | 9 +- milli/src/update/index_documents/mod.rs | 13 +- milli/src/update/index_documents/transform.rs | 24 +- milli/src/update/settings.rs | 2 +- milli/tests/search/mod.rs | 9 +- 15 files changed, 332 insertions(+), 84 deletions(-) diff --git a/.gitignore b/.gitignore index e793156a28..107b5bb36a 100644 --- a/.gitignore +++ b/.gitignore @@ -2,9 +2,6 @@ /target /Cargo.lock -/cli/target -/cli/Cargo.lock - # datasets *.csv *.mmdb diff --git a/Cargo.lock b/Cargo.lock index 675a5a729b..2e8f31d64a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "addr2line" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e61f2b7f93d2c7d2b08263acaa4a363b3e276806c68af6134c44f523bf1aacd" +dependencies = [ + "gimli", +] + [[package]] name = "adler" version = "1.0.2" @@ -23,6 +32,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "ansi_term" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" +dependencies = [ + "winapi 0.3.9", +] + [[package]] name = "anyhow" version = "1.0.43" @@ -110,6 +128,21 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" +[[package]] +name = "backtrace" +version = "0.3.61" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7a905d892734eea339e896738c14b9afce22b5318f64b951e70bf3844419b01" +dependencies = [ + "addr2line", + "cc", + "cfg-if 1.0.0", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] + [[package]] name = "base64" version = "0.12.3" @@ -249,6 +282,7 @@ version = "4.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "063197e6eb4b775b64160dedde7a0986bb2836cce140e9492e9e96f28e18bcd8" dependencies = [ + "serde", "utf8-width", ] @@ -358,10 +392,60 @@ version = "2.33.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002" dependencies = [ + "ansi_term", + "atty", "bitflags", + "strsim", "term_size", "textwrap", "unicode-width", + "vec_map", +] + +[[package]] +name = "cli" +version = "0.1.0" +dependencies = [ + "bimap", + "byte-unit", + "color-eyre", + "csv", + "eyre", + "heed", + "indicatif", + "jemallocator", + "milli", + "serde", + "serde_json", + "stderrlog", + "structopt", +] + +[[package]] +name = "color-eyre" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f1885697ee8a177096d42f158922251a41973117f6d8a234cee94b9509157b7" +dependencies = [ + "backtrace", + "color-spantrace", + "eyre", + "indenter", + "once_cell", + "owo-colors", + "tracing-error", +] + +[[package]] +name = "color-spantrace" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6eee477a4a8a72f4addd4de416eb56d54bc307b284d6601bafdee1f4ea462d1" +dependencies = [ + "once_cell", + "owo-colors", + "tracing-core", + "tracing-error", ] [[package]] @@ -375,6 +459,19 @@ dependencies = [ "syn 1.0.75", ] +[[package]] +name = "console" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3993e6445baa160675931ec041a5e03ca84b9c6e32a056150d3aa2bdda0a1f45" +dependencies = [ + "encode_unicode", + "lazy_static", + "libc", + "terminal_size", + "winapi 0.3.9", +] + [[package]] name = "convert_case" version = "0.4.0" @@ -562,6 +659,12 @@ version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" +[[package]] +name = "encode_unicode" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" + [[package]] name = "encoding_rs" version = "0.8.28" @@ -571,6 +674,16 @@ dependencies = [ "cfg-if 1.0.0", ] +[[package]] +name = "eyre" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "221239d1d5ea86bf5d6f91c9d6bc3646ffe471b08ff9b0f91c44f115ac969d2b" +dependencies = [ + "indenter", + "once_cell", +] + [[package]] name = "fake-simd" version = "0.1.2" @@ -783,6 +896,12 @@ dependencies = [ "wasi 0.10.0+wasi-snapshot-preview1", ] +[[package]] +name = "gimli" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0a01e0497841a3b2db4f8afa483cce65f7e96a3498bd6c541734792aeac8fe7" + [[package]] name = "glob" version = "0.3.0" @@ -1128,6 +1247,12 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "indenter" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce23b50ad8242c51a442f3ff322d56b02f08852c77e4c0b4d3fd684abc89c683" + [[package]] name = "indexmap" version = "1.7.0" @@ -1138,6 +1263,18 @@ dependencies = [ "hashbrown 0.11.2", ] +[[package]] +name = "indicatif" +version = "0.16.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d207dc617c7a380ab07ff572a6e52fa202a2a8f355860ac9c38e23f8196be1b" +dependencies = [ + "console", + "lazy_static", + "number_prefix", + "regex", +] + [[package]] name = "infos" version = "0.12.0" @@ -1635,6 +1772,21 @@ dependencies = [ "libc", ] +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + +[[package]] +name = "object" +version = "0.26.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39f37e50073ccad23b6d09bcb5b263f4e76d3bb6038e4a3c08e52162ffa8abc2" +dependencies = [ + "memchr", +] + [[package]] name = "obkv" version = "0.2.0" @@ -1674,6 +1826,12 @@ dependencies = [ "num-traits", ] +[[package]] +name = "owo-colors" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2386b4ebe91c2f7f51082d4cefa145d030e33a1842a96b12e4885cc3c01f7a55" + [[package]] name = "page_size" version = "0.4.2" @@ -2175,6 +2333,12 @@ dependencies = [ "retain_mut", ] +[[package]] +name = "rustc-demangle" +version = "0.1.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ef03e0a2b150c7a90d01faf6254c9c48a41e95fb2a8c2ac1c6f0d2b9aefc342" + [[package]] name = "rustc_version" version = "0.4.0" @@ -2346,6 +2510,15 @@ dependencies = [ "opaque-debug 0.3.0", ] +[[package]] +name = "sharded-slab" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "740223c51853f3145fe7c90360d2d4232f2b62e3449489c207eccde818979982" +dependencies = [ + "lazy_static", +] + [[package]] name = "signal-hook-registry" version = "1.4.0" @@ -2441,6 +2614,12 @@ dependencies = [ "thread_local", ] +[[package]] +name = "strsim" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" + [[package]] name = "structopt" version = "0.3.23" @@ -2547,6 +2726,16 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "terminal_size" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "633c1a546cee861a1a6d0dc69ebeca693bf4296661ba7852b9d21d159e0506df" +dependencies = [ + "libc", + "winapi 0.3.9", +] + [[package]] name = "textwrap" version = "0.11.0" @@ -2729,9 +2918,21 @@ dependencies = [ "cfg-if 1.0.0", "log", "pin-project-lite 0.2.7", + "tracing-attributes", "tracing-core", ] +[[package]] +name = "tracing-attributes" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c42e6fa53307c8a17e4ccd4dc81cf5ec38db9209f59b222210375b54ee40d1e2" +dependencies = [ + "proc-macro2 1.0.29", + "quote 1.0.9", + "syn 1.0.75", +] + [[package]] name = "tracing-core" version = "0.1.19" @@ -2741,6 +2942,16 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "tracing-error" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4d7c0b83d4a500748fa5879461652b361edf5c9d51ede2a2ac03875ca185e24" +dependencies = [ + "tracing", + "tracing-subscriber", +] + [[package]] name = "tracing-futures" version = "0.2.5" @@ -2751,6 +2962,17 @@ dependencies = [ "tracing", ] +[[package]] +name = "tracing-subscriber" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9cbe87a2fa7e35900ce5de20220a582a9483a7063811defce79d7cbd59d4cfe" +dependencies = [ + "sharded-slab", + "thread_local", + "tracing-core", +] + [[package]] name = "try-lock" version = "0.2.3" @@ -2900,6 +3122,12 @@ dependencies = [ "getrandom 0.2.3", ] +[[package]] +name = "vec_map" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" + [[package]] name = "version_check" version = "0.9.3" diff --git a/Cargo.toml b/Cargo.toml index 46b759b8bd..b78989f505 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,6 @@ [workspace] -members = ["milli", "http-ui", "benchmarks", "infos", "helpers"] +members = ["milli", "http-ui", "benchmarks", "infos", "helpers", "cli"] default-members = ["milli"] -exclude = ["cli"] [profile.dev] opt-level = 3 diff --git a/cli/Cargo.toml b/cli/Cargo.toml index ebb98344b1..24fb214b9c 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -2,6 +2,7 @@ name = "cli" version = "0.1.0" edition = "2018" +description = "A CLI to interact with a milli index" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/cli/src/main.rs b/cli/src/main.rs index 9d766c2f94..d38b1fc3be 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -86,7 +86,7 @@ impl FromStr for DocumentAdditionFormat { #[derive(Debug, StructOpt)] struct DocumentAddition { - #[structopt(short, long, default_value = "json")] + #[structopt(short, long, default_value = "json", possible_values = &["csv", "jsonl", "json"])] format: DocumentAdditionFormat, /// Path to the update file, if not present, will read from stdin. #[structopt(short, long)] @@ -117,7 +117,7 @@ impl DocumentAddition { DocumentAdditionFormat::Jsonl => documents_from_jsonl(reader)?, }; - let reader = milli::documents::DocumentsReader::from_reader(Cursor::new(documents))?; + let reader = milli::documents::DocumentBatchReader::from_reader(Cursor::new(documents))?; println!("Adding {} documents to the index.", reader.len()); @@ -200,7 +200,7 @@ fn indexing_callback(step: milli::update::UpdateIndexingStep, bars: &[ProgressBa fn documents_from_jsonl(reader: impl Read) -> Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = - milli::documents::DocumentsBuilder::new(&mut writer, bimap::BiHashMap::new())?; + milli::documents::DocumentBatchBuilder::new(&mut writer)?; let values = serde_json::Deserializer::from_reader(reader) .into_iter::>(); @@ -216,7 +216,7 @@ fn documents_from_jsonl(reader: impl Read) -> Result> { fn documents_from_json(reader: impl Read) -> Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = - milli::documents::DocumentsBuilder::new(&mut writer, bimap::BiHashMap::new())?; + milli::documents::DocumentBatchBuilder::new(&mut writer)?; let json: serde_json::Value = serde_json::from_reader(reader)?; documents.add_documents(json)?; @@ -228,7 +228,7 @@ fn documents_from_json(reader: impl Read) -> Result> { fn documents_from_csv(reader: impl Read) -> Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = - milli::documents::DocumentsBuilder::new(&mut writer, bimap::BiHashMap::new())?; + milli::documents::DocumentBatchBuilder::new(&mut writer)?; let mut records = csv::Reader::from_reader(reader); let iter = records.deserialize::>(); diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 4ad9595ff1..7482841ce3 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -19,7 +19,7 @@ use flate2::read::GzDecoder; use futures::{stream, FutureExt, StreamExt}; use heed::EnvOpenOptions; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; -use milli::documents::DocumentsReader; +use milli::documents::DocumentBatchReader; use milli::update::UpdateIndexingStep::*; use milli::update::{IndexDocumentsMethod, Setting, UpdateBuilder}; use milli::{obkv_to_json, CompressionType, FilterCondition, Index, MatchingWords, SearchResult}; @@ -377,7 +377,7 @@ async fn main() -> anyhow::Result<()> { otherwise => panic!("invalid update format {:?}", otherwise), }; - let documents = DocumentsReader::from_reader(Cursor::new(documents))?; + let documents = DocumentBatchReader::from_reader(Cursor::new(documents))?; let result = builder.execute(documents, |indexing_step, update_id| { let (current, total) = match indexing_step { @@ -1020,7 +1020,7 @@ async fn main() -> anyhow::Result<()> { fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = - milli::documents::DocumentsBuilder::new(&mut writer, bimap::BiHashMap::new())?; + milli::documents::DocumentBatchBuilder::new(&mut writer)?; let values = serde_json::Deserializer::from_reader(reader) .into_iter::>(); @@ -1036,7 +1036,7 @@ fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result> { fn documents_from_json(reader: impl io::Read) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = - milli::documents::DocumentsBuilder::new(&mut writer, bimap::BiHashMap::new())?; + milli::documents::DocumentBatchBuilder::new(&mut writer)?; let json: serde_json::Value = serde_json::from_reader(reader)?; documents.add_documents(json)?; @@ -1048,7 +1048,7 @@ fn documents_from_json(reader: impl io::Read) -> anyhow::Result> { fn documents_from_csv(reader: impl io::Read) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = - milli::documents::DocumentsBuilder::new(&mut writer, bimap::BiHashMap::new())?; + milli::documents::DocumentBatchBuilder::new(&mut writer)?; let mut records = csv::Reader::from_reader(reader); let iter = records.deserialize::>(); diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index ad23715184..ba1319eff6 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -1,25 +1,41 @@ use std::io; -use bimap::BiHashMap; use byteorder::{BigEndian, WriteBytesExt}; use serde::ser::Serialize; -use super::serde::DocumentsSerilializer; -use super::{ByteCounter, DocumentsMetadata, Error}; -use crate::FieldId; +use super::serde::DocumentSerializer; +use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error}; -pub struct DocumentsBuilder { - serializer: DocumentsSerilializer, +/// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary +/// format used by milli. +/// +/// The writer used by the DocumentBatchBuilder can be read using a `DocumentBatchReader` to +/// iterate other the documents. +/// +/// ## example: +/// ``` +/// use milli::documents::DocumentBatchBuilder; +/// use serde_json::json; +/// use std::io::Cursor; +/// +/// let mut writer = Cursor::new(Vec::new()); +/// let mut builder = DocumentBatchBuilder::new(&mut writer).unwrap(); +/// builder.add_documents(json!({"id": 1, "name": "foo"})).unwrap(); +/// builder.finish().unwrap(); +/// ``` +pub struct DocumentBatchBuilder { + serializer: DocumentSerializer, } -impl DocumentsBuilder { - pub fn new(writer: W, index: BiHashMap) -> Result { +impl DocumentBatchBuilder { + pub fn new(writer: W) -> Result { + let index = DocumentsBatchIndex::new(); let mut writer = ByteCounter::new(writer); // add space to write the offset of the metadata at the end of the writer writer.write_u64::(0)?; let serializer = - DocumentsSerilializer { writer, buffer: Vec::new(), index, count: 0, allow_seq: true }; + DocumentSerializer { writer, buffer: Vec::new(), index, count: 0, allow_seq: true }; Ok(Self { serializer }) } @@ -33,7 +49,7 @@ impl DocumentsBuilder { /// metadata at the end of the file, and write the metadata offset at the beginning on the /// file. pub fn finish(self) -> Result<(), Error> { - let DocumentsSerilializer { + let DocumentSerializer { writer: ByteCounter { mut writer, count: offset }, index, count, diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 1fa7884d49..f74bb1c7fc 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -2,8 +2,8 @@ mod builder; /// The documents module defines an intermediary document format that milli uses for indexation, and /// provides an API to easily build and read such documents. /// -/// The `DocumentBuilder` interface allows to write batches of documents to a writer, that can -/// later be read by milli using the `DocumentsReader` interface. +/// The `DocumentBatchBuilder` interface allows to write batches of documents to a writer, that can +/// later be read by milli using the `DocumentBatchReader` interface. mod reader; mod serde; @@ -11,15 +11,18 @@ use std::{fmt, io}; use ::serde::{Deserialize, Serialize}; use bimap::BiHashMap; -pub use builder::DocumentsBuilder; -pub use reader::DocumentsReader; +pub use builder::DocumentBatchBuilder; +pub use reader::DocumentBatchReader; use crate::FieldId; +/// A bidirectional map that links field ids to their name in a document batch. +pub type DocumentsBatchIndex = BiHashMap; + #[derive(Debug, Serialize, Deserialize)] struct DocumentsMetadata { count: usize, - index: BiHashMap, + index: DocumentsBatchIndex, } pub struct ByteCounter { @@ -89,13 +92,13 @@ macro_rules! documents { let documents = serde_json::json!($data); let mut writer = std::io::Cursor::new(Vec::new()); let mut builder = - crate::documents::DocumentsBuilder::new(&mut writer, bimap::BiHashMap::new()).unwrap(); + crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap(); builder.add_documents(documents).unwrap(); builder.finish().unwrap(); writer.set_position(0); - crate::documents::DocumentsReader::from_reader(writer).unwrap() + crate::documents::DocumentBatchReader::from_reader(writer).unwrap() }}; } @@ -120,14 +123,14 @@ mod test { let mut v = Vec::new(); let mut cursor = io::Cursor::new(&mut v); - let mut builder = DocumentsBuilder::new(&mut cursor, BiHashMap::new()).unwrap(); + let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); builder.add_documents(json).unwrap(); builder.finish().unwrap(); let mut documents = - DocumentsReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap(); + DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap(); assert_eq!(documents.index().iter().count(), 5); @@ -149,7 +152,7 @@ mod test { let mut v = Vec::new(); let mut cursor = io::Cursor::new(&mut v); - let mut builder = DocumentsBuilder::new(&mut cursor, BiHashMap::new()).unwrap(); + let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); builder.add_documents(doc1).unwrap(); builder.add_documents(doc2).unwrap(); @@ -157,7 +160,7 @@ mod test { builder.finish().unwrap(); let mut documents = - DocumentsReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap(); + DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap(); assert_eq!(documents.index().iter().count(), 2); @@ -178,14 +181,14 @@ mod test { let mut v = Vec::new(); let mut cursor = io::Cursor::new(&mut v); - let mut builder = DocumentsBuilder::new(&mut cursor, BiHashMap::new()).unwrap(); + let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); builder.add_documents(docs).unwrap(); builder.finish().unwrap(); let mut documents = - DocumentsReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap(); + DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap(); assert_eq!(documents.index().iter().count(), 2); @@ -201,7 +204,7 @@ mod test { let mut v = Vec::new(); let mut cursor = io::Cursor::new(&mut v); - let mut builder = DocumentsBuilder::new(&mut cursor, BiHashMap::new()).unwrap(); + let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); let docs = json!([[ { "toto": false }, diff --git a/milli/src/documents/reader.rs b/milli/src/documents/reader.rs index 1f41134542..14d7c8cebb 100644 --- a/milli/src/documents/reader.rs +++ b/milli/src/documents/reader.rs @@ -2,22 +2,26 @@ use std::io; use std::io::{BufReader, Read}; use std::mem::size_of; -use bimap::BiHashMap; use byteorder::{BigEndian, ReadBytesExt}; use obkv::KvReader; -use super::{DocumentsMetadata, Error}; +use super::{DocumentsBatchIndex, DocumentsMetadata, Error}; use crate::FieldId; -pub struct DocumentsReader { +/// The `DocumentsBatchReader` provides a way to iterate over documents that have been created with +/// a `DocumentsBatchWriter`. +/// +/// The documents are returned in the form of `obkv::Reader` where each field is identified with a +/// `FieldId`. The mapping between the field ids and the field names is done thanks to the index. +pub struct DocumentBatchReader { reader: BufReader, metadata: DocumentsMetadata, buffer: Vec, seen_documents: usize, } -impl DocumentsReader { - /// Construct a DocumentsReader from a reader. +impl DocumentBatchReader { + /// Construct a `DocumentsReader` from a reader. /// /// It first retrieves the index, then moves to the first document. Subsequent calls to /// `next_document` advance the document reader until all the documents have been read. @@ -41,7 +45,7 @@ impl DocumentsReader { /// reference to the addition index. pub fn next_document_with_index<'a>( &'a mut self, - ) -> io::Result, KvReader<'a, FieldId>)>> { + ) -> io::Result)>> { if self.seen_documents < self.metadata.count { let doc_len = self.reader.read_u32::()?; self.buffer.resize(doc_len as usize, 0); @@ -56,7 +60,7 @@ impl DocumentsReader { } /// Return the fields index for the documents batch. - pub fn index(&self) -> &BiHashMap { + pub fn index(&self) -> &DocumentsBatchIndex { &self.metadata.index } @@ -64,4 +68,8 @@ impl DocumentsReader { pub fn len(&self) -> usize { self.metadata.count } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } } diff --git a/milli/src/documents/serde.rs b/milli/src/documents/serde.rs index d513d38407..cd5617c52c 100644 --- a/milli/src/documents/serde.rs +++ b/milli/src/documents/serde.rs @@ -1,23 +1,22 @@ use std::convert::TryInto; use std::{fmt, io}; -use bimap::BiHashMap; use byteorder::{BigEndian, WriteBytesExt}; use obkv::KvWriter; use serde::ser::{Impossible, Serialize, SerializeMap, SerializeSeq, Serializer}; -use super::{ByteCounter, Error}; +use super::{ByteCounter, DocumentsBatchIndex, Error}; use crate::FieldId; -pub struct DocumentsSerilializer { +pub struct DocumentSerializer { pub writer: ByteCounter, pub buffer: Vec, - pub index: BiHashMap, + pub index: DocumentsBatchIndex, pub count: usize, pub allow_seq: bool, } -impl<'a, W: io::Write> Serializer for &'a mut DocumentsSerilializer { +impl<'a, W: io::Write> Serializer for &'a mut DocumentSerializer { type Ok = (); type Error = Error; @@ -203,7 +202,7 @@ impl<'a, W: io::Write> Serializer for &'a mut DocumentsSerilializer { } pub struct SeqSerializer<'a, W> { - serializer: &'a mut DocumentsSerilializer, + serializer: &'a mut DocumentSerializer, } impl<'a, W: io::Write> SerializeSeq for SeqSerializer<'a, W> { @@ -225,7 +224,7 @@ impl<'a, W: io::Write> SerializeSeq for SeqSerializer<'a, W> { pub struct MapSerializer<'a, W> { map: KvWriter>, FieldId>, - index: &'a mut BiHashMap, + index: &'a mut DocumentsBatchIndex, writer: W, buffer: Vec, } @@ -277,7 +276,7 @@ impl<'a, W: io::Write> SerializeMap for MapSerializer<'a, W> { } struct FieldSerializer<'a> { - index: &'a mut BiHashMap, + index: &'a mut DocumentsBatchIndex, } impl<'a> serde::Serializer for FieldSerializer<'a> { diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 6010a4410e..3020c091fa 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -29,14 +29,13 @@ mod test { use std::collections::HashSet; use std::io::Cursor; - use bimap::BiHashMap; use once_cell::sync::Lazy; use rand::seq::SliceRandom; use rand::Rng; use roaring::RoaringBitmap; use serde_json::{json, Value}; - use crate::documents::{DocumentsBuilder, DocumentsReader}; + use crate::documents::{DocumentBatchReader, DocumentBatchBuilder}; use crate::index::tests::TempIndex; use crate::index::Index; use crate::update::{IndexDocumentsMethod, UpdateBuilder}; @@ -49,7 +48,7 @@ mod test { let num_docs = rng.gen_range(10..30); let mut cursor = Cursor::new(Vec::new()); - let mut builder = DocumentsBuilder::new(&mut cursor, BiHashMap::new()).unwrap(); + let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); let txts = ["Toto", "Titi", "Tata"]; let cats = (1..10).map(|i| i.to_string()).collect::>(); let cat_ints = (1..10).collect::>(); @@ -93,13 +92,13 @@ mod test { let mut addition = builder.index_documents(&mut txn, &index); addition.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); - let reader = crate::documents::DocumentsReader::from_reader(Cursor::new(&*JSON)).unwrap(); + let reader = crate::documents::DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap(); addition.execute(reader, |_, _| ()).unwrap(); let fields_map = index.fields_ids_map(&txn).unwrap(); let fid = fields_map.id(&distinct).unwrap(); - let documents = DocumentsReader::from_reader(Cursor::new(&*JSON)).unwrap(); + let documents = DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap(); let map = (0..documents.len() as u32).collect(); txn.commit().unwrap(); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index b5b5abdbd1..c240432a83 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -25,7 +25,7 @@ pub use self::helpers::{ use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; use crate::{Index, Result}; -use crate::documents::DocumentsReader; +use crate::documents::DocumentBatchReader; use crate::update::{ Facets, UpdateBuilder, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids, WordsLevelPositions, WordsPrefixesFst, @@ -126,7 +126,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { #[logging_timer::time("IndexDocuments::{}")] pub fn execute( self, - reader: DocumentsReader, + reader: DocumentBatchReader, progress_callback: F, ) -> Result where @@ -134,7 +134,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { F: Fn(UpdateIndexingStep, u64) + Sync, { // Early return when there is no document to add - if reader.len() == 0 { + if reader.is_empty() { return Ok(DocumentAdditionResult { nb_documents: 0 }); } @@ -421,11 +421,10 @@ mod tests { use std::io::Cursor; use big_s::S; - use bimap::BiHashMap; use heed::EnvOpenOptions; use super::*; - use crate::documents::DocumentsBuilder; + use crate::documents::{DocumentBatchBuilder}; use crate::update::DeleteDocuments; use crate::HashMap; @@ -856,11 +855,11 @@ mod tests { let mut cursor = Cursor::new(Vec::new()); - let mut builder = DocumentsBuilder::new(&mut cursor, BiHashMap::new()).unwrap(); + let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); builder.add_documents(big_object).unwrap(); builder.finish().unwrap(); cursor.set_position(0); - let content = DocumentsReader::from_reader(cursor).unwrap(); + let content = DocumentBatchReader::from_reader(cursor).unwrap(); let builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.execute(content, |_, _| ()).unwrap(); diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index beac58b3e9..c58408f160 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -19,7 +19,7 @@ use super::IndexDocumentsMethod; use crate::error::{Error, InternalError, UserError}; use crate::index::db_name; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; -use crate::documents::DocumentsReader; +use crate::documents::{DocumentBatchReader, DocumentsBatchIndex}; use crate::{ExternalDocumentsIds, FieldDistribution, FieldId, FieldsIdsMap, Index, Result, BEU32}; const DEFAULT_PRIMARY_KEY_NAME: &str = "id"; @@ -53,22 +53,22 @@ pub struct Transform<'t, 'i> { pub autogenerate_docids: bool, } -/// Create the mapping between the field ids present in a document addition and the one in the -/// current fields id map. +/// Create a mapping between the field ids found in the document batch and the one that were +/// already present in the index. /// -/// If new fields are present in the addition, they are added to the provided field ids map. +/// If new fields are present in the addition, they are added to the index field ids map. fn create_fields_mapping( - fields_id_map: &mut FieldsIdsMap, - addition_fields_map: &bimap::BiHashMap, + index_field_map: &mut FieldsIdsMap, + batch_field_map: &DocumentsBatchIndex, ) -> Result> { - addition_fields_map + batch_field_map .iter() // we sort by id here to ensure a deterministic mapping of the fields, that preserves // the original ordering. .sorted_by_key(|(&id, _)| id) - .map(|(field, name)| match fields_id_map.id(&name) { + .map(|(field, name)| match index_field_map.id(&name) { Some(id) => Ok((*field, id)), - None => fields_id_map + None => index_field_map .insert(&name) .ok_or(Error::UserError(UserError::AttributeLimitReached)) .map(|id| (*field, id)), @@ -86,7 +86,7 @@ fn find_primary_key(index: &bimap::BiHashMap) -> Option<&str> { impl Transform<'_, '_> { pub fn read_documents( self, - mut reader: DocumentsReader, + mut reader: DocumentBatchReader, progress_callback: F, ) -> Result where @@ -146,7 +146,7 @@ impl Transform<'_, '_> { // We need to make sure that every document has a primary key. After we have remapped // all the fields in the document, we try to find the primary key value. If we can find - // it, transform it into a string and validate it, and then update it in the the + // it, transform it into a string and validate it, and then update it in the // document. If none is found, and we were told to generate missing document ids, then // we create the missing field, and update the new document. let external_id = match field_buffer.iter_mut().find(|(id, _)| *id == primary_key_id) { @@ -201,7 +201,7 @@ impl Transform<'_, '_> { // fieldids map keys order. field_buffer.sort_unstable_by(|(f1, _), (f2, _)| f1.cmp(&f2)); - // The last step is to build the new new obkv document, and insert it in the sorter. + // The last step is to build the new obkv document, and insert it in the sorter. let mut writer = obkv::KvWriter::new(&mut obkv_buffer); for (k, v) in field_buffer.iter() { writer.insert(*k, v)?; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 5ccbacde24..4aa79f6e35 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -722,7 +722,7 @@ mod tests { let count = index .facet_id_f64_docids .remap_key_type::() - // The faceted field id is 2u16 + // The faceted field id is 1u16 .prefix_iter(&rtxn, &[0, 1, 0]) .unwrap() .count(); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 0bea481c09..af8b31e97e 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -6,9 +6,8 @@ use big_s::S; use either::{Either, Left, Right}; use heed::EnvOpenOptions; use maplit::{hashmap, hashset}; -use milli::update::{IndexDocuments, Settings, UpdateBuilder}; -use milli::documents::{DocumentsBuilder, DocumentsReader}; -use milli::update::{IndexDocuments, Settings}; +use milli::update::{Settings, UpdateBuilder}; +use milli::documents::{DocumentBatchReader, DocumentBatchBuilder}; use milli::{AscDesc, Criterion, DocumentId, Index}; use serde::Deserialize; use slice_group_by::GroupBy; @@ -59,7 +58,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { builder.enable_autogenerate_docids(); let mut cursor = Cursor::new(Vec::new()); let mut documents_builder = - DocumentsBuilder::new(&mut cursor, bimap::BiHashMap::new()).unwrap(); + DocumentBatchBuilder::new(&mut cursor).unwrap(); let reader = Cursor::new(CONTENT.as_bytes()); for doc in serde_json::Deserializer::from_reader(reader).into_iter::() { documents_builder.add_documents(doc.unwrap()).unwrap(); @@ -69,7 +68,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { cursor.set_position(0); // index documents - let content = DocumentsReader::from_reader(cursor).unwrap(); + let content = DocumentBatchReader::from_reader(cursor).unwrap(); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap();