Skip to content

Commit

Permalink
rustdoc-search: compressed bitmap to sort, then load desc
Browse files Browse the repository at this point in the history
This adds a bit more data than "pure sharding" by
including information about which items have no description
at all. This way, it can sort the results, then truncate,
then finally download the description.

With the "e" bitmap: 2380KiB

Without the "e" bitmap: 2364KiB
  • Loading branch information
notriddle committed Mar 22, 2024
1 parent e860b9c commit 28db4cc
Show file tree
Hide file tree
Showing 5 changed files with 395 additions and 37 deletions.
2 changes: 2 additions & 0 deletions Cargo.lock
Original file line number Diff line number Diff line change
Expand Up @@ -4741,6 +4741,8 @@ version = "0.0.0"
dependencies = [
"arrayvec",
"askama",
"base64",
"byteorder",
"expect-test",
"indexmap",
"itertools 0.12.1",
Expand Down
2 changes: 2 additions & 0 deletions src/librustdoc/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ path = "lib.rs"
[dependencies]
arrayvec = { version = "0.7", default-features = false }
askama = { version = "0.12", default-features = false, features = ["config"] }
base64 = "0.21.7"
byteorder = "1.5"
itertools = "0.12"
indexmap = "2"
minifier = "0.3.0"
Expand Down
233 changes: 226 additions & 7 deletions src/librustdoc/html/render/search_index.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use std::collections::hash_map::Entry;
use std::collections::{BTreeMap, VecDeque};

use base64::prelude::*;
use rustc_data_structures::fx::{FxHashMap, FxIndexMap};
use rustc_middle::ty::TyCtxt;
use rustc_span::def_id::DefId;
Expand All @@ -21,14 +22,14 @@ use crate::html::render::{self, IndexItem, IndexItemFunctionType, RenderType, Re
///
/// The `index` is a JSON-encoded list of names and other information.
///
/// The desc has newlined descriptions, split up by size into 1MiB shards.
/// The desc has newlined descriptions, split up by size into 128KiB shards.
/// For example, `(4, "foo\nbar\nbaz\nquux")`.
pub(crate) struct SerializedSearchIndex {
pub(crate) index: String,
pub(crate) desc: Vec<(usize, String)>,
}

const DESC_INDEX_SHARD_LEN: usize = 1024 * 1024;
const DESC_INDEX_SHARD_LEN: usize = 128 * 1024;

/// Builds the search index from the collected metadata
pub(crate) fn build_index<'tcx>(
Expand Down Expand Up @@ -342,6 +343,8 @@ pub(crate) fn build_index<'tcx>(
// A list of shard lengths encoded as vlqhex. See the comment in write_vlqhex_to_string
// for information on the format.
descindex: String,
// A list of items with no description. This is eventually turned into a bitmap.
emptydesc: Vec<u32>,
}

struct Paths {
Expand Down Expand Up @@ -456,7 +459,8 @@ pub(crate) fn build_index<'tcx>(
}

if item.deprecation.is_some() {
deprecated.push(index);
// bitmasks always use 1-indexing for items, with 0 as the crate itself
deprecated.push(u32::try_from(index + 1).unwrap());
}
}

Expand All @@ -473,21 +477,37 @@ pub(crate) fn build_index<'tcx>(
crate_data.serialize_field("i", &parents)?;
crate_data.serialize_field("f", &functions)?;
crate_data.serialize_field("D", &self.descindex)?;
crate_data.serialize_field("c", &deprecated)?;
crate_data.serialize_field("p", &paths)?;
crate_data.serialize_field("b", &self.associated_item_disambiguators)?;
let mut buf = Vec::new();
let mut strbuf = String::new();
write_bitmap_to_bytes(&deprecated, &mut buf).unwrap();
BASE64_STANDARD.encode_string(&buf, &mut strbuf);
crate_data.serialize_field("c", &strbuf)?;
strbuf.clear();
buf.clear();
write_bitmap_to_bytes(&self.emptydesc, &mut buf).unwrap();
BASE64_STANDARD.encode_string(&buf, &mut strbuf);
crate_data.serialize_field("e", &strbuf)?;
if has_aliases {
crate_data.serialize_field("a", &self.aliases)?;
}
crate_data.end()
}
}

let desc = {
let (emptydesc, desc) = {
let mut emptydesc = Vec::new();
let mut result = Vec::new();
let mut set = String::new();
let mut len: usize = 0;
let mut itemindex: u32 = 0;
for desc in std::iter::once(&crate_doc).chain(crate_items.iter().map(|item| &item.desc)) {
if desc == "" {
emptydesc.push(itemindex);
itemindex += 1;
continue;
}
if set.len() >= DESC_INDEX_SHARD_LEN {
result.push((len, std::mem::replace(&mut set, String::new())));
len = 0;
Expand All @@ -496,9 +516,10 @@ pub(crate) fn build_index<'tcx>(
}
set.push_str(&desc);
len += 1;
itemindex += 1;
}
result.push((len, std::mem::replace(&mut set, String::new())));
result
(emptydesc, result)
};

let descindex = {
Expand All @@ -509,7 +530,10 @@ pub(crate) fn build_index<'tcx>(
descindex
};

assert_eq!(crate_items.len() + 1, desc.iter().map(|(len, _)| *len).sum::<usize>());
assert_eq!(
crate_items.len() + 1,
desc.iter().map(|(len, _)| *len).sum::<usize>() + emptydesc.len()
);

// The index, which is actually used to search, is JSON
// It uses `JSON.parse(..)` to actually load, since JSON
Expand All @@ -523,6 +547,7 @@ pub(crate) fn build_index<'tcx>(
aliases: &aliases,
associated_item_disambiguators: &associated_item_disambiguators,
descindex,
emptydesc,
})
.expect("failed serde conversion")
// All these `replace` calls are because we have to go through JS string for JSON content.
Expand Down Expand Up @@ -571,6 +596,200 @@ pub(crate) fn write_vlqhex_to_string(n: i32, string: &mut String) {
}
}

// checked against roaring-rs in
// https://gitlab.com/notriddle/roaring-test
pub fn write_bitmap_to_bytes(domain: &[u32], mut out: impl std::io::Write) -> std::io::Result<()> {
// https://arxiv.org/pdf/1603.06549.pdf
let mut keys = Vec::<u16>::new();
let mut containers = Vec::<Container>::new();
enum Container {
/// number of ones, bits
Bits(Box<[u64; 1024]>),
/// list of entries
Array(Vec<u16>),
/// list of (start, len-1)
Run(Vec<(u16, u16)>),
}
impl Container {
fn popcount(&self) -> u32 {
match self {
Container::Bits(bits) => bits.iter().copied().map(|x| x.count_ones()).sum(),
Container::Array(array) => {
array.len().try_into().expect("array can't be bigger than 2**32")
}
Container::Run(runs) => {
runs.iter().copied().map(|(_, lenm1)| u32::from(lenm1) + 1).sum()
}
}
}
fn push(&mut self, value: u16) {
match self {
Container::Bits(bits) => bits[value as usize >> 6] |= 1 << (value & 0x3F),
Container::Array(array) => {
array.push(value);
if array.len() >= 4096 {
let array = std::mem::replace(array, Vec::new());
*self = Container::Bits(Box::new([0; 1024]));
for value in array {
self.push(value);
}
}
}
Container::Run(runs) => {
if let Some(r) = runs.last_mut()
&& r.0 + r.1 + 1 == value
{
r.1 += 1;
} else {
runs.push((value, 0));
}
}
}
}
fn try_make_run(&mut self) -> bool {
match self {
Container::Bits(bits) => {
let mut r: u64 = 0;
for (i, chunk) in bits.iter().copied().enumerate() {
let next_chunk =
i.checked_add(1).and_then(|i| bits.get(i)).copied().unwrap_or(0);
r += !chunk & u64::from((chunk << 1).count_ones());
r += !next_chunk & u64::from((chunk >> 63).count_ones());
}
if (2 + 4 * r) < 8192 {
let bits = std::mem::replace(bits, Box::new([0; 1024]));
*self = Container::Run(Vec::new());
for (i, bits) in bits.iter().copied().enumerate() {
if bits == 0 {
continue;
}
for j in 0..64 {
let value = (u16::try_from(i).unwrap() << 6) | j;
if bits & (1 << j) != 0 {
self.push(value);
}
}
}
true
} else {
false
}
}
Container::Array(array) if array.len() <= 5 => false,
Container::Array(array) => {
let mut r = 0;
let mut prev = None;
for value in array.iter().copied() {
if value.checked_sub(1) != prev {
r += 1;
}
prev = Some(value);
}
if 2 + 4 * r < 2 * array.len() + 2 {
let array = std::mem::replace(array, Vec::new());
*self = Container::Run(Vec::new());
for value in array {
self.push(value);
}
true
} else {
false
}
}
Container::Run(_) => true,
}
}
}
let mut key: u16;
let mut domain_iter = domain.into_iter().copied().peekable();
let mut has_run = false;
while let Some(entry) = domain_iter.next() {
key = (entry >> 16).try_into().expect("shifted off the top 16 bits, so it should fit");
let value: u16 = (entry & 0x00_00_FF_FF).try_into().expect("AND 16 bits, so it should fit");
let mut container = Container::Array(vec![value]);
while let Some(entry) = domain_iter.peek().copied() {
let entry_key: u16 =
(entry >> 16).try_into().expect("shifted off the top 16 bits, so it should fit");
if entry_key != key {
break;
}
domain_iter.next().expect("peeking just succeeded");
container
.push((entry & 0x00_00_FF_FF).try_into().expect("AND 16 bits, so it should fit"));
}
keys.push(key);
has_run = container.try_make_run() || has_run;
containers.push(container);
}
// https://github.com/RoaringBitmap/RoaringFormatSpec
use byteorder::{WriteBytesExt, LE};
const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346;
const SERIAL_COOKIE: u32 = 12347;
const NO_OFFSET_THRESHOLD: u32 = 4;
let size: u32 = containers.len().try_into().unwrap();
let start_offset = if has_run {
out.write_u32::<LE>(SERIAL_COOKIE | ((size - 1) << 16))?;
for set in containers.chunks(8) {
let mut b = 0;
for (i, container) in set.iter().enumerate() {
if matches!(container, &Container::Run(..)) {
b |= 1 << i;
}
}
out.write_u8(b)?;
}
if size < NO_OFFSET_THRESHOLD {
4 + 4 * size + ((size + 7) / 8)
} else {
4 + 8 * size + ((size + 7) / 8)
}
} else {
out.write_u32::<LE>(SERIAL_COOKIE_NO_RUNCONTAINER)?;
out.write_u32::<LE>(containers.len().try_into().unwrap())?;
4 + 4 + 4 * size + 4 * size
};
for (&key, container) in keys.iter().zip(&containers) {
// descriptive header
let key: u32 = key.into();
let count: u32 = container.popcount() - 1;
out.write_u32::<LE>((count << 16) | key)?;
}
if !has_run || size >= NO_OFFSET_THRESHOLD {
// offset header
let mut starting_offset = start_offset;
for container in &containers {
out.write_u32::<LE>(starting_offset)?;
starting_offset += match container {
Container::Bits(_) => 8192u32,
Container::Array(array) => u32::try_from(array.len()).unwrap() * 2,
Container::Run(runs) => 2 + u32::try_from(runs.len()).unwrap() * 4,
};
}
}
for container in &containers {
match container {
Container::Bits(bits) => {
for chunk in bits.iter() {
out.write_u64::<LE>(*chunk)?;
}
}
Container::Array(array) => {
for value in array.iter() {
out.write_u16::<LE>(*value)?;
}
}
Container::Run(runs) => {
out.write_u16::<LE>((runs.len()).try_into().unwrap())?;
for (start, lenm1) in runs.iter().copied() {
out.write_u16::<LE>(start)?;
out.write_u16::<LE>(lenm1)?;
}
}
}
}
Ok(())
}

pub(crate) fn get_function_type_for_search<'tcx>(
item: &clean::Item,
tcx: TyCtxt<'tcx>,
Expand Down
Loading

0 comments on commit 28db4cc

Please sign in to comment.