Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rustdoc-search: shard the search result descriptions #122614

Merged
merged 7 commits into from
Apr 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.lock
Original file line number Diff line number Diff line change
Expand Up @@ -4741,6 +4741,8 @@ version = "0.0.0"
dependencies = [
"arrayvec",
"askama",
"base64",
"byteorder",
"expect-test",
"indexmap",
"itertools 0.12.1",
Expand Down
2 changes: 1 addition & 1 deletion src/ci/docker/host-x86_64/mingw-check/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ ENV SCRIPT python3 ../x.py --stage 2 test src/tools/expand-yaml-anchors && \
/scripts/validate-error-codes.sh && \
reuse --include-submodules lint && \
# Runs checks to ensure that there are no ES5 issues in our JS code.
es-check es6 ../src/librustdoc/html/static/js/*.js && \
es-check es8 ../src/librustdoc/html/static/js/*.js && \
eslint -c ../src/librustdoc/html/static/.eslintrc.js ../src/librustdoc/html/static/js/*.js && \
eslint -c ../src/tools/rustdoc-js/.eslintrc.js ../src/tools/rustdoc-js/tester.js && \
eslint -c ../src/tools/rustdoc-gui/.eslintrc.js ../src/tools/rustdoc-gui/tester.js
2 changes: 2 additions & 0 deletions src/librustdoc/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ path = "lib.rs"
[dependencies]
arrayvec = { version = "0.7", default-features = false }
askama = { version = "0.12", default-features = false, features = ["config"] }
base64 = "0.21.7"
byteorder = "1.5"
itertools = "0.12"
indexmap = "2"
minifier = "0.3.0"
Expand Down
33 changes: 4 additions & 29 deletions src/librustdoc/html/render/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -184,40 +184,15 @@ pub(crate) enum RenderTypeId {

impl RenderTypeId {
pub fn write_to_string(&self, string: &mut String) {
// (sign, value)
let (sign, id): (bool, u32) = match &self {
let id: i32 = match &self {
// 0 is a sentinel, everything else is one-indexed
// concrete type
RenderTypeId::Index(idx) if *idx >= 0 => (false, (idx + 1isize).try_into().unwrap()),
RenderTypeId::Index(idx) if *idx >= 0 => (idx + 1isize).try_into().unwrap(),
// generic type parameter
RenderTypeId::Index(idx) => (true, (-*idx).try_into().unwrap()),
RenderTypeId::Index(idx) => (*idx).try_into().unwrap(),
_ => panic!("must convert render types to indexes before serializing"),
};
// zig-zag encoding
let value: u32 = (id << 1) | (if sign { 1 } else { 0 });
// Self-terminating hex use capital letters for everything but the
// least significant digit, which is lowercase. For example, decimal 17
// would be `` Aa `` if zig-zag encoding weren't used.
//
// Zig-zag encoding, however, stores the sign bit as the last bit.
// This means, in the last hexit, 1 is actually `c`, -1 is `b`
// (`a` is the imaginary -0), and, because all the bits are shifted
// by one, `` A` `` is actually 8 and `` Aa `` is -8.
//
// https://rust-lang.github.io/rustc-dev-guide/rustdoc-internals/search.html
// describes the encoding in more detail.
let mut shift: u32 = 28;
let mut mask: u32 = 0xF0_00_00_00;
while shift < 32 {
let hexit = (value & mask) >> shift;
if hexit != 0 || shift == 0 {
let hex =
char::try_from(if shift == 0 { '`' } else { '@' } as u32 + hexit).unwrap();
string.push(hex);
}
shift = shift.wrapping_sub(4);
mask = mask >> 4;
}
search_index::encode::write_vlqhex_to_string(id, string);
}
}

Expand Down
107 changes: 94 additions & 13 deletions src/librustdoc/html/render/search_index.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
pub(crate) mod encode;

use std::collections::hash_map::Entry;
use std::collections::{BTreeMap, VecDeque};

Expand All @@ -17,12 +19,46 @@ use crate::html::format::join_with_double_colon;
use crate::html::markdown::short_markdown_summary;
use crate::html::render::{self, IndexItem, IndexItemFunctionType, RenderType, RenderTypeId};

use encode::{bitmap_to_string, write_vlqhex_to_string};

/// The serialized search description sharded version
///
/// The `index` is a JSON-encoded list of names and other information.
///
/// The desc has newlined descriptions, split up by size into 128KiB shards.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be nice if you added some explanations why 128KiB was picked and not another.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no single, optimal size for these shards, because it depends on
configuration values that we can't predict or control, such as the version
of HTTP used (HTTP/1.1 would work better with larger files, while HTTP/2
and 3 are more agnostic), transport compression (gzip, zstd, etc), whether
the search query is going to produce a large number of results or a small
number, the bandwidth delay product of the network...

Gzipping some standard library descriptions to guess what transport
compression will do, the compressed file sizes can be as small as 4.9KiB
or as large as 18KiB (ignoring the final 1.9KiB shard of leftovers).
A "reasonable" range for files is for them to be bigger than 1KiB,
since that's about the amount of data that can be transferred in a
single TCP packet, and 64KiB, the maximum amount of data that
TCP can transfer in a single round trip without extensions.

/// For example, `(4, "foo\nbar\nbaz\nquux")`.
///
/// There is no single, optimal size for these shards, because it depends on
/// configuration values that we can't predict or control, such as the version
/// of HTTP used (HTTP/1.1 would work better with larger files, while HTTP/2
/// and 3 are more agnostic), transport compression (gzip, zstd, etc), whether
/// the search query is going to produce a large number of results or a small
/// number, the bandwidth delay product of the network...
///
/// Gzipping some standard library descriptions to guess what transport
/// compression will do, the compressed file sizes can be as small as 4.9KiB
/// or as large as 18KiB (ignoring the final 1.9KiB shard of leftovers).
/// A "reasonable" range for files is for them to be bigger than 1KiB,
/// since that's about the amount of data that can be transferred in a
/// single TCP packet, and 64KiB, the maximum amount of data that
/// TCP can transfer in a single round trip without extensions.
///
/// [1]: https://en.wikipedia.org/wiki/Maximum_transmission_unit#MTUs_for_common_media
/// [2]: https://en.wikipedia.org/wiki/Sliding_window_protocol#Basic_concept
/// [3]: https://learn.microsoft.com/en-us/troubleshoot/windows-server/networking/description-tcp-features
pub(crate) struct SerializedSearchIndex {
pub(crate) index: String,
GuillaumeGomez marked this conversation as resolved.
Show resolved Hide resolved
pub(crate) desc: Vec<(usize, String)>,
}

const DESC_INDEX_SHARD_LEN: usize = 128 * 1024;

/// Builds the search index from the collected metadata
pub(crate) fn build_index<'tcx>(
krate: &clean::Crate,
cache: &mut Cache,
tcx: TyCtxt<'tcx>,
) -> String {
) -> SerializedSearchIndex {
let mut itemid_to_pathid = FxHashMap::default();
let mut primitives = FxHashMap::default();
let mut associated_types = FxHashMap::default();
Expand Down Expand Up @@ -318,7 +354,6 @@ pub(crate) fn build_index<'tcx>(
.collect::<Vec<_>>();

struct CrateData<'a> {
doc: String,
items: Vec<&'a IndexItem>,
paths: Vec<(ItemType, Vec<Symbol>)>,
// The String is alias name and the vec is the list of the elements with this alias.
Expand All @@ -327,6 +362,11 @@ pub(crate) fn build_index<'tcx>(
aliases: &'a BTreeMap<String, Vec<usize>>,
// Used when a type has more than one impl with an associated item with the same name.
associated_item_disambiguators: &'a Vec<(usize, String)>,
// A list of shard lengths encoded as vlqhex. See the comment in write_vlqhex_to_string
// for information on the format.
desc_index: String,
// A list of items with no description. This is eventually turned into a bitmap.
empty_desc: Vec<u32>,
}

struct Paths {
Expand Down Expand Up @@ -408,7 +448,6 @@ pub(crate) fn build_index<'tcx>(
let mut names = Vec::with_capacity(self.items.len());
let mut types = String::with_capacity(self.items.len());
let mut full_paths = Vec::with_capacity(self.items.len());
let mut descriptions = Vec::with_capacity(self.items.len());
let mut parents = Vec::with_capacity(self.items.len());
let mut functions = String::with_capacity(self.items.len());
let mut deprecated = Vec::with_capacity(self.items.len());
Expand All @@ -431,7 +470,6 @@ pub(crate) fn build_index<'tcx>(
parents.push(item.parent_idx.map(|x| x + 1).unwrap_or(0));

names.push(item.name.as_str());
descriptions.push(&item.desc);

if !item.path.is_empty() {
full_paths.push((index, &item.path));
Expand All @@ -443,7 +481,8 @@ pub(crate) fn build_index<'tcx>(
}

if item.deprecation.is_some() {
deprecated.push(index);
// bitmasks always use 1-indexing for items, with 0 as the crate itself
deprecated.push(u32::try_from(index + 1).unwrap());
}
}

Expand All @@ -454,42 +493,84 @@ pub(crate) fn build_index<'tcx>(
let has_aliases = !self.aliases.is_empty();
let mut crate_data =
serializer.serialize_struct("CrateData", if has_aliases { 9 } else { 8 })?;
crate_data.serialize_field("doc", &self.doc)?;
crate_data.serialize_field("t", &types)?;
crate_data.serialize_field("n", &names)?;
// Serialize as an array of item indices and full paths
crate_data.serialize_field("q", &full_paths)?;
crate_data.serialize_field("d", &descriptions)?;
crate_data.serialize_field("i", &parents)?;
crate_data.serialize_field("f", &functions)?;
crate_data.serialize_field("c", &deprecated)?;
crate_data.serialize_field("D", &self.desc_index)?;
crate_data.serialize_field("p", &paths)?;
crate_data.serialize_field("b", &self.associated_item_disambiguators)?;
crate_data.serialize_field("c", &bitmap_to_string(&deprecated))?;
crate_data.serialize_field("e", &bitmap_to_string(&self.empty_desc))?;
if has_aliases {
crate_data.serialize_field("a", &self.aliases)?;
}
crate_data.end()
}
}

// Collect the index into a string
format!(
let (empty_desc, desc) = {
let mut empty_desc = Vec::new();
let mut result = Vec::new();
let mut set = String::new();
let mut len: usize = 0;
let mut item_index: u32 = 0;
for desc in std::iter::once(&crate_doc).chain(crate_items.iter().map(|item| &item.desc)) {
if desc == "" {
empty_desc.push(item_index);
item_index += 1;
continue;
}
if set.len() >= DESC_INDEX_SHARD_LEN {
result.push((len, std::mem::replace(&mut set, String::new())));
len = 0;
} else if len != 0 {
set.push('\n');
}
set.push_str(&desc);
len += 1;
item_index += 1;
}
result.push((len, std::mem::replace(&mut set, String::new())));
(empty_desc, result)
};

let desc_index = {
let mut desc_index = String::with_capacity(desc.len() * 4);
for &(len, _) in desc.iter() {
write_vlqhex_to_string(len.try_into().unwrap(), &mut desc_index);
}
desc_index
};

assert_eq!(
crate_items.len() + 1,
desc.iter().map(|(len, _)| *len).sum::<usize>() + empty_desc.len()
);

// The index, which is actually used to search, is JSON
notriddle marked this conversation as resolved.
Show resolved Hide resolved
// It uses `JSON.parse(..)` to actually load, since JSON
// parses faster than the full JavaScript syntax.
let index = format!(
r#"["{}",{}]"#,
krate.name(tcx),
serde_json::to_string(&CrateData {
doc: crate_doc,
items: crate_items,
paths: crate_paths,
aliases: &aliases,
associated_item_disambiguators: &associated_item_disambiguators,
desc_index,
empty_desc,
})
.expect("failed serde conversion")
// All these `replace` calls are because we have to go through JS string for JSON content.
.replace('\\', r"\\")
.replace('\'', r"\'")
// We need to escape double quotes for the JSON.
.replace("\\\"", "\\\\\"")
)
);
SerializedSearchIndex { index, desc }
}

pub(crate) fn get_function_type_for_search<'tcx>(
Expand Down
Loading
Loading