rust-lang · bors · Apr 2, 2024 · Mar 16, 2024 · Mar 17, 2024 · Mar 17, 2024
diff --git a/Cargo.lock b/Cargo.lock
@@ -4741,6 +4741,8 @@ version = "0.0.0"
 dependencies = [
  "arrayvec",
  "askama",
+ "base64",
+ "byteorder",
  "expect-test",
  "indexmap",
  "itertools 0.12.1",

diff --git a/src/ci/docker/host-x86_64/mingw-check/Dockerfile b/src/ci/docker/host-x86_64/mingw-check/Dockerfile
@@ -56,7 +56,7 @@ ENV SCRIPT python3 ../x.py --stage 2 test src/tools/expand-yaml-anchors && \
            /scripts/validate-error-codes.sh && \
            reuse --include-submodules lint && \
            # Runs checks to ensure that there are no ES5 issues in our JS code.
-           es-check es6 ../src/librustdoc/html/static/js/*.js && \
+           es-check es8 ../src/librustdoc/html/static/js/*.js && \
            eslint -c ../src/librustdoc/html/static/.eslintrc.js ../src/librustdoc/html/static/js/*.js && \
            eslint -c ../src/tools/rustdoc-js/.eslintrc.js ../src/tools/rustdoc-js/tester.js && \
            eslint -c ../src/tools/rustdoc-gui/.eslintrc.js ../src/tools/rustdoc-gui/tester.js
diff --git a/src/librustdoc/Cargo.toml b/src/librustdoc/Cargo.toml
@@ -9,6 +9,8 @@ path = "lib.rs"
 [dependencies]
 arrayvec = { version = "0.7", default-features = false }
 askama = { version = "0.12", default-features = false, features = ["config"] }
+base64 = "0.21.7"
+byteorder = "1.5"
 itertools = "0.12"
 indexmap = "2"
 minifier = "0.3.0"

diff --git a/src/librustdoc/html/render/mod.rs b/src/librustdoc/html/render/mod.rs
@@ -184,40 +184,15 @@ pub(crate) enum RenderTypeId {
 
 impl RenderTypeId {
     pub fn write_to_string(&self, string: &mut String) {
-        // (sign, value)
-        let (sign, id): (bool, u32) = match &self {
+        let id: i32 = match &self {
             // 0 is a sentinel, everything else is one-indexed
             // concrete type
-            RenderTypeId::Index(idx) if *idx >= 0 => (false, (idx + 1isize).try_into().unwrap()),
+            RenderTypeId::Index(idx) if *idx >= 0 => (idx + 1isize).try_into().unwrap(),
             // generic type parameter
-            RenderTypeId::Index(idx) => (true, (-*idx).try_into().unwrap()),
+            RenderTypeId::Index(idx) => (*idx).try_into().unwrap(),
             _ => panic!("must convert render types to indexes before serializing"),
         };
-        // zig-zag encoding
-        let value: u32 = (id << 1) | (if sign { 1 } else { 0 });
-        // Self-terminating hex use capital letters for everything but the
-        // least significant digit, which is lowercase. For example, decimal 17
-        // would be `` Aa `` if zig-zag encoding weren't used.
-        //
-        // Zig-zag encoding, however, stores the sign bit as the last bit.
-        // This means, in the last hexit, 1 is actually `c`, -1 is `b`
-        // (`a` is the imaginary -0), and, because all the bits are shifted
-        // by one, `` A` `` is actually 8 and `` Aa `` is -8.
-        //
-        // https://rust-lang.github.io/rustc-dev-guide/rustdoc-internals/search.html
-        // describes the encoding in more detail.
-        let mut shift: u32 = 28;
-        let mut mask: u32 = 0xF0_00_00_00;
-        while shift < 32 {
-            let hexit = (value & mask) >> shift;
-            if hexit != 0 || shift == 0 {
-                let hex =
-                    char::try_from(if shift == 0 { '`' } else { '@' } as u32 + hexit).unwrap();
-                string.push(hex);
-            }
-            shift = shift.wrapping_sub(4);
-            mask = mask >> 4;
-        }
+        search_index::encode::write_vlqhex_to_string(id, string);
     }
 }
 

diff --git a/src/librustdoc/html/render/search_index.rs b/src/librustdoc/html/render/search_index.rs
@@ -1,3 +1,5 @@
+pub(crate) mod encode;
+
 use std::collections::hash_map::Entry;
 use std::collections::{BTreeMap, VecDeque};
 
@@ -17,12 +19,46 @@ use crate::html::format::join_with_double_colon;
 use crate::html::markdown::short_markdown_summary;
 use crate::html::render::{self, IndexItem, IndexItemFunctionType, RenderType, RenderTypeId};
 
+use encode::{bitmap_to_string, write_vlqhex_to_string};
+
+/// The serialized search description sharded version
+///
+/// The `index` is a JSON-encoded list of names and other information.
+///
+/// The desc has newlined descriptions, split up by size into 128KiB shards.
+/// For example, `(4, "foo\nbar\nbaz\nquux")`.
+///
+/// There is no single, optimal size for these shards, because it depends on
+/// configuration values that we can't predict or control, such as the version
+/// of HTTP used (HTTP/1.1 would work better with larger files, while HTTP/2
+/// and 3 are more agnostic), transport compression (gzip, zstd, etc), whether
+/// the search query is going to produce a large number of results or a small
+/// number, the bandwidth delay product of the network...
+///
+/// Gzipping some standard library descriptions to guess what transport
+/// compression will do, the compressed file sizes can be as small as 4.9KiB
+/// or as large as 18KiB (ignoring the final 1.9KiB shard of leftovers).
+/// A "reasonable" range for files is for them to be bigger than 1KiB,
+/// since that's about the amount of data that can be transferred in a
+/// single TCP packet, and 64KiB, the maximum amount of data that
+/// TCP can transfer in a single round trip without extensions.
+///
+/// [1]: https://en.wikipedia.org/wiki/Maximum_transmission_unit#MTUs_for_common_media
+/// [2]: https://en.wikipedia.org/wiki/Sliding_window_protocol#Basic_concept
+/// [3]: https://learn.microsoft.com/en-us/troubleshoot/windows-server/networking/description-tcp-features
+pub(crate) struct SerializedSearchIndex {
+    pub(crate) index: String,
+    pub(crate) desc: Vec<(usize, String)>,
+}
+
+const DESC_INDEX_SHARD_LEN: usize = 128 * 1024;
+
 /// Builds the search index from the collected metadata
 pub(crate) fn build_index<'tcx>(
     krate: &clean::Crate,
     cache: &mut Cache,
     tcx: TyCtxt<'tcx>,
-) -> String {
+) -> SerializedSearchIndex {
     let mut itemid_to_pathid = FxHashMap::default();
     let mut primitives = FxHashMap::default();
     let mut associated_types = FxHashMap::default();
@@ -318,7 +354,6 @@ pub(crate) fn build_index<'tcx>(
         .collect::<Vec<_>>();
 
     struct CrateData<'a> {
-        doc: String,
         items: Vec<&'a IndexItem>,
         paths: Vec<(ItemType, Vec<Symbol>)>,
         // The String is alias name and the vec is the list of the elements with this alias.
@@ -327,6 +362,11 @@ pub(crate) fn build_index<'tcx>(
         aliases: &'a BTreeMap<String, Vec<usize>>,
         // Used when a type has more than one impl with an associated item with the same name.
         associated_item_disambiguators: &'a Vec<(usize, String)>,
+        // A list of shard lengths encoded as vlqhex. See the comment in write_vlqhex_to_string
+        // for information on the format.
+        desc_index: String,
+        // A list of items with no description. This is eventually turned into a bitmap.
+        empty_desc: Vec<u32>,
     }
 
     struct Paths {
@@ -408,7 +448,6 @@ pub(crate) fn build_index<'tcx>(
             let mut names = Vec::with_capacity(self.items.len());
             let mut types = String::with_capacity(self.items.len());
             let mut full_paths = Vec::with_capacity(self.items.len());
-            let mut descriptions = Vec::with_capacity(self.items.len());
             let mut parents = Vec::with_capacity(self.items.len());
             let mut functions = String::with_capacity(self.items.len());
             let mut deprecated = Vec::with_capacity(self.items.len());
@@ -431,7 +470,6 @@ pub(crate) fn build_index<'tcx>(
                 parents.push(item.parent_idx.map(|x| x + 1).unwrap_or(0));
 
                 names.push(item.name.as_str());
-                descriptions.push(&item.desc);
 
                 if !item.path.is_empty() {
                     full_paths.push((index, &item.path));
@@ -443,7 +481,8 @@ pub(crate) fn build_index<'tcx>(
                 }
 
                 if item.deprecation.is_some() {
-                    deprecated.push(index);
+                    // bitmasks always use 1-indexing for items, with 0 as the crate itself
+                    deprecated.push(u32::try_from(index + 1).unwrap());
                 }
             }
 
@@ -454,42 +493,84 @@ pub(crate) fn build_index<'tcx>(
             let has_aliases = !self.aliases.is_empty();
             let mut crate_data =
                 serializer.serialize_struct("CrateData", if has_aliases { 9 } else { 8 })?;
-            crate_data.serialize_field("doc", &self.doc)?;
             crate_data.serialize_field("t", &types)?;
             crate_data.serialize_field("n", &names)?;
-            // Serialize as an array of item indices and full paths
             crate_data.serialize_field("q", &full_paths)?;
-            crate_data.serialize_field("d", &descriptions)?;
             crate_data.serialize_field("i", &parents)?;
             crate_data.serialize_field("f", &functions)?;
-            crate_data.serialize_field("c", &deprecated)?;
+            crate_data.serialize_field("D", &self.desc_index)?;
             crate_data.serialize_field("p", &paths)?;
             crate_data.serialize_field("b", &self.associated_item_disambiguators)?;
+            crate_data.serialize_field("c", &bitmap_to_string(&deprecated))?;
+            crate_data.serialize_field("e", &bitmap_to_string(&self.empty_desc))?;
             if has_aliases {
                 crate_data.serialize_field("a", &self.aliases)?;
             }
             crate_data.end()
         }
     }
 
-    // Collect the index into a string
-    format!(
+    let (empty_desc, desc) = {
+        let mut empty_desc = Vec::new();
+        let mut result = Vec::new();
+        let mut set = String::new();
+        let mut len: usize = 0;
+        let mut item_index: u32 = 0;
+        for desc in std::iter::once(&crate_doc).chain(crate_items.iter().map(|item| &item.desc)) {
+            if desc == "" {
+                empty_desc.push(item_index);
+                item_index += 1;
+                continue;
+            }
+            if set.len() >= DESC_INDEX_SHARD_LEN {
+                result.push((len, std::mem::replace(&mut set, String::new())));
+                len = 0;
+            } else if len != 0 {
+                set.push('\n');
+            }
+            set.push_str(&desc);
+            len += 1;
+            item_index += 1;
+        }
+        result.push((len, std::mem::replace(&mut set, String::new())));
+        (empty_desc, result)
+    };
+
+    let desc_index = {
+        let mut desc_index = String::with_capacity(desc.len() * 4);
+        for &(len, _) in desc.iter() {
+            write_vlqhex_to_string(len.try_into().unwrap(), &mut desc_index);
+        }
+        desc_index
+    };
+
+    assert_eq!(
+        crate_items.len() + 1,
+        desc.iter().map(|(len, _)| *len).sum::<usize>() + empty_desc.len()
+    );
+
+    // The index, which is actually used to search, is JSON
+    // It uses `JSON.parse(..)` to actually load, since JSON
+    // parses faster than the full JavaScript syntax.
+    let index = format!(
         r#"["{}",{}]"#,
         krate.name(tcx),
         serde_json::to_string(&CrateData {
-            doc: crate_doc,
             items: crate_items,
             paths: crate_paths,
             aliases: &aliases,
             associated_item_disambiguators: &associated_item_disambiguators,
+            desc_index,
+            empty_desc,
         })
         .expect("failed serde conversion")
         // All these `replace` calls are because we have to go through JS string for JSON content.
         .replace('\\', r"\\")
         .replace('\'', r"\'")
         // We need to escape double quotes for the JSON.
         .replace("\\\"", "\\\\\"")
-    )
+    );
+    SerializedSearchIndex { index, desc }
 }
 
 pub(crate) fn get_function_type_for_search<'tcx>(