Skip to content

Commit

Permalink
Reduce the size of static data in std_unicode::tables.
Browse files Browse the repository at this point in the history
`BoolTrie` works well for sets of code points spread out through
most of Unicode’s range, but is uses a lot of space for sets
with few, mostly low, code points.

This switches a few of its instances to a similar but simpler trie
data structure.

 ## Before

`size_of::<BoolTrie>()` is 1552, which is added to
`table.r3.len() * 8 + t.r5.len() + t.r6.len() * 8`:

* `Cc_table`: 1632
* `White_Space_table`: 1656
* `Pattern_White_Space_table`: 1640
* Total: 4928 bytes

 ## After

`size_of::<SmallBoolTrie>()` is 32, which is added to
`t.r1.len() + t.r2.len() * 8`:

* `Cc_table`: 51
* `White_Space_table`: 273
* `Pattern_White_Space_table`: 193
* Total: 517 bytes

 ## Difference

Every Rust program with `std` statically linked should be about 4 KB smaller.
  • Loading branch information
SimonSapin committed Jan 3, 2017
1 parent 90c7c05 commit 3b208d2
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 220 deletions.
13 changes: 7 additions & 6 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,13 @@ __pycache__/
/obj/
/rt/
/rustllvm/
/src/libunicode/DerivedCoreProperties.txt
/src/libunicode/EastAsianWidth.txt
/src/libunicode/HangulSyllableType.txt
/src/libunicode/PropList.txt
/src/libunicode/Scripts.txt
/src/libunicode/UnicodeData.txt
/src/libstd_unicode/DerivedCoreProperties.txt
/src/libstd_unicode/DerivedNormalizationProps.txt
/src/libstd_unicode/PropList.txt
/src/libstd_unicode/ReadMe.txt
/src/libstd_unicode/Scripts.txt
/src/libstd_unicode/SpecialCasing.txt
/src/libstd_unicode/UnicodeData.txt
/stage[0-9]+/
/target
/test/
Expand Down
64 changes: 58 additions & 6 deletions src/etc/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
# Since this should not require frequent updates, we just store this
# out-of-line and check the unicode.rs file into git.

import fileinput, re, os, sys, operator
import fileinput, re, os, sys, operator, math

preamble = '''// Copyright 2012-2016 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
Expand Down Expand Up @@ -359,7 +359,23 @@ def emit_trie_lookup_range_table(f):
let leaf = r.r5[((child as usize) << 6) + ((c >> 6) & 0x3f)];
trie_range_leaf(c, r.r6[leaf as usize])
}
}\n
}
pub struct SmallBoolTrie {
r1: &'static [u8], // first level
r2: &'static [u64], // leaves
}
impl SmallBoolTrie {
fn lookup(&self, c: char) -> bool {
let c = c as usize;
match self.r1.get(c >> 6) {
Some(&child) => trie_range_leaf(c, self.r2[child as usize]),
None => false,
}
}
}
""")

def compute_trie(rawdata, chunksize):
Expand Down Expand Up @@ -429,13 +445,49 @@ def emit_bool_trie(f, name, t_data, is_pub=True):

f.write(" };\n\n")

def emit_small_bool_trie(f, name, t_data, is_pub=True):
last_chunk = max(int(hi / 64) for (lo, hi) in t_data)
n_chunks = last_chunk + 1
chunks = [0] * n_chunks
for (lo, hi) in t_data:
for cp in range(lo, hi + 1):
if int(cp / 64) >= len(chunks):
print(cp, int(cp / 64), len(chunks), lo, hi)
chunks[int(cp / 64)] |= 1 << (cp & 63)

pub_string = ""
if is_pub:
pub_string = "pub "
f.write(" %sconst %s: &'static super::SmallBoolTrie = &super::SmallBoolTrie {\n"
% (pub_string, name))

(r1, r2) = compute_trie(chunks, 1)

f.write(" r1: &[\n")
data = ','.join(str(node) for node in r1)
format_table_content(f, data, 12)
f.write("\n ],\n")

f.write(" r2: &[\n")
data = ','.join('0x%016x' % node for node in r2)
format_table_content(f, data, 12)
f.write("\n ],\n")

f.write(" };\n\n")

def emit_property_module(f, mod, tbl, emit):
f.write("pub mod %s {\n" % mod)
for cat in sorted(emit):
emit_bool_trie(f, "%s_table" % cat, tbl[cat])
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
f.write(" super::trie_lookup_range_table(c, %s_table)\n" % cat)
f.write(" }\n\n")
if cat in ["Cc", "White_Space", "Pattern_White_Space"]:
emit_small_bool_trie(f, "%s_table" % cat, tbl[cat])
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
f.write(" %s_table.lookup(c)\n" % cat)
f.write(" }\n\n")
else:
emit_bool_trie(f, "%s_table" % cat, tbl[cat])
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
f.write(" super::trie_lookup_range_table(c, %s_table)\n" % cat)
f.write(" }\n\n")
f.write("}\n\n")

def emit_conversions_module(f, to_upper, to_lower, to_title):
Expand Down
Loading

0 comments on commit 3b208d2

Please sign in to comment.