Skip to content

Commit

Permalink
run rustfmt
Browse files Browse the repository at this point in the history
  • Loading branch information
rustfmt authored and progval committed Mar 3, 2023
1 parent d500740 commit 4abb9f4
Show file tree
Hide file tree
Showing 7 changed files with 205 additions and 120 deletions.
6 changes: 4 additions & 2 deletions generator/src/formatting.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@ use std::io::prelude::*;
static LINE_LIMIT: usize = 95;

pub struct Context {
pub out: Box<dyn Write+'static>
pub out: Box<dyn Write + 'static>,
}

impl Context {
pub fn write_array<T, F>(&mut self, name: &str, ty: &str, elements: &[T], format: F)
where F: Fn(&T) -> String{
where
F: Fn(&T) -> String,
{
w!(self, "pub static {}: &'static [{}] = &[", name, ty);

let mut width = LINE_LIMIT;
Expand Down
167 changes: 109 additions & 58 deletions generator/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
use std::{cmp, char};
use std::collections::{HashMap, hash_map};
use std::fs::{File, self};
use std::io::{BufReader, BufWriter, self};
use std::collections::{hash_map, HashMap};
use std::fs::{self, File};
use std::io::prelude::*;
use std::io::{self, BufReader, BufWriter};
use std::iter::repeat;
use std::path::Path;
use std::{char, cmp};

use crate::formatting::Context;

Expand All @@ -30,15 +30,19 @@ fn get_table_data() -> (Vec<(char, String)>, Vec<(char, char)>) {
($line: expr) => {{
let line = $line;
let mut splits = line.split(';');
let cp = splits.next().and_then(|s| u32::from_str_radix(s, 16).ok())
let cp = splits
.next()
.and_then(|s| u32::from_str_radix(s, 16).ok())
.unwrap_or_else(|| panic!("invalid {}", line));
let c = match char::from_u32(cp) {
None => continue,
Some(c) => c,
};
let name = splits.next().unwrap_or_else(|| panic!("missing name {}", line));
let name = splits
.next()
.unwrap_or_else(|| panic!("missing name {}", line));
(c, name)
}}
}};
}

let r = BufReader::new(File::open(Path::new(IN_FILE)).unwrap());
Expand All @@ -50,7 +54,7 @@ fn get_table_data() -> (Vec<(char, String)>, Vec<(char, char)>) {
loop {
let l = match iter.next() {
Some(l) => l,
None => break
None => break,
};

let (cp, name) = extract!(l.trim());
Expand All @@ -62,8 +66,7 @@ fn get_table_data() -> (Vec<(char, String)>, Vec<(char, char)>) {
// should be CJK Ideograph ..., Last
let line2 = iter.next().expect("unclosed ideograph range");
let (cp2, name2) = extract!(line2.trim());
assert_eq!(&*name.replace("First", "Last"),
&name2[1..name2.len() - 1]);
assert_eq!(&*name.replace("First", "Last"), &name2[1..name2.len() - 1]);

cjk_ideograph_ranges.push((cp, cp2));
} else if name.starts_with("Hangul Syllable") {
Expand Down Expand Up @@ -92,9 +95,9 @@ fn write_cjk_ideograph_ranges(ctxt: &mut Context, ranges: &[(char, char)]) {
/// Construct a huge string storing the text data, and return it,
/// along with information about the position and frequency of the
/// constituent words of the input.
fn create_lexicon_and_offsets(mut codepoint_names: Vec<(char, String)>) -> (String,
Vec<(usize, Vec<u8>,
usize)>) {
fn create_lexicon_and_offsets(
mut codepoint_names: Vec<(char, String)>,
) -> (String, Vec<(usize, Vec<u8>, usize)>) {
codepoint_names.sort_by(|a, b| a.1.len().cmp(&b.1.len()).reverse());

// a trie of all the suffixes of the data,
Expand All @@ -107,7 +110,7 @@ fn create_lexicon_and_offsets(mut codepoint_names: Vec<(char, String)>) -> (Stri
for &(_, ref name) in codepoint_names.iter() {
for n in util::split(name, SPLITTERS) {
if n.len() == 1 && SPLITTERS.contains(&n.as_bytes()[0]) {
continue
continue;
}

let (already, previous_was_exact) = t.insert(n.bytes(), None, false);
Expand All @@ -132,15 +135,23 @@ fn create_lexicon_and_offsets(mut codepoint_names: Vec<(char, String)>) -> (Stri
// once we've found a string that's already
// been inserted, we know all suffixes will've
// been inserted too.
break
break;
}
}
}
}
}
let words: Vec<_> = t.iter().map(|(a, b, c)| (a, b, c.expect("unset offset?"))).collect();
println!("Lexicon: # words {}, byte size {}, with {} ({} bytes) non-exact matches",
words.len(), output.len(), substring_overlaps, substring_o_bytes);
let words: Vec<_> = t
.iter()
.map(|(a, b, c)| (a, b, c.expect("unset offset?")))
.collect();
println!(
"Lexicon: # words {}, byte size {}, with {} ({} bytes) non-exact matches",
words.len(),
output.len(),
substring_overlaps,
substring_o_bytes
);
(output, words)
}

Expand Down Expand Up @@ -173,8 +184,8 @@ fn bin_data(dat: &[u32]) -> (Vec<u32>, Vec<u32>, usize) {
t1.push((index >> shift) as u32)
}

let my_size = t1.len() * util::smallest_type(t1.iter().copied()) +
t2.len() * util::smallest_type(t2.iter().copied());
let my_size = t1.len() * util::smallest_type(t1.iter().copied())
+ t2.len() * util::smallest_type(t2.iter().copied());
println!("binning: shift {}, size {}", shift, my_size);
if my_size < smallest {
data = (t1, t2, shift);
Expand All @@ -187,7 +198,10 @@ fn bin_data(dat: &[u32]) -> (Vec<u32>, Vec<u32>, usize) {
let (ref t1, ref t2, shift) = data;
let mask = (1 << shift) - 1;
for (i, &elem) in dat.iter().enumerate() {
assert_eq!(elem, t2[((t1[i >> shift] << shift) + (i as u32 & mask)) as usize])
assert_eq!(
elem,
t2[((t1[i >> shift] << shift) + (i as u32 & mask)) as usize]
)
}
}

Expand Down Expand Up @@ -252,7 +266,9 @@ fn write_codepoint_maps(ctxt: &mut Context, codepoint_names: Vec<(char, String)>
previous_len = len;
}

assert!(word_encodings.insert(word, vec![hi as u32, lo as u32]).is_none());
assert!(word_encodings
.insert(word, vec![hi as u32, lo as u32])
.is_none());
}
// don't forget the last one.
lexicon_ordered_lengths.push((lexicon_offsets.len(), previous_len));
Expand All @@ -273,7 +289,9 @@ fn write_codepoint_maps(ctxt: &mut Context, codepoint_names: Vec<(char, String)>

let mut last_len = 0;
for w in util::split(name, SPLITTERS) {
let data = word_encodings.get(w.as_bytes()).expect(concat!("option on ", line!()));
let data = word_encodings
.get(w.as_bytes())
.expect(concat!("option on ", line!()));
last_len = data.len();
// info!("{}: '{}' {}", name, w, data);

Expand All @@ -290,40 +308,62 @@ fn write_codepoint_maps(ctxt: &mut Context, codepoint_names: Vec<(char, String)>
// compress the offsets, hopefully collapsing all the 0's.
let (t1, t2, shift) = bin_data(&phrasebook_offsets);

w!(ctxt, "pub const MAX_NAME_LENGTH: usize = {};\n", longest_name);
w!(
ctxt,
"pub const MAX_NAME_LENGTH: usize = {};\n",
longest_name
);
ctxt.write_plain_string("LEXICON", &lexicon_string);
ctxt.write_debugs("LEXICON_OFFSETS", "u16", &lexicon_offsets);
ctxt.write_debugs("LEXICON_SHORT_LENGTHS", "u8",
&lexicon_short_lengths);
ctxt.write_debugs("LEXICON_ORDERED_LENGTHS", "(usize, u8)",
&lexicon_ordered_lengths);
ctxt.write_debugs("LEXICON_SHORT_LENGTHS", "u8", &lexicon_short_lengths);
ctxt.write_debugs(
"LEXICON_ORDERED_LENGTHS",
"(usize, u8)",
&lexicon_ordered_lengths,
);
w!(ctxt, "pub static PHRASEBOOK_SHORT: u8 = {};\n", short);
ctxt.write_debugs("PHRASEBOOK", "u8",
&phrasebook);
w!(ctxt, "pub static PHRASEBOOK_OFFSET_SHIFT: usize = {};\n", shift);
ctxt.write_debugs("PHRASEBOOK_OFFSETS1",
&util::smallest_u(t1.iter().copied()),
&t1);
ctxt.write_debugs("PHRASEBOOK_OFFSETS2",
&util::smallest_u(t2.iter().copied()),
&t2);
ctxt.write_debugs("PHRASEBOOK", "u8", &phrasebook);
w!(
ctxt,
"pub static PHRASEBOOK_OFFSET_SHIFT: usize = {};\n",
shift
);
ctxt.write_debugs(
"PHRASEBOOK_OFFSETS1",
&util::smallest_u(t1.iter().copied()),
&t1,
);
ctxt.write_debugs(
"PHRASEBOOK_OFFSETS2",
&util::smallest_u(t2.iter().copied()),
&t2,
);
}

fn main() {
let mut opts = getopts::Options::new();
opts.optflag("p", "phf", "compute the name -> codepoint PHF");
opts.optopt("l", "phf-lambda", "the lambda to use for PHF", "N");
opts.optopt("t", "phf-tries", "the number of attempts when computing PHF", "N");
opts.optopt(
"t",
"phf-tries",
"the number of attempts when computing PHF",
"N",
);
opts.optflag("s", "silent", "don't write anything to files");
opts.optopt("", "truncate", "only handle the first N", "N");
opts.optflag("h", "help", "print this message");
let matches = match opts.parse(std::env::args().skip(1)) {
Ok(m) => m, Err(f) => panic!("{}", f.to_string()),
Ok(m) => m,
Err(f) => panic!("{}", f.to_string()),
};

if matches.opt_present("h") {
println!("{}", opts.usage("generate compressed codepoint <-> name tables"));
return
println!(
"{}",
opts.usage("generate compressed codepoint <-> name tables")
);
return;
}
let do_phf = matches.opt_present("phf");
let file = if matches.opt_present("s") {
Expand All @@ -336,39 +376,50 @@ fn main() {

let mut ctxt = Context {
out: match file {
Some(p) => Box::new(BufWriter::new(File::create(&p.with_extension("tmp")).unwrap()))
as Box<dyn Write>,
None => Box::new(io::sink()) as Box<dyn Write>
}
Some(p) => Box::new(BufWriter::new(
File::create(&p.with_extension("tmp")).unwrap(),
)) as Box<dyn Write>,
None => Box::new(io::sink()) as Box<dyn Write>,
},
};
ctxt.out.write(b"// autogenerated by generator.rs\n").unwrap();
ctxt.out
.write(b"// autogenerated by generator.rs\n")
.unwrap();

let lambda = matches.opt_str("phf-lambda");
let tries = matches.opt_str("phf-tries");

let (mut codepoint_names, cjk) = get_table_data();
match matches.opt_str("truncate").map(
|s| s.parse().ok().expect("truncate should be an integer")) {
match matches
.opt_str("truncate")
.map(|s| s.parse().ok().expect("truncate should be an integer"))
{
Some(n) => codepoint_names.truncate(n),
None => {}
}

if do_phf {
let (n, disps, data) =
phf::create_phf(&codepoint_names,
lambda.map(|s| s.parse().ok().expect("invalid -l")).unwrap_or(3),
tries.map(|s| s.parse().ok().expect("invalid -t")).unwrap_or(2));

let (n, disps, data) = phf::create_phf(
&codepoint_names,
lambda
.map(|s| s.parse().ok().expect("invalid -l"))
.unwrap_or(3),
tries
.map(|s| s.parse().ok().expect("invalid -t"))
.unwrap_or(2),
);

w!(ctxt, "pub static NAME2CODE_N: u64 = {};\n", n);
ctxt.write_debugs("NAME2CODE_DISP",
"(u16, u16)",
&disps);
ctxt.write_debugs("NAME2CODE_DISP", "(u16, u16)", &disps);

ctxt.write_debugs("NAME2CODE_CODE", "char", &data);
} else {
if lambda.is_some() { println!("-l/--phf-lambda only applies with --phf") }
if tries.is_some() { println!("-t/--phf-tries only applies with --phf") }
if lambda.is_some() {
println!("-l/--phf-lambda only applies with --phf")
}
if tries.is_some() {
println!("-t/--phf-tries only applies with --phf")
}

write_cjk_ideograph_ranges(&mut ctxt, &cjk);
ctxt.out.write(b"\n").unwrap();
Expand Down
Loading

0 comments on commit 4abb9f4

Please sign in to comment.