From 6589c3bbb3c7d12d689e90ef93225d31d43ab15f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BF=A0=20/=20green?= Date: Sat, 25 Jan 2025 15:00:24 +0900 Subject: [PATCH] feat(mangler): reuse variable names (#8562) Changed the mangler to reuse variable names where possible. This will reduce the code size as shorter variable names can be used in more places. But requires global information and limits parallelism in a single file and requires more memory. --------- Co-authored-by: Boshen Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> --- Cargo.lock | 1 + Cargo.toml | 1 + crates/oxc_mangler/Cargo.toml | 1 + crates/oxc_mangler/src/lib.rs | 204 ++++++++++++------ crates/oxc_minifier/src/lib.rs | 11 +- crates/oxc_minifier/tests/mangler/mod.rs | 15 ++ .../tests/mangler/snapshots/mangler.snap | 140 +++++++++++- crates/oxc_semantic/src/lib.rs | 8 +- napi/minify/test/minify.test.ts | 2 +- tasks/benchmark/benches/minifier.rs | 9 +- tasks/minsize/minsize.snap | 24 +-- 11 files changed, 317 insertions(+), 99 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a6756661b6ce8..32e2183f08aee 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1833,6 +1833,7 @@ dependencies = [ name = "oxc_mangler" version = "0.48.0" dependencies = [ + "fixedbitset", "itertools", "oxc_allocator", "oxc_ast", diff --git a/Cargo.toml b/Cargo.toml index ee8568d694859..974c802bf7723 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -149,6 +149,7 @@ encoding_rs = "0.8.35" encoding_rs_io = "0.1.7" env_logger = { version = "0.11.5", default-features = false } fast-glob = "0.4.0" +fixedbitset = "0.5.7" flate2 = "1.0.35" futures = "0.3.31" globset = "0.4.15" diff --git a/crates/oxc_mangler/Cargo.toml b/crates/oxc_mangler/Cargo.toml index f13471dc27ce2..dd3988f77123a 100644 --- a/crates/oxc_mangler/Cargo.toml +++ b/crates/oxc_mangler/Cargo.toml @@ -27,5 +27,6 @@ oxc_index = { workspace = true } oxc_semantic = { workspace = true } oxc_span = { workspace = true } +fixedbitset = { workspace = true } itertools = { workspace = true } rustc-hash = { workspace = true } diff --git a/crates/oxc_mangler/src/lib.rs b/crates/oxc_mangler/src/lib.rs index 0082d319c7c17..0dc5fab99de5c 100644 --- a/crates/oxc_mangler/src/lib.rs +++ b/crates/oxc_mangler/src/lib.rs @@ -1,12 +1,14 @@ +use std::iter; use std::ops::Deref; +use fixedbitset::FixedBitSet; use itertools::Itertools; use rustc_hash::FxHashSet; use oxc_allocator::{Allocator, Vec}; use oxc_ast::ast::{Declaration, Program, Statement}; use oxc_index::Idx; -use oxc_semantic::{ReferenceId, ScopeTree, SemanticBuilder, SymbolId, SymbolTable}; +use oxc_semantic::{ReferenceId, ScopeTree, Semantic, SemanticBuilder, SymbolId, SymbolTable}; use oxc_span::Atom; #[derive(Default, Debug, Clone, Copy)] @@ -22,17 +24,19 @@ type Slot = usize; /// See: /// * [esbuild](https://github.com/evanw/esbuild/blob/v0.24.0/docs/architecture.md#symbol-minification) /// -/// This algorithm is targeted for better gzip compression. +/// This algorithm is based on the implementation of esbuild and additionally implements improved name reuse functionality. +/// It targets for better gzip compression. /// -/// Visually, a slot is the index position for binding identifiers: +/// A slot is a placeholder for binding identifiers that shares the same name. +/// Visually, it is the index position for binding identifiers: /// /// ```javascript -/// function slot0(slot2, slot3, slot4) { +/// function slot0(slot1, slot2, slot3) { /// slot2 = 1; /// } -/// function slot1(slot2, slot3) { -/// function slot4() { -/// slot2 = 1; +/// function slot1(slot0) { +/// function slot2() { +/// slot0 = 1; /// } /// } /// ``` @@ -40,24 +44,73 @@ type Slot = usize; /// The slot number for a new scope starts after the maximum slot of the parent scope. /// /// Occurrences of slots and their corresponding newly assigned short identifiers are: -/// - slot2: 4 - a -/// - slot3: 2 - b -/// - slot4: 2 - c -/// - slot0: 1 - d -/// - slot1: 1 - e +/// - slot2: 3 - a +/// - slot0: 2 - b +/// - slot1: 2 - c +/// - slot3: 1 - d /// /// After swapping out the mangled names: /// /// ```javascript -/// function d(a, b, c) { +/// function b(c, a, d) { /// a = 1; /// } -/// function e(a, b) { -/// function c() { -/// a = 1; +/// function c(b) { +/// function a() { +/// b = 1; /// } /// } /// ``` +/// +/// ## Name Reuse Calculation +/// +/// This improvement was inspired by [evanw/esbuild#2614](https://github.com/evanw/esbuild/pull/2614). +/// +/// For better compression, we shadow the variables where possible to reuse the same name. +/// For example, the following code: +/// ```javascript +/// var top_level_a = 0; +/// var top_level_b = 1; +/// function foo() { +/// var foo_a = 1; +/// console.log(top_level_b, foo_a); +/// } +/// function bar() { +/// var bar_a = 1; +/// console.log(top_level_b, bar_a); +/// } +/// console.log(top_level_a, foo(), bar()) +/// ``` +/// `top_level_a` is declared in the root scope, but is not used in function `foo` and function `bar`. +/// Therefore, we can reuse the same name for `top_level_a` and `foo_a` and `bar_a`. +/// +/// To calculate whether the variable name can be reused in the descendant scopes, +/// this mangler introduces a concept of symbol liveness and slot liveness. +/// Symbol liveness is a subtree of the scope tree that contains the declared scope of the symbol and +/// all the scopes that the symbol is used in. It is a subtree, so any scopes that are between the declared scope and the used scope +/// are also included. This is to ensure that the symbol is not shadowed by a different symbol before the use in the descendant scope. +/// +/// For the example above, the liveness of each symbols are: +/// - `top_level_a`: root_scope +/// - `top_level_b`: root_scope -> foo, root_scope -> bar +/// - `foo_a`: root_scope -> foo +/// - `bar_a`: root_scope -> bar +/// - `foo`: root_scope +/// - `bar`: root_scope +/// +/// Slot liveness is the same as symbol liveness, but it is a subforest (multiple subtrees) of the scope tree that can contain +/// multiple symbol liveness. +/// +/// Now that we have the liveness of each symbol, we want to assign symbols to minimal number of slots. +/// This is a graph coloring problem where the node of the graph is the symbol and the edge of the graph indicates whether +/// the symbols has a common alive scope and the color of the node is the slot. +/// This mangler uses a greedy algorithm to assign symbols to slots to achieve that. +/// In other words, it assigns symbols to the first slot that does not live in the liveness of the symbol. +/// For the example above, each symbol is assigned to the following slots: +/// - slot 0: `top_level_a` +/// - slot 1: `top_level_b`, `foo_a`, `bar_a` +/// - slot 2: `foo` +/// - slot 3: `bar` #[derive(Default)] pub struct Mangler { symbol_table: SymbolTable, @@ -88,22 +141,20 @@ impl Mangler { #[must_use] pub fn build(self, program: &Program<'_>) -> Mangler { - let semantic = SemanticBuilder::new().build(program).semantic; - let (symbol_table, scope_tree) = semantic.into_symbol_table_and_scope_tree(); - self.build_with_symbols_and_scopes(symbol_table, &scope_tree, program) + let semantic = + SemanticBuilder::new().with_scope_tree_child_ids(true).build(program).semantic; + self.build_with_semantic(semantic, program) } + /// # Panics + /// + /// Panics if the child_ids does not exist in scope_tree. #[must_use] - pub fn build_with_symbols_and_scopes( - self, - symbol_table: SymbolTable, - scope_tree: &ScopeTree, - program: &Program<'_>, - ) -> Mangler { + pub fn build_with_semantic(self, semantic: Semantic<'_>, program: &Program<'_>) -> Mangler { if self.options.debug { - self.build_with_symbols_and_scopes_impl(symbol_table, scope_tree, program, debug_name) + self.build_with_symbols_and_scopes_impl(semantic, program, debug_name) } else { - self.build_with_symbols_and_scopes_impl(symbol_table, scope_tree, program, base54) + self.build_with_symbols_and_scopes_impl(semantic, program, base54) } } @@ -112,11 +163,14 @@ impl Mangler { G: Fn(usize) -> InlineString, >( mut self, - symbol_table: SymbolTable, - scope_tree: &ScopeTree, + semantic: Semantic<'_>, program: &Program<'_>, generate_name: G, ) -> Mangler { + let (mut symbol_table, scope_tree, ast_nodes) = semantic.into_symbols_scopes_nodes(); + + assert!(scope_tree.has_child_ids(), "child_id needs to be generated"); + let (exported_names, exported_symbols) = if self.options.top_level { Mangler::collect_exported_symbols(program) } else { @@ -125,59 +179,81 @@ impl Mangler { let allocator = Allocator::default(); - // Mangle the symbol table by computing slots from the scope tree. - // A slot is the occurrence index of a binding identifier inside a scope. - let mut symbol_table = symbol_table; - - // Total number of slots for all scopes - let mut total_number_of_slots: Slot = 0; - // All symbols with their assigned slots. Keyed by symbol id. let mut slots: Vec<'_, Slot> = Vec::with_capacity_in(symbol_table.len(), &allocator); for _ in 0..symbol_table.len() { slots.push(0); } - // Keep track of the maximum slot number for each scope - let mut max_slot_for_scope = Vec::with_capacity_in(scope_tree.len(), &allocator); - for _ in 0..scope_tree.len() { - max_slot_for_scope.push(0); - } + // Stores the lived scope ids for each slot. Keyed by slot number. + let mut slot_liveness: std::vec::Vec = vec![]; - // Walk the scope tree and compute the slot number for each scope let mut tmp_bindings = std::vec::Vec::with_capacity(100); - for scope_id in scope_tree.descendants_from_root() { + let mut reusable_slots = std::vec::Vec::new(); + // Walk down the scope tree and assign a slot number for each symbol. + // It is possible to do this in a loop over the symbol list, + // but walking down the scope tree seems to generate a better code. + for scope_id in iter::once(scope_tree.root_scope_id()) + .chain(scope_tree.iter_all_child_ids(scope_tree.root_scope_id())) + { let bindings = scope_tree.get_bindings(scope_id); - - // The current slot number is continued by the maximum slot from the parent scope - let parent_max_slot = scope_tree - .get_parent_id(scope_id) - .map_or(0, |parent_scope_id| max_slot_for_scope[parent_scope_id.index()]); - - let mut slot = parent_max_slot; - - if !bindings.is_empty() { - // Sort `bindings` in declaration order. - tmp_bindings.clear(); - tmp_bindings.extend(bindings.values().copied()); - tmp_bindings.sort_unstable(); - for symbol_id in &tmp_bindings { - slots[symbol_id.index()] = slot; - slot += 1; - } + if bindings.is_empty() { + continue; } - max_slot_for_scope[scope_id.index()] = slot; + let mut slot = slot_liveness.len(); + + reusable_slots.clear(); + reusable_slots.extend( + // Slots that are already assigned to other symbols, but does not live in the current scope. + slot_liveness + .iter() + .enumerate() + .filter(|(_, slot_liveness)| !slot_liveness.contains(scope_id.index())) + .map(|(slot, _)| slot) + .take(bindings.len()), + ); + + // The number of new slots that needs to be allocated. + let remaining_count = bindings.len() - reusable_slots.len(); + reusable_slots.extend(slot..slot + remaining_count); + + slot += remaining_count; + if slot_liveness.len() < slot { + slot_liveness.resize_with(slot, || FixedBitSet::with_capacity(scope_tree.len())); + } - if slot > total_number_of_slots { - total_number_of_slots = slot; + // Sort `bindings` in declaration order. + tmp_bindings.clear(); + tmp_bindings.extend(bindings.values().copied()); + tmp_bindings.sort_unstable(); + for (symbol_id, assigned_slot) in + tmp_bindings.iter().zip(reusable_slots.iter().copied()) + { + slots[symbol_id.index()] = assigned_slot; + + // Calculate the scope ids that this symbol is alive in. + let lived_scope_ids = symbol_table + .get_resolved_references(*symbol_id) + .flat_map(|reference| { + let used_scope_id = ast_nodes.get_node(reference.node_id()).scope_id(); + scope_tree.ancestors(used_scope_id).take_while(|s_id| *s_id != scope_id) + }) + .chain(iter::once(scope_id)); + + // Since the slot is now assigned to this symbol, it is alive in all the scopes that this symbol is alive in. + for scope_id in lived_scope_ids { + slot_liveness[assigned_slot].insert(scope_id.index()); + } } } + let total_number_of_slots = slot_liveness.len(); + let frequencies = self.tally_slot_frequencies( &symbol_table, &exported_symbols, - scope_tree, + &scope_tree, total_number_of_slots, &slots, &allocator, diff --git a/crates/oxc_minifier/src/lib.rs b/crates/oxc_minifier/src/lib.rs index 6904b981b53b0..ede3349e6f584 100644 --- a/crates/oxc_minifier/src/lib.rs +++ b/crates/oxc_minifier/src/lib.rs @@ -55,11 +55,12 @@ impl Minifier { Stats::default() }; let mangler = self.options.mangle.map(|options| { - let semantic = SemanticBuilder::new().with_stats(stats).build(program).semantic; - let (symbols, scopes) = semantic.into_symbol_table_and_scope_tree(); - Mangler::default() - .with_options(options) - .build_with_symbols_and_scopes(symbols, &scopes, program) + let semantic = SemanticBuilder::new() + .with_stats(stats) + .with_scope_tree_child_ids(true) + .build(program) + .semantic; + Mangler::default().with_options(options).build_with_semantic(semantic, program) }); MinifierReturn { mangler } } diff --git a/crates/oxc_minifier/tests/mangler/mod.rs b/crates/oxc_minifier/tests/mangler/mod.rs index 3d76bed459fca..8534638354ddb 100644 --- a/crates/oxc_minifier/tests/mangler/mod.rs +++ b/crates/oxc_minifier/tests/mangler/mod.rs @@ -25,6 +25,21 @@ fn mangler() { "var x; function foo(a) { ({ x } = y) }", "import { x } from 's'; export { x }", "function _ (exports) { Object.defineProperty(exports, '__esModule', { value: true }) }", + "function foo(foo_a, foo_b, foo_c) {}; function bar(bar_a, bar_b, bar_c) {}", // foo_a and bar_a can be reused + "function _() { function foo() { var x; foo; } }", // x should not use the same name with foo + "function _() { var x; function foo() { var y; function bar() { x } } }", // y should not shadow x + "function _() { function x(a) {} }", // a can shadow x + "function _() { function x(a) { x } }", // a should not shadow x + "function _() { var x; { var y }}", // y should not shadow x + "function _() { var x; { let y }}", // y can shadow x + "function _() { let x; { let y }}", // y can shadow x + "function _() { var x; { const y }}", // y can shadow x + "function _() { let x; { const y }}", // y can shadow x + "function _() { var x; { class Y{} }}", // Y can shadow x + "function _() { let x; { class Y{} }}", // Y can shadow x + "function _() { var x; try { throw 0 } catch (e) { e } }", // e can shadow x + "function _() { var x; try { throw 0 } catch (e) { var e } }", // e can shadow x (not implemented) + "function _() { var x; try { throw 0 } catch { var e } }", // e should not shadow x ]; let top_level_cases = [ "function foo(a) {a}", diff --git a/crates/oxc_minifier/tests/mangler/snapshots/mangler.snap b/crates/oxc_minifier/tests/mangler/snapshots/mangler.snap index 39756b92e6a0c..5056372bf82bf 100644 --- a/crates/oxc_minifier/tests/mangler/snapshots/mangler.snap +++ b/crates/oxc_minifier/tests/mangler/snapshots/mangler.snap @@ -2,23 +2,23 @@ source: crates/oxc_minifier/tests/mangler/mod.rs --- function foo(a) {a} -function foo(b) { - b; +function foo(a) { + a; } function foo(a) { let _ = { x } } -function foo(b) { - let c = { x }; +function foo(a) { + let b = { x }; } function foo(a) { let { x } = y } -function foo(b) { - let { x: c } = y; +function foo(a) { + let { x: b } = y; } var x; function foo(a) { ({ x } = y) } var x; -function foo(c) { +function foo(b) { ({x} = y); } @@ -31,9 +31,131 @@ function _(exports) { Object.defineProperty(exports, "__esModule", { value: true }); } +function foo(foo_a, foo_b, foo_c) {}; function bar(bar_a, bar_b, bar_c) {} +function foo(a, b, c) {} +; +function bar(a, b, c) {} + +function _() { function foo() { var x; foo; } } +function _() { + function a() { + var b; + a; + } +} + +function _() { var x; function foo() { var y; function bar() { x } } } +function _() { + var a; + function b() { + var b; + function c() { + a; + } + } +} + +function _() { function x(a) {} } +function _() { + function a(a) {} +} + +function _() { function x(a) { x } } +function _() { + function a(b) { + a; + } +} + +function _() { var x; { var y }} +function _() { + var a; + { + var b; + } +} + +function _() { var x; { let y }} +function _() { + var a; + { + let a; + } +} + +function _() { let x; { let y }} +function _() { + let a; + { + let a; + } +} + +function _() { var x; { const y }} +function _() { + var a; + { + const a; + } +} + +function _() { let x; { const y }} +function _() { + let a; + { + const a; + } +} + +function _() { var x; { class Y{} }} +function _() { + var a; + { + class a {} + } +} + +function _() { let x; { class Y{} }} +function _() { + let a; + { + class a {} + } +} + +function _() { var x; try { throw 0 } catch (e) { e } } +function _() { + var a; + try { + throw 0; + } catch (a) { + a; + } +} + +function _() { var x; try { throw 0 } catch (e) { var e } } +function _() { + var a; + try { + throw 0; + } catch (b) { + var b; + } +} + +function _() { var x; try { throw 0 } catch { var e } } +function _() { + var a; + try { + throw 0; + } catch { + var b; + } +} + function foo(a) {a} -function a(b) { - b; +function a(a) { + a; } export function foo() {}; foo() diff --git a/crates/oxc_semantic/src/lib.rs b/crates/oxc_semantic/src/lib.rs index 3bb10b9fe8652..175a191ce83f7 100644 --- a/crates/oxc_semantic/src/lib.rs +++ b/crates/oxc_semantic/src/lib.rs @@ -90,12 +90,16 @@ pub struct Semantic<'a> { } impl<'a> Semantic<'a> { - /// Extract the [`SymbolTable`] and [`ScopeTree`] from the [`Semantic`] - /// instance, consuming `self`. + /// Extract [`SymbolTable`] and [`ScopeTree`] from [`Semantic`]. pub fn into_symbol_table_and_scope_tree(self) -> (SymbolTable, ScopeTree) { (self.symbols, self.scopes) } + /// Extract [`SymbolTable`], [`ScopeTree`] and [`AstNode`] from the [`Semantic`]. + pub fn into_symbols_scopes_nodes(self) -> (SymbolTable, ScopeTree, AstNodes<'a>) { + (self.symbols, self.scopes, self.nodes) + } + /// Source code of the JavaScript/TypeScript program being analyzed. pub fn source_text(&self) -> &'a str { self.source_text diff --git a/napi/minify/test/minify.test.ts b/napi/minify/test/minify.test.ts index a9a88a1cb36ce..3841d0c5dba05 100644 --- a/napi/minify/test/minify.test.ts +++ b/napi/minify/test/minify.test.ts @@ -8,7 +8,7 @@ describe('simple', () => { it('matches output', () => { const ret = minify('test.js', code, { sourcemap: true }); expect(ret).toStrictEqual({ - 'code': 'function foo(){var b;b(void 0)}foo();', + 'code': 'function foo(){var a;a(void 0)}foo();', 'map': { 'mappings': 'AAAA,SAAS,KAAM,CAAE,IAAIA,EAAK,SAAc,AAAE,CAAC,KAAK', 'names': [ diff --git a/tasks/benchmark/benches/minifier.rs b/tasks/benchmark/benches/minifier.rs index 25c7b3180418a..b9bd1fb15b217 100644 --- a/tasks/benchmark/benches/minifier.rs +++ b/tasks/benchmark/benches/minifier.rs @@ -58,13 +58,10 @@ fn bench_mangler(criterion: &mut Criterion) { b.iter_with_setup_wrapper(|runner| { allocator.reset(); let program = Parser::new(&allocator, source_text, source_type).parse().program; - let (symbols, scopes) = SemanticBuilder::new() - .build(&program) - .semantic - .into_symbol_table_and_scope_tree(); + let semantic = + SemanticBuilder::new().with_scope_tree_child_ids(true).build(&program).semantic; runner.run(|| { - let _ = - Mangler::new().build_with_symbols_and_scopes(symbols, &scopes, &program); + let _ = Mangler::new().build_with_semantic(semantic, &program); }); }); }); diff --git a/tasks/minsize/minsize.snap b/tasks/minsize/minsize.snap index c49db0d51b92c..0513ae9b85a62 100644 --- a/tasks/minsize/minsize.snap +++ b/tasks/minsize/minsize.snap @@ -1,27 +1,27 @@ | Oxc | ESBuild | Oxc | ESBuild | Original | minified | minified | gzip | gzip | Fixture ------------------------------------------------------------------------------------- -72.14 kB | 23.67 kB | 23.70 kB | 8.60 kB | 8.54 kB | react.development.js +72.14 kB | 23.61 kB | 23.70 kB | 8.55 kB | 8.54 kB | react.development.js -173.90 kB | 59.79 kB | 59.82 kB | 19.41 kB | 19.33 kB | moment.js +173.90 kB | 59.71 kB | 59.82 kB | 19.26 kB | 19.33 kB | moment.js -287.63 kB | 90.08 kB | 90.07 kB | 32.03 kB | 31.95 kB | jquery.js +287.63 kB | 89.58 kB | 90.07 kB | 31.08 kB | 31.95 kB | jquery.js -342.15 kB | 118.19 kB | 118.14 kB | 44.45 kB | 44.37 kB | vue.js +342.15 kB | 117.76 kB | 118.14 kB | 43.67 kB | 44.37 kB | vue.js -544.10 kB | 71.75 kB | 72.48 kB | 26.15 kB | 26.20 kB | lodash.js +544.10 kB | 71.50 kB | 72.48 kB | 25.92 kB | 26.20 kB | lodash.js -555.77 kB | 272.89 kB | 270.13 kB | 90.90 kB | 90.80 kB | d3.js +555.77 kB | 272.35 kB | 270.13 kB | 88.60 kB | 90.80 kB | d3.js -1.01 MB | 460.16 kB | 458.89 kB | 126.78 kB | 126.71 kB | bundle.min.js +1.01 MB | 458.28 kB | 458.89 kB | 123.94 kB | 126.71 kB | bundle.min.js -1.25 MB | 652.68 kB | 646.76 kB | 163.48 kB | 163.73 kB | three.js +1.25 MB | 650.82 kB | 646.76 kB | 161.51 kB | 163.73 kB | three.js -2.14 MB | 723.85 kB | 724.14 kB | 179.88 kB | 181.07 kB | victory.js +2.14 MB | 719.54 kB | 724.14 kB | 162.47 kB | 181.07 kB | victory.js -3.20 MB | 1.01 MB | 1.01 MB | 331.98 kB | 331.56 kB | echarts.js +3.20 MB | 1.01 MB | 1.01 MB | 325.40 kB | 331.56 kB | echarts.js -6.69 MB | 2.31 MB | 2.31 MB | 491.91 kB | 488.28 kB | antd.js +6.69 MB | 2.30 MB | 2.31 MB | 470.00 kB | 488.28 kB | antd.js -10.95 MB | 3.48 MB | 3.49 MB | 905.29 kB | 915.50 kB | typescript.js +10.95 MB | 3.37 MB | 3.49 MB | 866.68 kB | 915.50 kB | typescript.js