Skip to content

Commit

Permalink
use ConsecutiveStringStorage to dedup serialized literals
Browse files Browse the repository at this point in the history
Summary:
Serialized literals de-duping was quadratic with the total size of the
literals because it performed a linear search when adding every literal.

We can use ConsecutiveStringStorage (with small modifications) instead
to dedup the literals with much better performance. To that end, we need
an additional pass over the LIR to collect all literals, dedup them, and
remember their dedupped offsets.

Reviewed By: tmikov

Differential Revision: D18056232

fbshipit-source-id: 14df6f5c6868339dcb11a663ab2132916bf992f1
  • Loading branch information
Michael Anthony Leon authored and Riccardo Cipolleschi committed Mar 7, 2023
1 parent 1eb8f7e commit 62d58e5
Show file tree
Hide file tree
Showing 14 changed files with 497 additions and 306 deletions.
50 changes: 34 additions & 16 deletions include/hermes/BCGen/HBC/BytecodeGenerator.h
Original file line number Diff line number Diff line change
Expand Up @@ -270,16 +270,18 @@ class BytecodeFunctionGenerator : public BytecodeInstructionGenerator {
/// This class is used by the hermes backend.
/// It wraps all data required to generate the module.
class BytecodeModuleGenerator {
public:
using LiteralOffset = std::pair<uint32_t, uint32_t>;
using LiteralOffsetMapTy = llvh::DenseMap<const Instruction *, LiteralOffset>;

private:
/// Mapping from Function * to a sequential ID.
AllocationTable<Function *> functionIDMap_{};

/// Mapping from Function * to it's BytecodeFunctionGenerator *.
DenseMap<Function *, std::unique_ptr<BytecodeFunctionGenerator>>
functionGenerators_{};

/// Generate literals buffer for object/array.
SerializedLiteralGenerator literalGenerator_;

/// The mapping from strings to ID for strings in the resulting bytecode
/// module.
StringLiteralTable stringTable_{};
Expand Down Expand Up @@ -321,6 +323,11 @@ class BytecodeModuleGenerator {
/// They are stored as chars in order to shorten bytecode size
std::vector<unsigned char> objValBuffer_{};

/// A map from instruction to literal offset in the corresponding buffers.
/// \c arrayBuffer_, \c objKeyBuffer_, \c objValBuffer_.
/// This map is populated before instruction selection.
LiteralOffsetMapTy literalOffsetMap_{};

/// Options controlling bytecode generation.
BytecodeGenerationOptions options_;

Expand All @@ -343,8 +350,7 @@ class BytecodeModuleGenerator {
/// Constructor which enables optimizations if \p optimizationEnabled is set.
BytecodeModuleGenerator(
BytecodeGenerationOptions options = BytecodeGenerationOptions::defaults())
: literalGenerator_(*this, options.optimizationEnabled),
options_(options) {}
: options_(options) {}

/// Add a function to functionIDMap_ if not already exist. Returns the ID.
unsigned addFunction(Function *F);
Expand Down Expand Up @@ -390,6 +396,18 @@ class BytecodeModuleGenerator {
/// \return the index of the bigint in the table.
uint32_t addBigInt(bigint::ParsedBigInt bigint);

/// Set the serialized literal tables that this generator will use. Once set,
/// no further modifications are possible.
/// \param arrayBuffer buffer containing the serialized array literals.
/// \param objBuffer buffer containing the keys of serialized object literals.
/// \param valBuffer buffer containing the values of serialized object
/// literals.
void initializeSerializedLiterals(
std::vector<unsigned char> &&arrayBuffer,
std::vector<unsigned char> &&keyBuffer,
std::vector<unsigned char> &&valBuffer,
LiteralOffsetMapTy &&offsetMap);

/// Adds a compiled regexp to the module table.
/// \return the index of the regexp in the table.
uint32_t addRegExp(CompiledRegExp *regexp);
Expand All @@ -415,17 +433,6 @@ class BytecodeModuleGenerator {
/// \param stringID the index of the corresponding source in the string table.
void addFunctionSource(uint32_t functionID, uint32_t stringID);

/// Returns the starting offset of the elements.
uint32_t addArrayBuffer(ArrayRef<Literal *> elements);

/// Add to the the object buffer using \keys as the array of keys, and
/// \vals as the array of values.
/// Returns a pair where the first value is the object's offset into the
/// key buffer, and the second value is its offset into the value buffer.
std::pair<uint32_t, uint32_t> addObjectBuffer(
ArrayRef<Literal *> keys,
ArrayRef<Literal *> vals);

/// Serializes the array of literals given into a compact char buffer.
/// The serialization format can be found in:
/// include/hermes/VM/SerializedLiteralParser.h
Expand All @@ -444,6 +451,17 @@ class BytecodeModuleGenerator {
std::vector<unsigned char> &buff,
bool isKeyBuffer);

/// For a given instruction \p inst that has an associated serialized literal,
/// obtain the offset of the literal in the associated buffer. In case of
/// an object literal, it is a pair of offsets (key and value). In case of
/// array literal, only the first offset is used.
LiteralOffset serializedLiteralOffsetFor(const Instruction *inst) {
assert(
literalOffsetMap_.count(inst) &&
"instruction has no serialized literal");
return literalOffsetMap_[inst];
}

/// \return a BytecodeModule.
std::unique_ptr<BytecodeModule> generate();
};
Expand Down
13 changes: 10 additions & 3 deletions include/hermes/BCGen/HBC/ConsecutiveStringStorage.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,14 +74,21 @@ class ConsecutiveStringStorage {

/// Construct from a list of unique strings. Note that this is only
/// instantiated for a small number of different \p I types.
template <typename I>
ConsecutiveStringStorage(I begin, I end, bool optimize = false);
/// \param Force8Bit if set to std::true_type, indicates that the input
/// is *not* utf-8 encoded and consists of 8-bit bytes. If set to
/// std::false_type, the input is utf-8 encoded.
template <typename I, typename Force8Bit>
ConsecutiveStringStorage(I begin, I end, Force8Bit, bool optimize);

/// Construct from a list of unique strings.
explicit ConsecutiveStringStorage(
llvh::ArrayRef<llvh::StringRef> strings,
bool optimize = false)
: ConsecutiveStringStorage(strings.begin(), strings.end(), optimize) {}
: ConsecutiveStringStorage(
strings.begin(),
strings.end(),
std::false_type{},
optimize) {}

/// Construct from a table and storage.
ConsecutiveStringStorage(
Expand Down
9 changes: 3 additions & 6 deletions include/hermes/BCGen/HBC/SerializedLiteralGenerator.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,9 @@ class SerializedLiteralGenerator {
private:
/// The bytecode module generator.
BytecodeModuleGenerator &BMGen_;
/// Whether to perform de-duplication optimization or not.
bool deDuplicate_;

public:
SerializedLiteralGenerator(BytecodeModuleGenerator &BMGen, bool deDuplicate)
: BMGen_(BMGen), deDuplicate_(deDuplicate) {}
SerializedLiteralGenerator(BytecodeModuleGenerator &BMGen) : BMGen_(BMGen) {}

using TagType = unsigned char;

Expand All @@ -78,10 +75,10 @@ class SerializedLiteralGenerator {

static constexpr unsigned SequenceMax = (1 << 12) - 1;

/// Serialize input \p literals into \p buff.
/// Serialize input \p literals and append into \p buff.
/// \p isKeyBuffer: whether this is generating object literal key buffer or
/// not.
uint32_t serializeBuffer(
void serializeBuffer(
llvh::ArrayRef<Literal *> literals,
std::vector<unsigned char> &buff,
bool isKeyBuffer);
Expand Down
26 changes: 14 additions & 12 deletions lib/BCGen/HBC/BytecodeGenerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,18 +80,6 @@ void BytecodeFunctionGenerator::setJumpTable(
jumpTable_ = std::move(jumpTable);
}

uint32_t BytecodeModuleGenerator::addArrayBuffer(ArrayRef<Literal *> elements) {
return literalGenerator_.serializeBuffer(elements, arrayBuffer_, false);
}

std::pair<uint32_t, uint32_t> BytecodeModuleGenerator::addObjectBuffer(
ArrayRef<Literal *> keys,
ArrayRef<Literal *> vals) {
return std::pair<uint32_t, uint32_t>{
literalGenerator_.serializeBuffer(keys, objKeyBuffer_, true),
literalGenerator_.serializeBuffer(vals, objValBuffer_, false)};
}

std::unique_ptr<BytecodeFunction>
BytecodeFunctionGenerator::generateBytecodeFunction(
Function::DefinitionKind definitionKind,
Expand Down Expand Up @@ -240,6 +228,20 @@ uint32_t BytecodeModuleGenerator::addBigInt(bigint::ParsedBigInt bigint) {
return bigIntTable_.addBigInt(std::move(bigint));
}

void BytecodeModuleGenerator::initializeSerializedLiterals(
std::vector<unsigned char> &&arrayBuffer,
std::vector<unsigned char> &&keyBuffer,
std::vector<unsigned char> &&valBuffer,
hermes::hbc::BytecodeModuleGenerator::LiteralOffsetMapTy &&offsetMap) {
assert(
arrayBuffer_.empty() && objKeyBuffer_.empty() && objValBuffer_.empty() &&
literalOffsetMap_.empty() && "serialized literals already initialized");
arrayBuffer_ = std::move(arrayBuffer);
objKeyBuffer_ = std::move(keyBuffer);
objValBuffer_ = std::move(valBuffer);
literalOffsetMap_ = std::move(offsetMap);
}

uint32_t BytecodeModuleGenerator::addRegExp(CompiledRegExp *regexp) {
return regExpTable_.addRegExp(regexp);
}
Expand Down
20 changes: 15 additions & 5 deletions lib/BCGen/HBC/ConsecutiveStringStorage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -571,8 +571,8 @@ class StringTableBuilder {
/// and end. Note that we do not always copy the underlying string data so
/// the resulting builder must not outlive these strings. In delta
/// optimizing mode, only new strings are added here and packed.
template <typename I>
StringTableBuilder(I begin, I end) {
template <typename I, typename Force8Bit>
StringTableBuilder(I begin, I end, Force8Bit) {
// Generate and store a StringEntry for each string.
// Remember the index of each string in our StringEntry, so that we can
// later output the table in the correct order.
Expand All @@ -586,7 +586,7 @@ class StringTableBuilder {
static_assert(sizeof(str.data()[0]) == 1, "strings must be UTF8");
const unsigned char *begin = (const unsigned char *)str.data();
const unsigned char *end = begin + str.size();
if (isAllASCII(begin, end)) {
if (Force8Bit::value || isAllASCII(begin, end)) {
ArrayRef<unsigned char> astr(begin, end);
asciiStrings_.emplace_back(index, astr);
} else {
Expand Down Expand Up @@ -713,14 +713,15 @@ class StringTableBuilder {
namespace hermes {
namespace hbc {

template <typename I>
template <typename I, typename Force8Bit>
ConsecutiveStringStorage::ConsecutiveStringStorage(
I begin,
I end,
Force8Bit,
bool optimize) {
// Prepare to build our string table.
// Generate storage for our ASCII and u16 strings.
StringTableBuilder builder(begin, end);
StringTableBuilder builder(begin, end, Force8Bit{});
std::vector<unsigned char> asciiStorage;
std::vector<char16_t> u16Storage;
builder.packIntoStorage(&asciiStorage, &u16Storage, optimize);
Expand All @@ -741,16 +742,25 @@ ConsecutiveStringStorage::ConsecutiveStringStorage(
template ConsecutiveStringStorage::ConsecutiveStringStorage(
StringSetVector::const_iterator begin,
StringSetVector::const_iterator end,
std::false_type,
bool optimize);

template ConsecutiveStringStorage::ConsecutiveStringStorage(
StringSetVector::iterator begin,
StringSetVector::iterator end,
std::false_type,
bool optimize);

template ConsecutiveStringStorage::ConsecutiveStringStorage(
ArrayRef<llvh::StringRef>::const_iterator begin,
ArrayRef<llvh::StringRef>::const_iterator end,
std::false_type,
bool optimize);

template ConsecutiveStringStorage::ConsecutiveStringStorage(
StringSetVector::const_iterator begin,
StringSetVector::const_iterator end,
std::true_type,
bool optimize);

uint32_t ConsecutiveStringStorage::getEntryHash(size_t i) const {
Expand Down
Loading

0 comments on commit 62d58e5

Please sign in to comment.