From 62d58e5d6747da8b8ee9e18eb8d0f57469acf201 Mon Sep 17 00:00:00 2001 From: Michael Anthony Leon Date: Fri, 9 Dec 2022 13:03:31 -0800 Subject: [PATCH] use ConsecutiveStringStorage to dedup serialized literals Summary: Serialized literals de-duping was quadratic with the total size of the literals because it performed a linear search when adding every literal. We can use ConsecutiveStringStorage (with small modifications) instead to dedup the literals with much better performance. To that end, we need an additional pass over the LIR to collect all literals, dedup them, and remember their dedupped offsets. Reviewed By: tmikov Differential Revision: D18056232 fbshipit-source-id: 14df6f5c6868339dcb11a663ab2132916bf992f1 --- include/hermes/BCGen/HBC/BytecodeGenerator.h | 50 ++-- .../BCGen/HBC/ConsecutiveStringStorage.h | 13 +- .../BCGen/HBC/SerializedLiteralGenerator.h | 9 +- lib/BCGen/HBC/BytecodeGenerator.cpp | 26 +- lib/BCGen/HBC/ConsecutiveStringStorage.cpp | 20 +- lib/BCGen/HBC/HBC.cpp | 246 +++++++++++++++++- lib/BCGen/HBC/ISel.cpp | 25 +- lib/BCGen/HBC/SerializedLiteralGenerator.cpp | 34 +-- lib/BCGen/HBC/UniquingFilenameTable.cpp | 3 +- test/hermes/array-literal-large.js | 50 ++++ test/hermes/array-literal.js | 23 ++ unittests/BCGen/HBC.cpp | 127 ++++++--- unittests/VMRuntime/CMakeLists.txt | 1 - unittests/VMRuntime/ObjectBufferTest.cpp | 176 ------------- 14 files changed, 497 insertions(+), 306 deletions(-) create mode 100644 test/hermes/array-literal-large.js create mode 100644 test/hermes/array-literal.js delete mode 100644 unittests/VMRuntime/ObjectBufferTest.cpp diff --git a/include/hermes/BCGen/HBC/BytecodeGenerator.h b/include/hermes/BCGen/HBC/BytecodeGenerator.h index 4ced54a4c44..b82fd604553 100644 --- a/include/hermes/BCGen/HBC/BytecodeGenerator.h +++ b/include/hermes/BCGen/HBC/BytecodeGenerator.h @@ -270,6 +270,11 @@ class BytecodeFunctionGenerator : public BytecodeInstructionGenerator { /// This class is used by the hermes backend. /// It wraps all data required to generate the module. class BytecodeModuleGenerator { + public: + using LiteralOffset = std::pair; + using LiteralOffsetMapTy = llvh::DenseMap; + + private: /// Mapping from Function * to a sequential ID. AllocationTable functionIDMap_{}; @@ -277,9 +282,6 @@ class BytecodeModuleGenerator { DenseMap> functionGenerators_{}; - /// Generate literals buffer for object/array. - SerializedLiteralGenerator literalGenerator_; - /// The mapping from strings to ID for strings in the resulting bytecode /// module. StringLiteralTable stringTable_{}; @@ -321,6 +323,11 @@ class BytecodeModuleGenerator { /// They are stored as chars in order to shorten bytecode size std::vector objValBuffer_{}; + /// A map from instruction to literal offset in the corresponding buffers. + /// \c arrayBuffer_, \c objKeyBuffer_, \c objValBuffer_. + /// This map is populated before instruction selection. + LiteralOffsetMapTy literalOffsetMap_{}; + /// Options controlling bytecode generation. BytecodeGenerationOptions options_; @@ -343,8 +350,7 @@ class BytecodeModuleGenerator { /// Constructor which enables optimizations if \p optimizationEnabled is set. BytecodeModuleGenerator( BytecodeGenerationOptions options = BytecodeGenerationOptions::defaults()) - : literalGenerator_(*this, options.optimizationEnabled), - options_(options) {} + : options_(options) {} /// Add a function to functionIDMap_ if not already exist. Returns the ID. unsigned addFunction(Function *F); @@ -390,6 +396,18 @@ class BytecodeModuleGenerator { /// \return the index of the bigint in the table. uint32_t addBigInt(bigint::ParsedBigInt bigint); + /// Set the serialized literal tables that this generator will use. Once set, + /// no further modifications are possible. + /// \param arrayBuffer buffer containing the serialized array literals. + /// \param objBuffer buffer containing the keys of serialized object literals. + /// \param valBuffer buffer containing the values of serialized object + /// literals. + void initializeSerializedLiterals( + std::vector &&arrayBuffer, + std::vector &&keyBuffer, + std::vector &&valBuffer, + LiteralOffsetMapTy &&offsetMap); + /// Adds a compiled regexp to the module table. /// \return the index of the regexp in the table. uint32_t addRegExp(CompiledRegExp *regexp); @@ -415,17 +433,6 @@ class BytecodeModuleGenerator { /// \param stringID the index of the corresponding source in the string table. void addFunctionSource(uint32_t functionID, uint32_t stringID); - /// Returns the starting offset of the elements. - uint32_t addArrayBuffer(ArrayRef elements); - - /// Add to the the object buffer using \keys as the array of keys, and - /// \vals as the array of values. - /// Returns a pair where the first value is the object's offset into the - /// key buffer, and the second value is its offset into the value buffer. - std::pair addObjectBuffer( - ArrayRef keys, - ArrayRef vals); - /// Serializes the array of literals given into a compact char buffer. /// The serialization format can be found in: /// include/hermes/VM/SerializedLiteralParser.h @@ -444,6 +451,17 @@ class BytecodeModuleGenerator { std::vector &buff, bool isKeyBuffer); + /// For a given instruction \p inst that has an associated serialized literal, + /// obtain the offset of the literal in the associated buffer. In case of + /// an object literal, it is a pair of offsets (key and value). In case of + /// array literal, only the first offset is used. + LiteralOffset serializedLiteralOffsetFor(const Instruction *inst) { + assert( + literalOffsetMap_.count(inst) && + "instruction has no serialized literal"); + return literalOffsetMap_[inst]; + } + /// \return a BytecodeModule. std::unique_ptr generate(); }; diff --git a/include/hermes/BCGen/HBC/ConsecutiveStringStorage.h b/include/hermes/BCGen/HBC/ConsecutiveStringStorage.h index cad1c3c2d73..274225b13e5 100644 --- a/include/hermes/BCGen/HBC/ConsecutiveStringStorage.h +++ b/include/hermes/BCGen/HBC/ConsecutiveStringStorage.h @@ -74,14 +74,21 @@ class ConsecutiveStringStorage { /// Construct from a list of unique strings. Note that this is only /// instantiated for a small number of different \p I types. - template - ConsecutiveStringStorage(I begin, I end, bool optimize = false); + /// \param Force8Bit if set to std::true_type, indicates that the input + /// is *not* utf-8 encoded and consists of 8-bit bytes. If set to + /// std::false_type, the input is utf-8 encoded. + template + ConsecutiveStringStorage(I begin, I end, Force8Bit, bool optimize); /// Construct from a list of unique strings. explicit ConsecutiveStringStorage( llvh::ArrayRef strings, bool optimize = false) - : ConsecutiveStringStorage(strings.begin(), strings.end(), optimize) {} + : ConsecutiveStringStorage( + strings.begin(), + strings.end(), + std::false_type{}, + optimize) {} /// Construct from a table and storage. ConsecutiveStringStorage( diff --git a/include/hermes/BCGen/HBC/SerializedLiteralGenerator.h b/include/hermes/BCGen/HBC/SerializedLiteralGenerator.h index b683fd20d66..7b02e122fa8 100644 --- a/include/hermes/BCGen/HBC/SerializedLiteralGenerator.h +++ b/include/hermes/BCGen/HBC/SerializedLiteralGenerator.h @@ -55,12 +55,9 @@ class SerializedLiteralGenerator { private: /// The bytecode module generator. BytecodeModuleGenerator &BMGen_; - /// Whether to perform de-duplication optimization or not. - bool deDuplicate_; public: - SerializedLiteralGenerator(BytecodeModuleGenerator &BMGen, bool deDuplicate) - : BMGen_(BMGen), deDuplicate_(deDuplicate) {} + SerializedLiteralGenerator(BytecodeModuleGenerator &BMGen) : BMGen_(BMGen) {} using TagType = unsigned char; @@ -78,10 +75,10 @@ class SerializedLiteralGenerator { static constexpr unsigned SequenceMax = (1 << 12) - 1; - /// Serialize input \p literals into \p buff. + /// Serialize input \p literals and append into \p buff. /// \p isKeyBuffer: whether this is generating object literal key buffer or /// not. - uint32_t serializeBuffer( + void serializeBuffer( llvh::ArrayRef literals, std::vector &buff, bool isKeyBuffer); diff --git a/lib/BCGen/HBC/BytecodeGenerator.cpp b/lib/BCGen/HBC/BytecodeGenerator.cpp index 40916cb6355..b441adac8ef 100644 --- a/lib/BCGen/HBC/BytecodeGenerator.cpp +++ b/lib/BCGen/HBC/BytecodeGenerator.cpp @@ -80,18 +80,6 @@ void BytecodeFunctionGenerator::setJumpTable( jumpTable_ = std::move(jumpTable); } -uint32_t BytecodeModuleGenerator::addArrayBuffer(ArrayRef elements) { - return literalGenerator_.serializeBuffer(elements, arrayBuffer_, false); -} - -std::pair BytecodeModuleGenerator::addObjectBuffer( - ArrayRef keys, - ArrayRef vals) { - return std::pair{ - literalGenerator_.serializeBuffer(keys, objKeyBuffer_, true), - literalGenerator_.serializeBuffer(vals, objValBuffer_, false)}; -} - std::unique_ptr BytecodeFunctionGenerator::generateBytecodeFunction( Function::DefinitionKind definitionKind, @@ -240,6 +228,20 @@ uint32_t BytecodeModuleGenerator::addBigInt(bigint::ParsedBigInt bigint) { return bigIntTable_.addBigInt(std::move(bigint)); } +void BytecodeModuleGenerator::initializeSerializedLiterals( + std::vector &&arrayBuffer, + std::vector &&keyBuffer, + std::vector &&valBuffer, + hermes::hbc::BytecodeModuleGenerator::LiteralOffsetMapTy &&offsetMap) { + assert( + arrayBuffer_.empty() && objKeyBuffer_.empty() && objValBuffer_.empty() && + literalOffsetMap_.empty() && "serialized literals already initialized"); + arrayBuffer_ = std::move(arrayBuffer); + objKeyBuffer_ = std::move(keyBuffer); + objValBuffer_ = std::move(valBuffer); + literalOffsetMap_ = std::move(offsetMap); +} + uint32_t BytecodeModuleGenerator::addRegExp(CompiledRegExp *regexp) { return regExpTable_.addRegExp(regexp); } diff --git a/lib/BCGen/HBC/ConsecutiveStringStorage.cpp b/lib/BCGen/HBC/ConsecutiveStringStorage.cpp index c599ebf810e..107b568d4b2 100644 --- a/lib/BCGen/HBC/ConsecutiveStringStorage.cpp +++ b/lib/BCGen/HBC/ConsecutiveStringStorage.cpp @@ -571,8 +571,8 @@ class StringTableBuilder { /// and end. Note that we do not always copy the underlying string data so /// the resulting builder must not outlive these strings. In delta /// optimizing mode, only new strings are added here and packed. - template - StringTableBuilder(I begin, I end) { + template + StringTableBuilder(I begin, I end, Force8Bit) { // Generate and store a StringEntry for each string. // Remember the index of each string in our StringEntry, so that we can // later output the table in the correct order. @@ -586,7 +586,7 @@ class StringTableBuilder { static_assert(sizeof(str.data()[0]) == 1, "strings must be UTF8"); const unsigned char *begin = (const unsigned char *)str.data(); const unsigned char *end = begin + str.size(); - if (isAllASCII(begin, end)) { + if (Force8Bit::value || isAllASCII(begin, end)) { ArrayRef astr(begin, end); asciiStrings_.emplace_back(index, astr); } else { @@ -713,14 +713,15 @@ class StringTableBuilder { namespace hermes { namespace hbc { -template +template ConsecutiveStringStorage::ConsecutiveStringStorage( I begin, I end, + Force8Bit, bool optimize) { // Prepare to build our string table. // Generate storage for our ASCII and u16 strings. - StringTableBuilder builder(begin, end); + StringTableBuilder builder(begin, end, Force8Bit{}); std::vector asciiStorage; std::vector u16Storage; builder.packIntoStorage(&asciiStorage, &u16Storage, optimize); @@ -741,16 +742,25 @@ ConsecutiveStringStorage::ConsecutiveStringStorage( template ConsecutiveStringStorage::ConsecutiveStringStorage( StringSetVector::const_iterator begin, StringSetVector::const_iterator end, + std::false_type, bool optimize); template ConsecutiveStringStorage::ConsecutiveStringStorage( StringSetVector::iterator begin, StringSetVector::iterator end, + std::false_type, bool optimize); template ConsecutiveStringStorage::ConsecutiveStringStorage( ArrayRef::const_iterator begin, ArrayRef::const_iterator end, + std::false_type, + bool optimize); + +template ConsecutiveStringStorage::ConsecutiveStringStorage( + StringSetVector::const_iterator begin, + StringSetVector::const_iterator end, + std::true_type, bool optimize); uint32_t ConsecutiveStringStorage::getEntryHash(size_t i) const { diff --git a/lib/BCGen/HBC/HBC.cpp b/lib/BCGen/HBC/HBC.cpp index 0c75f8c26d3..fbf3fd1e8fe 100644 --- a/lib/BCGen/HBC/HBC.cpp +++ b/lib/BCGen/HBC/HBC.cpp @@ -138,7 +138,244 @@ UniquingStringLiteralAccumulator stringAccumulatorFromBCProvider( std::move(css), std::move(isIdentifier)}; } -} // namespace +/// A container that deduplicates byte sequences, while keeping the original +/// insertion order with the duplicates. Note: strings are used as a +/// representation for code reuse and simplicity, but the contents are meant to +/// be interpreted as unsigned bytes. +/// It maintains: +/// - a StringSetVector containing the uniqued strings. +/// - a vector mapping each originally inserted string in order to an index in +/// the StringSetVector. +/// This is a specialized class used only by \c LiteralBufferBuilder. +/// NOTE: We use std::string instead of std::vector for code reuse. +class UniquedStringVector { + public: + /// Append a string. + void push_back(llvh::StringRef str) { + indexInSet_.push_back(set_.insert(str)); + } + + /// \return how many strings the vector contains, in other words, how many + /// times \c push_back() was called. + size_t size() const { + return indexInSet_.size(); + } + + /// \return the begin iterator over the uniqued set of strings in insertion + /// order. + StringSetVector::const_iterator beginSet() const { + return set_.begin(); + } + /// \return the end iterator over the uniqued set of strings in insertion + /// order. + StringSetVector::const_iterator endSet() const { + return set_.end(); + } + + /// \return the index in the uniqued set corresponding to the insertion index + /// \p insertionIndex. + uint32_t indexInSet(size_t insertionIndex) const { + return indexInSet_[insertionIndex]; + } + + private: + /// The uniqued string set in insertion order. + StringSetVector set_{}; + /// Index into the set of each original non-deduplicated string in insertion + /// order. + std::vector indexInSet_{}; +}; + +/// A utility class which collects all serialized literals from a module, +/// optionally deduplicates them, and installs them in the +/// BytecodeModuleGenerator. +class LiteralBufferBuilder { + public: + /// Constructor. + /// \param m the IR module to process. + /// \param shouldVisitFunction a predicate indicating whether a function + /// should be processed or not. (In some cases like segment splitting we + /// want to exclude part of the module.) + /// \param bmGen the BytecodeModuleGenerator to use. + /// \param optimize whether to deduplicate the serialized literals. + LiteralBufferBuilder( + Module *m, + const std::function &shouldVisitFunction, + BytecodeModuleGenerator &bmGen, + bool optimize) + : M_(m), + shouldVisitFunction_(shouldVisitFunction), + bmGen_(bmGen), + optimize_(optimize), + literalGenerator_(bmGen) {} + + /// Do everything: collect the literals, optionally deduplicate them, install + /// them in the BytecodeModuleGenerator. + void generate(); + + private: + /// Traverse the module, skipping functions that should not be visited, + /// and collect all serialized array and object literals and the corresponding + /// instruction. + void traverse(); + + // Serialization handlers for different instructions. + + void serializeLiteralFor(AllocArrayInst *AAI); + void serializeLiteralFor(HBCAllocObjectFromBufferInst *AOFB); + + /// Serialize the the input literals \p elements into the UniquedStringVector + /// \p dest. + /// \p isKeyBuffer: whether this is generating object literal key buffer or + /// not. + void serializeInto( + UniquedStringVector &dest, + llvh::ArrayRef elements, + bool isKeyBuffer); + + private: + /// The IR module to process. + Module *const M_; + /// A predicate indicating whether a function should be processed or not. (In + /// some cases like segment splitting we want to exclude part of the module.) + const std::function &shouldVisitFunction_; + /// The BytecodeModuleGenerator to use. + BytecodeModuleGenerator &bmGen_; + /// Whether to deduplicate the serialized literals. + bool const optimize_; + + /// The stateless generator object. + SerializedLiteralGenerator literalGenerator_; + + /// Temporary buffer to serialize literals into. We keep it around instead + /// of allocating a new one every time. + std::vector tempBuffer_{}; + + /// Each element is a serialized array literal. + UniquedStringVector arrays_{}; + + /// Each element records the instruction whose literal was serialized at the + /// corresponding index in \c arrays_. + std::vector arraysInst_{}; + + /// Each element is the keys portion of a serialized object literal. + UniquedStringVector objKeys_{}; + + /// Each element is the values portion of a serialized object literal. + UniquedStringVector objVals_{}; + + /// Each element records the instruction whose literal was serialized at the + /// corresponding index in \c objKeys_/objVals_. + std::vector objInst_{}; +}; + +void LiteralBufferBuilder::generate() { + traverse(); + + // Construct the serialized storage, optionally optimizing it. + ConsecutiveStringStorage arrayStorage{ + arrays_.beginSet(), arrays_.endSet(), std::true_type{}, optimize_}; + ConsecutiveStringStorage keyStorage{ + objKeys_.beginSet(), objKeys_.endSet(), std::true_type{}, optimize_}; + ConsecutiveStringStorage valStorage{ + objVals_.beginSet(), objVals_.endSet(), std::true_type{}, optimize_}; + + // Populate the offset map. + BytecodeModuleGenerator::LiteralOffsetMapTy literalOffsetMap{}; + + // Visit all array literals. + auto arrayView = const_cast(arrayStorage) + .getStringTableView(); + for (size_t i = 0, e = arraysInst_.size(); i != e; ++i) { + assert( + literalOffsetMap.count(arraysInst_[i]) == 0 && + "instruction literal can't be serialized twice"); + uint32_t arrayIndexInSet = arrays_.indexInSet(i); + literalOffsetMap[arraysInst_[i]] = BytecodeModuleGenerator::LiteralOffset{ + arrayView[arrayIndexInSet].getOffset(), UINT32_MAX}; + } + + // Visit all object literals - they are split in two buffers. + auto keyView = const_cast(keyStorage) + .getStringTableView(); + auto valView = const_cast(valStorage) + .getStringTableView(); + for (size_t i = 0, e = objInst_.size(); i != e; ++i) { + assert( + literalOffsetMap.count(objInst_[i]) == 0 && + "instruction literal can't be serialized twice"); + uint32_t keyIndexInSet = objKeys_.indexInSet(i); + uint32_t valIndexInSet = objVals_.indexInSet(i); + literalOffsetMap[objInst_[i]] = BytecodeModuleGenerator::LiteralOffset{ + keyView[keyIndexInSet].getOffset(), valView[valIndexInSet].getOffset()}; + } + + bmGen_.initializeSerializedLiterals( + arrayStorage.acquireStringStorage(), + keyStorage.acquireStringStorage(), + valStorage.acquireStringStorage(), + std::move(literalOffsetMap)); +} + +void LiteralBufferBuilder::traverse() { + for (auto &F : *M_) { + if (!shouldVisitFunction_(&F)) + continue; + + for (auto &BB : F) { + for (auto &I : BB) { + if (auto *AAI = dyn_cast(&I)) { + serializeLiteralFor(AAI); + } else if (auto *AOFB = dyn_cast(&I)) { + serializeLiteralFor(AOFB); + } + } + } + } +} + +void LiteralBufferBuilder::serializeInto( + UniquedStringVector &dest, + llvh::ArrayRef elements, + bool isKeyBuffer) { + tempBuffer_.clear(); + literalGenerator_.serializeBuffer(elements, tempBuffer_, isKeyBuffer); + dest.push_back( + llvh::StringRef((const char *)tempBuffer_.data(), tempBuffer_.size())); +} + +void LiteralBufferBuilder::serializeLiteralFor(AllocArrayInst *AAI) { + SmallVector elements; + for (unsigned i = 0, e = AAI->getElementCount(); i < e; ++i) { + elements.push_back(cast(AAI->getArrayElement(i))); + } + + serializeInto(arrays_, elements, false); + arraysInst_.push_back(AAI); +} + +void LiteralBufferBuilder::serializeLiteralFor( + HBCAllocObjectFromBufferInst *AOFB) { + unsigned e = AOFB->getKeyValuePairCount(); + if (!e) + return; + + SmallVector objKeys; + SmallVector objVals; + for (unsigned ind = 0; ind != e; ++ind) { + auto keyValuePair = AOFB->getKeyValuePair(ind); + objKeys.push_back(cast(keyValuePair.first)); + objVals.push_back(cast(keyValuePair.second)); + } + + serializeInto(objKeys_, objKeys, true); + serializeInto(objVals_, objVals, false); + assert( + objKeys_.size() == objVals_.size() && + "objKeys_ and objVals_ must be the same size"); + objInst_.push_back(AOFB); +} +}; // namespace std::unique_ptr hbc::generateBytecodeModule( Module *M, @@ -222,6 +459,13 @@ std::unique_ptr hbc::generateBytecodeModule( std::move(strings), options.optimizationEnabled)); } + // Generate the serialized literal buffers. + { + LiteralBufferBuilder litBuilder{ + M, shouldGenerate, BMGen, options.optimizationEnabled}; + litBuilder.generate(); + } + // Add each function to BMGen so that each function has a unique ID. for (auto &F : *M) { if (!shouldGenerate(&F)) { diff --git a/lib/BCGen/HBC/ISel.cpp b/lib/BCGen/HBC/ISel.cpp index 122a75a2b09..425fc682697 100644 --- a/lib/BCGen/HBC/ISel.cpp +++ b/lib/BCGen/HBC/ISel.cpp @@ -783,18 +783,13 @@ void HBCISel::generateAllocArrayInst(AllocArrayInst *Inst, BasicBlock *next) { if (elementCount == 0) { BCFGen_->emitNewArray(dstReg, sizeHint); } else { - SmallVector elements; - for (unsigned i = 0, e = Inst->getElementCount(); i < e; ++i) { - elements.push_back(cast(Inst->getArrayElement(i))); - } - auto bufIndex = - BCFGen_->BMGen_.addArrayBuffer(ArrayRef{elements}); - if (bufIndex <= UINT16_MAX) { + auto bufIndex = BCFGen_->BMGen_.serializedLiteralOffsetFor(Inst); + if (bufIndex.first <= UINT16_MAX) { BCFGen_->emitNewArrayWithBuffer( - encodeValue(Inst), sizeHint, elementCount, bufIndex); + encodeValue(Inst), sizeHint, elementCount, bufIndex.first); } else { BCFGen_->emitNewArrayWithBufferLong( - encodeValue(Inst), sizeHint, elementCount, bufIndex); + encodeValue(Inst), sizeHint, elementCount, bufIndex.first); } } } @@ -843,21 +838,13 @@ void HBCISel::generateHBCAllocObjectFromBufferInst( HBCAllocObjectFromBufferInst *Inst, BasicBlock *next) { auto result = encodeValue(Inst); - int e = Inst->getKeyValuePairCount(); - SmallVector objKeys; - SmallVector objVals; - for (int ind = 0; ind < e; ind++) { - auto keyValuePair = Inst->getKeyValuePair(ind); - objKeys.push_back(cast(keyValuePair.first)); - objVals.push_back(cast(keyValuePair.second)); - } + unsigned e = Inst->getKeyValuePairCount(); // size hint operand of NewObjectWithBuffer opcode is 16-bit. uint32_t sizeHint = std::min((uint32_t)UINT16_MAX, Inst->getSizeHint()->asUInt32()); - auto buffIdxs = BCFGen_->BMGen_.addObjectBuffer( - llvh::ArrayRef{objKeys}, llvh::ArrayRef{objVals}); + auto buffIdxs = BCFGen_->BMGen_.serializedLiteralOffsetFor(Inst); if (buffIdxs.first <= UINT16_MAX && buffIdxs.second <= UINT16_MAX) { BCFGen_->emitNewObjectWithBuffer( result, sizeHint, e, buffIdxs.first, buffIdxs.second); diff --git a/lib/BCGen/HBC/SerializedLiteralGenerator.cpp b/lib/BCGen/HBC/SerializedLiteralGenerator.cpp index cbb5d93b4ea..78a030350d6 100644 --- a/lib/BCGen/HBC/SerializedLiteralGenerator.cpp +++ b/lib/BCGen/HBC/SerializedLiteralGenerator.cpp @@ -43,7 +43,7 @@ void serializeValueToBuffer( } } // namespace -uint32_t SerializedLiteralGenerator::serializeBuffer( +void SerializedLiteralGenerator::serializeBuffer( llvh::ArrayRef literals, std::vector &buff, bool isKeyBuffer) { @@ -57,12 +57,6 @@ uint32_t SerializedLiteralGenerator::serializeBuffer( // they can be added to tempBuff after the tag is finalized std::vector tmpSeqBuffer; - // Store the constructed buffer in a separate vector. - // This vector will be searched for in \buff. If an exact match - // occurs, \buff will not be modified, and the match's offset will be - // returned. - std::vector tempBuff; - // Stores the length of the current type sequence size_t seqLength = 0; @@ -111,9 +105,8 @@ uint32_t SerializedLiteralGenerator::serializeBuffer( if (newTag != lastTag || seqLength == SequenceMax) { if (seqLength > 0) { - appendTagToBuffer(tempBuff, lastTag, seqLength); - tempBuff.insert( - tempBuff.end(), tmpSeqBuffer.begin(), tmpSeqBuffer.end()); + appendTagToBuffer(buff, lastTag, seqLength); + buff.insert(buff.end(), tmpSeqBuffer.begin(), tmpSeqBuffer.end()); tmpSeqBuffer.resize(0); } lastTag = newTag; @@ -159,25 +152,8 @@ uint32_t SerializedLiteralGenerator::serializeBuffer( } } // The last value in the buffer can't get serialized in the loop. - appendTagToBuffer(tempBuff, lastTag, seqLength); - tempBuff.insert(tempBuff.end(), tmpSeqBuffer.begin(), tmpSeqBuffer.end()); - - // If this array buffer has already been added, potentially as a substring of - // another, we can just point there instead. This simple search gives a nice - // little space saving, but at a quadratic cost (fast in practice though). - if (deDuplicate_) { - auto it = - std::search(buff.begin(), buff.end(), tempBuff.begin(), tempBuff.end()); - - if (it != buff.end()) { - return it - buff.begin(); - } - } - - // If it doesn't exist or we don't optimize, append it and return its offset. - uint32_t ret = buff.size(); - buff.insert(buff.end(), tempBuff.begin(), tempBuff.end()); - return ret; + appendTagToBuffer(buff, lastTag, seqLength); + buff.insert(buff.end(), tmpSeqBuffer.begin(), tmpSeqBuffer.end()); } } // namespace hbc diff --git a/lib/BCGen/HBC/UniquingFilenameTable.cpp b/lib/BCGen/HBC/UniquingFilenameTable.cpp index 0805a73ec10..e5d020e9584 100644 --- a/lib/BCGen/HBC/UniquingFilenameTable.cpp +++ b/lib/BCGen/HBC/UniquingFilenameTable.cpp @@ -19,7 +19,8 @@ uint32_t UniquingFilenameTable::addFilename(llvh::StringRef filename) { /* static */ ConsecutiveStringStorage UniquingFilenameTable::toStorage( UniquingFilenameTable table) { auto &filenames = table.filenames_; - return ConsecutiveStringStorage{filenames.begin(), filenames.end()}; + return ConsecutiveStringStorage{ + filenames.begin(), filenames.end(), std::false_type{}, false}; } } // namespace hbc diff --git a/test/hermes/array-literal-large.js b/test/hermes/array-literal-large.js new file mode 100644 index 00000000000..97211ecf80b --- /dev/null +++ b/test/hermes/array-literal-large.js @@ -0,0 +1,50 @@ +/** + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + + +// RUN: %hermes %s > %t.js && %hermes -O %t.js -gc-sanitize-handles=0 +// REQUIRES: !slow_debug + +var numElems = 0; +var totalElems = 200; +var nextRecurse = 10; + +function makeArrayStr(maxSz){ + var arrStr = '['; + for (let i = 0; i < maxSz; i++) { + if (numElems > totalElems){ + break; + } + numElems++; + if (numElems % nextRecurse == 0){ + rnd = Math.floor(10 + Math.random() * 10); + arrStr += makeArrayStr(rnd) + ','; + } + arrStr += Math.floor(1 + Math.random() * 200) + ','; + } + arrStr += ']'; + return arrStr; +} + +function makeAssignVar(name){ + numElems = 0; + code = 'var ' + name + ' = ' + code += makeArrayStr(totalElems); + code += ';\n'; + return code +} + +function makeTest(){ + var code = ''; + var N = 4000; + for (var i = 0; i < N; i++){ + code += makeAssignVar('arr' + i); + } + return code; +} + +print(makeTest()) diff --git a/test/hermes/array-literal.js b/test/hermes/array-literal.js new file mode 100644 index 00000000000..d20028d66d9 --- /dev/null +++ b/test/hermes/array-literal.js @@ -0,0 +1,23 @@ +/** + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +// RUN: %hermes -O -target=HBC %s | %FileCheck --match-full-lines %s +// RUN: %hermes -O -target=HBC -emit-binary -out %t.hbc %s && %hermes %t.hbc | %FileCheck --match-full-lines %s +// Test that literal arrays work. +"use strict"; + +print('Array Literal'); +// CHECK-LABEL: Array Literal +var arr = [3, 5, null, true, false]; +print(arr.__proto__.constructor === Array); +// CHECK-NEXT: true +print(arr.toString()); +// CHECK-NEXT: 3,5,,true,false +print(arr[0]); +// CHECK-NEXT: 3 +print(arr[4]); +// CHECK-NEXT: false diff --git a/unittests/BCGen/HBC.cpp b/unittests/BCGen/HBC.cpp index f0a3e26e1c6..21d6a525763 100644 --- a/unittests/BCGen/HBC.cpp +++ b/unittests/BCGen/HBC.cpp @@ -317,48 +317,101 @@ TEST(HBCBytecodeGen, ExceptionTableTest) { } TEST(HBCBytecodeGen, ArrayBufferTest) { - // Since deserialization of the array buffer now requires a codeblock, - // the only thing that can be checked at BCGen time is that it uses - // the proper number of bytes for the serialization format. - std::string Result; - llvh::raw_string_ostream OS(Result); - - auto Ctx = std::make_shared(); - Module M(Ctx); - IRBuilder Builder(&M); - - BytecodeModuleGenerator BMG; - BMG.initializeStringTable(stringsForTest({"abc"})); - - Function *F = Builder.createTopLevelFunction( - M.getInitialScope()->createInnerScope(), true); - auto BFG = BytecodeFunctionGenerator::create(BMG, 3); - BFG->emitMov(1, 2); - std::vector arr1{ - Builder.getLiteralNumber(1), - Builder.getLiteralBool(true), - Builder.getLiteralBool(false), - Builder.getLiteralNull(), - Builder.getLiteralNull(), - Builder.getLiteralString("abc"), - }; - BMG.addArrayBuffer(llvh::ArrayRef{arr1}); + auto src = R"( +var arr = [1, true, false, null, null, 'abc'] +)"; + auto BM = bytecodeModuleForSource(src); + ASSERT_EQ(BM->getArrayBufferSize(), 10u); +} - BMG.setEntryPointIndex(BMG.addFunction(F)); - BMG.setFunctionGenerator(F, std::move(BFG)); +TEST(HBCBytecodeGen, ObjectBufferTest) { + auto src = R"( +var obj = {a:1, b:2, c:3}; +)"; + auto BM = bytecodeModuleForSource(src); + ASSERT_EQ(BM->getObjectKeyBufferSize(), 4u); + ASSERT_EQ(BM->getObjectValueBufferSize(), 13u); +} - std::shared_ptr BM = BMG.generate(); - // auto &BF = BM->getGlobalCode(); - ASSERT_EQ(BM->getArrayBufferSize(), 10u); +// For the following 'easy' tests, exact duplicate literals are used. These will +// always be de-duplicated, so we don't have to turn optimizations on. +TEST(HBCBytecodeGen, EasyArrayDedupBufferTest) { + auto singleArrCode = R"( +var arr = [1, null, undefined, 9n, 'hi', {}, Symbol('sym')]; +)"; + auto dupArrCode = R"( +var arr1 = [1, null, undefined, 9n, 'hi', {}, Symbol('sym')]; +var arr2 = [1, null, undefined, 9n, 'hi', {}, Symbol('sym')]; +var arr3 = [1, null, undefined, 9n, 'hi', {}, Symbol('sym')]; +var arr4 = [1, null, undefined, 9n, 'hi', {}, Symbol('sym')]; +var s = arr1[0] + arr2[0] + arr3[0] + arr4[0]; +)"; + auto singleArrBM = bytecodeModuleForSource(singleArrCode); + auto manyArrBM = bytecodeModuleForSource(dupArrCode); + // If de-duplication is working correctly, the amount of space that a single + // array literal takes up should be the same amount of space that N identical + // array literals take up. + ASSERT_EQ(singleArrBM->getArrayBufferSize(), manyArrBM->getArrayBufferSize()); +} - BytecodeSerializer BS{OS}; - BS.serialize(*BM, SHA1{}); +TEST(HBCBytecodeGen, EasyObjectDedupBufferTest) { + auto singleObjCode = R"( +var obj = {a:1, b:null, c:undefined, d:9n, e:'hi', f:{}, g:Symbol('sym')}; +)"; + auto dupObjsCode = R"( +var obj1 = {a:1, b:null, c:undefined, d:9n, e:'hi', f:{}, g:Symbol('sym')}; +var obj2 = {a:1, b:null, c:undefined, d:9n, e:'hi', f:{}, g:Symbol('sym')}; +var obj3 = {a:1, b:null, c:undefined, d:9n, e:'hi', f:{}, g:Symbol('sym')}; +var obj4 = {a:1, b:null, c:undefined, d:9n, e:'hi', f:{}, g:Symbol('sym')}; +var s = obj1.a + obj2.a + obj3.a + obj4.a; +)"; + auto singleObjBM = bytecodeModuleForSource(singleObjCode); + auto dedupBM = bytecodeModuleForSource(dupObjsCode); + // If de-duplication is working correctly, the amount of space that a single + // object literal takes up should be the same amount of space that N identical + // object literals take up. + ASSERT_EQ( + singleObjBM->getObjectKeyBufferSize(), dedupBM->getObjectKeyBufferSize()); + ASSERT_EQ( + singleObjBM->getObjectValueBufferSize(), + dedupBM->getObjectValueBufferSize()); +} - auto bytecode = hbc::BCProviderFromBuffer::createBCProviderFromBuffer( - std::make_unique(OS.str())) - .first; +// For the next 'hard' tests, the literals are not exact duplicates. If +// optimizations are off, we will simply give up there and not perform any +// de-duplications. However, if optimizations are on, then we perform a more +// advanced algorithm for de-duplicating, which will result in a smaller buffer +// size. +TEST(HBCBytecodeGen, HardArrayDedupBufferTest) { + auto almostDupCode = R"( +var arr1 = [false, true, null, 'hi', null, true, false, 'bye']; +var arr2 = ['diff', 4, 5, 6, false, true, null, 'hi', null, true, false, 'bye']; +var s = arr1[0] + arr2[1]; +)"; + auto dupBM = bytecodeModuleForSource(almostDupCode); + auto dedupOpts = BytecodeGenerationOptions::defaults(); + dedupOpts.optimizationEnabled = true; + auto dedupBM = bytecodeModuleForSource(almostDupCode, dedupOpts); + // The bytecode module which performed optimizations should have a smaller + // buffer size than the one that didn't. + ASSERT_LT(dedupBM->getArrayBufferSize(), dupBM->getArrayBufferSize()); +} - ASSERT_EQ(bytecode->getArrayBuffer().size(), 10u); +TEST(HBCBytecodeGen, HardObjectDedupBufferTest) { + auto almostDupCode = R"( +var obj1 = {a:10,b:11,c:12,1:null,2:true,3:false}; +var obj2 = {a:10,b:11,c:12}; +var s = obj1.a + obj2.a; +)"; + auto dupBM = bytecodeModuleForSource(almostDupCode); + auto dedupOpts = BytecodeGenerationOptions::defaults(); + dedupOpts.optimizationEnabled = true; + auto dedupBM = bytecodeModuleForSource(almostDupCode, dedupOpts); + // The bytecode module which performed optimizations should have a smaller + // buffer size than the one that didn't. + ASSERT_LT(dedupBM->getObjectKeyBufferSize(), dupBM->getObjectKeyBufferSize()); + ASSERT_LT( + dedupBM->getObjectValueBufferSize(), dupBM->getObjectValueBufferSize()); } TEST(SpillRegisterTest, SpillsParameters) { diff --git a/unittests/VMRuntime/CMakeLists.txt b/unittests/VMRuntime/CMakeLists.txt index 345f113d779..6aa3f353e41 100644 --- a/unittests/VMRuntime/CMakeLists.txt +++ b/unittests/VMRuntime/CMakeLists.txt @@ -52,7 +52,6 @@ set(RTSources NativeFunctionTest.cpp NativeStateTest.cpp MarkBitArrayNCTest.cpp - ObjectBufferTest.cpp ObjectModelTest.cpp OperationsTest.cpp PredefinedStringsTest.cpp diff --git a/unittests/VMRuntime/ObjectBufferTest.cpp b/unittests/VMRuntime/ObjectBufferTest.cpp deleted file mode 100644 index 3cc2d49453a..00000000000 --- a/unittests/VMRuntime/ObjectBufferTest.cpp +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#define DEBUG_TYPE "hbc-unittests" - -#include "TestHelpers.h" - -#include "hermes/BCGen/HBC/BytecodeGenerator.h" -#include "hermes/IR/IR.h" -#include "hermes/IR/IRBuilder.h" -#include "hermes/VM/CodeBlock.h" -#include "hermes/VM/Domain.h" -#include "hermes/VM/Operations.h" -#include "hermes/VM/Runtime.h" - -#include "gtest/gtest.h" - -#include "llvh/Support/raw_ostream.h" - -using namespace hermes::vm; -using namespace hermes::hbc; - -namespace { - -using ObjectBufferTest = RuntimeTestFixture; - -TEST_F(ObjectBufferTest, TestNewObjectWithBuffer) { - auto Ctx = std::make_shared(); - hermes::Module M(Ctx); - hermes::IRBuilder Builder(&M); - - /* - ; Store the object outlined below: - ; obj = { - ; 'a': false - ; 'b': true - ; 'c': null - ; 'd': "foo" - ; 'e': {"bar": null} - ; } - ; First five values are added through NewObjectWithBuffer - ; while the sixth value is added through PutNewOwnById. - ; This test makes sure that each value is correctly stored - ; and that two objects can correctly use the same buffers. - */ - - const unsigned FRAME_SIZE = 4; - BytecodeModuleGenerator BMG; - - { // Make BMG aware of the strings it needs. - UniquingStringLiteralAccumulator strings; - - // Name of the top-level function. - strings.addString("global", /* isIdentifier */ false); - - // Strings used in the test. - strings.addString("a", /* isIdentifier */ true); - strings.addString("b", /* isIdentifier */ true); - strings.addString("c", /* isIdentifier */ true); - strings.addString("d", /* isIdentifier */ true); - strings.addString("e", /* isIdentifier */ true); - strings.addString("foo", /* isIdentifier */ false); - strings.addString("bar", /* isIdentifier */ true); - - BMG.initializeStringTable( - UniquingStringLiteralAccumulator::toTable(std::move(strings))); - } - - auto BFG = BytecodeFunctionGenerator::create(BMG, FRAME_SIZE); - - // Need to generate the BytecodeModule twice. - // Runtime needs to agree on string table contents with the bytecode, - // which means the runtime must be initialized with a BytecodeModule. - // First module just inserts the proper string indices into the runtime, - // the second module actually generates instructions. - std::vector objKeys{ - Builder.getLiteralString("a"), - Builder.getLiteralString("b"), - Builder.getLiteralString("c"), - Builder.getLiteralString("d")}; - - std::vector objVals{ - Builder.getLiteralBool(false), - Builder.getLiteralBool(true), - Builder.getLiteralNull(), - Builder.getLiteralString("foo")}; - - std::vector innerObjKeys{Builder.getLiteralString("bar")}; - std::vector innerObjVals{Builder.getLiteralNull()}; - - auto objIdxs = BMG.addObjectBuffer( - llvh::ArrayRef{objKeys}, - llvh::ArrayRef{objVals}); - auto innerObjIdxs = BMG.addObjectBuffer( - llvh::ArrayRef{innerObjKeys}, - llvh::ArrayRef{innerObjVals}); - - auto IDa = BMG.getStringID("a"); - auto IDb = BMG.getStringID("b"); - auto IDc = BMG.getStringID("c"); - auto IDd = BMG.getStringID("d"); - auto IDe = BMG.getStringID("e"); - auto IDfoo = BMG.getStringID("foo"); - auto IDbar = BMG.getStringID("bar"); - - BFG->emitLoadConstInt(3, 1); - BFG->emitNewObjectWithBuffer(0, 5, 4, objIdxs.first, objIdxs.second); - BFG->emitNewObjectWithBuffer( - 1, 1, 1, innerObjIdxs.first, innerObjIdxs.second); - BFG->emitPutNewOwnById(0, 1, IDe); - - BFG->emitGetById(1, 0, 0, IDa); - BFG->emitLoadConstFalse(2); - BFG->emitStrictEq(2, 1, 2); - BFG->emitBitAnd(3, 2, 3); - - BFG->emitGetById(1, 0, 0, IDb); - BFG->emitLoadConstTrue(2); - BFG->emitStrictEq(2, 1, 2); - BFG->emitBitAnd(3, 2, 3); - - BFG->emitGetById(1, 0, 0, IDc); - BFG->emitLoadConstNull(2); - BFG->emitStrictEq(2, 1, 2); - BFG->emitBitAnd(3, 2, 3); - - BFG->emitGetById(1, 0, 0, IDd); - BFG->emitLoadConstString(2, IDfoo); - BFG->emitStrictEq(2, 1, 2); - BFG->emitBitAnd(3, 2, 3); - - BFG->emitGetById(1, 0, 0, IDe); - BFG->emitGetById(1, 1, 0, IDbar); - BFG->emitLoadConstNull(2); - BFG->emitStrictEq(2, 1, 2); - BFG->emitBitAnd(3, 2, 3); - - BFG->emitRet(3); - BFG->setHighestReadCacheIndex(255); - BFG->setHighestWriteCacheIndex(255); - BFG->bytecodeGenerationComplete(); - auto F = Builder.createTopLevelFunction( - M.getInitialScope()->createInnerScope(), true); - BMG.addFunction(F); - BMG.setFunctionGenerator(F, std::move(BFG)); - - auto *runtimeModule = RuntimeModule::createUninitialized(runtime, domain); - - runtimeModule->initializeWithoutCJSModulesMayAllocate( - BCProviderFromSrc::createBCProviderFromSrc(BMG.generate())); - - auto codeBlock = runtimeModule->getCodeBlockMayAllocate(0); - CallResult status{ExecutionStatus::EXCEPTION}; - { - ScopedNativeCallFrame frame{ - runtime, - 0, - HermesValue::encodeUndefinedValue(), - HermesValue::encodeUndefinedValue(), - HermesValue::encodeUndefinedValue()}; - status = runtime.interpretFunction(codeBlock); - } - auto frames = runtime.getStackFrames(); - ASSERT_TRUE(frames.begin() == frames.end()); - ASSERT_EQ( - StackFrameLayout::CalleeExtraRegistersAtStart, runtime.getStackLevel()); - ASSERT_EQ(ExecutionStatus::RETURNED, status.getStatus()); - ASSERT_EQ(1, status.getValue().getDouble()); -} - -} // anonymous namespace -#undef DEBUG_TYPE