Skip to content

Commit

Permalink
[Hermes] Handle unicode characters in 'show source'
Browse files Browse the repository at this point in the history
Summary:
The unconventional encoding used in the string table requires that
input unicode source has to be reencoded before being added to the
table.

ASCII source can be used unmodified.

Copy `appendUnicodeToStorage` over from the JSLexer to allow
for reencoding the source during HBC generation.

@already-on-github

Test Plan: add a test

Reviewers: tmikov, #hermes-buddies, #hermes

Reviewed By: tmikov

Subscribers: #hermes

Differential Revision: https://phabricator.intern.facebook.com/D46324771

Tasks: T151863959

Tags: hermes
  • Loading branch information
avp authored and Michael Leon committed Jul 14, 2023
1 parent 469fc3b commit 36195b3
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 3 deletions.
1 change: 1 addition & 0 deletions include/hermes/BCGen/HBC/TraverseLiteralStrings.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ void traverseFunctions(
Module *M,
std::function<bool(Function *)> shouldVisitFunction,
std::function<void(llvh::StringRef)> traversal,
std::function<void(llvh::StringRef)> functionSourceTraversal,
bool stripFunctionNames);

/// Calls \p traversal with the name of the CommonJS module of every function
Expand Down
91 changes: 89 additions & 2 deletions lib/BCGen/HBC/HBC.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,29 @@ std::unique_ptr<BytecodeModule> hbc::generateBytecodeModule(
std::move(baseBCProvider));
}

/// Encode a Unicode codepoint into a UTF8 sequence and append it to \p
/// storage. Code points above 0xFFFF are encoded into UTF16, and the
/// resulting surrogate pair values are encoded individually into UTF8.
static inline void appendUnicodeToStorage(
uint32_t cp,
llvh::SmallVectorImpl<char> &storage) {
// Sized to allow for two 16-bit values to be encoded.
// A 16-bit value takes up to three bytes encoded in UTF-8.
char buf[8];
char *d = buf;
// We need to normalize code points which would be encoded with a surrogate
// pair. Note that this produces technically invalid UTF-8.
if (LLVM_LIKELY(cp < 0x10000)) {
hermes::encodeUTF8(d, cp);
} else {
assert(cp <= UNICODE_MAX_VALUE && "invalid Unicode value");
cp -= 0x10000;
hermes::encodeUTF8(d, UTF16_HIGH_SURROGATE + ((cp >> 10) & 0x3FF));
hermes::encodeUTF8(d, UTF16_LOW_SURROGATE + (cp & 0x3FF));
}
storage.append(buf, d);
}

std::unique_ptr<BytecodeModule> hbc::generateBytecodeModule(
Module *M,
Function *lexicalTopLevel,
Expand Down Expand Up @@ -429,6 +452,18 @@ std::unique_ptr<BytecodeModule> hbc::generateBytecodeModule(
shouldGenerate = [](const Function *) { return true; };
}

/// Mapping of the source text UTF-8 to the modified UTF-16-like
/// representation used by string literal encoding.
/// See appendUnicodeToStorage.
/// If a function source isn't in this map, then it's entirely ASCII and can
/// be added to the string table unmodified.
/// This allows us to add strings to the StringLiteralTable,
/// which will convert actual UTF-8 to UTF-16 automatically if it's detected,
/// meaning we'd not be able to directly look up the original function source
/// in the table.
llvh::DenseMap<llvh::StringRef, llvh::SmallVector<char, 32>>
unicodeFunctionSources{};

{ // Collect all the strings in the bytecode module into a storage.
// If we are in delta optimizing mode, start with the string storage from
// our base bytecode provider.
Expand All @@ -449,7 +484,48 @@ std::unique_ptr<BytecodeModule> hbc::generateBytecodeModule(
if (options.stripFunctionNames) {
addString(kStrippedFunctionName);
}
traverseFunctions(M, shouldGenerate, addString, options.stripFunctionNames);

/// Add the original function source \p str to the \c strings table.
/// If it's not ASCII, re-encode it using the string table's string literal
/// encoding and map from the original source to the newly encoded source in
/// unicodeFunctionSources,so it can be reused below.
auto addFunctionSource = [&strings,
&unicodeFunctionSources](llvh::StringRef str) {
if (hermes::isAllASCII(str.begin(), str.end())) {
// Fast path, no re-encoding needed.
strings.addString(str, /* isIdentifier */ false);
} else {
auto &storage = unicodeFunctionSources[str];
if (!storage.empty())
return;
for (const char *cur = str.begin(), *e = str.end(); cur != e;
/* increment in body */) {
if (LLVM_UNLIKELY(isUTF8Start(*cur))) {
// Decode and re-encode the character and append it to the string
// storage
appendUnicodeToStorage(
hermes::_decodeUTF8SlowPath<false>(
cur, [](const llvh::Twine &) {}),
storage);
} else {
storage.push_back(*cur);
++cur;
}
}
strings.addString(
llvh::StringRef{storage.begin(), storage.size()},
/* isIdentifier */ false);
}
};

// Populate strings table and if the source of a function contains unicode,
// add an entry to the unicodeFunctionSources.
traverseFunctions(
M,
shouldGenerate,
addString,
addFunctionSource,
options.stripFunctionNames);

if (!M->getCJSModulesResolved()) {
traverseCJSModuleNames(M, shouldGenerate, addString);
Expand Down Expand Up @@ -489,7 +565,18 @@ std::unique_ptr<BytecodeModule> hbc::generateBytecodeModule(
// Add entries to function source table for non-default source.
if (!F.isGlobalScope()) {
if (auto source = F.getSourceRepresentationStr()) {
BMGen.addFunctionSource(index, BMGen.getStringID(*source));
auto it = unicodeFunctionSources.find(*source);
// If the original source was mapped to a re-encoded one in
// unicodeFunctionSources, then use the re-encoded source to lookup the
// string ID. Otherwise it's ASCII and can be used directly.
if (it != unicodeFunctionSources.end()) {
BMGen.addFunctionSource(
index,
BMGen.getStringID(
llvh::StringRef{it->second.begin(), it->second.size()}));
} else {
BMGen.addFunctionSource(index, BMGen.getStringID(*source));
}
}
}
}
Expand Down
3 changes: 2 additions & 1 deletion lib/BCGen/HBC/TraverseLiteralStrings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ void traverseFunctions(
Module *M,
std::function<bool(Function *)> shouldVisitFunction,
std::function<void(llvh::StringRef)> traversal,
std::function<void(llvh::StringRef)> functionSourceTraversal,
bool stripFunctionNames) {
for (auto &F : *M) {
if (!shouldVisitFunction(&F)) {
Expand All @@ -65,7 +66,7 @@ void traverseFunctions(
if (!F.isGlobalScope()) {
// Only add non-default source representation to the string table.
if (auto source = F.getSourceRepresentationStr()) {
traversal(*source);
functionSourceTraversal(*source);
}
}
}
Expand Down
14 changes: 14 additions & 0 deletions test/hermes/source-visibility/unicode.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
/**
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/

// RUN: %hermes -O %s | %FileCheck --match-full-lines %s
// RUN: %hermes -lazy %s | %FileCheck --match-full-lines %s
// UNSUPPORTED: serializer

var x = function 𞸆() { 'show source'; }
print(x.toString());
// CHECK: function 𞸆() { 'show source'; }

0 comments on commit 36195b3

Please sign in to comment.