From e3b67160d119967522d1be9fa0680e973cdb6275 Mon Sep 17 00:00:00 2001 From: Joyee Cheung Date: Thu, 13 Jun 2024 17:37:11 +0200 Subject: [PATCH 1/3] src: return non-empty data in context data serializer For pointer values in the context data, we need to return non-empty data in the serializer so that V8 does not serialize them verbatim, making the snapshot unreproducible. --- src/node_snapshotable.cc | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/src/node_snapshotable.cc b/src/node_snapshotable.cc index ec01c2d0d5ac18..aefd479b83cd05 100644 --- a/src/node_snapshotable.cc +++ b/src/node_snapshotable.cc @@ -1262,18 +1262,30 @@ void DeserializeNodeContextData(Local holder, int index, StartupData payload, void* callback_data) { - // This is unreachable for now. We will reset all the pointers in - // Environment::AssignToContext() via the realm constructor. - UNREACHABLE(); + // We will reset all the pointers in Environment::AssignToContext() + // via the realm constructor. + switch (index) { + case ContextEmbedderIndex::kEnvironment: + case ContextEmbedderIndex::kContextifyContext: + case ContextEmbedderIndex::kRealm: + case ContextEmbedderIndex::kContextTag: { + uint64_t index_64; + int size = sizeof(index_64); + CHECK_EQ(payload.raw_size, size); + memcpy(&index_64, payload.data, payload.raw_size); + CHECK_EQ(index_64, static_cast(index)); + break; + } + default: + UNREACHABLE(); + } } StartupData SerializeNodeContextData(Local holder, int index, void* callback_data) { - // For now we just reset all of them in Environment::AssignToContext(). - // We return empty data here to make sure that the embedder data serialized - // into the snapshot is reproducible and V8 doesn't have to try to serialize - // the pointer values that won't be useful during deserialization. + // For pointer values, we need to return some non-empty data so that V8 + // does not serialize them verbatim, making the snapshot unreproducible. switch (index) { case ContextEmbedderIndex::kEnvironment: case ContextEmbedderIndex::kContextifyContext: @@ -1286,7 +1298,13 @@ StartupData SerializeNodeContextData(Local holder, static_cast(index), *holder, data); - return {nullptr, 0}; + // We use uint64_t to avoid padding. + uint64_t index_64 = static_cast(index); + // It must be allocated with new[] because V8 will call delete[] on it. + size_t size = sizeof(index_64); + char* startup_data = new char[size]; + memcpy(startup_data, &index_64, size); + return {startup_data, static_cast(size)}; } default: UNREACHABLE(); From 31b78b12c30de5581108110aa3cba46cd726d8c6 Mon Sep 17 00:00:00 2001 From: Joyee Cheung Date: Thu, 13 Jun 2024 17:47:59 +0200 Subject: [PATCH 2/3] src: make sure that memcpy-ed structs in snapshot have no padding To make the snapshots reproducible, this patch updates the size of a few types and adds some static assertions to ensure that there are no padding in the memcpy-ed structs. --- src/aliased_buffer-inl.h | 2 +- src/aliased_buffer.h | 2 +- src/base_object_types.h | 7 +++++-- src/encoding_binding.h | 6 ++++++ src/node_file.h | 6 ++++++ src/node_process.h | 6 ++++++ src/node_snapshotable.cc | 12 ++++++------ src/node_snapshotable.h | 37 +++++++++++++++++++++++++++++++++---- src/node_v8.h | 7 +++++++ 9 files changed, 71 insertions(+), 14 deletions(-) diff --git a/src/aliased_buffer-inl.h b/src/aliased_buffer-inl.h index f04fb7c2667016..40e08dfe149667 100644 --- a/src/aliased_buffer-inl.h +++ b/src/aliased_buffer-inl.h @@ -8,7 +8,7 @@ namespace node { -typedef size_t AliasedBufferIndex; +typedef uint64_t AliasedBufferIndex; template AliasedBufferBase::AliasedBufferBase( diff --git a/src/aliased_buffer.h b/src/aliased_buffer.h index b847641f8faa15..c2ddd269fd179c 100644 --- a/src/aliased_buffer.h +++ b/src/aliased_buffer.h @@ -9,7 +9,7 @@ namespace node { -typedef size_t AliasedBufferIndex; +typedef uint64_t AliasedBufferIndex; /** * Do not use this class directly when creating instances of it - use the diff --git a/src/base_object_types.h b/src/base_object_types.h index 9cfe6a77f71708..391a88973171ab 100644 --- a/src/base_object_types.h +++ b/src/base_object_types.h @@ -42,10 +42,13 @@ namespace node { SERIALIZABLE_NON_BINDING_TYPES(V) #define V(TypeId, NativeType) k_##TypeId, -enum class BindingDataType : uint8_t { BINDING_TYPES(V) kBindingDataTypeCount }; +// To avoid padding, the enums are uint64_t. +enum class BindingDataType : uint64_t { + BINDING_TYPES(V) kBindingDataTypeCount +}; // Make sure that we put the bindings first so that we can also use the enums // for the bindings as index to the binding data store. -enum class EmbedderObjectType : uint8_t { +enum class EmbedderObjectType : uint64_t { BINDING_TYPES(V) SERIALIZABLE_NON_BINDING_TYPES(V) // We do not need to know about all the unserializable non-binding types for // now so we do not list them. diff --git a/src/encoding_binding.h b/src/encoding_binding.h index 2690cb74f8a05b..7b18db9c6a1108 100644 --- a/src/encoding_binding.h +++ b/src/encoding_binding.h @@ -18,6 +18,12 @@ class BindingData : public SnapshotableObject { AliasedBufferIndex encode_into_results_buffer; }; + // Make sure that there's no padding in the struct since we will memcpy + // them into the snapshot blob and they need to be reproducible. + static_assert(sizeof(InternalFieldInfo) == + sizeof(InternalFieldInfoBase) + sizeof(AliasedBufferIndex), + "InternalFieldInfo should have no padding"); + BindingData(Realm* realm, v8::Local obj, InternalFieldInfo* info = nullptr); diff --git a/src/node_file.h b/src/node_file.h index bdad1ae25f4892..a0144314b280c1 100644 --- a/src/node_file.h +++ b/src/node_file.h @@ -64,6 +64,12 @@ class BindingData : public SnapshotableObject { AliasedBufferIndex statfs_field_bigint_array; }; + // Make sure that there's no padding in the struct since we will memcpy + // them into the snapshot blob and they need to be reproducible. + static_assert(sizeof(InternalFieldInfo) == sizeof(InternalFieldInfoBase) + + sizeof(AliasedBufferIndex) * 4, + "InternalFieldInfo should have no padding"); + enum class FilePathIsFileReturnType { kIsFile = 0, kIsNotFile, diff --git a/src/node_process.h b/src/node_process.h index 142d0e63e18c46..607826398beeae 100644 --- a/src/node_process.h +++ b/src/node_process.h @@ -54,6 +54,12 @@ class BindingData : public SnapshotableObject { AliasedBufferIndex hrtime_buffer; }; + // Make sure that there's no padding in the struct since we will memcpy + // them into the snapshot blob and they need to be reproducible. + static_assert(sizeof(InternalFieldInfo) == + sizeof(InternalFieldInfoBase) + sizeof(AliasedBufferIndex), + "InternalFieldInfo should have no padding"); + static void AddMethods(v8::Isolate* isolate, v8::Local target); static void RegisterExternalReferences(ExternalReferenceRegistry* registry); diff --git a/src/node_snapshotable.cc b/src/node_snapshotable.cc index aefd479b83cd05..f665d9dad01feb 100644 --- a/src/node_snapshotable.cc +++ b/src/node_snapshotable.cc @@ -282,9 +282,9 @@ size_t SnapshotSerializer::Write(const PropInfo& data) { } // Layout of AsyncHooks::SerializeInfo -// [ 4/8 bytes ] snapshot index of async_ids_stack -// [ 4/8 bytes ] snapshot index of fields -// [ 4/8 bytes ] snapshot index of async_id_fields +// [ 8 bytes ] snapshot index of async_ids_stack +// [ 8 bytes ] snapshot index of fields +// [ 8 bytes ] snapshot index of async_id_fields // [ 4/8 bytes ] snapshot index of js_execution_async_resources // [ 4/8 bytes ] length of native_execution_async_resources // [ ... ] snapshot indices of each element in @@ -387,9 +387,9 @@ size_t SnapshotSerializer::Write(const ImmediateInfo::SerializeInfo& data) { } // Layout of PerformanceState::SerializeInfo -// [ 4/8 bytes ] snapshot index of root -// [ 4/8 bytes ] snapshot index of milestones -// [ 4/8 bytes ] snapshot index of observers +// [ 8 bytes ] snapshot index of root +// [ 8 bytes ] snapshot index of milestones +// [ 8 bytes ] snapshot index of observers template <> performance::PerformanceState::SerializeInfo SnapshotDeserializer::Read() { Debug("Read()\n"); diff --git a/src/node_snapshotable.h b/src/node_snapshotable.h index 600e56c481be90..51a9499da5f2d7 100644 --- a/src/node_snapshotable.h +++ b/src/node_snapshotable.h @@ -4,6 +4,8 @@ #if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS +#include // For static_assert +#include // For offsetof #include "aliased_buffer.h" #include "base_object.h" #include "util.h" @@ -33,13 +35,13 @@ bool WithoutCodeCache(const SnapshotConfig& config); // and pass it into the V8 callback as the payload of StartupData. // The memory chunk looks like this: // -// [ type ] - EmbedderObjectType (a uint8_t) -// [ length ] - a size_t +// [ type ] - EmbedderObjectType (a uint64_t) +// [ length ] - a uint64_t // [ ... ] - custom bytes of size |length - header size| struct InternalFieldInfoBase { public: EmbedderObjectType type; - size_t length; + uint64_t length; template static T* New(EmbedderObjectType type) { @@ -71,14 +73,35 @@ struct InternalFieldInfoBase { InternalFieldInfoBase() = default; }; +// Make sure that there's no padding in the struct since we will memcpy +// them into the snapshot blob and they need to be reproducible. +static_assert(offsetof(InternalFieldInfoBase, type) == 0, + "InternalFieldInfoBase::type should start from offset 0"); +static_assert(offsetof(InternalFieldInfoBase, length) == + sizeof(EmbedderObjectType), + "InternalFieldInfoBase::type should have no padding"); + struct EmbedderTypeInfo { - enum class MemoryMode : uint8_t { kBaseObject, kCppGC }; + // To avoid padding, the enum is uint64_t. + enum class MemoryMode : uint64_t { kBaseObject = 0, kCppGC }; EmbedderTypeInfo(EmbedderObjectType t, MemoryMode m) : type(t), mode(m) {} EmbedderTypeInfo() = default; + EmbedderObjectType type; MemoryMode mode; }; +// Make sure that there's no padding in the struct since we will memcpy +// them into the snapshot blob and they need to be reproducible. +static_assert(offsetof(EmbedderTypeInfo, type) == 0, + "EmbedderTypeInfo::type should start from offset 0"); +static_assert(offsetof(EmbedderTypeInfo, mode) == sizeof(EmbedderObjectType), + "EmbedderTypeInfo::type should have no padding"); +static_assert(sizeof(EmbedderTypeInfo) == + sizeof(EmbedderObjectType) + + sizeof(EmbedderTypeInfo::MemoryMode), + "EmbedderTypeInfo::mode should have no padding"); + // An interface for snapshotable native objects to inherit from. // Use the SERIALIZABLE_OBJECT_METHODS() macro in the class to define // the following methods to implement: @@ -150,6 +173,12 @@ class BindingData : public SnapshotableObject { AliasedBufferIndex is_building_snapshot_buffer; }; + // Make sure that there's no padding in the struct since we will memcpy + // them into the snapshot blob and they need to be reproducible. + static_assert(sizeof(InternalFieldInfo) == + sizeof(InternalFieldInfoBase) + sizeof(AliasedBufferIndex), + "InternalFieldInfo should have no padding"); + BindingData(Realm* realm, v8::Local obj, InternalFieldInfo* info = nullptr); diff --git a/src/node_v8.h b/src/node_v8.h index d3797432a24db7..844addd6a93b1b 100644 --- a/src/node_v8.h +++ b/src/node_v8.h @@ -23,6 +23,13 @@ class BindingData : public SnapshotableObject { AliasedBufferIndex heap_space_statistics_buffer; AliasedBufferIndex heap_code_statistics_buffer; }; + + // Make sure that there's no padding in the struct since we will memcpy + // them into the snapshot blob and they need to be reproducible. + static_assert(sizeof(InternalFieldInfo) == sizeof(InternalFieldInfoBase) + + sizeof(AliasedBufferIndex) * 3, + "InternalFieldInfo should have no padding"); + BindingData(Realm* realm, v8::Local obj, InternalFieldInfo* info = nullptr); From b71c4f8bc06f5af00b174894b4c54549c447ee33 Mon Sep 17 00:00:00 2001 From: Joyee Cheung Date: Fri, 22 Mar 2024 19:51:49 +0100 Subject: [PATCH 3/3] src: add utilities to help debugging reproducibility of snapshots - Print offsets in blob serializer - Add a special node:generate_default_snapshot ID to generate the built-in snapshot. - Improve logging - Add a test to check the reproducibilty of the snapshot --- src/blob_serializer_deserializer-inl.h | 24 +++++-- src/node.cc | 24 ++++--- src/node_snapshotable.cc | 11 ++-- test/parallel/test-snapshot-reproducible.js | 70 +++++++++++++++++++++ 4 files changed, 110 insertions(+), 19 deletions(-) create mode 100644 test/parallel/test-snapshot-reproducible.js diff --git a/src/blob_serializer_deserializer-inl.h b/src/blob_serializer_deserializer-inl.h index f47a1e0cdf8a44..ecf664fc807b68 100644 --- a/src/blob_serializer_deserializer-inl.h +++ b/src/blob_serializer_deserializer-inl.h @@ -238,7 +238,8 @@ size_t BlobSerializer::WriteVector(const std::vector& data) { if (is_debug) { std::string str = std::is_arithmetic_v ? "" : ToStr(data); std::string name = GetName(); - Debug("\nWriteVector<%s>() (%d-byte), count=%d: %s\n", + Debug("\nAt 0x%x: WriteVector<%s>() (%d-byte), count=%d: %s\n", + sink.size(), name.c_str(), sizeof(T), data.size(), @@ -270,7 +271,10 @@ size_t BlobSerializer::WriteVector(const std::vector& data) { template size_t BlobSerializer::WriteStringView(std::string_view data, StringLogMode mode) { - Debug("WriteStringView(), length=%zu: %p\n", data.size(), data.data()); + Debug("At 0x%x: WriteStringView(), length=%zu: %p\n", + sink.size(), + data.size(), + data.data()); size_t written_total = WriteArithmetic(data.size()); size_t length = data.size(); @@ -294,6 +298,8 @@ size_t BlobSerializer::WriteString(const std::string& data) { return WriteStringView(data, StringLogMode::kAddressAndContent); } +static size_t kPreviewCount = 16; + // Helper for writing an array of numeric types. template template @@ -301,10 +307,18 @@ size_t BlobSerializer::WriteArithmetic(const T* data, size_t count) { static_assert(std::is_arithmetic_v, "Arithmetic type"); DCHECK_GT(count, 0); // Should not write contents for vectors of size 0. if (is_debug) { - std::string str = - "{ " + std::to_string(data[0]) + (count > 1 ? ", ... }" : " }"); + size_t preview_count = count < kPreviewCount ? count : kPreviewCount; + std::string str = "{ "; + for (size_t i = 0; i < preview_count; ++i) { + str += (std::to_string(data[i]) + ","); + } + if (count > preview_count) { + str += "..."; + } + str += "}"; std::string name = GetName(); - Debug("Write<%s>() (%zu-byte), count=%zu: %s", + Debug("At 0x%x: Write<%s>() (%zu-byte), count=%zu: %s", + sink.size(), name.c_str(), sizeof(T), count, diff --git a/src/node.cc b/src/node.cc index 87488cf0e015fd..a13c5a45496fd0 100644 --- a/src/node.cc +++ b/src/node.cc @@ -1288,18 +1288,24 @@ ExitCode GenerateAndWriteSnapshotData(const SnapshotData** snapshot_data_ptr, return exit_code; } } else { + std::optional builder_script_content; // Otherwise, load and run the specified builder script. std::unique_ptr generated_data = std::make_unique(); - std::string builder_script_content; - int r = ReadFileSync(&builder_script_content, builder_script.c_str()); - if (r != 0) { - FPrintF(stderr, - "Cannot read builder script %s for building snapshot. %s: %s", - builder_script, - uv_err_name(r), - uv_strerror(r)); - return ExitCode::kGenericUserError; + if (builder_script != "node:generate_default_snapshot") { + builder_script_content = std::string(); + int r = ReadFileSync(&(builder_script_content.value()), + builder_script.c_str()); + if (r != 0) { + FPrintF(stderr, + "Cannot read builder script %s for building snapshot. %s: %s\n", + builder_script, + uv_err_name(r), + uv_strerror(r)); + return ExitCode::kGenericUserError; + } + } else { + snapshot_config.builder_script_path = std::nullopt; } exit_code = node::SnapshotBuilder::Generate(generated_data.get(), diff --git a/src/node_snapshotable.cc b/src/node_snapshotable.cc index f665d9dad01feb..eaeafd8e16a91c 100644 --- a/src/node_snapshotable.cc +++ b/src/node_snapshotable.cc @@ -599,16 +599,17 @@ std::vector SnapshotData::ToBlob() const { size_t written_total = 0; // Metadata - w.Debug("Write magic %" PRIx32 "\n", kMagic); + w.Debug("0x%x: Write magic %" PRIx32 "\n", w.sink.size(), kMagic); written_total += w.WriteArithmetic(kMagic); - w.Debug("Write metadata\n"); + w.Debug("0x%x: Write metadata\n", w.sink.size()); written_total += w.Write(metadata); - + w.Debug("0x%x: Write snapshot blob\n", w.sink.size()); written_total += w.Write(v8_snapshot_blob_data); - w.Debug("Write isolate_data_indices\n"); + w.Debug("0x%x: Write IsolateDataSerializeInfo\n", w.sink.size()); written_total += w.Write(isolate_data_info); + w.Debug("0x%x: Write EnvSerializeInfo\n", w.sink.size()); written_total += w.Write(env_info); - w.Debug("Write code_cache\n"); + w.Debug("0x%x: Write CodeCacheInfo\n", w.sink.size()); written_total += w.WriteVector(code_cache); w.Debug("SnapshotData::ToBlob() Wrote %d bytes\n", written_total); diff --git a/test/parallel/test-snapshot-reproducible.js b/test/parallel/test-snapshot-reproducible.js new file mode 100644 index 00000000000000..f9392a7fb4adfc --- /dev/null +++ b/test/parallel/test-snapshot-reproducible.js @@ -0,0 +1,70 @@ +'use strict'; + +require('../common'); +const { spawnSyncAndAssert } = require('../common/child_process'); +const tmpdir = require('../common/tmpdir'); +const fs = require('fs'); +const assert = require('assert'); + +// When the test fails this helper can be modified to write outputs +// differently and aid debugging. +function log(line) { + console.log(line); +} + +function generateSnapshot() { + tmpdir.refresh(); + + spawnSyncAndAssert( + process.execPath, + [ + '--random_seed=42', + '--predictable', + '--build-snapshot', + 'node:generate_default_snapshot', + ], + { + env: { ...process.env, NODE_DEBUG_NATIVE: 'SNAPSHOT_SERDES' }, + cwd: tmpdir.path + }, + { + stderr(output) { + const lines = output.split('\n'); + for (const line of lines) { + if (line.startsWith('0x')) { + log(line); + } + } + }, + } + ); + const blobPath = tmpdir.resolve('snapshot.blob'); + return fs.readFileSync(blobPath); +} + +const buf1 = generateSnapshot(); +const buf2 = generateSnapshot(); + +const diff = []; +let offset = 0; +const step = 16; +do { + const length = Math.min(buf1.length - offset, step); + const slice1 = buf1.slice(offset, offset + length).toString('hex'); + const slice2 = buf2.slice(offset, offset + length).toString('hex'); + if (slice1 !== slice2) { + diff.push({ offset: '0x' + (offset).toString(16), slice1, slice2 }); + } + offset += length; +} while (offset < buf1.length); + +assert.strictEqual(offset, buf1.length); +if (offset < buf2.length) { + const length = Math.min(buf2.length - offset, step); + const slice2 = buf2.slice(offset, offset + length).toString('hex'); + diff.push({ offset, slice1: '', slice2 }); + offset += length; +} while (offset < buf2.length); + +assert.deepStrictEqual(diff, []); +assert.strictEqual(buf1.length, buf2.length);