diff --git a/common/common.h b/common/common.h index a43a156921..34f32d4f4b 100644 --- a/common/common.h +++ b/common/common.h @@ -535,12 +535,22 @@ inline bool remove_prefix(std::string_view &s, std::string_view prefix) { template class ConcurrentMap { public: - ConcurrentMap() {} + ConcurrentMap() = default; ConcurrentMap(i64 nbuckets) { resize(nbuckets); } + ~ConcurrentMap() { + if (entries) { +#ifdef _WIN32 + _aligned_free(entries); +#else + munmap(entries, sizeof(Entry) * nbuckets); +#endif + } + } + // In order to avoid unnecessary cache-line false sharing, we want // to make this object to be aligned to a reasonably large // power-of-two address. @@ -551,14 +561,19 @@ class ConcurrentMap { }; void resize(i64 nbuckets) { + assert(!entries); this->nbuckets = std::max(MIN_NBUCKETS, bit_ceil(nbuckets)); + i64 bufsize = sizeof(Entry) * this->nbuckets; - // Even though std::aligned_alloc is defined in C++17, MSVC doesn't - // seem to provide that function. C11's aligned_alloc may not always be - // available. Therefore, we'll align the buffer ourselves. - entries_buf.clear(); - entries_buf.resize(sizeof(Entry) * this->nbuckets + alignof(Entry) - 1); - entries = (Entry *)align_to((uintptr_t)&entries_buf[0], alignof(Entry)); + // Allocate a zero-initialized buffer. We use mmap() if available + // because it's faster than malloc() and memset(). +#ifdef _WIN32 + entries = (Entry *)_aligned_malloc(bufsize, alignof(Entry)); + memset((void *)entries, 0, bufsize); +#else + entries = (Entry *)mmap(nullptr, bufsize, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); +#endif } std::pair insert(std::string_view key, u64 hash, const T &val) { @@ -669,7 +684,6 @@ class ConcurrentMap { static constexpr i64 NUM_SHARDS = 16; static constexpr i64 MAX_RETRY = 128; - std::vector entries_buf; Entry *entries = nullptr; i64 nbuckets = 0; diff --git a/elf/mold.h b/elf/mold.h index 3a1906abbe..b9e127064a 100644 --- a/elf/mold.h +++ b/elf/mold.h @@ -798,14 +798,13 @@ class MergedSection : public Chunk { void write_to(Context &ctx, u8 *buf) override; void print_stats(Context &ctx); + ConcurrentMap> map; HyperLogLog estimator; private: MergedSection(std::string_view name, i64 flags, i64 type, i64 entsize); - ConcurrentMap> map; std::vector shard_offsets; - std::once_flag once_flag; }; template diff --git a/elf/output-chunks.cc b/elf/output-chunks.cc index 921f0c1bd5..d0ed7aa605 100644 --- a/elf/output-chunks.cc +++ b/elf/output-chunks.cc @@ -1933,11 +1933,6 @@ template SectionFragment * MergedSection::insert(Context &ctx, std::string_view data, u64 hash, i64 p2align) { - std::call_once(once_flag, [&] { - // We aim 2/3 occupation ratio - map.resize(estimator.get_cardinality() * 3 / 2); - }); - // Even if GC is enabled, we garbage-collect only memory-mapped strings. // Non-memory-allocated strings are typically identifiers used by debug info. // To remove such strings, use the `strip` command. diff --git a/elf/passes.cc b/elf/passes.cc index bfbde17dda..dfffbdf209 100644 --- a/elf/passes.cc +++ b/elf/passes.cc @@ -412,6 +412,10 @@ template void resolve_section_pieces(Context &ctx) { Timer t(ctx, "resolve_section_pieces"); + // We aim 2/3 occupation ratio + for (std::unique_ptr> &sec : ctx.merged_sections) + sec->map.resize(sec->estimator.get_cardinality() * 3 / 2); + tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { file->resolve_section_pieces(ctx); }); @@ -441,6 +445,9 @@ void add_comment_string(Context &ctx, std::string str) { MergedSection::get_instance(ctx, ".comment", SHT_PROGBITS, SHF_MERGE | SHF_STRINGS, 1, 1); + if (sec->map.nbuckets == 0) + sec->map.resize(4096); + std::string_view buf = save_string(ctx, str); std::string_view data(buf.data(), buf.size() + 1); sec->insert(ctx, data, hash_string(data), 0);