Skip to content

Commit

Permalink
[clang][AST] Support AST files larger than 512M
Browse files Browse the repository at this point in the history
Summary:
Clang uses 32-bit integers for storing bit offsets from the beginning of
the file that results in 512M limit on AST file. This diff replaces
absolute offsets with relative offsets from the beginning of
corresponding data structure when it is possible. And uses 64-bit
offsets for DeclOffests and TypeOffssts because these coder AST
section may easily exceeds 512M alone.

This diff breaks AST file format compatibility so VERSION_MAJOR bumped.

Test Plan:
Existing clang AST serialization tests
Tested on clangd with ~700M and ~900M preamble files

Reviewers: rsmith, dexonsmith

Subscribers: ilya-biryukov, kadircet, usaxena95, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D76594
  • Loading branch information
dmpolukhin committed Apr 16, 2020
1 parent f701d8f commit 30d5946
Show file tree
Hide file tree
Showing 8 changed files with 81 additions and 40 deletions.
27 changes: 21 additions & 6 deletions clang/include/clang/Serialization/ASTBitCodes.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ namespace serialization {
/// Version 4 of AST files also requires that the version control branch and
/// revision match exactly, since there is no backward compatibility of
/// AST files at this time.
const unsigned VERSION_MAJOR = 9;
const unsigned VERSION_MAJOR = 10;

/// AST file minor version number supported by this version of
/// Clang.
Expand Down Expand Up @@ -181,7 +181,7 @@ namespace serialization {
/// Raw source location of end of range.
unsigned End;

/// Offset in the AST file.
/// Offset in the AST file relative to ModuleFile::MacroOffsetsBase.
uint32_t BitOffset;

PPEntityOffset(SourceRange R, uint32_t BitOffset)
Expand Down Expand Up @@ -221,12 +221,18 @@ namespace serialization {
/// Raw source location.
unsigned Loc = 0;

/// Offset in the AST file.
uint32_t BitOffset = 0;
/// Offset in the AST file. Split 64-bit integer into low/high parts
/// to keep structure alignment 32-bit and don't have padding gap.
/// This structure is serialized "as is" to the AST file and undefined
/// value in the padding affects AST hash.
uint32_t BitOffsetLow = 0;
uint32_t BitOffsetHigh = 0;

DeclOffset() = default;
DeclOffset(SourceLocation Loc, uint32_t BitOffset)
: Loc(Loc.getRawEncoding()), BitOffset(BitOffset) {}
DeclOffset(SourceLocation Loc, uint64_t BitOffset) {
setLocation(Loc);
setBitOffset(BitOffset);
}

void setLocation(SourceLocation L) {
Loc = L.getRawEncoding();
Expand All @@ -235,6 +241,15 @@ namespace serialization {
SourceLocation getLocation() const {
return SourceLocation::getFromRawEncoding(Loc);
}

void setBitOffset(uint64_t Offset) {
BitOffsetLow = Offset;
BitOffsetHigh = Offset >> 32;
}

uint64_t getBitOffset() const {
return BitOffsetLow | (uint64_t(BitOffsetHigh) << 32);
}
};

/// The number of predefined preprocessed entity IDs.
Expand Down
7 changes: 4 additions & 3 deletions clang/include/clang/Serialization/ASTReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -723,9 +723,10 @@ class ASTReader

struct PendingMacroInfo {
ModuleFile *M;
uint64_t MacroDirectivesOffset;
/// Offset relative to ModuleFile::MacroOffsetsBase.
uint32_t MacroDirectivesOffset;

PendingMacroInfo(ModuleFile *M, uint64_t MacroDirectivesOffset)
PendingMacroInfo(ModuleFile *M, uint32_t MacroDirectivesOffset)
: M(M), MacroDirectivesOffset(MacroDirectivesOffset) {}
};

Expand Down Expand Up @@ -2205,7 +2206,7 @@ class ASTReader
/// \param MacroDirectivesOffset Offset of the serialized macro directive
/// history.
void addPendingMacro(IdentifierInfo *II, ModuleFile *M,
uint64_t MacroDirectivesOffset);
uint32_t MacroDirectivesOffset);

/// Read the set of macros defined by this external macro source.
void ReadDefinedMacros() override;
Expand Down
10 changes: 6 additions & 4 deletions clang/include/clang/Serialization/ASTWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ class ASTWriter : public ASTDeserializationListener,

/// Offset of each type in the bitstream, indexed by
/// the type's ID.
std::vector<uint32_t> TypeOffsets;
std::vector<uint64_t> TypeOffsets;

/// The first ID number we can use for our own identifiers.
serialization::IdentID FirstIdentID = serialization::NUM_PREDEF_IDENT_IDS;
Expand Down Expand Up @@ -277,7 +277,8 @@ class ASTWriter : public ASTDeserializationListener,
/// The macro infos to emit.
std::vector<MacroInfoToEmitData> MacroInfosToEmit;

llvm::DenseMap<const IdentifierInfo *, uint64_t> IdentMacroDirectivesOffsetMap;
llvm::DenseMap<const IdentifierInfo *, uint32_t>
IdentMacroDirectivesOffsetMap;

/// @name FlushStmt Caches
/// @{
Expand Down Expand Up @@ -464,7 +465,8 @@ class ASTWriter : public ASTDeserializationListener,
const Preprocessor &PP);
void WritePreprocessor(const Preprocessor &PP, bool IsModule);
void WriteHeaderSearch(const HeaderSearch &HS);
void WritePreprocessorDetail(PreprocessingRecord &PPRec);
void WritePreprocessorDetail(PreprocessingRecord &PPRec,
uint64_t MacroOffsetsBase);
void WriteSubmodules(Module *WritingModule);

void WritePragmaDiagnosticMappings(const DiagnosticsEngine &Diag,
Expand Down Expand Up @@ -588,7 +590,7 @@ class ASTWriter : public ASTDeserializationListener,
/// Determine the ID of an already-emitted macro.
serialization::MacroID getMacroID(MacroInfo *MI);

uint64_t getMacroDirectivesOffset(const IdentifierInfo *Name);
uint32_t getMacroDirectivesOffset(const IdentifierInfo *Name);

/// Emit a reference to a type.
void AddTypeRef(QualType T, RecordDataImpl &Record);
Expand Down
10 changes: 9 additions & 1 deletion clang/include/clang/Serialization/ModuleFile.h
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,10 @@ class ModuleFile {
/// The base offset in the source manager's view of this module.
unsigned SLocEntryBaseOffset = 0;

/// Base file offset for the offsets in SLocEntryOffsets. Real file offset
/// for the entry is SLocEntryOffsetsBase + SLocEntryOffsets[i].
uint64_t SLocEntryOffsetsBase = 0;

/// Offsets for all of the source location entries in the
/// AST file.
const uint32_t *SLocEntryOffsets = nullptr;
Expand Down Expand Up @@ -302,6 +306,10 @@ class ModuleFile {
/// The number of macros in this AST file.
unsigned LocalNumMacros = 0;

/// Base file offset for the offsets in MacroOffsets. Real file offset for
/// the entry is MacroOffsetsBase + MacroOffsets[i].
uint64_t MacroOffsetsBase = 0;

/// Offsets of macros in the preprocessor block.
///
/// This array is indexed by the macro ID (-1), and provides
Expand Down Expand Up @@ -450,7 +458,7 @@ class ModuleFile {

/// Offset of each type within the bitstream, indexed by the
/// type ID, or the representation of a Type*.
const uint32_t *TypeOffsets = nullptr;
const uint64_t *TypeOffsets = nullptr;

/// Base type ID for types local to this module as represented in
/// the global type ID space.
Expand Down
20 changes: 12 additions & 8 deletions clang/lib/Serialization/ASTReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1470,6 +1470,7 @@ bool ASTReader::ReadSLocEntry(int ID) {

ModuleFile *F = GlobalSLocEntryMap.find(-ID)->second;
if (llvm::Error Err = F->SLocEntryCursor.JumpToBit(
F->SLocEntryOffsetsBase +
F->SLocEntryOffsets[ID - F->SLocEntryBaseID])) {
Error(std::move(Err));
return true;
Expand Down Expand Up @@ -1932,9 +1933,8 @@ HeaderFileInfoTrait::ReadData(internal_key_ref key, const unsigned char *d,
return HFI;
}

void ASTReader::addPendingMacro(IdentifierInfo *II,
ModuleFile *M,
uint64_t MacroDirectivesOffset) {
void ASTReader::addPendingMacro(IdentifierInfo *II, ModuleFile *M,
uint32_t MacroDirectivesOffset) {
assert(NumCurrentElementsDeserializing > 0 &&"Missing deserialization guard");
PendingMacroIDs[II].push_back(PendingMacroInfo(M, MacroDirectivesOffset));
}
Expand Down Expand Up @@ -2099,7 +2099,8 @@ void ASTReader::resolvePendingMacro(IdentifierInfo *II,

BitstreamCursor &Cursor = M.MacroCursor;
SavedStreamPosition SavedPosition(Cursor);
if (llvm::Error Err = Cursor.JumpToBit(PMInfo.MacroDirectivesOffset)) {
if (llvm::Error Err =
Cursor.JumpToBit(M.MacroOffsetsBase + PMInfo.MacroDirectivesOffset)) {
Error(std::move(Err));
return;
}
Expand Down Expand Up @@ -3098,7 +3099,7 @@ ASTReader::ReadASTBlock(ModuleFile &F, unsigned ClientLoadCapabilities) {
Error("duplicate TYPE_OFFSET record in AST file");
return Failure;
}
F.TypeOffsets = (const uint32_t *)Blob.data();
F.TypeOffsets = reinterpret_cast<const uint64_t *>(Blob.data());
F.LocalNumTypes = Record[0];
unsigned LocalBaseTypeIndex = Record[1];
F.BaseTypeIndex = getTotalNumTypes();
Expand Down Expand Up @@ -3376,6 +3377,7 @@ ASTReader::ReadASTBlock(ModuleFile &F, unsigned ClientLoadCapabilities) {
F.SLocEntryOffsets = (const uint32_t *)Blob.data();
F.LocalNumSLocEntries = Record[0];
unsigned SLocSpaceSize = Record[1];
F.SLocEntryOffsetsBase = Record[2];
std::tie(F.SLocEntryBaseID, F.SLocEntryBaseOffset) =
SourceMgr.AllocateLoadedSLocEntries(F.LocalNumSLocEntries,
SLocSpaceSize);
Expand Down Expand Up @@ -3694,6 +3696,7 @@ ASTReader::ReadASTBlock(ModuleFile &F, unsigned ClientLoadCapabilities) {
F.MacroOffsets = (const uint32_t *)Blob.data();
F.LocalNumMacros = Record[0];
unsigned LocalBaseMacroID = Record[1];
F.MacroOffsetsBase = Record[2];
F.BaseMacroID = getTotalNumMacros();

if (F.LocalNumMacros > 0) {
Expand Down Expand Up @@ -5907,8 +5910,8 @@ PreprocessedEntity *ASTReader::ReadPreprocessedEntity(unsigned Index) {
}

SavedStreamPosition SavedPosition(M.PreprocessorDetailCursor);
if (llvm::Error Err =
M.PreprocessorDetailCursor.JumpToBit(PPOffs.BitOffset)) {
if (llvm::Error Err = M.PreprocessorDetailCursor.JumpToBit(
M.MacroOffsetsBase + PPOffs.BitOffset)) {
Error(std::move(Err));
return nullptr;
}
Expand Down Expand Up @@ -8427,7 +8430,8 @@ MacroInfo *ASTReader::getMacro(MacroID ID) {
assert(I != GlobalMacroMap.end() && "Corrupted global macro map");
ModuleFile *M = I->second;
unsigned Index = ID - M->BaseMacroID;
MacrosLoaded[ID] = ReadMacroRecord(*M, M->MacroOffsets[Index]);
MacrosLoaded[ID] =
ReadMacroRecord(*M, M->MacroOffsetsBase + M->MacroOffsets[Index]);

if (DeserializationListener)
DeserializationListener->MacroRead(ID + NUM_PREDEF_MACRO_IDS,
Expand Down
2 changes: 1 addition & 1 deletion clang/lib/Serialization/ASTReaderDecl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2870,7 +2870,7 @@ ASTReader::DeclCursorForID(DeclID ID, SourceLocation &Loc) {
const DeclOffset &DOffs =
M->DeclOffsets[ID - M->BaseDeclID - NUM_PREDEF_DECL_IDS];
Loc = TranslateSourceLocation(*M, DOffs.getLocation());
return RecordLocation(M, DOffs.BitOffset);
return RecordLocation(M, DOffs.getBitOffset());
}

ASTReader::RecordLocation ASTReader::getLocalBitOffset(uint64_t GlobalOffset) {
Expand Down
41 changes: 26 additions & 15 deletions clang/lib/Serialization/ASTWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1893,6 +1893,7 @@ void ASTWriter::WriteSourceManagerBlock(SourceManager &SourceMgr,
// Write out the source location entry table. We skip the first
// entry, which is always the same dummy entry.
std::vector<uint32_t> SLocEntryOffsets;
uint64_t SLocEntryOffsetsBase = Stream.GetCurrentBitNo();
RecordData PreloadSLocs;
SLocEntryOffsets.reserve(SourceMgr.local_sloc_entry_size() - 1);
for (unsigned I = 1, N = SourceMgr.local_sloc_entry_size();
Expand All @@ -1903,7 +1904,9 @@ void ASTWriter::WriteSourceManagerBlock(SourceManager &SourceMgr,
assert(&SourceMgr.getSLocEntry(FID) == SLoc);

// Record the offset of this source-location entry.
SLocEntryOffsets.push_back(Stream.GetCurrentBitNo());
uint64_t Offset = Stream.GetCurrentBitNo() - SLocEntryOffsetsBase;
assert((Offset >> 32) == 0 && "SLocEntry offset too large");
SLocEntryOffsets.push_back(Offset);

// Figure out which record code to use.
unsigned Code;
Expand Down Expand Up @@ -2011,12 +2014,14 @@ void ASTWriter::WriteSourceManagerBlock(SourceManager &SourceMgr,
Abbrev->Add(BitCodeAbbrevOp(SOURCE_LOCATION_OFFSETS));
Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 16)); // # of slocs
Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 16)); // total size
Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 32)); // base offset
Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // offsets
unsigned SLocOffsetsAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
{
RecordData::value_type Record[] = {
SOURCE_LOCATION_OFFSETS, SLocEntryOffsets.size(),
SourceMgr.getNextLocalOffset() - 1 /* skip dummy */};
SourceMgr.getNextLocalOffset() - 1 /* skip dummy */,
SLocEntryOffsetsBase};
Stream.EmitRecordWithBlob(SLocOffsetsAbbrev, Record,
bytes(SLocEntryOffsets));
}
Expand Down Expand Up @@ -2093,9 +2098,11 @@ static bool shouldIgnoreMacro(MacroDirective *MD, bool IsModule,
/// Writes the block containing the serialized form of the
/// preprocessor.
void ASTWriter::WritePreprocessor(const Preprocessor &PP, bool IsModule) {
uint64_t MacroOffsetsBase = Stream.GetCurrentBitNo();

PreprocessingRecord *PPRec = PP.getPreprocessingRecord();
if (PPRec)
WritePreprocessorDetail(*PPRec);
WritePreprocessorDetail(*PPRec, MacroOffsetsBase);

RecordData Record;
RecordData ModuleMacroRecord;
Expand Down Expand Up @@ -2156,7 +2163,8 @@ void ASTWriter::WritePreprocessor(const Preprocessor &PP, bool IsModule) {
// identifier they belong to.
for (const IdentifierInfo *Name : MacroIdentifiers) {
MacroDirective *MD = PP.getLocalMacroDirectiveHistory(Name);
auto StartOffset = Stream.GetCurrentBitNo();
uint64_t StartOffset = Stream.GetCurrentBitNo() - MacroOffsetsBase;
assert((StartOffset >> 32) == 0 && "Macro identifiers offset too large");

// Emit the macro directives in reverse source order.
for (; MD; MD = MD->getPrevious()) {
Expand Down Expand Up @@ -2229,14 +2237,12 @@ void ASTWriter::WritePreprocessor(const Preprocessor &PP, bool IsModule) {

// Record the local offset of this macro.
unsigned Index = ID - FirstMacroID;
if (Index == MacroOffsets.size())
MacroOffsets.push_back(Stream.GetCurrentBitNo());
else {
if (Index > MacroOffsets.size())
MacroOffsets.resize(Index + 1);
if (Index >= MacroOffsets.size())
MacroOffsets.resize(Index + 1);

MacroOffsets[Index] = Stream.GetCurrentBitNo();
}
uint64_t Offset = Stream.GetCurrentBitNo() - MacroOffsetsBase;
assert((Offset >> 32) == 0 && "Macro offset too large");
MacroOffsets[Index] = Offset;

AddIdentifierRef(Name, Record);
AddSourceLocation(MI->getDefinitionLoc(), Record);
Expand Down Expand Up @@ -2287,17 +2293,20 @@ void ASTWriter::WritePreprocessor(const Preprocessor &PP, bool IsModule) {
Abbrev->Add(BitCodeAbbrevOp(MACRO_OFFSET));
Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // # of macros
Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // first ID
Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 32)); // base offset
Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));

unsigned MacroOffsetAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
{
RecordData::value_type Record[] = {MACRO_OFFSET, MacroOffsets.size(),
FirstMacroID - NUM_PREDEF_MACRO_IDS};
FirstMacroID - NUM_PREDEF_MACRO_IDS,
MacroOffsetsBase};
Stream.EmitRecordWithBlob(MacroOffsetAbbrev, Record, bytes(MacroOffsets));
}
}

void ASTWriter::WritePreprocessorDetail(PreprocessingRecord &PPRec) {
void ASTWriter::WritePreprocessorDetail(PreprocessingRecord &PPRec,
uint64_t MacroOffsetsBase) {
if (PPRec.local_begin() == PPRec.local_end())
return;

Expand Down Expand Up @@ -2334,8 +2343,10 @@ void ASTWriter::WritePreprocessorDetail(PreprocessingRecord &PPRec) {
(void)++E, ++NumPreprocessingRecords, ++NextPreprocessorEntityID) {
Record.clear();

uint64_t Offset = Stream.GetCurrentBitNo() - MacroOffsetsBase;
assert((Offset >> 32) == 0 && "Preprocessed entity offset too large");
PreprocessedEntityOffsets.push_back(
PPEntityOffset((*E)->getSourceRange(), Stream.GetCurrentBitNo()));
PPEntityOffset((*E)->getSourceRange(), Offset));

if (auto *MD = dyn_cast<MacroDefinitionRecord>(*E)) {
// Record this macro definition's ID.
Expand Down Expand Up @@ -5144,7 +5155,7 @@ MacroID ASTWriter::getMacroID(MacroInfo *MI) {
return MacroIDs[MI];
}

uint64_t ASTWriter::getMacroDirectivesOffset(const IdentifierInfo *Name) {
uint32_t ASTWriter::getMacroDirectivesOffset(const IdentifierInfo *Name) {
return IdentMacroDirectivesOffsetMap.lookup(Name);
}

Expand Down
4 changes: 2 additions & 2 deletions clang/lib/Serialization/ASTWriterDecl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2434,12 +2434,12 @@ void ASTWriter::WriteDecl(ASTContext &Context, Decl *D) {
SourceLocation Loc = D->getLocation();
unsigned Index = ID - FirstDeclID;
if (DeclOffsets.size() == Index)
DeclOffsets.push_back(DeclOffset(Loc, Offset));
DeclOffsets.emplace_back(Loc, Offset);
else if (DeclOffsets.size() < Index) {
// FIXME: Can/should this happen?
DeclOffsets.resize(Index+1);
DeclOffsets[Index].setLocation(Loc);
DeclOffsets[Index].BitOffset = Offset;
DeclOffsets[Index].setBitOffset(Offset);
} else {
llvm_unreachable("declarations should be emitted in ID order");
}
Expand Down

0 comments on commit 30d5946

Please sign in to comment.