-
Notifications
You must be signed in to change notification settings - Fork 12k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[clang][Diagnostics] Highlight code snippets #66514
Conversation
@llvm/pr-subscribers-clang ChangesAdd some primitive syntax highlighting to our code snippet output.Obviously this is kinda WIP and more of a hack in general, but IMO it increases readability of the source snippets (which people look at highlighted all the time anyway...) and LLDB does something similar, so let's see. -- 5 Files Affected:
diff --git a/clang/include/clang/Frontend/CodeSnippetHighlighter.h b/clang/include/clang/Frontend/CodeSnippetHighlighter.h new file mode 100644 index 000000000000000..776954b59e2e1a8 --- /dev/null +++ b/clang/include/clang/Frontend/CodeSnippetHighlighter.h @@ -0,0 +1,46 @@ +//===--- CodeSnippetHighlighter.h - Code snippet highlighting ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_FRONTEND_CODESNIPPETHIGHLIGHTER_H +#define LLVM_CLANG_FRONTEND_CODESNIPPETHIGHLIGHTER_H + +#include "clang/Basic/LangOptions.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/Support/raw_ostream.h" +#include <vector> + +namespace clang { + +struct StyleRange { + unsigned Start; + unsigned End; + const enum llvm::raw_ostream::Colors c; +}; + +class CodeSnippetHighlighter final { +public: + CodeSnippetHighlighter() = default; + + /// Produce StyleRanges for the given line. + /// The returned vector contains non-overlapping style ranges. They are sorted + /// from beginning of the line to the end. + std::vector<StyleRange> highlightLine(llvm::StringRef SourceLine, + const LangOptions &LangOpts); + +private: + bool Initialized = false; + /// Fills Keywords and Literals. + void ensureTokenData(); + + llvm::SmallSet<StringRef, 12> Keywords; + llvm::SmallSet<StringRef, 12> Literals; +}; + +} // namespace clang + +#endif diff --git a/clang/include/clang/Frontend/TextDiagnostic.h b/clang/include/clang/Frontend/TextDiagnostic.h index 7eb0ab0cdc9bca8..39e09fe553dd4b9 100644 --- a/clang/include/clang/Frontend/TextDiagnostic.h +++ b/clang/include/clang/Frontend/TextDiagnostic.h @@ -15,6 +15,7 @@ #ifndef LLVM_CLANG_FRONTEND_TEXTDIAGNOSTIC_H #define LLVM_CLANG_FRONTEND_TEXTDIAGNOSTIC_H +#include "clang/Frontend/CodeSnippetHighlighter.h" #include "clang/Frontend/DiagnosticRenderer.h" namespace clang { @@ -33,6 +34,7 @@ namespace clang { /// printing coming out of libclang. class TextDiagnostic : public DiagnosticRenderer { raw_ostream &OS; + CodeSnippetHighlighter SnippetHighlighter; public: TextDiagnostic(raw_ostream &OS, @@ -104,7 +106,7 @@ class TextDiagnostic : public DiagnosticRenderer { ArrayRef<FixItHint> Hints); void emitSnippet(StringRef SourceLine, unsigned MaxLineNoDisplayWidth, - unsigned LineNo); + unsigned LineNo, bool A, const SourceManager &SM); void emitParseableFixits(ArrayRef<FixItHint> Hints, const SourceManager &SM); }; diff --git a/clang/lib/Frontend/CMakeLists.txt b/clang/lib/Frontend/CMakeLists.txt index 1e5f0a859dfd568..f3547f771593093 100644 --- a/clang/lib/Frontend/CMakeLists.txt +++ b/clang/lib/Frontend/CMakeLists.txt @@ -42,6 +42,7 @@ add_clang_library(clangFrontend TextDiagnosticPrinter.cpp VerifyDiagnosticConsumer.cpp InterfaceStubFunctionsConsumer.cpp + CodeSnippetHighlighter.cpp DEPENDS ClangDriverOptions diff --git a/clang/lib/Frontend/CodeSnippetHighlighter.cpp b/clang/lib/Frontend/CodeSnippetHighlighter.cpp new file mode 100644 index 000000000000000..829a533ad2692e5 --- /dev/null +++ b/clang/lib/Frontend/CodeSnippetHighlighter.cpp @@ -0,0 +1,120 @@ + +#include "clang/Frontend/CodeSnippetHighlighter.h" +#include "clang/Basic/DiagnosticOptions.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Lex/Lexer.h" +#include "llvm/Support/raw_ostream.h" + +using namespace clang; + +void CodeSnippetHighlighter::ensureTokenData() { + if (Initialized) + return; + + // List of keywords, literals and types we want to highlight. + // These are best-effort, as is everything we do wrt. highlighting. + Keywords.insert("_Static_assert"); + Keywords.insert("auto"); + Keywords.insert("concept"); + Keywords.insert("const"); + Keywords.insert("consteval"); + Keywords.insert("constexpr"); + Keywords.insert("delete"); + Keywords.insert("do"); + Keywords.insert("else"); + Keywords.insert("final"); + Keywords.insert("for"); + Keywords.insert("if"); + Keywords.insert("mutable"); + Keywords.insert("namespace"); + Keywords.insert("new"); + Keywords.insert("private"); + Keywords.insert("public"); + Keywords.insert("requires"); + Keywords.insert("return"); + Keywords.insert("static"); + Keywords.insert("static_assert"); + Keywords.insert("using"); + Keywords.insert("void"); + Keywords.insert("volatile"); + Keywords.insert("while"); + + // Builtin types we highlight + Keywords.insert("void"); + Keywords.insert("char"); + Keywords.insert("short"); + Keywords.insert("int"); + Keywords.insert("unsigned"); + Keywords.insert("long"); + Keywords.insert("float"); + Keywords.insert("double"); + + Literals.insert("true"); + Literals.insert("false"); + Literals.insert("nullptr"); + + Initialized = true; +} + +static SourceManager createTempSourceManager() { + FileSystemOptions FileOpts; + FileManager FileMgr(FileOpts); + llvm::IntrusiveRefCntPtr<DiagnosticIDs> DiagIDs(new DiagnosticIDs()); + llvm::IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts(new DiagnosticOptions()); + DiagnosticsEngine diags(DiagIDs, DiagOpts); + return SourceManager(diags, FileMgr); +} + +static Lexer createTempLexer(llvm::MemoryBufferRef B, SourceManager &FakeSM, + const LangOptions &LangOpts) { + return Lexer(FakeSM.createFileID(B), B, FakeSM, LangOpts); +} + +std::vector<StyleRange> +CodeSnippetHighlighter::highlightLine(StringRef SourceLine, + const LangOptions &LangOpts) { + ensureTokenData(); + + constexpr raw_ostream::Colors CommentColor = raw_ostream::BLACK; + constexpr raw_ostream::Colors LiteralColor = raw_ostream::GREEN; + constexpr raw_ostream::Colors KeywordColor = raw_ostream::YELLOW; + + const auto MemBuf = llvm::MemoryBuffer::getMemBuffer(SourceLine); + SourceManager FakeSM = createTempSourceManager(); + Lexer L = createTempLexer(MemBuf->getMemBufferRef(), FakeSM, LangOpts); + L.SetKeepWhitespaceMode(true); + + std::vector<StyleRange> Styles; + bool Stop = false; + while (!Stop) { + Token tok; + Stop = L.LexFromRawLexer(tok); + if (tok.is(tok::unknown)) + continue; + + bool Invalid; + unsigned Start = + FakeSM.getSpellingColumnNumber(tok.getLocation(), &Invalid) - 1; + if (Invalid) + continue; + + if (tok.is(tok::raw_identifier)) { + // Almost everything we lex is an identifier, since we use a raw lexer. + // Some should be highlightes as literals, others as keywords. + if (Keywords.contains(tok.getRawIdentifier())) + Styles.push_back( + StyleRange{Start, Start + tok.getLength(), KeywordColor}); + else if (Literals.contains(tok.getRawIdentifier())) + Styles.push_back( + StyleRange{Start, Start + tok.getLength(), LiteralColor}); + } else if (tok::isLiteral(tok.getKind())) { + Styles.push_back( + StyleRange{Start, Start + tok.getLength(), LiteralColor}); + } else if (tok.is(tok::comment)) { + Styles.push_back( + StyleRange{Start, Start + tok.getLength(), CommentColor}); + } + } + + return Styles; +} diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp index eaa6e8d29a1dece..a7a2405d9bbae6d 100644 --- a/clang/lib/Frontend/TextDiagnostic.cpp +++ b/clang/lib/Frontend/TextDiagnostic.cpp @@ -11,6 +11,7 @@ #include "clang/Basic/DiagnosticOptions.h" #include "clang/Basic/FileManager.h" #include "clang/Basic/SourceManager.h" +#include "clang/Frontend/CodeSnippetHighlighter.h" #include "clang/Lex/Lexer.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringExtras.h" @@ -1248,7 +1249,7 @@ void TextDiagnostic::emitSnippetAndCaret( } // Emit what we have computed. - emitSnippet(SourceLine, MaxLineNoDisplayWidth, DisplayLineNo); + emitSnippet(SourceLine, MaxLineNoDisplayWidth, DisplayLineNo, false, SM); if (!CaretLine.empty()) { indentForLineNumbers(); @@ -1278,7 +1279,11 @@ void TextDiagnostic::emitSnippetAndCaret( void TextDiagnostic::emitSnippet(StringRef SourceLine, unsigned MaxLineNoDisplayWidth, - unsigned LineNo) { + unsigned LineNo, bool IsSlash, + const SourceManager &SM) { + std::vector<StyleRange> Styles = + SnippetHighlighter.highlightLine(SourceLine, LangOpts); + // Emit line number. if (MaxLineNoDisplayWidth > 0) { unsigned LineNoDisplayWidth = getNumDisplayWidth(LineNo); @@ -1288,11 +1293,33 @@ void TextDiagnostic::emitSnippet(StringRef SourceLine, // Print the source line one character at a time. bool PrintReversed = false; + bool HighlightingEnabled = DiagOpts->ShowColors; size_t I = 0; while (I < SourceLine.size()) { auto [Str, WasPrintable] = printableTextForNextCharacter(SourceLine, &I, DiagOpts->TabStop); + // Just stop highlighting anything for this line if we found a non-printable + // character. + if (!WasPrintable) + HighlightingEnabled = false; + + // FIXME: I hope we can do this in some nicer way. + if (HighlightingEnabled) { + std::optional<enum raw_ostream::Colors> H; + for (auto &P : Styles) { + if (P.Start < I && P.End >= I) { + H = P.c; + break; + } + } + + if (H) { + OS.changeColor(*H, false); + } else + OS.resetColor(); + } + // Toggle inverted colors on or off for this character. if (DiagOpts->ShowColors) { if (WasPrintable == PrintReversed) { |
85e8687
to
7ff6c1b
Compare
Ping |
This is neat! I think we could get This would let us call |
Totally agree.
Ah you meant the preprocessor object gets passed into the
That is essentially what I'd like to do but I was so far unable to do it :) |
Okay, that seems quite complex since a lexer with a preprocessor doesn't seem to return comments at all and the tokens I get are all identifiers and don't have their I'm open to suggestions of course, I don't use the lexer and preprocessor API very often. |
@tbaederr I think you need to use the preprocessor that exists ( which you get in Note that we don't technically need the preprocessor, just the underlying At least i think this would work? |
The |
Neat. I'm perfectly happy special casing |
It's also about readability - notice how two of the three terminals render the comments in literal black (which I chose actually), so they disappear entirely. |
This is a really great idea! I like the direction this is going! I think we need to find some way to get the source colors/have them be configurable. Is there some sort of 'source' of terminal coloring that we could use? Or a theme of some sort? At minimum, we need to let these colors be configurable in some way. Is there some sort of prior art in another tool we can copy from? Additionally, this needs to be disabled/disable-able via a command line switch eventually (as we do for our other coloring). It should perhaps be a separate disable switch from how we highlight 'error' or 'warning', but enabled at the same time? WDYT? |
Most terminal will let you remap the color escape sequences to something else, IE "white" might become black in a dark theme. Nothing per-application. Trying to come up with a way to do that in clang does seems out of scope to me. After all we don't do it for warning.
Agreed
|
I'm thinking something like
Yeah, perhaps. If we didn't support customization, stealing colors from an existing 'dark' theme'ed IDE is not a terrible idea.
|
Interestingly, I found |
If black is such a bad choice, then why is llvm-project/clang/lib/Frontend/TextDiagnostic.cpp Lines 27 to 38 in 2ed813f
Check mate, atheists! As a side note, I've had problems with the "note:" in clang diagnostics being invisible for... very long. You can see it being invisible in the second screenshot I posted, too. |
I guess that answers my question about |
All I know about colors in terminals is that it's a huge clusterfuck and we can only rely on a few colors without being incompatible with some terminal types, etc. If you e.g. check https://en.wikipedia.org/wiki/ANSI_escape_code#8-bit, |
I will also merge #80023 once the CI is green. That will fix one potential source of problems. |
Sure, we are working on this and will share if we are successful. But please take into account it is often very non-trivial and time consuming to reduce a huge piece of internal code to a shareable state (might literally take days of work). Thus, having a way to proceed with further testing (== command-line flag to disable) in parallel with investigation is IMHO a must. |
#80023 does not fix the problem we're seeing here. The problem is extremely hard to reduce as it happens during preprocessing and even removing unrelated headers causes the crash to disappear. I've built
The failed assertion is in this block of code: const char *FirstLineStart =
FileData.data() +
SM.getDecomposedLoc(SM.translateLineCol(FID, StartLineNumber, 1)).second;
if (const char *CheckPoint = PP->getCheckPoint(FID, FirstLineStart)) {
// THIS IS THE FAILED ASSERTION:
assert(CheckPoint >= Buff->getBufferStart() &&
CheckPoint <= Buff->getBufferEnd());
assert(CheckPoint <= FirstLineStart);
size_t Offset = CheckPoint - Buff->getBufferStart();
L.seek(Offset, /*IsAtStartOfLine=*/false);
} The crash happens when the preprocessor outputs a warning related to this code: #ifndef __GTS_H__
#define __GTS_H__
#include <math.h>
#include <stdio.h>
// #include "glib.h"
#define G_STRINGIFY(macro_or_string) G_STRINGIFY_ARG (macro_or_string)
#define G_STRINGIFY_ARG(contents) #contents
#define _GLIB_GNUC_DO_PRAGMA(x) _Pragma(G_STRINGIFY (x))
#define GLIB_DEPRECATED_MACRO_FOR(f) \
_GLIB_GNUC_DO_PRAGMA(GCC warning G_STRINGIFY (Deprecated pre-processor symbol: replace with #f))
#define GLIB_DEPRECATED_MACRO_IN_2_48_FOR(f) GLIB_DEPRECATED_MACRO_FOR (f)
#define G_INLINE_FUNC static inline GLIB_DEPRECATED_MACRO_IN_2_48_FOR(static inline)
/* Class declarations for base types */
G_INLINE_FUNC
gpointer gts_object_is_from_class (gpointer object,
gpointer klass);
#endif /* __GTS_H__ */ In the previous code snippet the macro definitions are extracted from the
This crash is blocking our internal work at Google. I second @eaeltsin's proposal to put the |
@bgra8 Thank you! This is helpful.
Yeah, crashes happening at this stage can easily be insane to reduce manually.
As mentioned above, |
Disabling color diagnostics would make the job of other software engineers harder, right? This hammer is a bit too big for the problem at hand, right? |
I thought we are still talking about internal work being blocked on this, which is much more severe issue. |
We can't push internally a compiler that crashes unless we disable color diagnostics. |
I hope you realize this is self-imposed limitation that Clang community doesn't necessary have to respect. |
Well, the actual LLVM policy is to revert to green: https://llvm.org/docs/DeveloperPolicy.html#patch-reversion-policy
This I think qualifies as a serious problem as it introduces a crash which may occur in any existent codebase. Anyone using the compiler at trunk and encountering the crash has no way to know how to deal with it. |
Since we don't have a reproducer, the danger is that we revert the commit but never get such a reproducer, leaving us in a state where we can never re-commit safely. If you have some sort of internal deployment, why not revert the commit locally? Or just apply something like --- a/clang/lib/Frontend/TextDiagnostic.cpp
+++ b/clang/lib/Frontend/TextDiagnostic.cpp
@@ -1350,7 +1350,7 @@ void TextDiagnostic::emitSnippetAndCaret(
// emit, starting from the first line.
std::unique_ptr<SmallVector<StyleRange>[]> SourceStyles =
highlightLines(BufStart, Lines.first, Lines.second, PP, LangOpts,
- DiagOpts->ShowColors, FID, SM);
+ false, FID, SM);
SmallVector<LineRange> LineRanges =
prepareAndFilterRanges(Ranges, SM, Lines, FID, LangOpts); That is, unless 6d1d2c6 actually fixed the problem already, which would be good to check. |
I think it's worth remembering that downstreams carry additional changes that the community has no visibility into, so the mere presence of a crash does not actually identify this change as the culprit without more investigation. The community cannot perform that investigation ourselves, we need Google to do that from their downstream or we need a reproducer that happens with vanilla Clang. Do you see this crash with community clang and no downstream changes? In the short term (next 24hrs), I think the existing flag to disable color diagnostics should get you a path forward; but if you disagree, your downstream can certainly revert this patch locally until you're able to provide us with a way to reproduce the issue or have at least verified that the issue is not caused by changes in your downstream. But if we get any evidence that other downstreams or users are hitting this or the issue is happening for Google's test case with a vanilla Clang, then I agree that a revert is appropriate until we have a solution. |
Yes I can reproduce it with a vanila
This was already reported by another user in #80127.
I already mentioned this does not fix the crash. |
Thank you, that's good to know you hit it with vanilla Clang! I think that between this information and the backtrace in #80127 we have enough of a reason to revert without a reproducer at hand. However, I think that if we don't have a reproducer by mid-next week (say 22:00 UTC on 2/7, but the date/time are negotiable) we should re-land the changes as-is to try to get more instances of the crash happening in the wild to help us solve the issue. WDYT? |
Ok, the problem is in clang/lib/Frontend/TextDiagnostic.cpp:1352:
Looks like passing Replacing this to
makes the crash go away. I don't know this code so cannot comment more on what actually happens. The only thing to add is that the crash occurred because offset returned in line 1154 was way out of bounds of buffer created at line 1148. Please fix. |
Great find @eaeltsin ! |
…lvm#66514) Implements the fix proposed by Evgeny Eltsin on llvm#66514 (comment). No test case provided, since the bug is extremely sensitive to the preprocessor state (headers, macros, including the ones defined on command line), and it turned out to be non-trivial to create an isolated test.
…66514) (#80442) Implements the fix proposed by Evgeny Eltsin on #66514 (comment). No test case provided, since the bug is extremely sensitive to the preprocessor state (headers, macros, including the ones defined on command line), and it turned out to be non-trivial to create an isolated test.
I've added my findings with regard to this issue in #80442 (comment) Hope that helps understanding the problem. |
…lvm#66514) (llvm#80442) Implements the fix proposed by Evgeny Eltsin on llvm#66514 (comment). No test case provided, since the bug is extremely sensitive to the preprocessor state (headers, macros, including the ones defined on command line), and it turned out to be non-trivial to create an isolated test.
Since llvm#66514 Clang relies on the FileIDs and buffer pointers stored within Preprocessor to perform syntax highlighting in diagnostics. With compilation caching, we round-trip diagnostics through serialization that uses a separate copy of SourceManager. This assigns different FileIDs and allocates buffers anew, breaking invariants of the consumer. This is not easy to test, since we'd rely on the order FileIDs get created, where in memory the buffers get allocated and the checkpoint interval. rdar://134198448
…9163) Since llvm#66514 Clang relies on the FileIDs and buffer pointers stored within Preprocessor to perform syntax highlighting in diagnostics. With compilation caching, we round-trip diagnostics through serialization that uses a separate copy of SourceManager. This assigns different FileIDs and allocates buffers anew, breaking invariants of the consumer. This fix is not easy to test, since we'd rely on the order FileIDs get created, where in memory the buffers get allocated and the checkpoint interval. rdar://134198448
…9163) Since llvm#66514 Clang relies on the FileIDs and buffer pointers stored within Preprocessor to perform syntax highlighting in diagnostics. With compilation caching, we round-trip diagnostics through serialization that uses a separate copy of SourceManager. This assigns different FileIDs and allocates buffers anew, breaking invariants of the consumer. This fix is not easy to test, since we'd rely on the order FileIDs get created, where in memory the buffers get allocated and the checkpoint interval. rdar://134198448 (cherry picked from commit 3613d18)
…9173) Since llvm#66514 Clang relies on the FileIDs and buffer pointers stored within Preprocessor to perform syntax highlighting in diagnostics. With compilation caching, we round-trip diagnostics through serialization that uses a separate copy of SourceManager. This assigns different FileIDs and allocates buffers anew, breaking invariants of the consumer. This fix is not easy to test, since we'd rely on the order FileIDs get created, where in memory the buffers get allocated and the checkpoint interval. rdar://134198448 (cherry picked from commit 3613d18)
Add some primitive syntax highlighting to our code snippet output.
Before:
After:
Obviously this is kinda WIP and more of a hack in general, but IMO it increases readability of the source snippets (which people look at highlighted all the time anyway...) and LLDB does something similar, so let's see.