From 0222243bba0097a3263446c2a49f19bea0bbe9e8 Mon Sep 17 00:00:00 2001 From: wxiaoguang Date: Sat, 23 Jul 2022 19:28:02 +0800 Subject: [PATCH] Improve code diff highlight, fix incorrect rendered diff result (#19958) Use Unicode placeholders to replace HTML tags and HTML entities first, then do diff, then recover the HTML tags and HTML entities. Now the code diff with highlight has stable behavior, and won't emit broken tags. --- modules/highlight/highlight.go | 8 +- services/gitdiff/gitdiff.go | 312 ++----------------------- services/gitdiff/gitdiff_test.go | 88 +------ services/gitdiff/highlightdiff.go | 223 ++++++++++++++++++ services/gitdiff/highlightdiff_test.go | 126 ++++++++++ 5 files changed, 379 insertions(+), 378 deletions(-) create mode 100644 services/gitdiff/highlightdiff.go create mode 100644 services/gitdiff/highlightdiff_test.go diff --git a/modules/highlight/highlight.go b/modules/highlight/highlight.go index acd3bebb9f408..6832207c0fc75 100644 --- a/modules/highlight/highlight.go +++ b/modules/highlight/highlight.go @@ -40,9 +40,11 @@ var ( // NewContext loads custom highlight map from local config func NewContext() { once.Do(func() { - keys := setting.Cfg.Section("highlight.mapping").Keys() - for i := range keys { - highlightMapping[keys[i].Name()] = keys[i].Value() + if setting.Cfg != nil { + keys := setting.Cfg.Section("highlight.mapping").Keys() + for i := range keys { + highlightMapping[keys[i].Name()] = keys[i].Value() + } } // The size 512 is simply a conservative rule of thumb diff --git a/services/gitdiff/gitdiff.go b/services/gitdiff/gitdiff.go index 6e8c149dabd94..6dd237bbc8d47 100644 --- a/services/gitdiff/gitdiff.go +++ b/services/gitdiff/gitdiff.go @@ -15,7 +15,6 @@ import ( "io" "net/url" "os" - "regexp" "sort" "strings" "time" @@ -40,7 +39,7 @@ import ( "golang.org/x/text/transform" ) -// DiffLineType represents the type of a DiffLine. +// DiffLineType represents the type of DiffLine. type DiffLineType uint8 // DiffLineType possible values. @@ -51,7 +50,7 @@ const ( DiffLineSection ) -// DiffFileType represents the type of a DiffFile. +// DiffFileType represents the type of DiffFile. type DiffFileType uint8 // DiffFileType possible values. @@ -100,12 +99,12 @@ type DiffLineSectionInfo struct { // BlobExcerptChunkSize represent max lines of excerpt const BlobExcerptChunkSize = 20 -// GetType returns the type of a DiffLine. +// GetType returns the type of DiffLine. func (d *DiffLine) GetType() int { return int(d.Type) } -// CanComment returns whether or not a line can get commented +// CanComment returns whether a line can get commented func (d *DiffLine) CanComment() bool { return len(d.Comments) == 0 && d.Type != DiffLineSection } @@ -191,287 +190,13 @@ var ( codeTagSuffix = []byte(``) ) -var ( - unfinishedtagRegex = regexp.MustCompile(`<[^>]*$`) - trailingSpanRegex = regexp.MustCompile(`]?$`) - entityRegex = regexp.MustCompile(`&[#]*?[0-9[:alpha:]]*$`) -) - -// shouldWriteInline represents combinations where we manually write inline changes -func shouldWriteInline(diff diffmatchpatch.Diff, lineType DiffLineType) bool { - if true && - diff.Type == diffmatchpatch.DiffEqual || - diff.Type == diffmatchpatch.DiffInsert && lineType == DiffLineAdd || - diff.Type == diffmatchpatch.DiffDelete && lineType == DiffLineDel { - return true - } - return false -} - -func fixupBrokenSpans(diffs []diffmatchpatch.Diff) []diffmatchpatch.Diff { - // Create a new array to store our fixed up blocks - fixedup := make([]diffmatchpatch.Diff, 0, len(diffs)) - - // semantically label some numbers - const insert, delete, equal = 0, 1, 2 - - // record the positions of the last type of each block in the fixedup blocks - last := []int{-1, -1, -1} - operation := []diffmatchpatch.Operation{diffmatchpatch.DiffInsert, diffmatchpatch.DiffDelete, diffmatchpatch.DiffEqual} - - // create a writer for insert and deletes - toWrite := []strings.Builder{ - {}, - {}, - } - - // make some flags for insert and delete - unfinishedTag := []bool{false, false} - unfinishedEnt := []bool{false, false} - - // store stores the provided text in the writer for the typ - store := func(text string, typ int) { - (&(toWrite[typ])).WriteString(text) - } - - // hasStored returns true if there is stored content - hasStored := func(typ int) bool { - return (&toWrite[typ]).Len() > 0 - } - - // stored will return that content - stored := func(typ int) string { - return (&toWrite[typ]).String() - } - - // empty will empty the stored content - empty := func(typ int) { - (&toWrite[typ]).Reset() - } - - // pop will remove the stored content appending to a diff block for that typ - pop := func(typ int, fixedup []diffmatchpatch.Diff) []diffmatchpatch.Diff { - if hasStored(typ) { - if last[typ] > last[equal] { - fixedup[last[typ]].Text += stored(typ) - } else { - fixedup = append(fixedup, diffmatchpatch.Diff{ - Type: operation[typ], - Text: stored(typ), - }) - } - empty(typ) - } - return fixedup - } - - // Now we walk the provided diffs and check the type of each block in turn - for _, diff := range diffs { - - typ := delete // flag for handling insert or delete typs - switch diff.Type { - case diffmatchpatch.DiffEqual: - // First check if there is anything stored - if hasStored(insert) || hasStored(delete) { - // There are two reasons for storing content: - // 1. Unfinished Entity <- Could be more efficient here by not doing this if we're looking for a tag - if unfinishedEnt[insert] || unfinishedEnt[delete] { - // we look for a ';' to finish an entity - idx := strings.IndexRune(diff.Text, ';') - if idx >= 0 { - // if we find a ';' store the preceding content to both insert and delete - store(diff.Text[:idx+1], insert) - store(diff.Text[:idx+1], delete) - - // and remove it from this block - diff.Text = diff.Text[idx+1:] - - // reset the ent flags - unfinishedEnt[insert] = false - unfinishedEnt[delete] = false - } else { - // otherwise store it all on insert and delete - store(diff.Text, insert) - store(diff.Text, delete) - // and empty this block - diff.Text = "" - } - } - // 2. Unfinished Tag - if unfinishedTag[insert] || unfinishedTag[delete] { - // we look for a '>' to finish a tag - idx := strings.IndexRune(diff.Text, '>') - if idx >= 0 { - store(diff.Text[:idx+1], insert) - store(diff.Text[:idx+1], delete) - diff.Text = diff.Text[idx+1:] - unfinishedTag[insert] = false - unfinishedTag[delete] = false - } else { - store(diff.Text, insert) - store(diff.Text, delete) - diff.Text = "" - } - } - - // If we've completed the required tag/entities - if !(unfinishedTag[insert] || unfinishedTag[delete] || unfinishedEnt[insert] || unfinishedEnt[delete]) { - // pop off the stack - fixedup = pop(insert, fixedup) - fixedup = pop(delete, fixedup) - } - - // If that has left this diff block empty then shortcut - if len(diff.Text) == 0 { - continue - } - } - - // check if this block ends in an unfinished tag? - idx := unfinishedtagRegex.FindStringIndex(diff.Text) - if idx != nil { - unfinishedTag[insert] = true - unfinishedTag[delete] = true - } else { - // otherwise does it end in an unfinished entity? - idx = entityRegex.FindStringIndex(diff.Text) - if idx != nil { - unfinishedEnt[insert] = true - unfinishedEnt[delete] = true - } - } - - // If there is an unfinished component - if idx != nil { - // Store the fragment - store(diff.Text[idx[0]:], insert) - store(diff.Text[idx[0]:], delete) - // and remove it from this block - diff.Text = diff.Text[:idx[0]] - } - - // If that hasn't left the block empty - if len(diff.Text) > 0 { - // store the position of the last equal block and store it in our diffs - last[equal] = len(fixedup) - fixedup = append(fixedup, diff) - } - continue - case diffmatchpatch.DiffInsert: - typ = insert - fallthrough - case diffmatchpatch.DiffDelete: - // First check if there is anything stored for this type - if hasStored(typ) { - // if there is prepend it to this block, empty the storage and reset our flags - diff.Text = stored(typ) + diff.Text - empty(typ) - unfinishedEnt[typ] = false - unfinishedTag[typ] = false - } - - // check if this block ends in an unfinished tag - idx := unfinishedtagRegex.FindStringIndex(diff.Text) - if idx != nil { - unfinishedTag[typ] = true - } else { - // otherwise does it end in an unfinished entity - idx = entityRegex.FindStringIndex(diff.Text) - if idx != nil { - unfinishedEnt[typ] = true - } - } - - // If there is an unfinished component - if idx != nil { - // Store the fragment - store(diff.Text[idx[0]:], typ) - // and remove it from this block - diff.Text = diff.Text[:idx[0]] - } - - // If that hasn't left the block empty - if len(diff.Text) > 0 { - // if the last block of this type was after the last equal block - if last[typ] > last[equal] { - // store this blocks content on that block - fixedup[last[typ]].Text += diff.Text - } else { - // otherwise store the position of the last block of this type and store the block - last[typ] = len(fixedup) - fixedup = append(fixedup, diff) - } - } - continue - } - } - - // pop off any remaining stored content - fixedup = pop(insert, fixedup) - fixedup = pop(delete, fixedup) - - return fixedup -} - -func diffToHTML(fileName string, diffs []diffmatchpatch.Diff, lineType DiffLineType) DiffInline { +func diffToHTML(lineWrapperTags []string, diffs []diffmatchpatch.Diff, lineType DiffLineType) string { buf := bytes.NewBuffer(nil) - match := "" - - diffs = fixupBrokenSpans(diffs) - + // restore the line wrapper tags and , if necessary + for _, tag := range lineWrapperTags { + buf.WriteString(tag) + } for _, diff := range diffs { - if shouldWriteInline(diff, lineType) { - if len(match) > 0 { - diff.Text = match + diff.Text - match = "" - } - // Chroma HTML syntax highlighting is done before diffing individual lines in order to maintain consistency. - // Since inline changes might split in the middle of a chroma span tag or HTML entity, make we manually put it back together - // before writing so we don't try insert added/removed code spans in the middle of one of those - // and create broken HTML. This is done by moving incomplete HTML forward until it no longer matches our pattern of - // a line ending with an incomplete HTML entity or partial/opening . - - // EX: - // diffs[{Type: dmp.DiffDelete, Text: "language}] - - // After first iteration - // diffs[{Type: dmp.DiffDelete, Text: "language"}, //write out - // {Type: dmp.DiffEqual, Text: ",}] - - // After second iteration - // {Type: dmp.DiffEqual, Text: ""}, // write out - // {Type: dmp.DiffDelete, Text: ",}] - - // Final - // {Type: dmp.DiffDelete, Text: ",}] - // end up writing , - // Instead of lass="p", - - m := trailingSpanRegex.FindStringSubmatchIndex(diff.Text) - if m != nil { - match = diff.Text[m[0]:m[1]] - diff.Text = strings.TrimSuffix(diff.Text, match) - } - m = entityRegex.FindStringSubmatchIndex(diff.Text) - if m != nil { - match = diff.Text[m[0]:m[1]] - diff.Text = strings.TrimSuffix(diff.Text, match) - } - // Print an existing closing span first before opening added/remove-code span so it doesn't unintentionally close it - if strings.HasPrefix(diff.Text, "") { - buf.WriteString("") - diff.Text = strings.TrimPrefix(diff.Text, "") - } - // If we weren't able to fix it then this should avoid broken HTML by not inserting more spans below - // The previous/next diff section will contain the rest of the tag that is missing here - if strings.Count(diff.Text, "<") != strings.Count(diff.Text, ">") { - buf.WriteString(diff.Text) - continue - } - } switch { case diff.Type == diffmatchpatch.DiffEqual: buf.WriteString(diff.Text) @@ -485,7 +210,10 @@ func diffToHTML(fileName string, diffs []diffmatchpatch.Diff, lineType DiffLineT buf.Write(codeTagSuffix) } } - return DiffInlineWithUnicodeEscape(template.HTML(buf.String())) + for range lineWrapperTags { + buf.WriteString("") + } + return buf.String() } // GetLine gets a specific line by type (add or del) and file line number @@ -597,10 +325,12 @@ func (diffSection *DiffSection) GetComputedInlineDiffFor(diffLine *DiffLine) Dif return DiffInlineWithHighlightCode(diffSection.FileName, language, diffLine.Content) } - diffRecord := diffMatchPatch.DiffMain(highlight.Code(diffSection.FileName, language, diff1[1:]), highlight.Code(diffSection.FileName, language, diff2[1:]), true) - diffRecord = diffMatchPatch.DiffCleanupEfficiency(diffRecord) - - return diffToHTML(diffSection.FileName, diffRecord, diffLine.Type) + hcd := newHighlightCodeDiff() + diffRecord := hcd.diffWithHighlight(diffSection.FileName, language, diff1[1:], diff2[1:]) + // it seems that Gitea doesn't need the line wrapper of Chroma, so do not add them back + // if the line wrappers are still needed in the future, it can be added back by "diffToHTML(hcd.lineWrapperTags. ...)" + diffHTML := diffToHTML(nil, diffRecord, diffLine.Type) + return DiffInlineWithUnicodeEscape(template.HTML(diffHTML)) } // DiffFile represents a file diff. @@ -1289,7 +1019,7 @@ func readFileName(rd *strings.Reader) (string, bool) { if char == '"' { fmt.Fscanf(rd, "%q ", &name) if len(name) == 0 { - log.Error("Reader has no file name: %v", rd) + log.Error("Reader has no file name: reader=%+v", rd) return "", true } @@ -1311,7 +1041,7 @@ func readFileName(rd *strings.Reader) (string, bool) { } } if len(name) < 2 { - log.Error("Unable to determine name from reader: %v", rd) + log.Error("Unable to determine name from reader: reader=%+v", rd) return "", true } return name[2:], ambiguity diff --git a/services/gitdiff/gitdiff_test.go b/services/gitdiff/gitdiff_test.go index caca0e91d8f13..e88d831759b7b 100644 --- a/services/gitdiff/gitdiff_test.go +++ b/services/gitdiff/gitdiff_test.go @@ -7,7 +7,6 @@ package gitdiff import ( "fmt" - "html/template" "strconv" "strings" "testing" @@ -17,93 +16,27 @@ import ( "code.gitea.io/gitea/models/unittest" user_model "code.gitea.io/gitea/models/user" "code.gitea.io/gitea/modules/git" - "code.gitea.io/gitea/modules/highlight" "code.gitea.io/gitea/modules/json" "code.gitea.io/gitea/modules/setting" dmp "github.com/sergi/go-diff/diffmatchpatch" "github.com/stretchr/testify/assert" - "gopkg.in/ini.v1" ) -func assertEqual(t *testing.T, s1 string, s2 template.HTML) { - if s1 != string(s2) { - t.Errorf("Did not receive expected results:\nExpected: %s\nActual: %s", s1, s2) - } -} - func TestDiffToHTML(t *testing.T) { - setting.Cfg = ini.Empty() - assertEqual(t, "foo bar biz", diffToHTML("", []dmp.Diff{ + assert.Equal(t, "foo bar biz", diffToHTML(nil, []dmp.Diff{ {Type: dmp.DiffEqual, Text: "foo "}, {Type: dmp.DiffInsert, Text: "bar"}, {Type: dmp.DiffDelete, Text: " baz"}, {Type: dmp.DiffEqual, Text: " biz"}, - }, DiffLineAdd).Content) + }, DiffLineAdd)) - assertEqual(t, "foo bar biz", diffToHTML("", []dmp.Diff{ + assert.Equal(t, "foo bar biz", diffToHTML(nil, []dmp.Diff{ {Type: dmp.DiffEqual, Text: "foo "}, {Type: dmp.DiffDelete, Text: "bar"}, {Type: dmp.DiffInsert, Text: " baz"}, {Type: dmp.DiffEqual, Text: " biz"}, - }, DiffLineDel).Content) - - assertEqual(t, "if !nohl && (lexer != nil || r.GuessLanguage) {", diffToHTML("", []dmp.Diff{ - {Type: dmp.DiffEqual, Text: "if !nohl && (lexer != nil"}, - {Type: dmp.DiffInsert, Text: " || r.GuessLanguage)"}, - {Type: dmp.DiffEqual, Text: " {"}, - }, DiffLineAdd).Content) - - assertEqual(t, "tagURL := fmt.Sprintf("## [%s](%s/%s/%s/%s?q=&type=all&state=closed&milestone=%d) - %s", ge.Milestone\", ge.BaseURL, ge.Owner, ge.Repo, from, milestoneID, time.Now().Format("2006-01-02"))", diffToHTML("", []dmp.Diff{ - {Type: dmp.DiffEqual, Text: "tagURL := fmt.Sprintf("## [%s](%s/%s/%s/%s?q=&type=all&state=closed&milestone=%d) - %s", ge.Milestone\""}, - {Type: dmp.DiffInsert, Text: "f\">getGiteaTagURL(client"}, - {Type: dmp.DiffEqual, Text: ", ge.BaseURL, ge.Owner, ge.Repo, "}, - {Type: dmp.DiffDelete, Text: "from, milestoneID, time.Now().Format("2006-01-02")"}, - {Type: dmp.DiffInsert, Text: "ge.Milestone, from, milestoneID"}, - {Type: dmp.DiffEqual, Text: ")"}, - }, DiffLineDel).Content) - - assertEqual(t, "r.WrapperRenderer(w, language, true, attrs, false)", diffToHTML("", []dmp.Diff{ - {Type: dmp.DiffEqual, Text: "r.WrapperRenderer(w, "}, - {Type: dmp.DiffDelete, Text: "language, true, attrs"}, - {Type: dmp.DiffEqual, Text: ", false)"}, - }, DiffLineDel).Content) - - assertEqual(t, "language, true, attrs, false)", diffToHTML("", []dmp.Diff{ - {Type: dmp.DiffInsert, Text: "language, true, attrs"}, - {Type: dmp.DiffEqual, Text: ", false)"}, - }, DiffLineAdd).Content) - - assertEqual(t, "print("// ", sys.argv)", diffToHTML("", []dmp.Diff{ - {Type: dmp.DiffEqual, Text: "print"}, - {Type: dmp.DiffInsert, Text: "("}, - {Type: dmp.DiffEqual, Text: ""// ", sys.argv"}, - {Type: dmp.DiffInsert, Text: ")"}, - }, DiffLineAdd).Content) - - assertEqual(t, "sh 'useradd -u $(stat -c "%u" .gitignore) jenkins'", diffToHTML("", []dmp.Diff{ - {Type: dmp.DiffEqual, Text: "sh "}, - {Type: dmp.DiffDelete, Text: "4;useradd -u 111 jenkins""}, - {Type: dmp.DiffInsert, Text: "9;useradd -u $(stat -c "%u" .gitignore) jenkins'"}, - {Type: dmp.DiffEqual, Text: ";"}, - }, DiffLineAdd).Content) - - assertEqual(t, " <h4 class="release-list-title df ac">", diffToHTML("", []dmp.Diff{ - {Type: dmp.DiffEqual, Text: " <h"}, - {Type: dmp.DiffInsert, Text: "4 class=&#"}, - {Type: dmp.DiffEqual, Text: "3"}, - {Type: dmp.DiffInsert, Text: "4;release-list-title df ac""}, - {Type: dmp.DiffEqual, Text: ">"}, - }, DiffLineAdd).Content) + }, DiffLineDel)) } func TestParsePatch_skipTo(t *testing.T) { @@ -592,7 +525,6 @@ index 0000000..6bb8f39 if err != nil { t.Errorf("ParsePatch failed: %s", err) } - println(result) diff2 := `diff --git "a/A \\ B" "b/A \\ B" --- "a/A \\ B" @@ -712,18 +644,6 @@ func TestGetDiffRangeWithWhitespaceBehavior(t *testing.T) { } } -func TestDiffToHTML_14231(t *testing.T) { - setting.Cfg = ini.Empty() - diffRecord := diffMatchPatch.DiffMain(highlight.Code("main.v", "", " run()\n"), highlight.Code("main.v", "", " run(db)\n"), true) - diffRecord = diffMatchPatch.DiffCleanupEfficiency(diffRecord) - - expected := ` run(db) -` - output := diffToHTML("main.v", diffRecord, DiffLineAdd) - - assertEqual(t, expected, output.Content) -} - func TestNoCrashes(t *testing.T) { type testcase struct { gitdiff string diff --git a/services/gitdiff/highlightdiff.go b/services/gitdiff/highlightdiff.go new file mode 100644 index 0000000000000..4ceada4d7ec95 --- /dev/null +++ b/services/gitdiff/highlightdiff.go @@ -0,0 +1,223 @@ +// Copyright 2022 The Gitea Authors. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package gitdiff + +import ( + "strings" + + "code.gitea.io/gitea/modules/highlight" + + "github.com/sergi/go-diff/diffmatchpatch" +) + +// token is a html tag or entity, eg: "", "", "<" +func extractHTMLToken(s string) (before, token, after string, valid bool) { + for pos1 := 0; pos1 < len(s); pos1++ { + if s[pos1] == '<' { + pos2 := strings.IndexByte(s[pos1:], '>') + if pos2 == -1 { + return "", "", s, false + } + return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true + } else if s[pos1] == '&' { + pos2 := strings.IndexByte(s[pos1:], ';') + if pos2 == -1 { + return "", "", s, false + } + return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true + } + } + return "", "", s, true +} + +// highlightCodeDiff is used to do diff with highlighted HTML code. +// It totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes. +// The HTML tags and entities will be replaced by Unicode placeholders: "{TEXT}" => "\uE000{TEXT}\uE001" +// These Unicode placeholders are friendly to the diff. +// Then after diff, the placeholders in diff result will be recovered to the HTML tags and entities. +// It's guaranteed that the tags in final diff result are paired correctly. +type highlightCodeDiff struct { + placeholderBegin rune + placeholderMaxCount int + placeholderIndex int + placeholderTokenMap map[rune]string + tokenPlaceholderMap map[string]rune + + placeholderOverflowCount int + + lineWrapperTags []string +} + +func newHighlightCodeDiff() *highlightCodeDiff { + return &highlightCodeDiff{ + placeholderBegin: rune(0x100000), // Plane 16: Supplementary Private Use Area B (U+100000..U+10FFFD) + placeholderMaxCount: 64000, + placeholderTokenMap: map[rune]string{}, + tokenPlaceholderMap: map[string]rune{}, + } +} + +// nextPlaceholder returns 0 if no more placeholder can be used +// the diff is done line by line, usually there are only a few (no more than 10) placeholders in one line +// so the placeholderMaxCount is impossible to be exhausted in real cases. +func (hcd *highlightCodeDiff) nextPlaceholder() rune { + for hcd.placeholderIndex < hcd.placeholderMaxCount { + r := hcd.placeholderBegin + rune(hcd.placeholderIndex) + hcd.placeholderIndex++ + // only use non-existing (not used by code) rune as placeholders + if _, ok := hcd.placeholderTokenMap[r]; !ok { + return r + } + } + return 0 // no more available placeholder +} + +func (hcd *highlightCodeDiff) isInPlaceholderRange(r rune) bool { + return hcd.placeholderBegin <= r && r < hcd.placeholderBegin+rune(hcd.placeholderMaxCount) +} + +func (hcd *highlightCodeDiff) collectUsedRunes(code string) { + for _, r := range code { + if hcd.isInPlaceholderRange(r) { + // put the existing rune (used by code) in map, then this rune won't be used a placeholder anymore. + hcd.placeholderTokenMap[r] = "" + } + } +} + +func (hcd *highlightCodeDiff) diffWithHighlight(filename, language, codeA, codeB string) []diffmatchpatch.Diff { + hcd.collectUsedRunes(codeA) + hcd.collectUsedRunes(codeB) + + highlightCodeA := highlight.Code(filename, language, codeA) + highlightCodeB := highlight.Code(filename, language, codeB) + + highlightCodeA = hcd.convertToPlaceholders(highlightCodeA) + highlightCodeB = hcd.convertToPlaceholders(highlightCodeB) + + diffs := diffMatchPatch.DiffMain(highlightCodeA, highlightCodeB, true) + diffs = diffMatchPatch.DiffCleanupEfficiency(diffs) + + for i := range diffs { + hcd.recoverOneDiff(&diffs[i]) + } + return diffs +} + +// convertToPlaceholders totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes. +func (hcd *highlightCodeDiff) convertToPlaceholders(htmlCode string) string { + var tagStack []string + res := strings.Builder{} + + firstRunForLineTags := hcd.lineWrapperTags == nil + + var beforeToken, token string + var valid bool + + // the standard chroma highlight HTML is " ... " + for { + beforeToken, token, htmlCode, valid = extractHTMLToken(htmlCode) + if !valid || token == "" { + break + } + // write the content before the token into result string, and consume the token in the string + res.WriteString(beforeToken) + + // the line wrapper tags should be removed before diff + if strings.HasPrefix(token, `") + continue + } + + var tokenInMap string + if strings.HasSuffix(token, "" for "" + tokenInMap = token + "" + tagStack = tagStack[:len(tagStack)-1] + } else if token[0] == '<' { // for opening tag + tokenInMap = token + tagStack = append(tagStack, token) + } else if token[0] == '&' { // for html entity + tokenInMap = token + } // else: impossible + + // remember the placeholder and token in the map + placeholder, ok := hcd.tokenPlaceholderMap[tokenInMap] + if !ok { + placeholder = hcd.nextPlaceholder() + if placeholder != 0 { + hcd.tokenPlaceholderMap[tokenInMap] = placeholder + hcd.placeholderTokenMap[placeholder] = tokenInMap + } + } + + if placeholder != 0 { + res.WriteRune(placeholder) // use the placeholder to replace the token + } else { + // unfortunately, all private use runes has been exhausted, no more placeholder could be used, no more converting + // usually, the exhausting won't occur in real cases, the magnitude of used placeholders is not larger than that of the CSS classes outputted by chroma. + hcd.placeholderOverflowCount++ + if strings.HasPrefix(token, "&") { + // when the token is a html entity, something must be outputted even if there is no placeholder. + res.WriteRune(0xFFFD) // replacement character TODO: how to handle this case more gracefully? + res.WriteString(token[1:]) // still output the entity code part, otherwise there will be no diff result. + } + } + } + + // write the remaining string + res.WriteString(htmlCode) + return res.String() +} + +func (hcd *highlightCodeDiff) recoverOneDiff(diff *diffmatchpatch.Diff) { + sb := strings.Builder{} + var tagStack []string + + for _, r := range diff.Text { + token, ok := hcd.placeholderTokenMap[r] + if !ok || token == "" { + sb.WriteRune(r) // if the rune is not a placeholder, write it as it is + continue + } + var tokenToRecover string + if strings.HasPrefix(token, "')+1] + if len(tagStack) == 0 { + continue // if no opening tag in stack yet, skip the closing tag + } + tagStack = tagStack[:len(tagStack)-1] + } else if token[0] == '<' { // for opening tag + tokenToRecover = token + tagStack = append(tagStack, token) + } else if token[0] == '&' { // for html entity + tokenToRecover = token + } // else: impossible + sb.WriteString(tokenToRecover) + } + + if len(tagStack) > 0 { + // close all opening tags + for i := len(tagStack) - 1; i >= 0; i-- { + tagToClose := tagStack[i] + // get the closing tag "" from "" or "" + pos := strings.IndexAny(tagToClose, " >") + if pos != -1 { + sb.WriteString("") + } // else: impossible. every tag was pushed into the stack by the code above and is valid HTML opening tag + } + } + + diff.Text = sb.String() +} diff --git a/services/gitdiff/highlightdiff_test.go b/services/gitdiff/highlightdiff_test.go new file mode 100644 index 0000000000000..1cd78bc94272d --- /dev/null +++ b/services/gitdiff/highlightdiff_test.go @@ -0,0 +1,126 @@ +// Copyright 2022 The Gitea Authors. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package gitdiff + +import ( + "fmt" + "strings" + "testing" + + "github.com/sergi/go-diff/diffmatchpatch" + "github.com/stretchr/testify/assert" +) + +func TestDiffWithHighlight(t *testing.T) { + hcd := newHighlightCodeDiff() + diffs := hcd.diffWithHighlight( + "main.v", "", + " run('<>')\n", + " run(db)\n", + ) + + expected := ` run('<>')` + "\n" + output := diffToHTML(nil, diffs, DiffLineDel) + assert.Equal(t, expected, output) + + expected = ` run(db)` + "\n" + output = diffToHTML(nil, diffs, DiffLineAdd) + assert.Equal(t, expected, output) + + hcd = newHighlightCodeDiff() + hcd.placeholderTokenMap['O'] = "" + hcd.placeholderTokenMap['C'] = "" + diff := diffmatchpatch.Diff{} + + diff.Text = "OC" + hcd.recoverOneDiff(&diff) + assert.Equal(t, "", diff.Text) + + diff.Text = "O" + hcd.recoverOneDiff(&diff) + assert.Equal(t, "", diff.Text) + + diff.Text = "C" + hcd.recoverOneDiff(&diff) + assert.Equal(t, "", diff.Text) +} + +func TestDiffWithHighlightPlaceholder(t *testing.T) { + hcd := newHighlightCodeDiff() + diffs := hcd.diffWithHighlight( + "main.js", "", + "a='\U00100000'", + "a='\U0010FFFD''", + ) + assert.Equal(t, "", hcd.placeholderTokenMap[0x00100000]) + assert.Equal(t, "", hcd.placeholderTokenMap[0x0010FFFD]) + + expected := fmt.Sprintf(`a='%s'`, "\U00100000") + output := diffToHTML(hcd.lineWrapperTags, diffs, DiffLineDel) + assert.Equal(t, expected, output) + + hcd = newHighlightCodeDiff() + diffs = hcd.diffWithHighlight( + "main.js", "", + "a='\U00100000'", + "a='\U0010FFFD'", + ) + expected = fmt.Sprintf(`a='%s'`, "\U0010FFFD") + output = diffToHTML(nil, diffs, DiffLineAdd) + assert.Equal(t, expected, output) +} + +func TestDiffWithHighlightPlaceholderExhausted(t *testing.T) { + hcd := newHighlightCodeDiff() + hcd.placeholderMaxCount = 0 + diffs := hcd.diffWithHighlight( + "main.js", "", + "'", + ``, + ) + output := diffToHTML(nil, diffs, DiffLineDel) + expected := fmt.Sprintf(`%s#39;`, "\uFFFD") + assert.Equal(t, expected, output) + + hcd = newHighlightCodeDiff() + hcd.placeholderMaxCount = 0 + diffs = hcd.diffWithHighlight( + "main.js", "", + "a < b", + "a > b", + ) + output = diffToHTML(nil, diffs, DiffLineDel) + expected = fmt.Sprintf(`a %slt; b`, "\uFFFD") + assert.Equal(t, expected, output) + + output = diffToHTML(nil, diffs, DiffLineAdd) + expected = fmt.Sprintf(`a %sgt; b`, "\uFFFD") + assert.Equal(t, expected, output) +} + +func TestDiffWithHighlightTagMatch(t *testing.T) { + totalOverflow := 0 + for i := 0; i < 100; i++ { + hcd := newHighlightCodeDiff() + hcd.placeholderMaxCount = i + diffs := hcd.diffWithHighlight( + "main.js", "", + "a='1'", + "b='2'", + ) + totalOverflow += hcd.placeholderOverflowCount + + output := diffToHTML(nil, diffs, DiffLineDel) + c1 := strings.Count(output, "