Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Convert files to utf-8 for indexing #7814

Merged
merged 12 commits into from
Aug 15, 2019
Merged
3 changes: 2 additions & 1 deletion models/repo_indexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -207,14 +207,15 @@ func addUpdate(update fileUpdate, repo *Repository, batch rupture.FlushingBatch)
if err != nil {
return err
} else if !base.IsTextFile(fileContents) {
// FIXME: UTF-16 files will probably fail here
return nil
}
indexerUpdate := indexer.RepoIndexerUpdate{
Filepath: update.Filename,
Op: indexer.RepoIndexerOpUpdate,
Data: &indexer.RepoIndexerData{
RepoID: repo.ID,
Content: string(fileContents),
Content: string(base.ToUTF8DropErrors(fileContents)),
},
}
return indexerUpdate.AddToFlushingBatch(batch)
Expand Down
99 changes: 99 additions & 0 deletions modules/base/encoding.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
// Copyright 2014 The Gogs Authors. All rights reserved.
guillep2k marked this conversation as resolved.
Show resolved Hide resolved
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

package base

import (
"fmt"

"golang.org/x/net/html/charset"
"golang.org/x/text/transform"
)

// ToUTF8WithErr converts content to UTF8 encoding
func ToUTF8WithErr(content []byte) (string, error) {
charsetLabel, err := DetectEncoding(content)
if err != nil {
return "", err
} else if charsetLabel == "UTF-8" {
return string(RemoveBOMIfPresent(content)), nil
}

encoding, _ := charset.Lookup(charsetLabel)
if encoding == nil {
return string(content), fmt.Errorf("Unknown encoding: %s", charsetLabel)
}

// If there is an error, we concatenate the nicely decoded part and the
// original left over. This way we won't lose data.
result, n, err := transform.Bytes(encoding.NewDecoder(), content)
if err != nil {
result = append(result, content[n:]...)
}

result = RemoveBOMIfPresent(result)

return string(result), err
}

// ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible
func ToUTF8WithFallback(content []byte) []byte {
charsetLabel, err := DetectEncoding(content)
if err != nil || charsetLabel == "UTF-8" {
return RemoveBOMIfPresent(content)
}

encoding, _ := charset.Lookup(charsetLabel)
if encoding == nil {
return content
}

// If there is an error, we concatenate the nicely decoded part and the
// original left over. This way we won't lose data.
result, n, err := transform.Bytes(encoding.NewDecoder(), content)
if err != nil {
return append(result, content[n:]...)
}

return RemoveBOMIfPresent(result)
}

// ToUTF8 converts content to UTF8 encoding and ignore error
func ToUTF8(content string) string {
res, _ := ToUTF8WithErr([]byte(content))
return res
}

// ToUTF8DropErrors makes sure the return string is valid utf-8; attempts conversion if possible
func ToUTF8DropErrors(content []byte) []byte {
charsetLabel, err := DetectEncoding(content)
if err != nil || charsetLabel == "UTF-8" {
return RemoveBOMIfPresent(content)
}

encoding, _ := charset.Lookup(charsetLabel)
if encoding == nil {
return content
}

// We ignore any non-decodable parts from the file.
// Some parts might be lost
var decoded []byte
decoder := encoding.NewDecoder()
idx := 0
for {
result, n, err := transform.Bytes(decoder, content[idx:])
decoded = append(decoded, result...)
if err == nil {
break
}
decoded = append(decoded, ' ')
idx = idx + n + 1
if idx >= len(content) {
break
}
}

return RemoveBOMIfPresent(decoded)
}
2 changes: 1 addition & 1 deletion modules/indexer/repo.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ const (
repoIndexerAnalyzer = "repoIndexerAnalyzer"
repoIndexerDocType = "repoIndexerDocType"

repoIndexerLatestVersion = 2
repoIndexerLatestVersion = 3
)

// repoIndexer (thread-safe) index for repository contents
Expand Down
56 changes: 0 additions & 56 deletions modules/templates/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@ import (
"code.gitea.io/gitea/modules/markup"
"code.gitea.io/gitea/modules/setting"

"golang.org/x/net/html/charset"
"golang.org/x/text/transform"
"gopkg.in/editorconfig/editorconfig-core-go.v1"
)

Expand Down Expand Up @@ -274,60 +272,6 @@ func Sha1(str string) string {
return base.EncodeSha1(str)
}

// ToUTF8WithErr converts content to UTF8 encoding
func ToUTF8WithErr(content []byte) (string, error) {
charsetLabel, err := base.DetectEncoding(content)
if err != nil {
return "", err
} else if charsetLabel == "UTF-8" {
return string(base.RemoveBOMIfPresent(content)), nil
}

encoding, _ := charset.Lookup(charsetLabel)
if encoding == nil {
return string(content), fmt.Errorf("Unknown encoding: %s", charsetLabel)
}

// If there is an error, we concatenate the nicely decoded part and the
// original left over. This way we won't lose data.
result, n, err := transform.Bytes(encoding.NewDecoder(), content)
if err != nil {
result = append(result, content[n:]...)
}

result = base.RemoveBOMIfPresent(result)

return string(result), err
}

// ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible
func ToUTF8WithFallback(content []byte) []byte {
charsetLabel, err := base.DetectEncoding(content)
if err != nil || charsetLabel == "UTF-8" {
return base.RemoveBOMIfPresent(content)
}

encoding, _ := charset.Lookup(charsetLabel)
if encoding == nil {
return content
}

// If there is an error, we concatenate the nicely decoded part and the
// original left over. This way we won't lose data.
result, n, err := transform.Bytes(encoding.NewDecoder(), content)
if err != nil {
return append(result, content[n:]...)
}

return base.RemoveBOMIfPresent(result)
}

// ToUTF8 converts content to UTF8 encoding and ignore error
func ToUTF8(content string) string {
res, _ := ToUTF8WithErr([]byte(content))
return res
}

// ReplaceLeft replaces all prefixes 'oldS' in 's' with 'newS'.
func ReplaceLeft(s, oldS, newS string) string {
oldLen, newLen, i, n := len(oldS), len(newS), 0, 0
Expand Down
3 changes: 1 addition & 2 deletions routers/repo/commit.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ import (
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/templates"
)

const (
Expand Down Expand Up @@ -251,7 +250,7 @@ func Diff(ctx *context.Context) {
note := &git.Note{}
err = git.GetNote(ctx.Repo.GitRepo, commitID, note)
if err == nil {
ctx.Data["Note"] = string(templates.ToUTF8WithFallback(note.Message))
ctx.Data["Note"] = string(base.ToUTF8WithFallback(note.Message))
ctx.Data["NoteCommit"] = note.Commit
ctx.Data["NoteAuthor"] = models.ValidateCommitWithEmail(note.Commit)
}
Expand Down
3 changes: 1 addition & 2 deletions routers/repo/editor.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ import (
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/repofiles"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/templates"
"code.gitea.io/gitea/modules/upload"
"code.gitea.io/gitea/modules/util"
)
Expand Down Expand Up @@ -118,7 +117,7 @@ func editFile(ctx *context.Context, isNewFile bool) {

d, _ := ioutil.ReadAll(dataRc)
buf = append(buf, d...)
if content, err := templates.ToUTF8WithErr(buf); err != nil {
if content, err := base.ToUTF8WithErr(buf); err != nil {
log.Error("ToUTF8WithErr: %v", err)
ctx.Data["FileContent"] = string(buf)
} else {
Expand Down
7 changes: 3 additions & 4 deletions routers/repo/view.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ import (
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/markup"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/templates"
)

const (
Expand Down Expand Up @@ -160,7 +159,7 @@ func renderDirectory(ctx *context.Context, treeLink string) {
ctx.Data["FileSize"] = fileSize
} else {
d, _ := ioutil.ReadAll(dataRc)
buf = templates.ToUTF8WithFallback(append(buf, d...))
buf = base.ToUTF8WithFallback(append(buf, d...))

if markup.Type(readmeFile.Name()) != "" {
ctx.Data["IsMarkup"] = true
Expand Down Expand Up @@ -278,7 +277,7 @@ func renderFile(ctx *context.Context, entry *git.TreeEntry, treeLink, rawLink st
}

d, _ := ioutil.ReadAll(dataRc)
buf = templates.ToUTF8WithFallback(append(buf, d...))
buf = base.ToUTF8WithFallback(append(buf, d...))

readmeExist := markup.IsReadmeFile(blob.Name())
ctx.Data["ReadmeExist"] = readmeExist
Expand All @@ -293,7 +292,7 @@ func renderFile(ctx *context.Context, entry *git.TreeEntry, treeLink, rawLink st
} else {
// Building code view blocks with line number on server side.
var fileContent string
if content, err := templates.ToUTF8WithErr(buf); err != nil {
if content, err := base.ToUTF8WithErr(buf); err != nil {
log.Error("ToUTF8WithErr: %v", err)
fileContent = string(buf)
} else {
Expand Down