Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Convert files to utf-8 for indexing #7814

Merged
merged 12 commits into from
Aug 15, 2019
38 changes: 37 additions & 1 deletion models/repo_indexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ import (
"code.gitea.io/gitea/modules/setting"

"github.com/ethantkoenig/rupture"
"golang.org/x/net/html/charset"
"golang.org/x/text/transform"
)

// RepoIndexerStatus status of a repo's entry in the repo indexer
Expand Down Expand Up @@ -207,14 +209,15 @@ func addUpdate(update fileUpdate, repo *Repository, batch rupture.FlushingBatch)
if err != nil {
return err
} else if !base.IsTextFile(fileContents) {
// FIXME: UTF-16 files will probably fail here
return nil
}
indexerUpdate := indexer.RepoIndexerUpdate{
Filepath: update.Filename,
Op: indexer.RepoIndexerOpUpdate,
Data: &indexer.RepoIndexerData{
RepoID: repo.ID,
Content: string(fileContents),
Content: string(toUTF8DropErrors(fileContents)),
},
}
return indexerUpdate.AddToFlushingBatch(batch)
Expand Down Expand Up @@ -360,3 +363,36 @@ func addOperationToQueue(op repoIndexerOperation) {
}()
}
}

// toUTF8DropErrors makes sure the return string is valid utf-8; attempts conversion if possible
func toUTF8DropErrors(content []byte) []byte {
lafriks marked this conversation as resolved.
Show resolved Hide resolved
charsetLabel, err := base.DetectEncoding(content)
if err != nil || charsetLabel == "UTF-8" {
return base.RemoveBOMIfPresent(content)
}

encoding, _ := charset.Lookup(charsetLabel)
if encoding == nil {
return content
}

// We ignore any non-decodable parts from the file.
// Some parts might be lost
var decoded []byte
decoder := encoding.NewDecoder()
idx := 0
for {
result, n, err := transform.Bytes(decoder, content[idx:])
decoded = append(decoded, result...)
if err == nil {
break
}
decoded = append(decoded, ' ')
idx = idx + n + 1
if idx >= len(content) {
break
}
}

return base.RemoveBOMIfPresent(decoded)
}