Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix synchronization bug in repo indexer #3455

Merged
merged 3 commits into from
Feb 5, 2018
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 100 additions & 62 deletions models/repo_indexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@
package models

import (
"io/ioutil"
"os"
"path"
"fmt"
"strconv"
"strings"

Expand All @@ -16,8 +14,6 @@ import (
"code.gitea.io/gitea/modules/indexer"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"

"github.com/Unknwon/com"
)

// RepoIndexerStatus status of a repo's entry in the repo indexer
Expand Down Expand Up @@ -132,77 +128,86 @@ func populateRepoIndexer(maxRepoID int64) {
}

func updateRepoIndexer(repo *Repository) error {
changes, err := getRepoChanges(repo)
sha, err := getDefaultBranchSha(repo)
if err != nil {
return err
}
changes, err := getRepoChanges(repo, sha)
if err != nil {
return err
} else if changes == nil {
return nil
}

batch := indexer.RepoIndexerBatch()
for _, filename := range changes.UpdatedFiles {
if err := addUpdate(filename, repo, batch); err != nil {
for _, update := range changes.Updates {
if err := addUpdate(update, repo, batch); err != nil {
return err
}
}
for _, filename := range changes.RemovedFiles {
for _, filename := range changes.RemovedFilenames {
if err := addDelete(filename, repo, batch); err != nil {
return err
}
}
if err = batch.Flush(); err != nil {
return err
}
return updateLastIndexSync(repo)
return repo.updateIndexerStatus(sha)
}

// repoChanges changes (file additions/updates/removals) to a repo
type repoChanges struct {
UpdatedFiles []string
RemovedFiles []string
Updates []fileUpdate
RemovedFilenames []string
}

// getRepoChanges returns changes to repo since last indexer update
func getRepoChanges(repo *Repository) (*repoChanges, error) {
repoWorkingPool.CheckIn(com.ToStr(repo.ID))
defer repoWorkingPool.CheckOut(com.ToStr(repo.ID))
type fileUpdate struct {
Filename string
BlobSha string
}

if err := repo.UpdateLocalCopyBranch(""); err != nil {
return nil, err
} else if !git.IsBranchExist(repo.LocalCopyPath(), repo.DefaultBranch) {
// repo does not have any commits yet, so nothing to update
return nil, nil
} else if err = repo.UpdateLocalCopyBranch(repo.DefaultBranch); err != nil {
return nil, err
} else if err = repo.getIndexerStatus(); err != nil {
func getDefaultBranchSha(repo *Repository) (string, error) {
stdout, err := git.NewCommand("show-ref", "-s", repo.DefaultBranch).RunInDir(repo.RepoPath())
if err != nil {
return "", err
}
return strings.TrimSpace(stdout), nil
}

// getRepoChanges returns changes to repo since last indexer update
func getRepoChanges(repo *Repository, revision string) (*repoChanges, error) {
if err := repo.getIndexerStatus(); err != nil {
return nil, err
}

if len(repo.IndexerStatus.CommitSha) == 0 {
return genesisChanges(repo)
return genesisChanges(repo, revision)
}
return nonGenesisChanges(repo)
return nonGenesisChanges(repo, revision)
}

func addUpdate(filename string, repo *Repository, batch *indexer.Batch) error {
filepath := path.Join(repo.LocalCopyPath(), filename)
if stat, err := os.Stat(filepath); err != nil {
func addUpdate(update fileUpdate, repo *Repository, batch *indexer.Batch) error {
stdout, err := git.NewCommand("cat-file", "-s", update.BlobSha).
RunInDir(repo.RepoPath())
if err != nil {
return err
} else if stat.Size() > setting.Indexer.MaxIndexerFileSize {
return nil
} else if stat.IsDir() {
// file could actually be a directory, if it is the root of a submodule.
// We do not index submodule contents, so don't do anything.
}
if size, err := strconv.Atoi(strings.TrimSpace(stdout)); err != nil {
return fmt.Errorf("Misformatted git cat-file output: %v", err)
} else if int64(size) > setting.Indexer.MaxIndexerFileSize {
return nil
}
fileContents, err := ioutil.ReadFile(filepath)

fileContents, err := git.NewCommand("cat-file", "blob", update.BlobSha).
RunInDirBytes(repo.RepoPath())
if err != nil {
return err
} else if !base.IsTextFile(fileContents) {
return nil
}
return batch.Add(indexer.RepoIndexerUpdate{
Filepath: filename,
Filepath: update.Filename,
Op: indexer.RepoIndexerOpUpdate,
Data: &indexer.RepoIndexerData{
RepoID: repo.ID,
Expand All @@ -221,42 +226,76 @@ func addDelete(filename string, repo *Repository, batch *indexer.Batch) error {
})
}

// genesisChanges get changes to add repo to the indexer for the first time
func genesisChanges(repo *Repository) (*repoChanges, error) {
var changes repoChanges
stdout, err := git.NewCommand("ls-files").RunInDir(repo.LocalCopyPath())
if err != nil {
return nil, err
}
for _, line := range strings.Split(stdout, "\n") {
filename := strings.TrimSpace(line)
if len(filename) == 0 {
// parseGitLsTreeOutput parses the output of a `git ls-tree -r --full-name` command
func parseGitLsTreeOutput(stdout string) ([]fileUpdate, error) {
lines := strings.Split(stdout, "\n")
updates := make([]fileUpdate, 0, len(lines))
for _, line := range lines {
// expect line to be "<mode> <object-type> <object-sha>\t<filename>"
line = strings.TrimSpace(line)
if len(line) == 0 {
continue
} else if filename[0] == '"' {
}
firstSpaceIndex := strings.IndexByte(line, ' ')
if firstSpaceIndex < 0 {
log.Error(4, "Misformatted git ls-tree output: %s", line)
continue
}
tabIndex := strings.IndexByte(line, '\t')
if tabIndex < 42+firstSpaceIndex || tabIndex == len(line)-1 {
log.Error(4, "Misformatted git ls-tree output: %s", line)
continue
}
if objectType := line[firstSpaceIndex+1 : tabIndex-41]; objectType != "blob" {
// submodules appear as commit objects, we do not index submodules
continue
}

blobSha := line[tabIndex-40 : tabIndex]
filename := line[tabIndex+1:]
if filename[0] == '"' {
var err error
filename, err = strconv.Unquote(filename)
if err != nil {
return nil, err
}
}
changes.UpdatedFiles = append(changes.UpdatedFiles, filename)
updates = append(updates, fileUpdate{
Filename: filename,
BlobSha: blobSha,
})
}
return updates, nil
}

// genesisChanges get changes to add repo to the indexer for the first time
func genesisChanges(repo *Repository, revision string) (*repoChanges, error) {
var changes repoChanges
stdout, err := git.NewCommand("ls-tree", "--full-tree", "-r", revision).
RunInDir(repo.RepoPath())
if err != nil {
return nil, err
}
return &changes, nil
changes.Updates, err = parseGitLsTreeOutput(stdout)
return &changes, err
}

// nonGenesisChanges get changes since the previous indexer update
func nonGenesisChanges(repo *Repository) (*repoChanges, error) {
func nonGenesisChanges(repo *Repository, revision string) (*repoChanges, error) {
diffCmd := git.NewCommand("diff", "--name-status",
repo.IndexerStatus.CommitSha, "HEAD")
stdout, err := diffCmd.RunInDir(repo.LocalCopyPath())
repo.IndexerStatus.CommitSha, revision)
stdout, err := diffCmd.RunInDir(repo.RepoPath())
if err != nil {
// previous commit sha may have been removed by a force push, so
// try rebuilding from scratch
log.Warn("git diff: %v", err)
if err = indexer.DeleteRepoFromIndexer(repo.ID); err != nil {
return nil, err
}
return genesisChanges(repo)
return genesisChanges(repo, revision)
}
var changes repoChanges
updatedFilenames := make([]string, 0, 10)
for _, line := range strings.Split(stdout, "\n") {
line = strings.TrimSpace(line)
if len(line) == 0 {
Expand All @@ -274,23 +313,22 @@ func nonGenesisChanges(repo *Repository) (*repoChanges, error) {

switch status := line[0]; status {
case 'M', 'A':
changes.UpdatedFiles = append(changes.UpdatedFiles, filename)
updatedFilenames = append(updatedFilenames, filename)
case 'D':
changes.RemovedFiles = append(changes.RemovedFiles, filename)
changes.RemovedFilenames = append(changes.RemovedFilenames, filename)
default:
log.Warn("Unrecognized status: %c (line=%s)", status, line)
}
}
return &changes, nil
}

func updateLastIndexSync(repo *Repository) error {
stdout, err := git.NewCommand("rev-parse", "HEAD").RunInDir(repo.LocalCopyPath())
cmd := git.NewCommand("ls-tree", "--full-tree", revision, "--")
cmd.AddArguments(updatedFilenames...)
stdout, err = cmd.RunInDir(repo.RepoPath())
if err != nil {
return err
return nil, err
}
sha := strings.TrimSpace(stdout)
return repo.updateIndexerStatus(sha)
changes.Updates, err = parseGitLsTreeOutput(stdout)
return &changes, err
}

func processRepoIndexerOperationQueue() {
Expand Down