Skip to content

Commit

Permalink
Optimize pushes of multiple refs
Browse files Browse the repository at this point in the history
When pushing multiple refs, we know any Git objects on the remote side
can be excluded from the objects that refer to LFS objects we need to
push, since if the remote side already has the Git objects, it should
have the corresponding LFS objects as well.

However, when traversing Git objects, we traverse them on a per-ref
basis, which is required since any LFS objects which spawn a batch
request will need the ref to be placed in the batch request as part of
the protocol.

Let's find a list of all the remote sides that exist before traversing
any Git objects, and exclude traversing any of those objects in any
traversal.  As a result, we can traverse far, far fewer objects,
especially when pushing new refs in a large repository.

Note that we exclude the case when the left and right sides are the same
because our code sets them to the same thing in some cases even though
Git does not, so we cannot reason about the values in that case.
  • Loading branch information
bk2204 committed Jan 16, 2020
1 parent 1b3cf7a commit d0e950d
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 3 deletions.
13 changes: 10 additions & 3 deletions commands/uploader.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,19 @@ func uploadForRefUpdates(ctx *uploadContext, updates []*git.RefUpdate, pushAll b
}()

verifyLocksForUpdates(ctx.lockVerifier, updates)
rightSides := make([]string, 0, len(updates))
for _, update := range updates {
right := update.Right().Sha
if update.LeftCommitish() != right {
rightSides = append(rightSides, right)
}
}
for _, update := range updates {
// initialized here to prevent looped defer
q := ctx.NewQueue(
tq.RemoteRef(update.Right()),
)
err := uploadLeftOrAll(gitscanner, ctx, q, update, pushAll)
err := uploadLeftOrAll(gitscanner, ctx, q, rightSides, update, pushAll)
ctx.CollectErrors(q)

if err != nil {
Expand All @@ -47,7 +54,7 @@ func uploadForRefUpdates(ctx *uploadContext, updates []*git.RefUpdate, pushAll b
return nil
}

func uploadLeftOrAll(g *lfs.GitScanner, ctx *uploadContext, q *tq.TransferQueue, update *git.RefUpdate, pushAll bool) error {
func uploadLeftOrAll(g *lfs.GitScanner, ctx *uploadContext, q *tq.TransferQueue, bases []string, update *git.RefUpdate, pushAll bool) error {
cb := ctx.gitScannerCallback(q)
if pushAll {
if err := g.ScanRefWithDeleted(update.LeftCommitish(), cb); err != nil {
Expand All @@ -59,7 +66,7 @@ func uploadLeftOrAll(g *lfs.GitScanner, ctx *uploadContext, q *tq.TransferQueue,
if left == right {
right = ""
}
if err := g.ScanRangeToRemote(left, right, cb); err != nil {
if err := g.ScanMultiRangeToRemote(left, bases, cb); err != nil {
return err
}
}
Expand Down
19 changes: 19 additions & 0 deletions lfs/gitscanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,25 @@ func (s *GitScanner) ScanRangeToRemote(left, right string, cb GitScannerFoundPoi
return scanLeftRightToChan(s, callback, left, right, s.cfg.OSEnv(), s.opts(ScanRangeToRemoteMode))
}

// ScanMultiRangeToRemote scans through all commits starting at the left ref but
// not including the right ref (if given) that the given remote does not have.
// See RemoteForPush().
func (s *GitScanner) ScanMultiRangeToRemote(left string, rights []string, cb GitScannerFoundPointer) error {
callback, err := firstGitScannerCallback(cb, s.FoundPointer)
if err != nil {
return err
}

s.mu.Lock()
if len(s.remote) == 0 {
s.mu.Unlock()
return fmt.Errorf("unable to scan starting at %q: no remote set", left)
}
s.mu.Unlock()

return scanMultiLeftRightToChan(s, callback, left, rights, s.cfg.OSEnv(), s.opts(ScanRangeToRemoteMode))
}

// ScanRefs through all commits reachable by refs contained in "include" and
// not reachable by any refs included in "excluded"
func (s *GitScanner) ScanRefs(include, exclude []string, cb GitScannerFoundPointer) error {
Expand Down
8 changes: 8 additions & 0 deletions lfs/gitscanner_refs.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,14 @@ func scanLeftRightToChan(scanner *GitScanner, pointerCb GitScannerFoundPointer,
return scanRefsToChan(scanner, pointerCb, []string{refLeft}, []string{refRight}, osEnv, opt)
}

// scanMultiLeftRightToChan takes a ref and a set of bases and returns a channel
// of WrappedPointer objects for all Git LFS pointers it finds for that ref.
// Reports unique oids once only, not multiple times if >1 file uses the same
// content
func scanMultiLeftRightToChan(scanner *GitScanner, pointerCb GitScannerFoundPointer, refLeft string, bases []string, osEnv config.Environment, opt *ScanRefsOptions) error {
return scanRefsToChan(scanner, pointerCb, []string{refLeft}, bases, osEnv, opt)
}

// revListShas uses git rev-list to return the list of object sha1s
// for the given ref. If all is true, ref is ignored. It returns a
// channel from which sha1 strings can be read.
Expand Down
50 changes: 50 additions & 0 deletions t/t-push.sh
Original file line number Diff line number Diff line change
Expand Up @@ -741,6 +741,56 @@ begin_test 'push with data the server already has'
)
end_test

begin_test 'push with multiple refs and data the server already has'
(
set -e

reponame="push-multi-ref-server-data"
setup_remote_repo "$reponame"
clone_repo "$reponame" "$reponame"

git lfs track "*.dat"
git add .gitattributes
git commit -m "initial commit"

contents="abc123"
contents_oid="$(calc_oid "$contents")"
printf "%s" "$contents" > a.dat
git add a.dat
git commit -m "add a.dat"

git push origin master

assert_server_object "$reponame" "$contents_oid"

contents2="def456"
contents2_oid="$(calc_oid "$contents2")"
printf "%s" "$contents2" > b.dat
git add b.dat
git commit -m "add b.dat"

# Create a tag. Normally this would cause the entire history to be traversed
# since it's a new ref, but we no longer do that since we're pushing multiple
# refs.
git tag -m v1.0.0 -a v1.0.0

# We remove the original object. The server already has this.
delete_local_object "$contents_oid"

# We use the URL so that we cannot take advantage of the existing "origin/*"
# refs that we know the server must have.
GIT_TRACE=1 GIT_TRANSFER_TRACE=1 GIT_CURL_VERBOSE=1 \
git push "$(git config remote.origin.url)" master v1.0.0 2>&1 | tee push.log

# We should not find a batch request for the object which is in the earlier
# version of master, since we know the remote side has it.
[ "$(grep -c "$contents_oid" push.log)" = 0 ]

# Yet we should have pushed the new object successfully.
assert_server_object "$reponame" "$contents2_oid"
)
end_test

begin_test "push custom reference"
(
set -e
Expand Down

0 comments on commit d0e950d

Please sign in to comment.