From 8d018c27662b128dca7631b01e016d20eb8b93cb Mon Sep 17 00:00:00 2001 From: Shinnosuke Sawada-Dazai Date: Fri, 13 Dec 2024 12:27:18 +0900 Subject: [PATCH] Use git partial clone and worktree to reduce network/file io (#5412) * Use git worktree and partial clone to reduce network io Signed-off-by: Shinnosuke Sawada-Dazai * Add defer statement to clean up cloned git repositories in detector implementations Signed-off-by: Shinnosuke Sawada-Dazai * Change the repo.Copy to use worktree and implement CopyToModify Signed-off-by: Shinnosuke Sawada-Dazai * Add tests for Copy and CopyToModify methods in repo Signed-off-by: Shinnosuke Sawada-Dazai * repo.Copy and related methods updated to use git.Worktree instead of git.Repo Signed-off-by: Shinnosuke Sawada-Dazai * MockRepo.Copy method updated to return git.Worktree instead of git.Repo Signed-off-by: Shinnosuke Sawada-Dazai * Update CopyToModify method to clone repository using git clone command Signed-off-by: Shinnosuke Sawada-Dazai * Test: Update TestCopy to use repo.Copy instead of CopyToModify Signed-off-by: Shinnosuke Sawada-Dazai * Fix comment in CopyToModify to clarify remote URL setting after local cloning Signed-off-by: Shinnosuke Sawada-Dazai * Fetch the latest changes from remote after local cloning Signed-off-by: Shinnosuke Sawada-Dazai * Remove .git directory from copied deploy source to avoid the git ops Signed-off-by: Shinnosuke Sawada-Dazai * Update TestCopyToModify to use a mock remote directory for testing Signed-off-by: Shinnosuke Sawada-Dazai * Copy deploy source using tar to exclude .git directory and improve performance Signed-off-by: Shinnosuke Sawada-Dazai --------- Signed-off-by: Shinnosuke Sawada-Dazai Signed-off-by: pipecd-bot --- pkg/app/piped/deploysource/deploysource.go | 11 +- .../piped/driftdetector/cloudrun/detector.go | 4 +- pkg/app/piped/driftdetector/ecs/detector.go | 4 +- .../driftdetector/kubernetes/detector.go | 4 +- .../piped/driftdetector/lambda/detector.go | 4 +- .../piped/driftdetector/terraform/detector.go | 4 +- pkg/app/piped/eventwatcher/eventwatcher.go | 4 +- pkg/app/pipedv1/deploysource/deploysource.go | 11 +- pkg/app/pipedv1/eventwatcher/eventwatcher.go | 4 +- pkg/git/client.go | 9 +- pkg/git/gittest/git.mock.go | 19 ++- pkg/git/repo.go | 112 ++++++++++++++++-- pkg/git/repo_test.go | 39 ++++++ 13 files changed, 203 insertions(+), 26 deletions(-) diff --git a/pkg/app/piped/deploysource/deploysource.go b/pkg/app/piped/deploysource/deploysource.go index 5809e459d8..ba62a6a86f 100644 --- a/pkg/app/piped/deploysource/deploysource.go +++ b/pkg/app/piped/deploysource/deploysource.go @@ -203,8 +203,17 @@ func (p *provider) prepare(ctx context.Context, lw io.Writer) (*DeploySource, er func (p *provider) copy(lw io.Writer) (*DeploySource, error) { p.copyNum++ + src := p.source.RepoDir dest := fmt.Sprintf("%s-%d", p.source.RepoDir, p.copyNum) - cmd := exec.Command("cp", "-rf", p.source.RepoDir, dest) + + // use tar to exclude the .git directory + // the tar command does not create the destination directory if it does not exist. + // so we need to create it before running the command. + if err := os.MkdirAll(dest, 0700); err != nil { + fmt.Fprintf(lw, "Unable to create the directory to store the copied deploy source (%v)\n", err) + return nil, err + } + cmd := exec.Command("sh", "-c", fmt.Sprintf("tar c -f - -C '%s' --exclude='.git' . | tar x -f - -C '%s'", src, dest)) out, err := cmd.CombinedOutput() if err != nil { fmt.Fprintf(lw, "Unable to copy deploy source data (%v, %s)\n", err, string(out)) diff --git a/pkg/app/piped/driftdetector/cloudrun/detector.go b/pkg/app/piped/driftdetector/cloudrun/detector.go index 465068a242..fc0386c995 100644 --- a/pkg/app/piped/driftdetector/cloudrun/detector.go +++ b/pkg/app/piped/driftdetector/cloudrun/detector.go @@ -221,7 +221,7 @@ func (d *detector) checkApplication(ctx context.Context, app *model.Application, return d.reporter.ReportApplicationSyncState(ctx, app.Id, state) } -func (d *detector) loadHeadServiceManifest(app *model.Application, repo git.Repo, headCommit git.Commit) (provider.ServiceManifest, error) { +func (d *detector) loadHeadServiceManifest(app *model.Application, repo git.Worktree, headCommit git.Commit) (provider.ServiceManifest, error) { var ( manifestCache = provider.ServiceManifestCache{ AppID: app.Id, @@ -263,6 +263,8 @@ func (d *detector) loadHeadServiceManifest(app *model.Application, repo git.Repo if err != nil { return provider.ServiceManifest{}, fmt.Errorf("failed to copy the cloned git repository (%w)", err) } + defer repo.Clean() + repoDir := repo.GetPath() appDir = filepath.Join(repoDir, app.GitPath.Path) } diff --git a/pkg/app/piped/driftdetector/ecs/detector.go b/pkg/app/piped/driftdetector/ecs/detector.go index 43ff8acabb..ca7d094054 100644 --- a/pkg/app/piped/driftdetector/ecs/detector.go +++ b/pkg/app/piped/driftdetector/ecs/detector.go @@ -343,7 +343,7 @@ func ignoreParameters(liveManifests provider.ECSManifests, headManifests provide return live, head } -func (d *detector) loadConfigs(app *model.Application, repo git.Repo, headCommit git.Commit) (provider.ECSManifests, error) { +func (d *detector) loadConfigs(app *model.Application, repo git.Worktree, headCommit git.Commit) (provider.ECSManifests, error) { var ( manifestCache = provider.ECSManifestsCache{ AppID: app.Id, @@ -387,6 +387,8 @@ func (d *detector) loadConfigs(app *model.Application, repo git.Repo, headCommit if err != nil { return provider.ECSManifests{}, fmt.Errorf("failed to copy the cloned git repository (%w)", err) } + defer repo.Clean() + repoDir := repo.GetPath() appDir = filepath.Join(repoDir, app.GitPath.Path) } diff --git a/pkg/app/piped/driftdetector/kubernetes/detector.go b/pkg/app/piped/driftdetector/kubernetes/detector.go index 5330876894..b1ae0f8195 100644 --- a/pkg/app/piped/driftdetector/kubernetes/detector.go +++ b/pkg/app/piped/driftdetector/kubernetes/detector.go @@ -236,7 +236,7 @@ func (d *detector) checkApplication(ctx context.Context, app *model.Application, return d.reporter.ReportApplicationSyncState(ctx, app.Id, state) } -func (d *detector) loadHeadManifests(ctx context.Context, app *model.Application, repo git.Repo, headCommit git.Commit, watchingResourceKinds []provider.APIVersionKind) ([]provider.Manifest, error) { +func (d *detector) loadHeadManifests(ctx context.Context, app *model.Application, repo git.Worktree, headCommit git.Commit, watchingResourceKinds []provider.APIVersionKind) ([]provider.Manifest, error) { var ( manifestCache = provider.AppManifestsCache{ AppID: app.Id, @@ -278,6 +278,8 @@ func (d *detector) loadHeadManifests(ctx context.Context, app *model.Application if err != nil { return nil, fmt.Errorf("failed to copy the cloned git repository (%w)", err) } + defer repo.Clean() + repoDir = repo.GetPath() appDir = filepath.Join(repoDir, app.GitPath.Path) } diff --git a/pkg/app/piped/driftdetector/lambda/detector.go b/pkg/app/piped/driftdetector/lambda/detector.go index fcfa57accf..0f7ca0c342 100644 --- a/pkg/app/piped/driftdetector/lambda/detector.go +++ b/pkg/app/piped/driftdetector/lambda/detector.go @@ -270,7 +270,7 @@ func ignoreAndSortParameters(headSpec provider.FunctionManifestSpec) provider.Fu return cloneSpec } -func (d *detector) loadHeadFunctionManifest(app *model.Application, repo git.Repo, headCommit git.Commit) (provider.FunctionManifest, error) { +func (d *detector) loadHeadFunctionManifest(app *model.Application, repo git.Worktree, headCommit git.Commit) (provider.FunctionManifest, error) { var ( manifestCache = provider.FunctionManifestCache{ AppID: app.Id, @@ -312,6 +312,8 @@ func (d *detector) loadHeadFunctionManifest(app *model.Application, repo git.Rep if err != nil { return provider.FunctionManifest{}, fmt.Errorf("failed to copy the cloned git repository (%w)", err) } + defer repo.Clean() + repoDir := repo.GetPath() appDir = filepath.Join(repoDir, app.GitPath.Path) } diff --git a/pkg/app/piped/driftdetector/terraform/detector.go b/pkg/app/piped/driftdetector/terraform/detector.go index 03ea36bf0d..43bb32c6db 100644 --- a/pkg/app/piped/driftdetector/terraform/detector.go +++ b/pkg/app/piped/driftdetector/terraform/detector.go @@ -169,7 +169,7 @@ func (d *detector) check(ctx context.Context) { } } -func (d *detector) checkApplication(ctx context.Context, app *model.Application, repo git.Repo, headCommit git.Commit) error { +func (d *detector) checkApplication(ctx context.Context, app *model.Application, repo git.Worktree, headCommit git.Commit) error { var ( repoDir = repo.GetPath() appDir = filepath.Join(repoDir, app.GitPath.Path) @@ -206,6 +206,8 @@ func (d *detector) checkApplication(ctx context.Context, app *model.Application, if err != nil { return fmt.Errorf("failed to copy the cloned git repository (%w)", err) } + defer repo.Clean() + repoDir = repo.GetPath() appDir = filepath.Join(repoDir, app.GitPath.Path) } diff --git a/pkg/app/piped/eventwatcher/eventwatcher.go b/pkg/app/piped/eventwatcher/eventwatcher.go index 8689e04497..97a9786a7a 100644 --- a/pkg/app/piped/eventwatcher/eventwatcher.go +++ b/pkg/app/piped/eventwatcher/eventwatcher.go @@ -319,7 +319,7 @@ func (w *watcher) execute(ctx context.Context, repo git.Repo, repoID string, eve if err != nil { return fmt.Errorf("failed to create a new temporary directory: %w", err) } - tmpRepo, err := repo.Copy(filepath.Join(tmpDir, "tmp-repo")) + tmpRepo, err := repo.CopyToModify(filepath.Join(tmpDir, "tmp-repo")) if err != nil { return fmt.Errorf("failed to copy the repository to the temporary directory: %w", err) } @@ -495,7 +495,7 @@ func (w *watcher) updateValues(ctx context.Context, repo git.Repo, repoID string if err != nil { return fmt.Errorf("failed to create a new temporary directory: %w", err) } - tmpRepo, err := repo.Copy(filepath.Join(tmpDir, "tmp-repo")) + tmpRepo, err := repo.CopyToModify(filepath.Join(tmpDir, "tmp-repo")) if err != nil { return fmt.Errorf("failed to copy the repository to the temporary directory: %w", err) } diff --git a/pkg/app/pipedv1/deploysource/deploysource.go b/pkg/app/pipedv1/deploysource/deploysource.go index e50e9b20f4..2637291ccd 100644 --- a/pkg/app/pipedv1/deploysource/deploysource.go +++ b/pkg/app/pipedv1/deploysource/deploysource.go @@ -183,8 +183,17 @@ func (p *provider) prepare(ctx context.Context, lw io.Writer) (*DeploySource, er func (p *provider) copy(lw io.Writer) (*DeploySource, error) { p.copyNum++ + src := p.source.RepoDir dest := fmt.Sprintf("%s-%d", p.source.RepoDir, p.copyNum) - cmd := exec.Command("cp", "-rf", p.source.RepoDir, dest) + + // use tar to exclude the .git directory + // the tar command does not create the destination directory if it does not exist. + // so we need to create it before running the command. + if err := os.MkdirAll(dest, 0700); err != nil { + fmt.Fprintf(lw, "Unable to create the directory to store the copied deploy source (%v)\n", err) + return nil, err + } + cmd := exec.Command("sh", "-c", fmt.Sprintf("tar c -f - -C '%s' --exclude='.git' . | tar x -f - -C '%s'", src, dest)) out, err := cmd.CombinedOutput() if err != nil { fmt.Fprintf(lw, "Unable to copy deploy source data (%v, %s)\n", err, string(out)) diff --git a/pkg/app/pipedv1/eventwatcher/eventwatcher.go b/pkg/app/pipedv1/eventwatcher/eventwatcher.go index 46755ae528..8aad4976f7 100644 --- a/pkg/app/pipedv1/eventwatcher/eventwatcher.go +++ b/pkg/app/pipedv1/eventwatcher/eventwatcher.go @@ -317,7 +317,7 @@ func (w *watcher) execute(ctx context.Context, repo git.Repo, repoID string, eve if err != nil { return fmt.Errorf("failed to create a new temporary directory: %w", err) } - tmpRepo, err := repo.Copy(filepath.Join(tmpDir, "tmp-repo")) + tmpRepo, err := repo.CopyToModify(filepath.Join(tmpDir, "tmp-repo")) if err != nil { return fmt.Errorf("failed to copy the repository to the temporary directory: %w", err) } @@ -478,7 +478,7 @@ func (w *watcher) updateValues(ctx context.Context, repo git.Repo, repoID string if err != nil { return fmt.Errorf("failed to create a new temporary directory: %w", err) } - tmpRepo, err := repo.Copy(filepath.Join(tmpDir, "tmp-repo")) + tmpRepo, err := repo.CopyToModify(filepath.Join(tmpDir, "tmp-repo")) if err != nil { return fmt.Errorf("failed to copy the repository to the temporary directory: %w", err) } diff --git a/pkg/git/client.go b/pkg/git/client.go index b741797500..56a9b13a84 100644 --- a/pkg/git/client.go +++ b/pkg/git/client.go @@ -166,7 +166,7 @@ func (c *client) Clone(ctx context.Context, repoID, remote, branch, destination return nil, err } out, err := retryCommand(3, time.Second, logger, func() ([]byte, error) { - args := []string{"clone", "--mirror", remote, repoCachePath} + args := []string{"clone", "--mirror", "--filter=blob:none", remote, repoCachePath} args = append(authArgs, args...) return runGitCommand(ctx, c.gitPath, "", c.envsForRepo(remote), args...) }) @@ -214,11 +214,12 @@ func (c *client) Clone(ctx context.Context, repoID, remote, branch, destination } } - args := []string{"clone"} + // git worktree add [-f] [--detach] [--checkout] [--lock [--reason ]] + // [--orphan] [(-b | -B) ] [] + args := []string{"-C", repoCachePath, "worktree", "add", "--detach", destination} if branch != "" { - args = append(args, "-b", branch) + args = append(args, branch) } - args = append(args, repoCachePath, destination) logger.Info("cloning a repo from cached one in local", zap.String("src", repoCachePath), diff --git a/pkg/git/gittest/git.mock.go b/pkg/git/gittest/git.mock.go index 7f01dabde3..eb4dea61db 100644 --- a/pkg/git/gittest/git.mock.go +++ b/pkg/git/gittest/git.mock.go @@ -121,10 +121,10 @@ func (mr *MockRepoMockRecorder) CommitChanges(arg0, arg1, arg2, arg3, arg4, arg5 } // Copy mocks base method. -func (m *MockRepo) Copy(arg0 string) (git.Repo, error) { +func (m *MockRepo) Copy(arg0 string) (git.Worktree, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "Copy", arg0) - ret0, _ := ret[0].(git.Repo) + ret0, _ := ret[0].(git.Worktree) ret1, _ := ret[1].(error) return ret0, ret1 } @@ -135,6 +135,21 @@ func (mr *MockRepoMockRecorder) Copy(arg0 interface{}) *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Copy", reflect.TypeOf((*MockRepo)(nil).Copy), arg0) } +// CopyToModify mocks base method. +func (m *MockRepo) CopyToModify(arg0 string) (git.Repo, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "CopyToModify", arg0) + ret0, _ := ret[0].(git.Repo) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// CopyToModify indicates an expected call of CopyToModify. +func (mr *MockRepoMockRecorder) CopyToModify(arg0 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CopyToModify", reflect.TypeOf((*MockRepo)(nil).CopyToModify), arg0) +} + // GetClonedBranch mocks base method. func (m *MockRepo) GetClonedBranch() string { m.ctrl.T.Helper() diff --git a/pkg/git/repo.go b/pkg/git/repo.go index de458274dc..0fc06f041d 100644 --- a/pkg/git/repo.go +++ b/pkg/git/repo.go @@ -35,7 +35,8 @@ var ( type Repo interface { GetPath() string GetClonedBranch() string - Copy(dest string) (Repo, error) + Copy(dest string) (Worktree, error) + CopyToModify(dest string) (Repo, error) ListCommits(ctx context.Context, visionRange string) ([]Commit, error) GetLatestCommit(ctx context.Context) (Commit, error) @@ -52,6 +53,17 @@ type Repo interface { CommitChanges(ctx context.Context, branch, message string, newBranch bool, changes map[string][]byte, trailers map[string]string) error } +// Worktree provides functions to get and handle git worktree. +// It is a separate checkout of the repository. +// It is used to make changes to the repository without affecting the main repository. +// Worktree always does checkout with the detached HEAD, so it doesn't affect the main repository. +type Worktree interface { + GetPath() string + Clean() error + Copy(dest string) (Worktree, error) + Checkout(ctx context.Context, commitish string) error +} + type repo struct { dir string gitPath string @@ -60,6 +72,55 @@ type repo struct { gitEnvs []string } +// worktree is a git worktree. +// It is a separate checkout of the repository. +type worktree struct { + base *repo + worktreePath string +} + +func (r *worktree) runGitCommand(ctx context.Context, args ...string) ([]byte, error) { + cmd := exec.CommandContext(ctx, r.base.gitPath, args...) + cmd.Dir = r.worktreePath + cmd.Env = append(os.Environ(), r.base.gitEnvs...) + return cmd.CombinedOutput() +} + +func (r *worktree) Copy(dest string) (Worktree, error) { + // garbage collecting worktrees + if _, err := r.runGitCommand(context.Background(), "worktree", "prune"); err != nil { + // ignore the error + } + + if out, err := r.runGitCommand(context.Background(), "worktree", "add", "--detach", dest); err != nil { + return nil, formatCommandError(err, out) + } + + return &worktree{ + base: r.base, + worktreePath: dest, + }, nil +} + +func (r *worktree) GetPath() string { + return r.worktreePath +} + +func (r *worktree) Clean() error { + if out, err := r.base.runGitCommand(context.Background(), "worktree", "remove", r.worktreePath); err != nil { + return formatCommandError(err, out) + } + return nil +} + +func (r *worktree) Checkout(ctx context.Context, commitish string) error { + out, err := r.runGitCommand(ctx, "checkout", "--detach", commitish) + if err != nil { + return formatCommandError(err, out) + } + return nil +} + // NewRepo creates a new Repo instance. func NewRepo(dir, gitPath, remote, clonedBranch string, gitEnvs []string) *repo { return &repo{ @@ -81,22 +142,55 @@ func (r *repo) GetClonedBranch() string { return r.clonedBranch } -// Copy does copying the repository to the given destination. +// Copy does copying the repository to the given destination using git worktree. +// The repository is cloned to the given destination with the detached HEAD. // NOTE: the given “dest” must be a path that doesn’t exist yet. -// If you don't, it copies the repo root itself to the given dest as a subdirectory. -func (r *repo) Copy(dest string) (Repo, error) { - cmd := exec.Command("cp", "-rf", r.dir, dest) - out, err := cmd.CombinedOutput() - if err != nil { +// If you don't, you will get an error. +func (r *repo) Copy(dest string) (Worktree, error) { + // garbage collecting worktrees + if _, err := r.runGitCommand(context.Background(), "worktree", "prune"); err != nil { + // ignore the error + } + + if out, err := r.runGitCommand(context.Background(), "worktree", "add", "--detach", dest); err != nil { return nil, formatCommandError(err, out) } - return &repo{ + return &worktree{ + base: r, + worktreePath: dest, + }, nil +} + +// CopyToModify does cloning the repository to the given destination. +// The repository is cloned to the given destination with the . +// NOTE: the given “dest” must be a path that doesn’t exist yet. +// If you don't, you will get an error. +func (r *repo) CopyToModify(dest string) (Repo, error) { + cmd := exec.Command(r.gitPath, "clone", r.dir, dest) + if out, err := cmd.CombinedOutput(); err != nil { + return nil, formatCommandError(err, out) + } + + cloned := &repo{ dir: dest, gitPath: r.gitPath, remote: r.remote, clonedBranch: r.clonedBranch, - }, nil + gitEnvs: r.gitEnvs, + } + + // because we did a local cloning so set the remote url of origin + if err := cloned.setRemote(context.Background(), r.remote); err != nil { + return nil, err + } + + // fetch the latest changes which doesn't exist in the local repository + if out, err := cloned.runGitCommand(context.Background(), "fetch"); err != nil { + return nil, formatCommandError(err, out) + } + + return cloned, nil } // ListCommits returns a list of commits in a given revision range. diff --git a/pkg/git/repo_test.go b/pkg/git/repo_test.go index ae09f21bf9..d36cb0c3f9 100644 --- a/pkg/git/repo_test.go +++ b/pkg/git/repo_test.go @@ -271,11 +271,50 @@ func TestCopy(t *testing.T) { assert.NotEqual(t, r, newRepo) + assert.NoError(t, newRepo.Clean()) +} + +func TestCopyToModify(t *testing.T) { + faker, err := newFaker() + require.NoError(t, err) + defer faker.clean() + + var ( + org = "test-repo-org" + repoName = "repo-copy" + ctx = context.Background() + ) + + err = faker.makeRepo(org, repoName) + require.NoError(t, err) + r := &repo{ + dir: faker.repoDir(org, repoName), + gitPath: faker.gitPath, + remote: faker.repoDir(org, repoName), // use the same directory as remote, it's not a real remote. it's strange but it's ok for testing. + } + + commits, err := r.ListCommits(ctx, "") + require.NoError(t, err) + assert.Equal(t, 1, len(commits)) + + tmpDir := filepath.Join(faker.dir, "tmp-repo") + newRepo, err := r.CopyToModify(tmpDir) + require.NoError(t, err) + + // we can copy the repo to another directory multiple times + tmpDir2 := filepath.Join(faker.dir, "tmp-repo2") + newRepo2, err := r.CopyToModify(tmpDir2) + require.NoError(t, err) + + assert.NotEqual(t, r, newRepo) + newRepoCommits, err := newRepo.ListCommits(ctx, "") require.NoError(t, err) assert.Equal(t, 1, len(newRepoCommits)) assert.Equal(t, commits, newRepoCommits) + assert.NoError(t, newRepo.Clean()) + assert.NoError(t, newRepo2.Clean()) } func TestGetCommitForRev(t *testing.T) {