diff --git a/bfg-library/src/main/scala/com/madgag/git/bfg/cleaner/LfsBlobConverter.scala b/bfg-library/src/main/scala/com/madgag/git/bfg/cleaner/LfsBlobConverter.scala new file mode 100644 index 00000000..ce07008f --- /dev/null +++ b/bfg-library/src/main/scala/com/madgag/git/bfg/cleaner/LfsBlobConverter.scala @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2015 Roberto Tyley + * + * This file is part of 'BFG Repo-Cleaner' - a tool for removing large + * or troublesome blobs from Git repositories. + * + * BFG Repo-Cleaner is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * BFG Repo-Cleaner is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ . + */ + +package com.madgag.git.bfg.cleaner + +import java.nio.charset.Charset +import java.security.{DigestInputStream, MessageDigest} + +import com.google.common.io.ByteStreams +import com.madgag.git.ThreadLocalObjectDatabaseResources +import com.madgag.git.bfg.model.{FileName, TreeBlobEntry} +import org.apache.commons.codec.binary.Hex.encodeHexString +import org.eclipse.jgit.lib.Constants.OBJ_BLOB +import org.eclipse.jgit.lib.ObjectLoader + +import scala.util.Try +import scalax.file.Path +import scalax.file.Path.createTempFile +import scalax.io.Resource + +trait LfsBlobConverter extends TreeBlobModifier { + + val threadLocalObjectDBResources: ThreadLocalObjectDatabaseResources + + val lfsSuitableFiles: (FileName => Boolean) + + val charset = Charset.forName("UTF-8") + + val lfsObjectsDir: Path + + override def fix(entry: TreeBlobEntry) = { + val oid = (for { + _ <- Some(entry.filename) filter lfsSuitableFiles + loader = threadLocalObjectDBResources.reader().open(entry.objectId) + (shaHex, lfsPath) <- buildLfsFileFrom(loader) + } yield { + val pointer = + s"""|version https://git-lfs.github.com/spec/v1 + |oid sha256:$shaHex + |size ${loader.getSize} + |""".stripMargin + + threadLocalObjectDBResources.inserter().insert(OBJ_BLOB, pointer.getBytes(charset)) + }).getOrElse(entry.objectId) + + (entry.mode, oid) + } + + def buildLfsFileFrom(loader: ObjectLoader): Option[(String, Path)] = { + val tmpFile = createTempFile() + + val digest = MessageDigest.getInstance("SHA-256") + + for { + inStream <- Resource.fromInputStream(new DigestInputStream(loader.openStream(), digest)) + outStream <- tmpFile.outputStream() + } ByteStreams.copy(inStream, outStream) + + val shaHex = encodeHexString(digest.digest()) + + val lfsPath = lfsObjectsDir / shaHex + + val ensureLfsFile = Try(if (!lfsPath.exists) tmpFile moveTo lfsPath).recover { + case _ => lfsPath.size.contains(loader.getSize) + } + + Try(tmpFile.delete(force = true)) + + for (_ <- ensureLfsFile.toOption) yield shaHex -> lfsPath + } +} \ No newline at end of file diff --git a/bfg/src/main/scala/com/madgag/git/bfg/cli/CLIConfig.scala b/bfg/src/main/scala/com/madgag/git/bfg/cli/CLIConfig.scala index abbdb087..8976a15e 100644 --- a/bfg/src/main/scala/com/madgag/git/bfg/cli/CLIConfig.scala +++ b/bfg/src/main/scala/com/madgag/git/bfg/cli/CLIConfig.scala @@ -74,6 +74,9 @@ object CLIConfig { fileMatcher("delete-folders").text("delete folders with the specified names (eg '.svn', '*-tmp' - matches on folder name, not path within repo)").action { (v, c) => c.copy(deleteFolders = Some(v)) } + opt[Unit]("convert-to-git-lfs").text("experimental support for Git LFS, use with '-fi' to specify files").hidden().action { + (_, c) => c.copy(lfsConversion = true) + } opt[File]("replace-text").abbr("rt").valueName("").text("filter content of files, replacing matched text. Match expressions should be listed in the file, one expression per line - " + "by default, each expression is treated as a literal, but 'regex:' & 'glob:' prefixes are supported, with '==>' to specify a replacement " + "string other than the default of '***REMOVED***'.").action { @@ -129,6 +132,7 @@ case class CLIConfig(stripBiggestBlobs: Option[Int] = None, filterSizeThreshold: Int = BlobTextModifier.DefaultSizeThreshold, textReplacementExpressions: Traversable[String] = List.empty, stripBlobsWithIds: Option[Set[ObjectId]] = None, + lfsConversion: Boolean = false, strictObjectChecking: Boolean = false, sensitiveData: Option[Boolean] = None, massiveNonFileObjects: Option[Int] = None, @@ -172,6 +176,16 @@ case class CLIConfig(stripBiggestBlobs: Option[Int] = None, } } + lazy val lfsBlobConverter: Option[LfsBlobConverter] = if (lfsConversion) Some { + new LfsBlobConverter { + val lfsObjectsDir = repo.getDirectory / "lfs" / "objects" + + val lfsSuitableFiles = filterContentPredicate + + val threadLocalObjectDBResources = repo.getObjectDatabase.threadLocalResources + } + } else None + lazy val privateDataRemoval = sensitiveData.getOrElse(Seq(fileDeletion, folderDeletion, blobTextModifier).flatten.nonEmpty) lazy val objectIdSubstitutor = if (privateDataRemoval) ObjectIdSubstitutor.OldIdsPrivate else ObjectIdSubstitutor.OldIdsPublic @@ -209,7 +223,7 @@ case class CLIConfig(stripBiggestBlobs: Option[Int] = None, } } - Seq(blobsByIdRemover, blobRemover, fileDeletion, blobTextModifier).flatten + Seq(blobsByIdRemover, blobRemover, fileDeletion, blobTextModifier, lfsBlobConverter).flatten } lazy val definesNoWork = treeBlobCleaners.isEmpty && folderDeletion.isEmpty && treeEntryListCleaners.isEmpty diff --git a/bfg/src/test/resources/sample-repos/repoWithBigBlobs.git.zip b/bfg/src/test/resources/sample-repos/repoWithBigBlobs.git.zip new file mode 100644 index 00000000..5648182c Binary files /dev/null and b/bfg/src/test/resources/sample-repos/repoWithBigBlobs.git.zip differ diff --git a/bfg/src/test/scala/com/madgag/git/bfg/cli/MainSpec.scala b/bfg/src/test/scala/com/madgag/git/bfg/cli/MainSpec.scala index aeb5bcfc..8eb9d66b 100644 --- a/bfg/src/test/scala/com/madgag/git/bfg/cli/MainSpec.scala +++ b/bfg/src/test/scala/com/madgag/git/bfg/cli/MainSpec.scala @@ -20,10 +20,13 @@ package com.madgag.git.bfg.cli +import com.madgag.git._ +import com.madgag.git.bfg.cli.test.unpackedRepo import org.specs2.mutable._ + +import scalax.file.ImplicitConversions._ import scalax.file.Path -import com.madgag.git._ -import bfg.cli.test.unpackedRepo + class MainSpec extends Specification { @@ -52,6 +55,15 @@ class MainSpec extends Specification { } } + "convert big blobs to the Git LFS format" in new unpackedRepo("/sample-repos/repoWithBigBlobs.git.zip") { + ensureRemovalOfBadEggs(packedBlobsOfSize(11238), contain(exactly(abbrId("596c")))) { + run("--convert-to-git-lfs --filter-content-including *.png --no-blob-protection") + } + val lfsFile = repo.getDirectory / "lfs" / "objects" / "e0ebd49837a1cced34b9e7d3ff2fa68a8100df8f158f165ce139e366a941ba6e" + + lfsFile.size must beSome(11238) + } + "remove bad folder named '.git'" in new unpackedRepo("/sample-repos/badRepoContainingDotGitFolder.git.zip") { ensureRemovalOf(commitHistory(haveFolder(".git").atLeastOnce)) { run("--delete-folders .git --no-blob-protection")