diff --git a/build.sbt b/build.sbt index 1cda078284d..ac313e05c64 100644 --- a/build.sbt +++ b/build.sbt @@ -322,20 +322,35 @@ val icebergSparkRuntimeArtifactName = { } lazy val testDeltaIcebergJar = (project in file("testDeltaIcebergJar")) + // delta-iceberg depends on delta-spark! So, we need to include it during our test. + .dependsOn(spark % "test") .settings( name := "test-delta-iceberg-jar", commonSettings, skipReleaseSettings, + exportJars := true, Compile / unmanagedJars += (iceberg / assembly).value, libraryDependencies ++= Seq( "org.apache.hadoop" % "hadoop-client" % hadoopVersion, "org.scalatest" %% "scalatest" % scalaTestVersion % "test", - "org.apache.spark" %% "spark-core" % sparkVersion + "org.apache.spark" %% "spark-core" % sparkVersion % "test" ) ) +val deltaIcebergSparkIncludePrefixes = Seq( + // We want everything from this package + "org/apache/spark/sql/delta/icebergShaded", + + // We only want the files in this project from this package. e.g. we want to exclude + // org/apache/spark/sql/delta/commands/convert/ConvertTargetFile.class (from delta-spark project). + "org/apache/spark/sql/delta/commands/convert/IcebergFileManifest", + "org/apache/spark/sql/delta/commands/convert/IcebergSchemaUtils", + "org/apache/spark/sql/delta/commands/convert/IcebergTable" +) + // Build using: build/sbt clean icebergShaded/compile iceberg/compile // It will fail the first time, just re-run it. +// scalastyle:off println lazy val iceberg = (project in file("iceberg")) .dependsOn(spark % "compile->compile;test->test;provided->provided") .settings ( @@ -358,7 +373,7 @@ lazy val iceberg = (project in file("iceberg")) assembly / test := {}, assembly / assemblyExcludedJars := { // Note: the input here is only `libraryDependencies` jars, not `.dependsOn(_)` jars. - val includes = Seq( + val allowedJars = Seq( "iceberg-shaded_2.12-3.0.0-SNAPSHOT.jar", "scala-library-2.12.15.jar", "scala-collection-compat_2.12-2.1.1.jar", @@ -372,10 +387,8 @@ lazy val iceberg = (project in file("iceberg")) // Return `true` when we want the jar `f` to be excluded from the assembly jar cp.filter { f => - val doExclude = !includes.contains(f.data.getName) - // scalastyle:off println - println(s"${f.data.getName} :: exclude? $doExclude") - // scalastyle:on println + val doExclude = !allowedJars.contains(f.data.getName) + println(s"Excluding jar: ${f.data.getName} ? $doExclude") doExclude } }, @@ -387,22 +400,24 @@ lazy val iceberg = (project in file("iceberg")) // - delta-storage will bring in classes: io/delta/storage // - delta-spark will bring in classes: io/delta/exceptions/, io/delta/implicits, // io/delta/package, io/delta/sql, io/delta/tables, - println(s"BBBB io/delta/$xs") + println(s"Discarding class: io/delta/${xs.mkString("/")}") + MergeStrategy.discard + case PathList("com", "databricks", xs @ _*) => + // delta-spark will bring in com/databricks/spark/util + println(s"Discarding class: com/databricks/${xs.mkString("/")}") MergeStrategy.discard - case PathList("org", "apache", "spark", xs @ _*) => - val includes = Seq("sql/delta/icebergShaded", "org.apache.spark.sql.delta.commands.convert") - val includes = Set( - List("sql", "delta", "icebergShaded"), - List("sql", "delta", "commands", "convert") - ) - println(s"------> ${xs.mkString("/")}") + case PathList("org", "apache", "spark", xs @ _*) + if !deltaIcebergSparkIncludePrefixes.exists { prefix => + s"org/apache/spark/${xs.mkString("/")}".startsWith(prefix) } => + println(s"Discarding class: org/apache/spark/${xs.mkString("/")}") MergeStrategy.discard case x => - println(s"AAAA $x") + println(s"Including class: $x") (assembly / assemblyMergeStrategy).value(x) }, assemblyPackageScala / assembleArtifact := false ) +// scalastyle:on println lazy val generateIcebergJarsTask = TaskKey[Unit]("generateIcebergJars", "Generate Iceberg JARs") diff --git a/testDeltaIcebergJar/src/test/scala/JarSuite.scala b/testDeltaIcebergJar/src/test/scala/JarSuite.scala index 9069cd4a223..2f8dced74dc 100644 --- a/testDeltaIcebergJar/src/test/scala/JarSuite.scala +++ b/testDeltaIcebergJar/src/test/scala/JarSuite.scala @@ -25,23 +25,53 @@ import scala.collection.JavaConverters._ import org.scalatest.funsuite.AnyFunSuite class JarSuite extends AnyFunSuite { + + val allowedClassPrefixes = Seq( + // e.g. shadedForDelta/org/apache/iceberg/BaseTable.class + "shadedForDelta/", + // e.g. scala/collection/compat/immutable/ArraySeq.class + // e.g. scala/jdk/CollectionConverters.class + "scala/", + // e.g. org/apache/spark/sql/delta/icebergShaded/IcebergTransactionUtils.class + "org/apache/spark/sql/delta/icebergShaded/", + // We explicitly include all the /delta/commands/convert classes we want, to ensure we don't + // accidentally pull in some from delta-spark package. + "org/apache/spark/sql/delta/commands/convert/IcebergFileManifest", + "org/apache/spark/sql/delta/commands/convert/IcebergSchemaUtils", + "org/apache/spark/sql/delta/commands/convert/IcebergTable", + // e.g. org/apache/iceberg/transforms/IcebergPartitionUtil.class + "org/apache/iceberg/", + "com/github/benmanes/caffeine/" + ) + test("audit files in assembly jar") { // Step 1: load the jar (and make sure it exists) // scalastyle:off classforname val classUrl = Class.forName("org.apache.spark.sql.delta.icebergShaded.IcebergConverter").getResource("IcebergConverter.class") // scalastyle:on classforname assert(classUrl != null, "Could not find delta-iceberg jar") - println(classUrl) val connection = classUrl.openConnection().asInstanceOf[JarURLConnection] - - val url = connection.getJarFileURL() + val url = connection.getJarFileURL val jarFile = new JarFile(new File(url.toURI)) - // Step 2: Verify the JAR has the classes we want it to ahve + // Step 2: Verify the JAR has the classes we want it to have try { - val set = jarFile.entries().asScala.filter(!_.isDirectory).map(_.toString).toSet - println(set.toList.sorted.mkString("\n")) + val jarClasses = jarFile + .entries() + .asScala + .filter(!_.isDirectory) + .map(_.toString) + .filter(_.endsWith(".class")) // let's ignore any .properties or META-INF files for now + .toSet + + val prohibitedJarClasses = jarClasses + .filter { clazz => !allowedClassPrefixes.exists(prefix => clazz.startsWith(prefix)) } + if (prohibitedJarClasses.nonEmpty) { + throw new Exception( + s"Prohibited jar class(es) found:\n- ${prohibitedJarClasses.mkString("\n- ")}" + ) + } } finally { jarFile.close() }