Merge remote-tracking branch 'origin/master' into better-unidoc

tdas · Aug 2, 2023 · 86d2221 · 86d2221
2 parents 3c3d624 + 115abb3
commit 86d2221
Show file tree

Hide file tree

Showing 117 changed files with 1,715 additions and 990 deletions.
diff --git a/PROTOCOL.md b/PROTOCOL.md
@@ -383,6 +383,7 @@ dataChange | Boolean | When `false` the records in the removed file must be cont
 extendedFileMetadata | Boolean | When `true` the fields `partitionValues`, `size`, and `tags` are present | optional
 partitionValues| Map[String, String] | A map from partition column to value for this file. See also [Partition Value Serialization](#Partition-Value-Serialization) | optional
 size| Long | The size of this data file in bytes | optional
+stats | [Statistics Struct](#Per-file-Statistics) | Contains statistics (e.g., count, min/max values for columns) about the data in this logical file | optional
 tags | Map[String, String] | Map containing metadata about this file | optional
 deletionVector | [DeletionVectorDescriptor Struct](#Deletion-Vectors) | Either null (or absent in JSON) when no DV is associated with this data file, or a struct (described below) that contains necessary information about the DV that is part of this logical file. | optional
 baseRowId | Long | Default generated Row ID of the first row in the file. The default generated Row IDs of the other rows in the file can be reconstructed by adding the physical index of the row within the file to the base Row ID. See also [Row IDs](#row-ids) | optional
@@ -990,6 +991,8 @@ Within the checkpoint, the `add` struct may or may not contain the following col
  - stats: Column level statistics can be stored as a JSON string in the checkpoint. This field needs to be written when statistics are available and the table property: `delta.checkpoint.writeStatsAsJson` is set to `true` (which is the default). When this property is set to `false`, this field should be omitted from the checkpoint.
  - stats_parsed: The stats can be stored in their [original format](#Per-file-Statistics). This field needs to be written when statistics are available and the table property: `delta.checkpoint.writeStatsAsStruct` is set to `true`. When this property is set to `false` (which is the default), this field should be omitted from the checkpoint.
 
+Within the checkpoint, the `remove` struct does not contain the `stats` and `tags` fields because the `remove` actions stored in checkpoints act only as tombstones for VACUUM operations, and VACUUM tombstones do not require `stats` or `tags`. These fields are only stored in Delta JSON commit files.
+
 Refer to the [appendix](#checkpoint-schema) for an example on the schema of the checkpoint.
 
 ## Data Files
@@ -1187,7 +1190,7 @@ Bytes | Name | Description
 `<start of i> + 4 + dataSize` — `<start of i> + 4 + dataSize + 3` | checksum | CRC-32 checksum of `bitmapData`
 
 ## Per-file Statistics
-`add` actions can optionally contain statistics about the data in the file being added to the table.
+`add` and `remove` actions can optionally contain statistics about the data in the file being added or removed from the table.
 These statistics can be used for eliminating files based on query predicates or as inputs to query optimization.
 
 Global statistics record information about the entire file.
@@ -1466,6 +1469,8 @@ Observe that `readerFeatures` and `writerFeatures` fields should comply with:
 - If a table has Writer Version 7, then a writer must write checkpoints with a not-null `writerFeatures` in the schema.
 - If a table has neither of the above, then a writer chooses whether to write `readerFeatures` and/or `writerFeatures` into the checkpoint schema. But if it does, their values must be null.
 
+Note that `remove` actions in the checkpoint are tombstones used only by VACUUM, and do not contain the `stats` and `tags` fields.
+
 For a table that uses column mapping, whether in `id` or `name` mode, the schema of the `add` column will look as follows.
 
 Schema definition:

diff --git a/build.sbt b/build.sbt
@@ -212,6 +212,14 @@ lazy val kernelApi = (project in file("kernel/kernel-api"))
       "com.novocode" % "junit-interface" % "0.11" % "test"
     ),
 
+    // Can be run explicitly via: build/sbt $module/checkstyle
+    // Will automatically be run during compilation (e.g. build/sbt compile)
+    // and during tests (e.g. build/sbt test)
+    checkstyleConfigLocation := CheckstyleConfigLocation.File("kernel/dev/checkstyle.xml"),
+    checkstyleSeverityLevel := Some(CheckstyleSeverityLevel.Error),
+    (Compile / checkstyle) := (Compile / checkstyle).triggeredBy(Compile / compile).value,
+    (Test / checkstyle) := (Test / checkstyle).triggeredBy(Test / compile).value,
+
     // Unidoc settings
     unidocSourceFilePatterns := Seq(SourceFilePattern("io/delta/kernel/")),
   ).configureUnidoc(docTitle = "Delta Kernel")
@@ -235,7 +243,15 @@ lazy val kernelDefault = (project in file("kernel/kernel-default"))
       "com.novocode" % "junit-interface" % "0.11" % "test"
     ),
 
-    // Unidoc settings
+    // Can be run explicitly via: build/sbt $module/checkstyle
+    // Will automatically be run during compilation (e.g. build/sbt compile)
+    // and during tests (e.g. build/sbt test)
+    checkstyleConfigLocation := CheckstyleConfigLocation.File("kernel/dev/checkstyle.xml"),
+    checkstyleSeverityLevel := Some(CheckstyleSeverityLevel.Error),
+    (Compile / checkstyle) := (Compile / checkstyle).triggeredBy(Compile / compile).value,
+    (Test / checkstyle) := (Test / checkstyle).triggeredBy(Test / compile).value,
+
+      // Unidoc settings
     unidocSourceFilePatterns += SourceFilePattern("io/delta/kernel/"),
   ).configureUnidoc(docTitle = "Delta Kernel Defaults")
 
@@ -284,10 +300,6 @@ lazy val storageS3DynamoDB = (project in file("storage-s3-dynamodb"))
     )
   ).configureUnidoc()
 
-/*
-// TODO: Investigate a smarter way to pull the Iceberg github.
-//       Make sure to add `iceberg` back to `sparkGroup` below.
-
 val icebergSparkRuntimeArtifactName = {
  val (expMaj, expMin, _) = getMajorMinorPatch(sparkVersion)
  s"iceberg-spark-runtime-$expMaj.$expMin"
@@ -349,7 +361,6 @@ lazy val icebergShaded = (project in file("icebergShaded"))
     assemblyPackageScala / assembleArtifact := false,
     // Make the 'compile' invoke the 'assembly' task to generate the uber jar.
   )
-*/
 
 lazy val hive = (project in file("connectors/hive"))
   .dependsOn(standaloneCosmetic)
@@ -972,7 +983,7 @@ val createTargetClassesDir = taskKey[Unit]("create target classes dir")
 
 // Don't use these groups for any other projects
 lazy val sparkGroup = project
-  .aggregate(spark, contribs, storage, storageS3DynamoDB) /* iceberg */
+  .aggregate(spark, contribs, storage, storageS3DynamoDB, iceberg)
   .settings(
     // crossScalaVersions must be set to Nil on the aggregating project
     crossScalaVersions := Nil,

diff --git a/dev/checkstyle-suppressions.xml b/dev/checkstyle-suppressions.xml
@@ -0,0 +1,29 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<!DOCTYPE suppressions PUBLIC
+        "-//Puppy Crawl//DTD Suppressions 1.1//EN"
+        "https://checkstyle.org/dtds/suppressions_1_1.dtd">
+
+<!--
+    This file contains suppression rules for Checkstyle checks.
+    Ideally only files that cannot be modified (e.g. third-party code)
+    should be added here. All other violations should be fixed.
+-->
+
+<suppressions>
+</suppressions>
diff --git a/icebergShaded/generate_iceberg_jars.py b/icebergShaded/generate_iceberg_jars.py
@@ -29,13 +29,17 @@
 iceberg_patches_dir_name = "iceberg_src_patches"
 
 iceberg_src_commit_hash = "ede085d0f7529f24acd0c81dd0a43f7bb969b763"
-iceberg_src_branch_with_commit_hash = "master"   # only this branch will be downloaded
-iceberg_src_compiled_jar_rel_paths = [ # related to `iceberg_src_dir_name`
-    "bundled-guava/build/libs/iceberg-bundled-guava-1.2.0-SNAPSHOT.jar",
-    "common/build/libs/iceberg-common-1.2.0-SNAPSHOT.jar",
-    "api/build/libs/iceberg-api-1.2.0-SNAPSHOT.jar",
-    "core/build/libs/iceberg-core-1.2.0-SNAPSHOT.jar",
-    "parquet/build/libs/iceberg-parquet-1.2.0-SNAPSHOT.jar",
+iceberg_src_branch = "master"  # only this branch will be downloaded
+
+# Relative to iceberg_src directory.
+# We use * because after applying the patches, a random git hash will be appended to each jar name.
+# This, for all usages below, we must search for these jar files using `glob.glob(pattern)`
+iceberg_src_compiled_jar_rel_glob_patterns = [
+    "bundled-guava/build/libs/iceberg-bundled-guava-*.jar",
+    "common/build/libs/iceberg-common-*.jar",
+    "api/build/libs/iceberg-api-*.jar",
+    "core/build/libs/iceberg-core-*.jar",
+    "parquet/build/libs/iceberg-parquet-*.jar",
 ]
 
 iceberg_root_dir = path.abspath(path.dirname(__file__))
@@ -44,32 +48,50 @@
 iceberg_lib_dir = path.join(iceberg_root_dir, iceberg_lib_dir_name)
 
 
-def compile_jar_rel_path_to_lib_jar_path(jar_rel_path):
-    jar_file_name = path.basename(path.normpath(jar_rel_path))
-    jar_file_name_splits = path.splitext(jar_file_name)
-    new_jar_file_name = "%s_%s%s" % (jar_file_name_splits[0], iceberg_src_commit_hash, jar_file_name_splits[1])
-    return path.join(iceberg_lib_dir, new_jar_file_name)
-
-
 def iceberg_jars_exists():
-    for jar_rel_path in iceberg_src_compiled_jar_rel_paths:
-        if not path.exists(compile_jar_rel_path_to_lib_jar_path(jar_rel_path)):
+    for compiled_jar_rel_glob_pattern in iceberg_src_compiled_jar_rel_glob_patterns:
+        jar_file_name_pattern = path.basename(path.normpath(compiled_jar_rel_glob_pattern))
+        lib_jar_abs_pattern = path.join(iceberg_lib_dir, jar_file_name_pattern)
+        results = glob.glob(lib_jar_abs_pattern)
+
+        if len(results) > 1:
+            raise Exception("More jars than expected: " + str(results))
+
+        if len(results) == 0:
             return False
+
     return True
 
 
+def set_git_config_if_empty(config_key, default_value):
+    curr_val = None
+    try:
+        (_, curr_val, _) = run_cmd("git config --get user.%s" % config_key)
+        curr_val = curr_val.decode("utf-8")
+    except:
+        print("Error getting user.%s" % config_key)
+    if not curr_val:
+        run_cmd("git config user.%s \"%s\"" % (config_key, default_value))
+
+
 def prepare_iceberg_source():
     with WorkingDirectory(iceberg_root_dir):
         print(">>> Cloning Iceberg repo")
         shutil.rmtree(iceberg_src_dir_name, ignore_errors=True)
-        run_cmd("git config user.email \"<>\"")
-        run_cmd("git config user.name \"Anonymous\"")
-        run_cmd("git clone --branch %s https://github.com/apache/iceberg.git %s" %
-                (iceberg_src_branch_with_commit_hash, iceberg_src_dir_name))
+
+        set_git_config_if_empty("email", "<>")
+        set_git_config_if_empty("name", "Anonymous")
+
+        # We just want the shallowest, smallest iceberg clone. We will check out the commit later.
+        run_cmd("git clone --depth 1 --branch %s https://github.com/apache/iceberg.git %s" %
+                (iceberg_src_branch, iceberg_src_dir_name))
 
     with WorkingDirectory(iceberg_src_dir):
         run_cmd("git config user.email \"<>\"")
         run_cmd("git config user.name \"Anonymous\"")
+
+        # Fetch just the single commit (shallow)
+        run_cmd("git fetch origin %s --depth 1" % iceberg_src_commit_hash)
         run_cmd("git checkout %s" % iceberg_src_commit_hash)
 
         print(">>> Applying patch files")
@@ -95,12 +117,25 @@ def generate_iceberg_jars():
     shutil.rmtree(iceberg_lib_dir, ignore_errors=True)
     os.mkdir(iceberg_lib_dir)
 
-    for compiled_jar_rel_path in iceberg_src_compiled_jar_rel_paths:
-        compiled_jar_full_path = path.join(iceberg_src_dir, compiled_jar_rel_path)
-        if not path.exists(compiled_jar_full_path):
-            raise Exception("Could not find the jar " + compiled_jar_full_path)
-        lib_jar_full_path = compile_jar_rel_path_to_lib_jar_path(compiled_jar_rel_path)
-        shutil.copyfile(compiled_jar_full_path, lib_jar_full_path)
+    # For each relative pattern p ...
+    for compiled_jar_rel_glob_pattern in iceberg_src_compiled_jar_rel_glob_patterns:
+        # Get the absolute pattern
+        compiled_jar_abs_pattern = path.join(iceberg_src_dir, compiled_jar_rel_glob_pattern)
+        # Search for all glob results
+        results = glob.glob(compiled_jar_abs_pattern)
+        # Compiled jars will include tests, sources, javadocs; exclude them
+        results = list(filter(lambda result: all(x not in result for x in ["test", "source", "javadoc"]), results))
+
+        if len(results) == 0:
+            raise Exception("Could not find the jar: " + compled_jar_rel_glob_pattern)
+        if len(results) > 1:
+            raise Exception("More jars created than expected: " + str(results))
+
+        # Copy the one jar result into the <iceberg root>/lib directory
+        compiled_jar_abs_path = results[0]
+        compiled_jar_name = path.basename(path.normpath(compiled_jar_abs_path))
+        lib_jar_abs_path = path.join(iceberg_lib_dir, compiled_jar_name)
+        shutil.copyfile(compiled_jar_abs_path, lib_jar_abs_path)
 
     if not iceberg_jars_exists():
         raise Exception("JAR copying failed")

diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/Scan.java b/kernel/kernel-api/src/main/java/io/delta/kernel/Scan.java
@@ -32,7 +32,6 @@
 import io.delta.kernel.data.Row;
 import io.delta.kernel.expressions.Expression;
 import io.delta.kernel.expressions.Literal;
-import io.delta.kernel.internal.data.SelectionColumnVector;
 import io.delta.kernel.types.StructField;
 import io.delta.kernel.types.StructType;
 import io.delta.kernel.utils.CloseableIterator;
@@ -42,6 +41,7 @@
 import io.delta.kernel.internal.actions.DeletionVectorDescriptor;
 import io.delta.kernel.internal.data.AddFileColumnarBatch;
 import io.delta.kernel.internal.data.ScanStateRow;
+import io.delta.kernel.internal.data.SelectionColumnVector;
 import io.delta.kernel.internal.deletionvectors.DeletionVectorUtils;
 import io.delta.kernel.internal.deletionvectors.RoaringBitmapArray;
 import io.delta.kernel.internal.util.PartitionUtils;

diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/client/FileSystemClient.java b/kernel/kernel-api/src/main/java/io/delta/kernel/client/FileSystemClient.java
@@ -16,8 +16,8 @@
 
 package io.delta.kernel.client;
 
-import java.io.FileNotFoundException;
 import java.io.ByteArrayInputStream;
+import java.io.FileNotFoundException;
 import java.io.IOException;
 
 import io.delta.kernel.fs.FileStatus;

diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/ScanBuilderImpl.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/ScanBuilderImpl.java
@@ -22,14 +22,14 @@
 import io.delta.kernel.ScanBuilder;
 import io.delta.kernel.client.TableClient;
 import io.delta.kernel.expressions.Expression;
-import io.delta.kernel.internal.fs.Path;
 import io.delta.kernel.types.StructType;
 import io.delta.kernel.utils.CloseableIterator;
 import io.delta.kernel.utils.Tuple2;
 
 import io.delta.kernel.internal.actions.AddFile;
 import io.delta.kernel.internal.actions.Metadata;
 import io.delta.kernel.internal.actions.Protocol;
+import io.delta.kernel.internal.fs.Path;
 import io.delta.kernel.internal.lang.Lazy;
 
 /**

diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/ScanImpl.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/ScanImpl.java
@@ -27,7 +27,6 @@
 import io.delta.kernel.data.ColumnarBatch;
 import io.delta.kernel.data.Row;
 import io.delta.kernel.expressions.Expression;
-import io.delta.kernel.expressions.Literal;
 import io.delta.kernel.types.StructType;
 import io.delta.kernel.utils.CloseableIterator;
 import io.delta.kernel.utils.Tuple2;

diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/SnapshotImpl.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/SnapshotImpl.java
@@ -18,12 +18,12 @@
 import io.delta.kernel.ScanBuilder;
 import io.delta.kernel.Snapshot;
 import io.delta.kernel.client.TableClient;
-import io.delta.kernel.internal.fs.Path;
 import io.delta.kernel.types.StructType;
 import io.delta.kernel.utils.Tuple2;
 
 import io.delta.kernel.internal.actions.Metadata;
 import io.delta.kernel.internal.actions.Protocol;
+import io.delta.kernel.internal.fs.Path;
 import io.delta.kernel.internal.lang.Lazy;
 import io.delta.kernel.internal.replay.LogReplay;
 import io.delta.kernel.internal.snapshot.LogSegment;

diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/actions/AddCDCFile.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/actions/AddCDCFile.java
@@ -24,10 +24,10 @@
 import io.delta.kernel.types.MapType;
 import io.delta.kernel.types.StringType;
 import io.delta.kernel.types.StructType;
+import static io.delta.kernel.utils.Utils.requireNonNull;
 
 import io.delta.kernel.internal.fs.Path;
 
-import static io.delta.kernel.utils.Utils.requireNonNull;
 
 /**
  * Delta log action representing an `AddCDCFile`

diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/actions/AddFile.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/actions/AddFile.java
@@ -20,16 +20,16 @@
 import java.util.Optional;
 
 import io.delta.kernel.data.Row;
-import io.delta.kernel.internal.fs.Path;
 import io.delta.kernel.types.BooleanType;
 import io.delta.kernel.types.LongType;
 import io.delta.kernel.types.MapType;
 import io.delta.kernel.types.StringType;
 import io.delta.kernel.types.StructType;
-import io.delta.kernel.utils.Utils;
-
 import static io.delta.kernel.utils.Utils.requireNonNull;
 
+import io.delta.kernel.internal.fs.Path;
+
+
 /**
  * Delta log action representing an `AddFile`
  */

diff --git a/...l/kernel-api/src/main/java/io/delta/kernel/internal/actions/DeletionVectorDescriptor.java b/...l/kernel-api/src/main/java/io/delta/kernel/internal/actions/DeletionVectorDescriptor.java
@@ -26,16 +26,17 @@
 import java.util.function.Function;
 
 import io.delta.kernel.data.Row;
-import io.delta.kernel.internal.data.PojoRow;
-import io.delta.kernel.internal.deletionvectors.Base85Codec;
-import io.delta.kernel.internal.fs.Path;
 import io.delta.kernel.types.IntegerType;
 import io.delta.kernel.types.LongType;
 import io.delta.kernel.types.StringType;
 import io.delta.kernel.types.StructType;
-import static io.delta.kernel.internal.util.InternalUtils.checkArgument;
 import static io.delta.kernel.utils.Utils.requireNonNull;
 
+import io.delta.kernel.internal.data.PojoRow;
+import io.delta.kernel.internal.deletionvectors.Base85Codec;
+import io.delta.kernel.internal.fs.Path;
+import static io.delta.kernel.internal.util.InternalUtils.checkArgument;
+
 /** Information about a deletion vector attached to a file action. */
 public class DeletionVectorDescriptor {
 
@@ -239,7 +240,8 @@ public Row toRow() {
     // Static fields to create PojoRows
     ////////////////////////////////////////////////////////////////////////////////
 
-    private static final Map<Integer, Function<DeletionVectorDescriptor, Object>> ordinalToAccessor = new HashMap<>();
+    private static final Map<Integer, Function<DeletionVectorDescriptor, Object>>
+        ordinalToAccessor = new HashMap<>();
 
     static {
         ordinalToAccessor.put(0, (a) -> a.getStorageType());

diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/actions/FileAction.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/actions/FileAction.java
@@ -17,12 +17,10 @@
 
 import java.net.URI;
 import java.net.URISyntaxException;
-import java.util.Objects;
+import static java.util.Objects.requireNonNull;
 
 import io.delta.kernel.internal.lang.Lazy;
 
-import static java.util.Objects.requireNonNull;
-
 public abstract class FileAction implements Action
 {
     protected final String path;

diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/actions/Format.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/actions/Format.java
@@ -19,7 +19,6 @@
 import io.delta.kernel.types.MapType;
 import io.delta.kernel.types.StringType;
 import io.delta.kernel.types.StructType;
-
 import static io.delta.kernel.utils.Utils.requireNonNull;
 
 public class Format