Merge branch 'develop'

dimajix · Sep 9, 2022 · db59d45 · db59d45
2 parents 8d5e019 + 2e909a9
commit db59d45
Show file tree

Hide file tree

Showing 236 changed files with 4,405 additions and 689 deletions.
diff --git a/BUILDING.md b/BUILDING.md
@@ -94,6 +94,7 @@ using the correct version. The following profiles are available:
 * spark-3.0
 * spark-3.1 
 * spark-3.2
+* spark-3.3
 * hadoop-2.6
 * hadoop-2.7
 * hadoop-2.8
@@ -123,61 +124,73 @@ mvn install -Djava.version=1.8
 ### Spark 2.4 and Hadoop 2.6:
 
 ```shell
-mvn clean install -Pspark-2.4 -Phadoop-2.6
+mvn clean install -Pspark-2.4 -Phadoop-2.6 -DskipTests
 ```
 
 ### Spark 2.4 and Hadoop 2.7:
 
 ```shell
-mvn clean install -Pspark-2.4 -Phadoop-2.7
+mvn clean install -Pspark-2.4 -Phadoop-2.7 -DskipTests
 ```
 
 ### Spark 2.4 and Hadoop 2.8:
 
 ```shell
-mvn clean install -Pspark-2.4 -Phadoop-2.8
+mvn clean install -Pspark-2.4 -Phadoop-2.8 -DskipTests
 ```
 
 ### Spark 2.4 and Hadoop 2.9:
 
 ```shell
-mvn clean install -Pspark-2.4 -Phadoop-2.9
+mvn clean install -Pspark-2.4 -Phadoop-2.9 -DskipTests
 ```
 
 ### Spark 3.0 and Hadoop 3.1
 
 ```shell
-mvn clean install -Pspark-3.0 -Phadoop-3.1
+mvn clean install -Pspark-3.0 -Phadoop-3.1 -DskipTests
 ```
 
 ### Spark 3.0 and Hadoop 3.2
 
 ```shell
-mvn clean install -Pspark-3.0 -Phadoop-3.2
+mvn clean install -Pspark-3.0 -Phadoop-3.2 -DskipTests
 ```
 
 ### Spark 3.1 and Hadoop 2.7
 
 ```shell
-mvn clean install -Pspark-3.1 -Phadoop-2.7
+mvn clean install -Pspark-3.1 -Phadoop-2.7 -DskipTests
 ```
 
 ### Spark 3.1 and Hadoop 3.2
 
 ```shell
-mvn clean install -Pspark-3.1 -Phadoop-3.2
+mvn clean install -Pspark-3.1 -Phadoop-3.2 -DskipTests
 ```
 
 ### Spark 3.2 and Hadoop 2.7
 
 ```shell
-mvn clean install -Pspark-3.2 -Phadoop-2.7
+mvn clean install -Pspark-3.2 -Phadoop-2.7 -DskipTests
 ```
 
 ### Spark 3.2 and Hadoop 3.3
 
 ```shell
-mvn clean install -Pspark-3.2 -Phadoop-3.3
+mvn clean install -Pspark-3.2 -Phadoop-3.3 -Dhadoop.version=3.3.1 -DskipTests
+```
+
+### Spark 3.3 and Hadoop 2.7
+
+```shell
+mvn clean install -Pspark-3.3 -Phadoop-2.7 -DskipTests
+```
+
+### Spark 3.3 and Hadoop 3.3
+
+```shell
+mvn clean install -Pspark-3.3 -Phadoop-3.3 -Dhadoop.version=3.3.2 -DskipTests
 ```
 
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,21 @@
+# Version 0.27.0 - 2022-09-09
+
+* github-232: [BUG] Column descriptions should be propagated in UNIONs
+* github-233: [BUG] Missing Hadoop dependencies for S3, Delta, etc
+* github-235: Implement new `rest` hook with fine control
+* github-229: A build target should not fail if Impala "COMPUTE STATS" fails
+* github-236: 'copy' target should not apply output schema
+* github-237: jdbcQuery relation should use fields "sql" and "file" instead of "query"
+* github-239: Allow optional SQL statement for creating jdbcTable
+* github-238: Implement new 'jdbcCommand' target
+* github-240: [BUG] Data quality checks in documentation should not fail on NULL values
+* github-241: Throw an error on duplicate entity definitions
+* github-220: Upgrade Delta-Lake to 2.0 / 2.1
+* github-242: Switch to Spark 3.3 as default
+* github-243: Use alternative Spark MS SQL Connector for Spark 3.3
+* github-244: Generate project HTML documentation with optional external CSS file
+
+
 # Version 0.26.1 - 2022-08-03
 
 * github-226: Upgrade to Spark 3.2.2

diff --git a/build-release.sh b/build-release.sh
@@ -10,11 +10,7 @@ mkdir -p release
 
 
 build_profile() {
-    profiles=""
-    for p in $@
-    do
-        profiles="$profiles -P$p"
-    done
+    profiles=$@
 
     # Set new version
     HADOOP_DIST=$(mvn $profiles -q -N help:evaluate -Dexpression=hadoop.dist -DforceStdout)
@@ -35,22 +31,22 @@ build_profile() {
 
 
 export JAVA_HOME=/usr/lib/jvm/java-1.8.0
-build_profile hadoop-2.6 spark-2.4
-build_profile hadoop-2.7 spark-2.4
+build_profile -phadoop-2.6 -pspark-2.4
+build_profile -phadoop-2.7 -pspark-2.4
 
 export JAVA_HOME=
-build_profile hadoop-2.7 spark-3.0
-build_profile hadoop-3.2 spark-3.0
-build_profile hadoop-2.7 spark-3.1
-build_profile hadoop-3.2 spark-3.1
-build_profile hadoop-2.7 spark-3.2
-build_profile hadoop-3.3 spark-3.2
-build_profile hadoop-2.7 spark-3.3
-build_profile hadoop-3.3 spark-3.3
+build_profile -phadoop-2.7 -pspark-3.0
+build_profile -phadoop-3.2 -pspark-3.0
+build_profile -phadoop-2.7 -pspark-3.1
+build_profile -phadoop-3.2 -pspark-3.1
+build_profile -phadoop-2.7 -pspark-3.2
+build_profile -phadoop-3.3 -pspark-3.2 -Dhadoop.version=3.3.1
+build_profile -phadoop-2.7 -pspark-3.3
+build_profile -phadoop-3.3 -pspark-3.3 -Dhadoop.version=3.3.2
 
 export JAVA_HOME=/usr/lib/jvm/java-1.8.0
-build_profile CDH-6.3
-build_profile CDP-7.1
+build_profile -pCDH-6.3
+build_profile -pCDP-7.1
 
 # Finally build default version
 export JAVA_HOME=

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -42,7 +42,11 @@ RUN tar -C /opt --owner=root --group=root -xzf /tmp/repo/flowman-dist.tar.gz \
     && chown -R flowman:flowman ~flowman/examples \
     # Create new var directory with write permissions for "flowman" user
     && mkdir ${FLOWMAN_HOME}/var \
-    && chown flowman:flowman ${FLOWMAN_HOME}/var
+    && chown flowman:flowman ${FLOWMAN_HOME}/var \
+    # Install missing Hadoop dependencies, which are required for S3, DeltaLake etc
+    && ${FLOWMAN_HOME}/bin/install-hadoop-dependencies \
+    && rm -rf ~/.m2 \
+    && chown -R root:root $SPARK_HOME/jars
 
 USER flowman
 WORKDIR /home/flowman

diff --git a/docker/pom.xml b/docker/pom.xml
@@ -10,7 +10,7 @@
     <parent>
         <groupId>com.dimajix.flowman</groupId>
         <artifactId>flowman-root</artifactId>
-        <version>0.26.1</version>
+        <version>0.27.0</version>
         <relativePath>../pom.xml</relativePath>
     </parent>
 
@@ -33,16 +33,16 @@
         </profile>
         <profile>
             <id>spark-3.2</id>
-            <activation>
-                <activeByDefault>true</activeByDefault>
-            </activation>
             <properties>
                 <!-- The Spark 3.2 archives continue to have a wrong file name -->
                 <spark-hadoop-archive.version>3.2</spark-hadoop-archive.version>
             </properties>
         </profile>
         <profile>
             <id>spark-3.3</id>
+            <activation>
+                <activeByDefault>true</activeByDefault>
+            </activation>
             <properties>
                 <!-- The Spark 3.3 archives have a generic Hadoop version-->
                 <spark-hadoop-archive.version>3</spark-hadoop-archive.version>

diff --git a/docs/conf.py b/docs/conf.py
@@ -61,9 +61,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '0.26'
+version = '0.27'
 # The full version, including alpha/beta/rc tags.
-release = '0.26.0'
+release = '0.27.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/docs/cookbook/advanced-jdbc.md b/docs/cookbook/advanced-jdbc.md
@@ -0,0 +1,118 @@
+# Using advanced Features of JDBC Databases
+
+Flowman already provides a very robust for dealing with relation databases, both as data sources and as data sinks.
+But when writing into a relational database, you eventually might find yourself in a situation where Flowman does 
+not support some special or exotic features of the relational database, which you require. In this case you need
+more control than provided via the standard approach by using the [`jdbcTable`](../spec/relation/jdbcTable.md), 
+[`jdbcQuery`](../spec/relation/jdbcQuery.md) and [`jdbcView`](../spec/relation/jdbcView.md) relations.
+
+Basically there are two main situations where the abstraction provided by Flowman might hide required special features:
+* You need more control over the `CREATE TABLE` statement
+* You need to execute additional commands as part of your data build process
+
+Flowman offers support for these two scenarios.
+
+
+## Full Control for `CREATE TABLE` Statements
+
+Starting with Flowman 0.27.0, you can now explicitly specify the `CREATE TABLE` statement(s) in a [`jdbcTable`](../spec/relation/jdbcTable.md)
+relation, which will be used instead of Flowmans standard mechanism for assembling the SQL statements:
+
+```yaml
+relations:
+  frontend_users:
+    kind: jdbcTable
+    # Directly embed a connection
+    connection:
+      kind: jdbc
+      driver: "$frontend_db_driver"
+      url: "$frontend_db_url"
+      username: "$frontend_db_username"
+      password: "$frontend_db_password"
+    # Specify the table
+    table: "frontend_users"
+    sql: 
+      - |
+        CREATE TABLE dbo.frontend_users(
+          "id" BIGINT,
+          "description" CLOB, 
+          "flags" INTEGER, 
+          "name" VARCHAR(32)
+        )
+      - CREATE CLUSTERED COLUMNSTORE INDEX CI_frontend_users ON dbo.frontend_users
+      - ALTER TABLE dbo.frontend_users ADD CONSTRAINT PK_frontend_users PRIMARY KEY NONCLUSTERED(id);
+```
+In this case, Flowman will only use the SQL statement for creating the table. This gives you full control, but at the
+same time, completely disables automatic migrations.
+
+
+## Executing arbitrary SQL Statements
+
+In addition to giving you more control over the `CREATE TABLE` statement, since version 0.27.0 Flowman has also 
+implemented a generic [`jdbcCommand` target](../spec/target/jdbcCommand.md) for executing arbitrary SQL statements.
+
+The following example will create and manage a MS SQL Server full text catalog and index:
+```yaml
+targets:
+  fulltext-catalog:
+    kind: jdbcCommand
+    connection: sql_server
+    # Create Fulltext Catalog
+    create:
+      # Check that catalog does not already exists
+      condition: |
+        SELECT 1 FROM sys.fulltext_catalogs
+        WHERE name = 'ftcat'
+        HAVING COUNT(*) = 0
+      sql: |
+        CREATE FULLTEXT CATALOG ftcat
+    # Remove fulltext catalog
+    destroy:
+      # Check that catalog really exists
+      condition: |
+        SELECT 1 FROM sys.fulltext_catalogs
+        WHERE name = 'ftcat'
+        HAVING COUNT(*) = 1
+      sql: |
+        DROP FULLTEXT CATALOG ftcat
+
+  tweets-index:
+    kind: jdbcCommand
+    connection: sql_server
+    # We require both the fulltext catalog and the base table
+    after:
+      - fulltext-catalog
+      - tweets-mssql
+    # Create Index
+    create:
+      # Check that index does nto already exist
+      condition: |
+        SELECT 1 FROM sys.fulltext_indexes i
+        WHERE i.object_id = OBJECT_ID('dbo.tweets')
+        HAVING COUNT(*) = 0
+      sql: |
+        CREATE FULLTEXT INDEX ON dbo.tweets
+        (
+          text,
+          user_description
+        )
+        KEY INDEX PK_tweets_id ON ftcat
+        WITH CHANGE_TRACKING OFF
+    # Fill index by starting background indexing process
+    build:
+      sql: |
+        ALTER FULLTEXT INDEX ON dbo.tweets START FULL POPULATION
+    # Delete index
+    destroy:
+      # Check that index really exists
+      condition: |
+        SELECT 1 FROM sys.fulltext_indexes i
+        WHERE i.object_id = OBJECT_ID('dbo.tweets')
+        HAVING COUNT(*) = 1
+      sql: |
+        DROP FULLTEXT INDEX ON dbo.tweets
+```
+As you can see, the `jdbcCommand` target supports different SQL commands for each lifecycle phase
+(`CREATE`, `BUILD`, ...) and you can also specify an optional `condition` which serves as a precondition when the
+corresponding SQL should be executed or not. This way you can avoid creating tables multiple times etc by first
+checking if the table already exists.
diff --git a/docs/cookbook/hadoop-dependencies.md b/docs/cookbook/hadoop-dependencies.md
@@ -0,0 +1,27 @@
+# Installing additional Hadoop Dependencies
+
+Starting with version 3.2, Spark has reduced the number of Hadoop libraries which are part of the downloadable Spark
+distribution. Unfortunately, some of the libraries which have been removed are required by some Flowman plugins (for
+example the S3 and Delta plugin need the `hadoop-commons` library). Since at the same time Flowman will for good
+reasons not include these missing libraries, you have to install these yourself and put them into the
+`$SPARK_HOME/jars` folder.
+
+
+## Automated Installation
+
+In order to simplify getting the appropriate Hadoop libraries and placing them into the correct Spark directory,
+Flowman provides a small script called `install-hadoop-dependencies`, which will download and install the missing
+jars:
+
+```shell
+export SPARK_HOME=your-spark-home
+
+cd $FLOWMAN_HOME
+bin/install-hadoop-dependencies
+```
+
+Note that you need to have appropriate write permissions into the `$SPARK_HOME/jars` directory, so you possibly need
+to execute this with super-user privileges.
+
+Also note that this script will download and install the Hadoop libraries with the build version of Flowman, not the
+version of the already existing Hadoop libraries.
diff --git a/docs/cookbook/override-jars.md b/docs/cookbook/override-jars.md
@@ -18,8 +18,8 @@ You need to add the following lines to your custom `flowman-env.sh` file which i
 # Add MS SQL JDBC Driver. Normally this is handled by the plugin mechanism, but Cloudera already provides some
 # old version of the JDBC driver, and this is the only place where we can force to use our JDBC driver
 SPARK_JARS="$FLOWMAN_HOME/plugins/flowman-mssqlserver/mssql-jdbc-9.2.1.jre8.jar"
-SPARK_OPTS="--conf spark.executor.extraClassPath=mssql-jdbc-9.2.1.jre8.jar"
+SPARK_OPTS="--conf spark.executor.extraClassPath=mssql-jdbc-9.2.1.jre8.jar --conf spark.driver.extraClassPath=$FLOWMAN_HOME/plugins/flowman-mssqlserver/mssql-jdbc-9.2.1.jre8.jar"
 ```
 The first line will explicitly add the plugin jar to the list of jars as passed to `spark-submit`. But this is still
-not enough, we also have to set `spark.executor.extraClassPath` which will *prepend* the specified jars to the
+not enough, we also have to set `spark.executor.extraClassPath` and `spark.driver.extraClassPath` which will *prepend* the specified jars to the
 classpath of the executor.
diff --git a/docs/documenting/config.md b/docs/documenting/config.md
@@ -24,6 +24,7 @@ generators:
   # Create an output file in the project directory
   - kind: file
     location: ${project.basedir}/doc
+    template: html+css
     # This will exclude all mappings
     excludeMappings: ".*"
     excludeRelations:
@@ -49,6 +50,12 @@ differently configured documentations.
 
 * `kind` **(mandatory)** *(type: string)*: `file` 
 
+* `template` **(optional)** *(type: string)* *(default: html)*:
+Specifies the template to use. Currently, Flowman provides three predefined templates
+  * text - Single file plain text documentation
+  * html - Single file HTML page with inline CSS 
+  * html+css - Single file HTML page with additional CSS file
+
 * `location` **(mandatory)** *(type: string)*: Specifies the output location 
 
 * `includeMappings` **(optional)** *(type: list:regex)* *(default: ".*")*: