[SC-5599] Inline spark-redshift sources into databricks/spark

This patch ports `spark-redshift` as of databricks/spark-avro@b01a034 and updates it to run with Spark 2.1.0. I didn't make any attempts to clean up the library, remove dead code, or modernize its dependencies. Each of these tasks will take some time and I'm hoping to delegate them out once we have the infrastructure pieces in place. Credentials are stored securely in Jenkins. If you need a copy of them to do local development, talk to Josh and he'll share them via ZeroBin (this will be done via LastPass in the future). Author: Josh Rosen <joshrosen@databricks.com> Closes apache#171 from JoshRosen/add-spark-redshift.
ericl · Jan 17, 2017 · f9c9cd8 · f9c9cd8
1 parent 227ff64
commit f9c9cd8
Show file tree

Hide file tree

Showing 54 changed files with 6,128 additions and 6 deletions.
diff --git a/common/tags/src/test/java/org/apache/spark/tags/ExtendedRedshiftTest.java b/common/tags/src/test/java/org/apache/spark/tags/ExtendedRedshiftTest.java
@@ -0,0 +1,19 @@
+/*
+ * Copyright (C) 2016 Databricks, Inc.
+ *
+ * Portions of this software incorporate or are derived from software contained within Apache Spark,
+ * and this modified software differs from the Apache Spark software provided under the Apache
+ * License, Version 2.0, a copy of which you may obtain at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ */
+
+package org.apache.spark.tags;
+
+import java.lang.annotation.*;
+
+import org.scalatest.TagAnnotation;
+
+@TagAnnotation
+@Retention(RetentionPolicy.RUNTIME)
+@Target({ElementType.METHOD, ElementType.TYPE})
+public @interface ExtendedRedshiftTest { }
diff --git a/dev/.rat-excludes b/dev/.rat-excludes
@@ -103,3 +103,4 @@ org.apache.spark.scheduler.ExternalClusterManager
 org.apache.spark.deploy.yarn.security.ServiceCredentialProvider
 spark-warehouse
 structured-streaming/*
+install-redshift-jdbc.sh
diff --git a/dev/install-redshift-jdbc.sh b/dev/install-redshift-jdbc.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+set -e
+
+SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
+SPARK_ROOT_DIR="$(dirname "$SCRIPT_DIR")"
+
+cd /tmp
+
+VERSION='1.1.7.1007'
+FILENAME="RedshiftJDBC4-$VERSION.jar"
+
+wget "https://s3.amazonaws.com/redshift-downloads/drivers/$FILENAME"
+
+$SPARK_ROOT_DIR/build/mvn install:install-file \
+    -Dfile=$FILENAME \
+    -DgroupId=com.amazonaws \
+    -DartifactId=redshift.jdbc4 \
+    -Dversion=$VERSION \
+    -Dpackaging=jar
+
diff --git a/dev/run-tests.py b/dev/run-tests.py
@@ -111,7 +111,8 @@ def determine_modules_to_test(changed_modules):
     >>> x = [x.name for x in determine_modules_to_test([modules.sql])]
     >>> x # doctest: +NORMALIZE_WHITESPACE
     ['sql', 'avro', 'hive', 'mllib', 'sql-kafka-0-10', 'sql-kafka-0-8', 'examples',
-     'hive-thriftserver', 'pyspark-sql', 'sparkr', 'pyspark-mllib', 'pyspark-ml']
+     'hive-thriftserver', 'pyspark-sql', 'redshift', 'sparkr', 'pyspark-mllib',
+     'redshift-integration-tests', 'pyspark-ml']
     """
     modules_to_test = set()
     for module in changed_modules:
@@ -512,6 +513,8 @@ def main():
         test_env = "amplab_jenkins"
         # add path for Python3 in Jenkins if we're calling from a Jenkins machine
         os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:" + os.environ.get("PATH")
+        # Install Redshift JDBC
+        run_cmd([os.path.join(SPARK_HOME, "dev", "install-redshift-jdbc.sh")])
     else:
         # else we're running locally and can use local settings
         build_tool = "sbt"

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
@@ -168,6 +168,31 @@ def __hash__(self):
     ]
 )
 
+redshift = Module(
+    name="redshift",
+    dependencies=[avro, sql],
+    source_file_regexes=[
+        "external/redshift",
+    ],
+    sbt_test_goals=[
+        "redshift/test",
+    ],
+    test_tags=[
+        "org.apache.spark.tags.ExtendedRedshiftTest"
+    ],
+)
+
+redshift_integration_tests = Module(
+    name="redshift-integration-tests",
+    dependencies=[redshift],
+    source_file_regexes=[
+        "external/redshift-integration-tests",
+    ],
+    sbt_test_goals=[
+        "redshift-integration-tests/test",
+    ],
+)
+
 sql_kafka = Module(
     name="sql-kafka-0-10",
     dependencies=[sql],

diff --git a/external/redshift-integration-tests/pom.xml b/external/redshift-integration-tests/pom.xml
@@ -0,0 +1,144 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+/*
+ * Copyright (C) 2016 Databricks, Inc.
+ *
+ * Portions of this software incorporate or are derived from software contained within Apache Spark,
+ * and this modified software differs from the Apache Spark software provided under the Apache
+ * License, Version 2.0, a copy of which you may obtain at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ */
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.spark</groupId>
+    <artifactId>spark-parent_2.11</artifactId>
+    <version>2.1.0</version>
+    <relativePath>../../pom.xml</relativePath>
+  </parent>
+
+  <groupId>com.databricks</groupId>
+  <artifactId>spark-redshift-integration-tests_2.11</artifactId>
+  <properties>
+    <sbt.project.name>redshift-integration-tests</sbt.project.name>
+  </properties>
+  <packaging>jar</packaging>
+  <name>Spark Redshift Integration Tests</name>
+  <url>http://spark.apache.org/</url>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-sql_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.databricks</groupId>
+      <artifactId>spark-avro_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.databricks</groupId>
+      <artifactId>spark-redshift_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.databricks</groupId>
+      <artifactId>spark-redshift_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+    <!-- The AWS Java SDK version here should match (or be below) the version used in DBC images -->
+    <dependency>
+      <groupId>com.amazonaws</groupId>
+      <artifactId>aws-java-sdk-core</artifactId>
+      <version>1.9.40</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.amazonaws</groupId>
+      <artifactId>aws-java-sdk-s3</artifactId>
+      <version>1.9.40</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.amazonaws</groupId>
+      <artifactId>aws-java-sdk-sts</artifactId>
+      <version>1.9.40</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.eclipsesource.minimal-json</groupId>
+      <artifactId>minimal-json</artifactId>
+      <version>0.9.4</version>
+      <scope>compile</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-client</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-common</artifactId>
+      <version>${hadoop.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-catalyst_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-sql_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-hive_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <!-- This dependency is installed using ./dev/install-redshift-jdbc.sh -->
+    <dependency>
+      <groupId>com.amazonaws</groupId>
+      <artifactId>redshift.jdbc4</artifactId>
+      <version>1.1.7.1007</version>
+      <type>jar</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.mockito</groupId>
+      <artifactId>mockito-core</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-tags_${scala.binary.version}</artifactId>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+  <build>
+    <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
+    <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
+  </build>
+</project>
diff --git a/...ts/src/test/scala/com/databricks/spark/redshift/AWSCredentialsInUriIntegrationSuite.scala b/...ts/src/test/scala/com/databricks/spark/redshift/AWSCredentialsInUriIntegrationSuite.scala
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2016 Databricks, Inc.
+ *
+ * Portions of this software incorporate or are derived from software contained within Apache Spark,
+ * and this modified software differs from the Apache Spark software provided under the Apache
+ * License, Version 2.0, a copy of which you may obtain at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ */
+
+package com.databricks.spark.redshift
+
+import java.net.URI
+
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
+import org.apache.spark.tags.ExtendedRedshiftTest
+
+/**
+ * This suite performs basic integration tests where the AWS credentials have been
+ * encoded into the tempdir URI rather than being set in the Hadoop configuration.
+ */
+@ExtendedRedshiftTest
+class AWSCredentialsInUriIntegrationSuite extends IntegrationSuiteBase {
+
+  override protected val tempDir: String = {
+    val uri = new URI(AWS_S3_SCRATCH_SPACE + randomSuffix + "/")
+    new URI(
+      uri.getScheme,
+      s"$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY",
+      uri.getHost,
+      uri.getPort,
+      uri.getPath,
+      uri.getQuery,
+      uri.getFragment).toString
+  }
+
+
+  // Override this method so that we do not set the credentials in sc.hadoopConf.
+  override def beforeAll(): Unit = {
+    assert(tempDir.contains("AKIA"), "tempdir did not contain AWS credentials")
+    assert(!AWS_SECRET_ACCESS_KEY.contains("/"), "AWS secret key should not contain slash")
+    sc = new SparkContext("local", getClass.getSimpleName)
+    conn = DefaultJDBCWrapper.getConnector(None, jdbcUrl, None)
+  }
+
+  test("roundtrip save and load") {
+    val df = sqlContext.createDataFrame(sc.parallelize(Seq(Row(1)), 1),
+      StructType(StructField("foo", IntegerType) :: Nil))
+    testRoundtripSaveAndLoad(s"roundtrip_save_and_load_$randomSuffix", df)
+  }
+}