Skip to content

Commit

Permalink
[SC-5599] Inline spark-redshift sources into databricks/spark
Browse files Browse the repository at this point in the history
This patch ports `spark-redshift` as of databricks/spark-avro@b01a034 and updates it to run with Spark 2.1.0.

I didn't make any attempts to clean up the library, remove dead code, or modernize its dependencies. Each of these tasks will take some time and I'm hoping to delegate them out once we have the infrastructure pieces in place.

Credentials are stored securely in Jenkins. If you need a copy of them to do local development, talk to Josh and he'll share them via ZeroBin (this will be done via LastPass in the future).

Author: Josh Rosen <joshrosen@databricks.com>

Closes apache#171 from JoshRosen/add-spark-redshift.
  • Loading branch information
JoshRosen committed Jan 17, 2017
1 parent 227ff64 commit f9c9cd8
Show file tree
Hide file tree
Showing 54 changed files with 6,128 additions and 6 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
/*
* Copyright (C) 2016 Databricks, Inc.
*
* Portions of this software incorporate or are derived from software contained within Apache Spark,
* and this modified software differs from the Apache Spark software provided under the Apache
* License, Version 2.0, a copy of which you may obtain at
* http://www.apache.org/licenses/LICENSE-2.0
*/

package org.apache.spark.tags;

import java.lang.annotation.*;

import org.scalatest.TagAnnotation;

@TagAnnotation
@Retention(RetentionPolicy.RUNTIME)
@Target({ElementType.METHOD, ElementType.TYPE})
public @interface ExtendedRedshiftTest { }
1 change: 1 addition & 0 deletions dev/.rat-excludes
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,4 @@ org.apache.spark.scheduler.ExternalClusterManager
org.apache.spark.deploy.yarn.security.ServiceCredentialProvider
spark-warehouse
structured-streaming/*
install-redshift-jdbc.sh
21 changes: 21 additions & 0 deletions dev/install-redshift-jdbc.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/usr/bin/env bash

set -e

SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
SPARK_ROOT_DIR="$(dirname "$SCRIPT_DIR")"

cd /tmp

VERSION='1.1.7.1007'
FILENAME="RedshiftJDBC4-$VERSION.jar"

wget "https://s3.amazonaws.com/redshift-downloads/drivers/$FILENAME"

$SPARK_ROOT_DIR/build/mvn install:install-file \
-Dfile=$FILENAME \
-DgroupId=com.amazonaws \
-DartifactId=redshift.jdbc4 \
-Dversion=$VERSION \
-Dpackaging=jar

5 changes: 4 additions & 1 deletion dev/run-tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,8 @@ def determine_modules_to_test(changed_modules):
>>> x = [x.name for x in determine_modules_to_test([modules.sql])]
>>> x # doctest: +NORMALIZE_WHITESPACE
['sql', 'avro', 'hive', 'mllib', 'sql-kafka-0-10', 'sql-kafka-0-8', 'examples',
'hive-thriftserver', 'pyspark-sql', 'sparkr', 'pyspark-mllib', 'pyspark-ml']
'hive-thriftserver', 'pyspark-sql', 'redshift', 'sparkr', 'pyspark-mllib',
'redshift-integration-tests', 'pyspark-ml']
"""
modules_to_test = set()
for module in changed_modules:
Expand Down Expand Up @@ -512,6 +513,8 @@ def main():
test_env = "amplab_jenkins"
# add path for Python3 in Jenkins if we're calling from a Jenkins machine
os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:" + os.environ.get("PATH")
# Install Redshift JDBC
run_cmd([os.path.join(SPARK_HOME, "dev", "install-redshift-jdbc.sh")])
else:
# else we're running locally and can use local settings
build_tool = "sbt"
Expand Down
25 changes: 25 additions & 0 deletions dev/sparktestsupport/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,31 @@ def __hash__(self):
]
)

redshift = Module(
name="redshift",
dependencies=[avro, sql],
source_file_regexes=[
"external/redshift",
],
sbt_test_goals=[
"redshift/test",
],
test_tags=[
"org.apache.spark.tags.ExtendedRedshiftTest"
],
)

redshift_integration_tests = Module(
name="redshift-integration-tests",
dependencies=[redshift],
source_file_regexes=[
"external/redshift-integration-tests",
],
sbt_test_goals=[
"redshift-integration-tests/test",
],
)

sql_kafka = Module(
name="sql-kafka-0-10",
dependencies=[sql],
Expand Down
144 changes: 144 additions & 0 deletions external/redshift-integration-tests/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
/*
* Copyright (C) 2016 Databricks, Inc.
*
* Portions of this software incorporate or are derived from software contained within Apache Spark,
* and this modified software differs from the Apache Spark software provided under the Apache
* License, Version 2.0, a copy of which you may obtain at
* http://www.apache.org/licenses/LICENSE-2.0
*/
-->

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.spark</groupId>
<artifactId>spark-parent_2.11</artifactId>
<version>2.1.0</version>
<relativePath>../../pom.xml</relativePath>
</parent>

<groupId>com.databricks</groupId>
<artifactId>spark-redshift-integration-tests_2.11</artifactId>
<properties>
<sbt.project.name>redshift-integration-tests</sbt.project.name>
</properties>
<packaging>jar</packaging>
<name>Spark Redshift Integration Tests</name>
<url>http://spark.apache.org/</url>

<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.databricks</groupId>
<artifactId>spark-avro_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.databricks</groupId>
<artifactId>spark-redshift_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.databricks</groupId>
<artifactId>spark-redshift_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<type>test-jar</type>
<scope>test</scope>
</dependency>
<!-- The AWS Java SDK version here should match (or be below) the version used in DBC images -->
<dependency>
<groupId>com.amazonaws</groupId>
<artifactId>aws-java-sdk-core</artifactId>
<version>1.9.40</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.amazonaws</groupId>
<artifactId>aws-java-sdk-s3</artifactId>
<version>1.9.40</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.amazonaws</groupId>
<artifactId>aws-java-sdk-sts</artifactId>
<version>1.9.40</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.eclipsesource.minimal-json</groupId>
<artifactId>minimal-json</artifactId>
<version>0.9.4</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<type>test-jar</type>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-catalyst_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<type>test-jar</type>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<type>test-jar</type>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<scope>test</scope>
</dependency>
<!-- This dependency is installed using ./dev/install-redshift-jdbc.sh -->
<dependency>
<groupId>com.amazonaws</groupId>
<artifactId>redshift.jdbc4</artifactId>
<version>1.1.7.1007</version>
<type>jar</type>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-tags_${scala.binary.version}</artifactId>
<type>test-jar</type>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
</build>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Copyright (C) 2016 Databricks, Inc.
*
* Portions of this software incorporate or are derived from software contained within Apache Spark,
* and this modified software differs from the Apache Spark software provided under the Apache
* License, Version 2.0, a copy of which you may obtain at
* http://www.apache.org/licenses/LICENSE-2.0
*/

package com.databricks.spark.redshift

import java.net.URI

import org.apache.spark.SparkContext
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
import org.apache.spark.tags.ExtendedRedshiftTest

/**
* This suite performs basic integration tests where the AWS credentials have been
* encoded into the tempdir URI rather than being set in the Hadoop configuration.
*/
@ExtendedRedshiftTest
class AWSCredentialsInUriIntegrationSuite extends IntegrationSuiteBase {

override protected val tempDir: String = {
val uri = new URI(AWS_S3_SCRATCH_SPACE + randomSuffix + "/")
new URI(
uri.getScheme,
s"$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY",
uri.getHost,
uri.getPort,
uri.getPath,
uri.getQuery,
uri.getFragment).toString
}


// Override this method so that we do not set the credentials in sc.hadoopConf.
override def beforeAll(): Unit = {
assert(tempDir.contains("AKIA"), "tempdir did not contain AWS credentials")
assert(!AWS_SECRET_ACCESS_KEY.contains("/"), "AWS secret key should not contain slash")
sc = new SparkContext("local", getClass.getSimpleName)
conn = DefaultJDBCWrapper.getConnector(None, jdbcUrl, None)
}

test("roundtrip save and load") {
val df = sqlContext.createDataFrame(sc.parallelize(Seq(Row(1)), 1),
StructType(StructField("foo", IntegerType) :: Nil))
testRoundtripSaveAndLoad(s"roundtrip_save_and_load_$randomSuffix", df)
}
}
Loading

0 comments on commit f9c9cd8

Please sign in to comment.