From 5de38320df58a60146a5208c0fc67fde4068506d Mon Sep 17 00:00:00 2001 From: Yash Sharma Date: Fri, 24 Mar 2017 14:42:10 +1100 Subject: [PATCH 1/3] Hoodie operability with S3 --- README.md | 1 + docs/s3_filesystem.md | 47 +++++++++++++++++ .../io/storage/HoodieWrapperFileSystem.java | 3 +- .../src/test/java/HoodieClientExample.java | 51 +++++++++++++++---- 4 files changed, 91 insertions(+), 11 deletions(-) create mode 100644 docs/s3_filesystem.md diff --git a/README.md b/README.md index 89edb4b60021..7d298971a1f0 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,4 @@ +# Hoodie Hoodie manages storage of large analytical datasets on [HDFS](http://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html) and serve them out via two types of tables * **Read Optimized Table** - Provides excellent query performance via purely columnar storage (e.g. [Parquet](https://parquet.apache.org/)) diff --git a/docs/s3_filesystem.md b/docs/s3_filesystem.md new file mode 100644 index 000000000000..c1bdd2fb1dae --- /dev/null +++ b/docs/s3_filesystem.md @@ -0,0 +1,47 @@ +--- +title: S3 Filesystem (experimental) +keywords: sql hive s3 spark presto +sidebar: mydoc_sidebar +permalink: s3_hoodie.html +toc: false +summary: In this page, we go over how to configure hoodie with S3 filesystem. +--- +Hoodie works with HDFS by default. There is an experimental work going on Hoodie-S3 compatibility. + +## S3 configs + +Add the required configs in your core-site.xml from where Hoodie can fetch them. Replace the `fs.defaultFS` with your S3 bucket name and Hoodie should be able to read/write from the bucket. + +``` + + fs.defaultFS + s3://ysharma + + + + fs.s3.impl + org.apache.hadoop.fs.s3native.NativeS3FileSystem + + + + fs.s3.awsAccessKeyId + AWS_KEY + + + + fs.s3.awsSecretAccessKey + AWS_SECRET + + + + fs.s3n.awsAccessKeyId + AWS_KEY + + + + fs.s3n.awsSecretAccessKey + AWS_SECRET + +``` + + diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieWrapperFileSystem.java b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieWrapperFileSystem.java index 64034b4d24d0..d413fc5c38da 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieWrapperFileSystem.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieWrapperFileSystem.java @@ -49,9 +49,10 @@ public class HoodieWrapperFileSystem extends FileSystem { public static final String HOODIE_SCHEME_PREFIX = "hoodie-"; static { - SUPPORT_SCHEMES = new HashSet<>(2); + SUPPORT_SCHEMES = new HashSet<>(); SUPPORT_SCHEMES.add("file"); SUPPORT_SCHEMES.add("hdfs"); + SUPPORT_SCHEMES.add("s3"); } private ConcurrentMap openStreams = diff --git a/hoodie-client/src/test/java/HoodieClientExample.java b/hoodie-client/src/test/java/HoodieClientExample.java index eb7e56f707c5..711b4bb04963 100644 --- a/hoodie-client/src/test/java/HoodieClientExample.java +++ b/hoodie-client/src/test/java/HoodieClientExample.java @@ -15,14 +15,20 @@ */ import com.uber.hoodie.HoodieWriteClient; -import com.uber.hoodie.common.table.HoodieTableMetaClient; -import com.uber.hoodie.common.util.FSUtils; -import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.common.HoodieTestDataGenerator; import com.uber.hoodie.common.model.HoodieRecord; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.config.HoodieIndexConfig; +import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.index.HoodieIndex; - +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.BasicParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.SparkConf; @@ -41,8 +47,34 @@ public class HoodieClientExample { private static Logger logger = LogManager.getLogger(HoodieClientExample.class); + private static final String DEFAULT_TABLE_PATH = "file:///tmp/hoodie/sample-table"; + private static final String DEFAULT_TABLE_NAME = "sample-table"; + public static void main(String[] args) throws Exception { - String tablePath = args.length == 1 ? args[0] : "file:///tmp/hoodie/sample-table"; + Options options = new Options(); + Option path = new Option("p", "table-path", true, "input table path"); + path.setRequired(false); + options.addOption(path); + + Option name = new Option("n", "table-name", true, "input table name"); + name.setRequired(false); + options.addOption(name); + + CommandLineParser parser = new BasicParser(); + HelpFormatter formatter = new HelpFormatter(); + CommandLine cmd; + + try { + cmd = parser.parse(options, args); + } catch (ParseException e) { + System.out.println(e.getMessage()); + formatter.printHelp("HoodieClientExample", options); + System.exit(1); + return; + } + + String inputTablePath = cmd.getOptionValue("table-path", DEFAULT_TABLE_PATH); + String inputTableName = cmd.getOptionValue("table-name", DEFAULT_TABLE_NAME); HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); @@ -54,16 +86,15 @@ public static void main(String[] args) throws Exception { // generate some records to be loaded in. HoodieWriteConfig cfg = - HoodieWriteConfig.newBuilder().withPath(tablePath) + HoodieWriteConfig.newBuilder().withPath(inputTablePath) .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) - .forTable("sample-table").withIndexConfig( + .forTable(inputTableName).withIndexConfig( HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) .build(); Properties properties = new Properties(); - properties.put(HoodieWriteConfig.TABLE_NAME, "sample-table"); + properties.put(HoodieWriteConfig.TABLE_NAME, inputTableName); HoodieTableMetaClient - .initializePathAsHoodieDataset(FSUtils.getFs(), tablePath, - properties); + .initializePathAsHoodieDataset(FSUtils.getFs(), inputTablePath, properties); HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); /** From 657da0955d2ed0a77e0e7be6516dc21aaeb77316 Mon Sep 17 00:00:00 2001 From: Yash Sharma Date: Mon, 27 Mar 2017 17:29:00 +1100 Subject: [PATCH 2/3] improve documentations --- docs/configurations.md | 3 ++ docs/s3_filesystem.md | 8 ++++ hoodie-client/pom.xml | 5 ++ .../src/test/java/HoodieClientExample.java | 47 ++++++------------- .../com/uber/hoodie/common/util/FSUtils.java | 1 - 5 files changed, 30 insertions(+), 34 deletions(-) diff --git a/docs/configurations.md b/docs/configurations.md index cf5c2d7b85e7..7042d8237529 100644 --- a/docs/configurations.md +++ b/docs/configurations.md @@ -76,4 +76,7 @@ summary: "Here we list all possible configurations and what they mean" - [usePrefix](#usePrefix) ()
Standard prefix for all metrics + - [S3Configs](s3_hoodie.html) (Hoodie S3 Configs)
+ Configurations required for S3 and Hoodie co-operability. + {% include callout.html content="Hoodie is a young project. A lot of pluggable interfaces and configurations to support diverse workloads need to be created. Get involved [here](https://github.com/uber/hoodie)" type="info" %} diff --git a/docs/s3_filesystem.md b/docs/s3_filesystem.md index c1bdd2fb1dae..faa424033431 100644 --- a/docs/s3_filesystem.md +++ b/docs/s3_filesystem.md @@ -10,6 +10,10 @@ Hoodie works with HDFS by default. There is an experimental work going on Hoodie ## S3 configs +There are two configurations required for Hoodie-S3 compatibility: +- Adding AWS Credentials for Hoodie +- Adding required Jars to classpath + Add the required configs in your core-site.xml from where Hoodie can fetch them. Replace the `fs.defaultFS` with your S3 bucket name and Hoodie should be able to read/write from the bucket. ``` @@ -44,4 +48,8 @@ Add the required configs in your core-site.xml from where Hoodie can fetch them. ``` +AWS hadoop libraries to add to your classpath - + - com.amazonaws:aws-java-sdk:1.10.34 + - org.apache.hadoop:hadoop-aws:2.7.3 + diff --git a/hoodie-client/pom.xml b/hoodie-client/pom.xml index 617dc9c4bf5f..11e6b85a30e6 100644 --- a/hoodie-client/pom.xml +++ b/hoodie-client/pom.xml @@ -118,6 +118,11 @@ io.dropwizard.metrics metrics-core + + com.beust + jcommander + 1.48 + diff --git a/hoodie-client/src/test/java/HoodieClientExample.java b/hoodie-client/src/test/java/HoodieClientExample.java index 711b4bb04963..39724f67e488 100644 --- a/hoodie-client/src/test/java/HoodieClientExample.java +++ b/hoodie-client/src/test/java/HoodieClientExample.java @@ -14,6 +14,9 @@ * limitations under the License. */ + +import com.beust.jcommander.JCommander; +import com.beust.jcommander.Parameter; import com.uber.hoodie.HoodieWriteClient; import com.uber.hoodie.common.HoodieTestDataGenerator; import com.uber.hoodie.common.model.HoodieRecord; @@ -22,13 +25,6 @@ import com.uber.hoodie.config.HoodieIndexConfig; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.index.HoodieIndex; -import org.apache.commons.cli.CommandLine; -import org.apache.commons.cli.CommandLineParser; -import org.apache.commons.cli.BasicParser; -import org.apache.commons.cli.HelpFormatter; -import org.apache.commons.cli.Option; -import org.apache.commons.cli.Options; -import org.apache.commons.cli.ParseException; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.SparkConf; @@ -44,38 +40,23 @@ */ public class HoodieClientExample { + @Parameter(names={"--table-path", "-p"}, description = "path for Hoodie sample table") + private String inputTablePath = "file:///tmp/hoodie/sample-table"; + + @Parameter(names={"--table-name", "-n"}, description = "table name for Hoodie sample table") + private String inputTableName = "sample-table"; private static Logger logger = LogManager.getLogger(HoodieClientExample.class); - private static final String DEFAULT_TABLE_PATH = "file:///tmp/hoodie/sample-table"; - private static final String DEFAULT_TABLE_NAME = "sample-table"; public static void main(String[] args) throws Exception { - Options options = new Options(); - Option path = new Option("p", "table-path", true, "input table path"); - path.setRequired(false); - options.addOption(path); - - Option name = new Option("n", "table-name", true, "input table name"); - name.setRequired(false); - options.addOption(name); - - CommandLineParser parser = new BasicParser(); - HelpFormatter formatter = new HelpFormatter(); - CommandLine cmd; - - try { - cmd = parser.parse(options, args); - } catch (ParseException e) { - System.out.println(e.getMessage()); - formatter.printHelp("HoodieClientExample", options); - System.exit(1); - return; - } - - String inputTablePath = cmd.getOptionValue("table-path", DEFAULT_TABLE_PATH); - String inputTableName = cmd.getOptionValue("table-name", DEFAULT_TABLE_NAME); + HoodieClientExample cli = new HoodieClientExample(); + new JCommander(cli, args); + cli.run(); + } + + public void run() throws Exception { HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); SparkConf sparkConf = new SparkConf().setAppName("hoodie-client-example"); diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/util/FSUtils.java b/hoodie-common/src/main/java/com/uber/hoodie/common/util/FSUtils.java index 77448d058910..2f554aecb81a 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/util/FSUtils.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/util/FSUtils.java @@ -18,7 +18,6 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; -import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.log.HoodieLogFile; import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.exception.HoodieIOException; From 465f58c58837c0bb405e63e3eca818d957918f67 Mon Sep 17 00:00:00 2001 From: Yash Sharma Date: Tue, 28 Mar 2017 10:30:50 +1100 Subject: [PATCH 3/3] formatting for docs --- docs/s3_filesystem.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/s3_filesystem.md b/docs/s3_filesystem.md index faa424033431..adb1cefcb5a4 100644 --- a/docs/s3_filesystem.md +++ b/docs/s3_filesystem.md @@ -8,12 +8,15 @@ summary: In this page, we go over how to configure hoodie with S3 filesystem. --- Hoodie works with HDFS by default. There is an experimental work going on Hoodie-S3 compatibility. -## S3 configs +## AWS configs There are two configurations required for Hoodie-S3 compatibility: + - Adding AWS Credentials for Hoodie - Adding required Jars to classpath +### AWS Credentials + Add the required configs in your core-site.xml from where Hoodie can fetch them. Replace the `fs.defaultFS` with your S3 bucket name and Hoodie should be able to read/write from the bucket. ``` @@ -48,7 +51,10 @@ Add the required configs in your core-site.xml from where Hoodie can fetch them. ``` -AWS hadoop libraries to add to your classpath - +### AWS Libs + +AWS hadoop libraries to add to our classpath + - com.amazonaws:aws-java-sdk:1.10.34 - org.apache.hadoop:hadoop-aws:2.7.3