diff --git a/README.md b/README.md index 89edb4b60021..7d298971a1f0 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,4 @@ +# Hoodie Hoodie manages storage of large analytical datasets on [HDFS](http://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html) and serve them out via two types of tables * **Read Optimized Table** - Provides excellent query performance via purely columnar storage (e.g. [Parquet](https://parquet.apache.org/)) diff --git a/docs/configurations.md b/docs/configurations.md index cf5c2d7b85e7..7042d8237529 100644 --- a/docs/configurations.md +++ b/docs/configurations.md @@ -76,4 +76,7 @@ summary: "Here we list all possible configurations and what they mean" - [usePrefix](#usePrefix) ()
Standard prefix for all metrics + - [S3Configs](s3_hoodie.html) (Hoodie S3 Configs)
+ Configurations required for S3 and Hoodie co-operability. + {% include callout.html content="Hoodie is a young project. A lot of pluggable interfaces and configurations to support diverse workloads need to be created. Get involved [here](https://github.com/uber/hoodie)" type="info" %} diff --git a/docs/s3_filesystem.md b/docs/s3_filesystem.md new file mode 100644 index 000000000000..adb1cefcb5a4 --- /dev/null +++ b/docs/s3_filesystem.md @@ -0,0 +1,61 @@ +--- +title: S3 Filesystem (experimental) +keywords: sql hive s3 spark presto +sidebar: mydoc_sidebar +permalink: s3_hoodie.html +toc: false +summary: In this page, we go over how to configure hoodie with S3 filesystem. +--- +Hoodie works with HDFS by default. There is an experimental work going on Hoodie-S3 compatibility. + +## AWS configs + +There are two configurations required for Hoodie-S3 compatibility: + +- Adding AWS Credentials for Hoodie +- Adding required Jars to classpath + +### AWS Credentials + +Add the required configs in your core-site.xml from where Hoodie can fetch them. Replace the `fs.defaultFS` with your S3 bucket name and Hoodie should be able to read/write from the bucket. + +``` + + fs.defaultFS + s3://ysharma + + + + fs.s3.impl + org.apache.hadoop.fs.s3native.NativeS3FileSystem + + + + fs.s3.awsAccessKeyId + AWS_KEY + + + + fs.s3.awsSecretAccessKey + AWS_SECRET + + + + fs.s3n.awsAccessKeyId + AWS_KEY + + + + fs.s3n.awsSecretAccessKey + AWS_SECRET + +``` + +### AWS Libs + +AWS hadoop libraries to add to our classpath + + - com.amazonaws:aws-java-sdk:1.10.34 + - org.apache.hadoop:hadoop-aws:2.7.3 + + diff --git a/hoodie-client/pom.xml b/hoodie-client/pom.xml index 617dc9c4bf5f..11e6b85a30e6 100644 --- a/hoodie-client/pom.xml +++ b/hoodie-client/pom.xml @@ -118,6 +118,11 @@ io.dropwizard.metrics metrics-core + + com.beust + jcommander + 1.48 + diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieWrapperFileSystem.java b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieWrapperFileSystem.java index 64034b4d24d0..d413fc5c38da 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieWrapperFileSystem.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieWrapperFileSystem.java @@ -49,9 +49,10 @@ public class HoodieWrapperFileSystem extends FileSystem { public static final String HOODIE_SCHEME_PREFIX = "hoodie-"; static { - SUPPORT_SCHEMES = new HashSet<>(2); + SUPPORT_SCHEMES = new HashSet<>(); SUPPORT_SCHEMES.add("file"); SUPPORT_SCHEMES.add("hdfs"); + SUPPORT_SCHEMES.add("s3"); } private ConcurrentMap openStreams = diff --git a/hoodie-client/src/test/java/HoodieClientExample.java b/hoodie-client/src/test/java/HoodieClientExample.java index eb7e56f707c5..39724f67e488 100644 --- a/hoodie-client/src/test/java/HoodieClientExample.java +++ b/hoodie-client/src/test/java/HoodieClientExample.java @@ -14,15 +14,17 @@ * limitations under the License. */ + +import com.beust.jcommander.JCommander; +import com.beust.jcommander.Parameter; import com.uber.hoodie.HoodieWriteClient; -import com.uber.hoodie.common.table.HoodieTableMetaClient; -import com.uber.hoodie.common.util.FSUtils; -import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.common.HoodieTestDataGenerator; import com.uber.hoodie.common.model.HoodieRecord; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.config.HoodieIndexConfig; +import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.index.HoodieIndex; - import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.SparkConf; @@ -38,12 +40,23 @@ */ public class HoodieClientExample { + @Parameter(names={"--table-path", "-p"}, description = "path for Hoodie sample table") + private String inputTablePath = "file:///tmp/hoodie/sample-table"; + + @Parameter(names={"--table-name", "-n"}, description = "table name for Hoodie sample table") + private String inputTableName = "sample-table"; private static Logger logger = LogManager.getLogger(HoodieClientExample.class); + public static void main(String[] args) throws Exception { - String tablePath = args.length == 1 ? args[0] : "file:///tmp/hoodie/sample-table"; + HoodieClientExample cli = new HoodieClientExample(); + new JCommander(cli, args); + cli.run(); + } + + public void run() throws Exception { HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); SparkConf sparkConf = new SparkConf().setAppName("hoodie-client-example"); @@ -54,16 +67,15 @@ public static void main(String[] args) throws Exception { // generate some records to be loaded in. HoodieWriteConfig cfg = - HoodieWriteConfig.newBuilder().withPath(tablePath) + HoodieWriteConfig.newBuilder().withPath(inputTablePath) .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) - .forTable("sample-table").withIndexConfig( + .forTable(inputTableName).withIndexConfig( HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) .build(); Properties properties = new Properties(); - properties.put(HoodieWriteConfig.TABLE_NAME, "sample-table"); + properties.put(HoodieWriteConfig.TABLE_NAME, inputTableName); HoodieTableMetaClient - .initializePathAsHoodieDataset(FSUtils.getFs(), tablePath, - properties); + .initializePathAsHoodieDataset(FSUtils.getFs(), inputTablePath, properties); HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); /** diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/util/FSUtils.java b/hoodie-common/src/main/java/com/uber/hoodie/common/util/FSUtils.java index 77448d058910..2f554aecb81a 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/util/FSUtils.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/util/FSUtils.java @@ -18,7 +18,6 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; -import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.log.HoodieLogFile; import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.exception.HoodieIOException;