twitter · piyushnarang · Sep 16, 2016 · Aug 30, 2016 · Aug 30, 2016 · Aug 30, 2016
diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionSchemed.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionSchemed.scala
@@ -15,17 +15,10 @@
 package com.twitter.scalding
 package typed
 
-import java.util.Properties
-import java.io.{ InputStream, OutputStream }
-
-import cascading.scheme.Scheme
-import cascading.scheme.hadoop.TextDelimited
-import cascading.scheme.local.{ TextDelimited => LocalTextDelimited }
-import cascading.tap.{ Tap, SinkMode }
-import cascading.tap.hadoop.{ Hfs, PartitionTap }
+import cascading.tap.hadoop.PartitionTap
 import cascading.tap.local.{ FileTap, PartitionTap => LocalPartitionTap }
-import cascading.tap.partition.Partition
-import cascading.tuple.{ Fields, Tuple, TupleEntry }
+import cascading.tap.{ SinkMode, Tap }
+import cascading.tuple.Fields
 
 /**
  * Trait to assist with creating partitioned sources.

diff --git a/scalding-parquet-fixtures/src/test/resources/test.thrift b/scalding-parquet-fixtures/src/test/resources/test.thrift
@@ -5,3 +5,8 @@ struct Name {
   1: required string first_name,
   2: optional string last_name
 }
+
+struct Address {
+  1: string street,
+  2: required string zip
+}
diff --git a/...-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetScrooge.scala b/...-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetScrooge.scala
@@ -2,11 +2,13 @@ package com.twitter.scalding.parquet.scrooge
 
 import cascading.scheme.Scheme
 import com.twitter.scalding._
-import com.twitter.scalding.parquet.thrift.ParquetThriftBase
+import com.twitter.scalding.parquet.thrift.ParquetThriftBaseFileSource
 import com.twitter.scalding.source.{ DailySuffixSource, HourlySuffixSource }
 import com.twitter.scrooge.ThriftStruct
 
-trait ParquetScrooge[T <: ThriftStruct] extends ParquetThriftBase[T] {
+import scala.reflect.ClassTag
+
+trait ParquetScrooge[T <: ThriftStruct] extends ParquetThriftBaseFileSource[T] {
 
   override def hdfsScheme = {
     // See docs in Parquet346ScroogeScheme
@@ -18,13 +20,13 @@ trait ParquetScrooge[T <: ThriftStruct] extends ParquetThriftBase[T] {
 
 class DailySuffixParquetScrooge[T <: ThriftStruct](
   path: String,
-  dateRange: DateRange)(implicit override val mf: Manifest[T])
+  dateRange: DateRange)(implicit override val ct: ClassTag[T])
   extends DailySuffixSource(path, dateRange) with ParquetScrooge[T]
 
 class HourlySuffixParquetScrooge[T <: ThriftStruct](
   path: String,
-  dateRange: DateRange)(implicit override val mf: Manifest[T])
+  dateRange: DateRange)(implicit override val ct: ClassTag[T])
   extends HourlySuffixSource(path, dateRange) with ParquetScrooge[T]
 
-class FixedPathParquetScrooge[T <: ThriftStruct](paths: String*)(implicit override val mf: Manifest[T])
+class FixedPathParquetScrooge[T <: ThriftStruct](paths: String*)(implicit override val ct: ClassTag[T])
   extends FixedPathSource(paths: _*) with ParquetScrooge[T]
diff --git a/...src/main/scala/com/twitter/scalding/parquet/scrooge/PartitionedParquetScroogeSource.scala b/...src/main/scala/com/twitter/scalding/parquet/scrooge/PartitionedParquetScroogeSource.scala
@@ -0,0 +1,50 @@
+package com.twitter.scalding.parquet.scrooge
+
+import _root_.cascading.scheme.Scheme
+import com.twitter.scalding._
+import com.twitter.scalding.parquet.thrift.ParquetThriftBase
+import com.twitter.scalding.typed.{ PartitionSchemed, PartitionUtil }
+import com.twitter.scrooge.ThriftStruct
+
+import scala.reflect.ClassTag
+
+/**
+ * Scalding source to read or write partitioned Parquet scrooge data.
+ *
+ * For writing it expects a pair of `(P, T)`, where `P` is the data used for partitioning and
+ * `T` is the scrooge object. `P` must be either a String or a tuple of Strings.
+ * Below is an example.
+ * {{{
+ * val data: TypedPipe[MyScroogeObject] = ???
+ * data.map { obj =>
+ *   ( (obj.country, obj.city), obj)
+ * }.write(PartitionedParquetScroogeSource[(String, String), MyScroogeObject](path, "%s/%s"))
+ * }}}
+ *
+ * For reading it produces a pair `(P, T)` where `P` is the partition data, `T` is the corresponding
+ * scrooge object. Below is an example.
+ * {{{
+ * val in: TypedPipe[(String, String), MyScroogeObject] =
+ * TypedPipe.from( PartitionedParquetScroogeSource[(String, String), MyScroogeObject](path, "%s/%s") )
+ * }}}
+ *
+ */
+case class PartitionedParquetScroogeSource[P, T <: ThriftStruct](path: String, template: String)(implicit val ct: ClassTag[T],
+  val valueSetter: TupleSetter[T], val valueConverter: TupleConverter[T], val partitionSetter: TupleSetter[P], val partitionConverter: TupleConverter[P])
+  extends FixedPathSource(path) with ParquetThriftBase[T] with PartitionSchemed[P, T] with Serializable {
+
+  override val fields = PartitionUtil.toFields(0, implicitly[TupleSetter[T]].arity)
+
+  assert(
+    fields.size == valueSetter.arity,
+    "The number of fields needs to be the same as the arity of the value setter")
+
+  // Create the underlying scheme and explicitly set the source, sink fields to be only the specified fields
+  override def hdfsScheme = {
+    val scroogeScheme = new Parquet346ScroogeScheme[T](this.config)
+    val scheme = HadoopSchemeInstance(scroogeScheme.asInstanceOf[Scheme[_, _, _, _, _]])
+    scheme.setSinkFields(fields)
+    scheme.setSourceFields(fields)
+    scheme
+  }
+}
diff --git a/...est/scala/com/twitter/scalding/parquet/scrooge/PartitionedParquetScroogeSourceTests.scala b/...est/scala/com/twitter/scalding/parquet/scrooge/PartitionedParquetScroogeSourceTests.scala
@@ -0,0 +1,65 @@
+package com.twitter.scalding.parquet.scrooge
+
+import java.io.File
+
+import com.twitter.scalding._
+import com.twitter.scalding.parquet.scrooge.thrift_scala.test.Address
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.apache.parquet.hadoop.ParquetReader
+
+import org.scalatest.{ Matchers, WordSpec }
+
+object PartitionedParquetScroogeTestSources {
+  val path = "/a/path"
+  val partitionSource = PartitionedParquetScroogeSource[String, Address](path, "%s")
+}
+
+class PartitionedParquetScroogeWriteJob(args: Args) extends Job(args) {
+  import PartitionedParquetScroogeTestSources._
+  val input = Seq(Address("123 Embarcadero", "94111"), Address("123 E 79th St", "10075"), Address("456 W 80th St", "10075"))
+
+  TypedPipe.from(input)
+    .map { case Address(street, zipcode) => (zipcode, Address(street, zipcode)) }
+    .write(partitionSource)
+}
+
+class PartitionedParquetScroogeSourceTests extends WordSpec with Matchers {
+  import PartitionedParquetScroogeTestSources._
+
+  def validate(path: Path, expectedAddresses: Address*) = {
+    val conf: Configuration = new Configuration
+    conf.set("parquet.thrift.converter.class", classOf[ScroogeRecordConverter[Address]].getName)
+    val parquetReader: ParquetReader[Address] =
+      ParquetReader.builder[Address](new ScroogeReadSupport[Address], path)
+        .withConf(conf)
+        .build()
+
+    Stream.continually(parquetReader.read).takeWhile(_ != null).toArray shouldBe expectedAddresses
+  }
+
+  "PartitionedParquetScroogeSource" should {
+    "write out partitioned scrooge objects" in {
+      var job: Job = null;
+      def buildJob(args: Args): Job = {
+        job = new PartitionedParquetScroogeWriteJob(args)
+        job
+      }
+      JobTest(buildJob(_))
+        .runHadoop
+        .finish()
+
+      val testMode = job.mode.asInstanceOf[HadoopTest]
+
+      val directory = new File(testMode.getWritePathFor(partitionSource))
+
+      directory.listFiles().map({ _.getName() }).toSet shouldBe Set("94111", "10075")
+
+      // check that the partitioning is done correctly by zipcode
+      validate(new Path(directory.getPath + "/94111/part-00000-00000-m-00000.parquet"),
+        Address("123 Embarcadero", "94111"))
+      validate(new Path(directory.getPath + "/10075/part-00000-00001-m-00000.parquet"),
+        Address("123 E 79th St", "10075"), Address("456 W 80th St", "10075"))
+    }
+  }
+}
diff --git a/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/thrift/ParquetThrift.scala b/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/thrift/ParquetThrift.scala
@@ -29,16 +29,19 @@ import com.twitter.scalding.source.{ DailySuffixSource, HourlySuffixSource }
 import java.io.Serializable
 import org.apache.thrift.{ TBase, TFieldIdEnum }
 
+import scala.reflect.ClassTag
+
 object ParquetThrift extends Serializable {
   type ThriftBase = TBase[_ <: TBase[_, _], _ <: TFieldIdEnum]
 }
 
-trait ParquetThriftBase[T] extends FileSource with SingleMappable[T] with TypedSink[T] with LocalTapSource with HasFilterPredicate with HasColumnProjection {
+trait ParquetThriftBase[T] extends LocalTapSource with HasFilterPredicate with HasColumnProjection {
 
-  def mf: Manifest[T]
+  implicit def ct: ClassTag[T]
 
-  def config: ParquetValueScheme.Config[T] = {
-    val config = new ParquetValueScheme.Config[T].withRecordClass(mf.runtimeClass.asInstanceOf[Class[T]])
+  def config(implicit ct: ClassTag[T]): ParquetValueScheme.Config[T] = {
+    val clazz = implicitly[ClassTag[T]].runtimeClass.asInstanceOf[Class[T]]
+    val config = new ParquetValueScheme.Config[T].withRecordClass(clazz)
     val configWithFp = withFilter match {
       case Some(fp) => config.withFilterPredicate(fp)
       case None => config
@@ -52,11 +55,13 @@ trait ParquetThriftBase[T] extends FileSource with SingleMappable[T] with TypedS
 
     configWithProjection
   }
+}
 
+trait ParquetThriftBaseFileSource[T] extends FileSource with ParquetThriftBase[T] with SingleMappable[T] with TypedSink[T] {
   override def setter[U <: T] = TupleSetter.asSubSetter[T, U](TupleSetter.singleSetter[T])
 }
 
-trait ParquetThrift[T <: ParquetThrift.ThriftBase] extends ParquetThriftBase[T] {
+trait ParquetThrift[T <: ParquetThrift.ThriftBase] extends ParquetThriftBaseFileSource[T] {
 
   override def hdfsScheme = {
     // See docs in Parquet346TBaseScheme
@@ -108,13 +113,13 @@ trait ParquetThrift[T <: ParquetThrift.ThriftBase] extends ParquetThriftBase[T]
  */
 class DailySuffixParquetThrift[T <: ParquetThrift.ThriftBase](
   path: String,
-  dateRange: DateRange)(implicit override val mf: Manifest[T])
+  dateRange: DateRange)(implicit override val ct: ClassTag[T])
   extends DailySuffixSource(path, dateRange) with ParquetThrift[T]
 
 class HourlySuffixParquetThrift[T <: ParquetThrift.ThriftBase](
   path: String,
-  dateRange: DateRange)(implicit override val mf: Manifest[T])
+  dateRange: DateRange)(implicit override val ct: ClassTag[T])
   extends HourlySuffixSource(path, dateRange) with ParquetThrift[T]
 
-class FixedPathParquetThrift[T <: ParquetThrift.ThriftBase](paths: String*)(implicit override val mf: Manifest[T])
+class FixedPathParquetThrift[T <: ParquetThrift.ThriftBase](paths: String*)(implicit override val ct: ClassTag[T])
   extends FixedPathSource(paths: _*) with ParquetThrift[T]
diff --git a/...t/src/main/scala/com/twitter/scalding/parquet/thrift/PartitionedParquetThriftSource.scala b/...t/src/main/scala/com/twitter/scalding/parquet/thrift/PartitionedParquetThriftSource.scala
@@ -0,0 +1,49 @@
+package com.twitter.scalding.parquet.thrift
+
+import cascading.scheme.Scheme
+import com.twitter.scalding.typed.{ PartitionSchemed, PartitionUtil }
+import com.twitter.scalding.{ FixedPathSource, HadoopSchemeInstance, TupleConverter, TupleSetter }
+
+import scala.reflect.ClassTag
+
+/**
+ * Scalding source to read or write partitioned Parquet thrift data.
+ *
+ * For writing it expects a pair of `(P, T)`, where `P` is the data used for partitioning and
+ * `T` is the thrift object. `P` must be either a String or a tuple of Strings.
+ * Below is an example.
+ * {{{
+ * val data: TypedPipe[MyThriftObject] = ???
+ * data.map{ obj =>
+ *   ( (obj.country, obj.city), obj)
+ * }.write(PartitionedParquetThriftSource[(String, String), MyThriftObject](path, "%s/%s"))
+ * }}}
+ *
+ * For reading it produces a pair `(P, T)` where `P` is the partition data, `T` is the corresponding
+ * thrift object. Below is an example.
+ * {{{
+ * val in: TypedPipe[(String, String), MyThriftObject] =
+ * TypedPipe.from( PartitionedParquetThriftSource[(String, String), MyThriftObject](path, "%s/%s") )
+ * }}}
+ *
+ */
+case class PartitionedParquetThriftSource[P, T <: ParquetThrift.ThriftBase](path: String, template: String)(implicit val ct: ClassTag[T],
+  val valueSetter: TupleSetter[T], val valueConverter: TupleConverter[T], val partitionSetter: TupleSetter[P], val partitionConverter: TupleConverter[P])
+  extends FixedPathSource(path) with ParquetThriftBase[T] with PartitionSchemed[P, T] with Serializable {
+
+  override val fields = PartitionUtil.toFields(0, implicitly[TupleSetter[T]].arity)
+
+  assert(
+    fields.size == valueSetter.arity,
+    "The number of fields needs to be the same as the arity of the value setter")
+
+  // Create the underlying scheme and explicitly set the source, sink fields to be only the specified fields
+  override def hdfsScheme = {
+    // See docs in Parquet346TBaseScheme
+    val baseScheme = new Parquet346TBaseScheme[T](this.config)
+    val scheme = HadoopSchemeInstance(baseScheme.asInstanceOf[Scheme[_, _, _, _, _]])
+    scheme.setSinkFields(fields)
+    scheme.setSourceFields(fields)
+    scheme
+  }
+}
diff --git a/.../test/scala/com/twitter/scalding/parquet/thrift/PartitionedParquetThriftSourceTests.scala b/.../test/scala/com/twitter/scalding/parquet/thrift/PartitionedParquetThriftSourceTests.scala
@@ -0,0 +1,60 @@
+package com.twitter.scalding.parquet.thrift
+
+import java.io.File
+
+import com.twitter.scalding._
+import com.twitter.scalding.parquet.thrift_java.test.Address
+import org.apache.hadoop.fs.Path
+import org.apache.parquet.hadoop.ParquetReader
+import org.apache.parquet.thrift.ThriftParquetReader
+
+import org.scalatest.{ Matchers, WordSpec }
+
+object PartitionedParquetThriftTestSources {
+  val path = "/a/path"
+  val partitionSource = PartitionedParquetThriftSource[String, Address](path, "%s")
+}
+
+class PartitionedParquetThriftWriteJob(args: Args) extends Job(args) {
+  import PartitionedParquetThriftTestSources._
+  val input = Seq(new Address("123 Embarcadero", "94111"), new Address("123 E 79th St", "10075"), new Address("456 W 80th St", "10075"))
+
+  TypedPipe.from(input)
+    .map { address => (address.getZip, address) }
+    .write(partitionSource)
+}
+
+class PartitionedParquetThriftSourceTests extends WordSpec with Matchers {
+  import PartitionedParquetThriftTestSources._
+
+  def validate(path: Path, expectedAddresses: Address*) = {
+    val parquetReader: ParquetReader[Address] =
+      ThriftParquetReader.build(path).withThriftClass(classOf[Address]).build()
+    Stream.continually(parquetReader.read).takeWhile(_ != null).toArray shouldBe expectedAddresses
+  }
+
+  "PartitionedParquetThriftSource" should {
+    "write out partitioned thrift objects" in {
+      var job: Job = null;
+      def buildJob(args: Args): Job = {
+        job = new PartitionedParquetThriftWriteJob(args)
+        job
+      }
+      JobTest(buildJob(_))
+        .runHadoop
+        .finish()
+
+      val testMode = job.mode.asInstanceOf[HadoopTest]
+
+      val directory = new File(testMode.getWritePathFor(partitionSource))
+
+      directory.listFiles().map({ _.getName() }).toSet shouldBe Set("94111", "10075")
+
+      // check that the partitioning is done correctly by zipcode
+      validate(new Path(directory.getPath + "/94111/part-00000-00000-m-00000.parquet"),
+        new Address("123 Embarcadero", "94111"))
+      validate(new Path(directory.getPath + "/10075/part-00000-00001-m-00000.parquet"),
+        new Address("123 E 79th St", "10075"), new Address("456 W 80th St", "10075"))
+    }
+  }
+}