From 91bb160211485ccab978897980d4628f370741a7 Mon Sep 17 00:00:00 2001 From: Idrees Khan Date: Tue, 24 Apr 2018 11:45:02 -0400 Subject: [PATCH] Add Proto BigDiffy example (#85) * Add Proto BigDiffy example * Update READMEs * Clean up some irrelevant changes * Change to Example Proto record, fix SBT settings * Scalastyle fixes * Add protoBufSettings to other projects * Add proto settings to CLI project --- README.md | 2 +- build.sbt | 13 +++-- .../com/spotify/ratatool/diffy/BigDiffy.scala | 31 ++++++------ ratatool-examples/README.md | 10 +++- .../src/main/protobuf/schemas.proto | 9 ++++ .../diffy/ProtobufBigDiffyExample.scala | 50 +++++++++++++++++++ .../GenExample.scala} | 2 +- .../examples/ExampleAvroGenTest.scala | 3 +- .../examples/ExampleTableRowGenTest.scala | 3 +- 9 files changed, 100 insertions(+), 23 deletions(-) create mode 100644 ratatool-examples/src/main/protobuf/schemas.proto create mode 100644 ratatool-examples/src/main/scala/com/spotify/ratatool/examples/diffy/ProtobufBigDiffyExample.scala rename ratatool-examples/src/main/scala/com/spotify/ratatool/examples/{Examples.scala => scalacheck/GenExample.scala} (99%) diff --git a/README.md b/README.md index ddfe6e55..dd6785bf 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ A tool for random data sampling and generation - [Samplers](https://github.com/spotify/ratatool/tree/master/ratatool-sampling/src/main/scala/com/spotify/ratatool/samplers) - random data samplers for Avro, BigQuery and Parquet. True random sampling is supported for Avro only while head mode (sampling from the start) is supported for all sources. - [Diffy](https://github.com/spotify/ratatool/tree/master/ratatool-diffy/src/main/scala/com/spotify/ratatool/diffy) - field-level record diff tool for Avro, Protobuf and BigQuery TableRow. - [BigDiffy](https://github.com/spotify/ratatool/blob/master/ratatool-diffy/src/main/scala/com/spotify/ratatool/diffy/BigDiffy.scala) - [Scio](https://github.com/spotify/scio) library for pairwise field-level statistical diff of data sets. See [slides](http://www.lyh.me/slides/bigdiffy.html) for more. -- [Command line tool](https://github.com/spotify/ratatool/tree/master/ratatool-cli/src/main/scala/com/spotify/ratatool/tool) - command line tool for sampling from various sources. +- [Command line tool](https://github.com/spotify/ratatool/tree/master/ratatool-cli/src/main/scala/com/spotify/ratatool/tool) - command line tool for local sampler, or executing BigDiffy and BigSampler. # Usage diff --git a/build.sbt b/build.sbt index e2dbbb84..73bb80cc 100644 --- a/build.sbt +++ b/build.sbt @@ -128,6 +128,7 @@ lazy val ratatoolSampling = project ratatoolCommon % "compile->compile;test->test", ratatoolScalacheck % "test" ) + .settings(protoBufSettings) lazy val ratatoolDiffy = project .in(file("ratatool-diffy")) @@ -152,6 +153,7 @@ lazy val ratatoolDiffy = project ratatoolSampling, ratatoolScalacheck % "test" ) + .settings(protoBufSettings) lazy val ratatoolCli = project .in(file("ratatool-cli")) @@ -171,6 +173,7 @@ lazy val ratatoolCli = project ratatoolSampling, ratatoolDiffy ) + .settings(protoBufSettings) lazy val ratatoolScalacheck = project .in(file("ratatool-scalacheck")) @@ -186,6 +189,7 @@ lazy val ratatoolScalacheck = project ) .enablePlugins(ProtobufPlugin) .dependsOn(ratatoolCommon % "compile->compile;test->test") + .settings(protoBufSettings) lazy val ratatoolExamples = project .in(file("ratatool-examples")) @@ -196,11 +200,13 @@ lazy val ratatoolExamples = project "com.google.apis" % "google-api-services-bigquery" % bigqueryVersion ) ) - .enablePlugins(ProtobufPlugin, PackPlugin) + .enablePlugins(ProtobufPlugin) .dependsOn( ratatoolCommon, - ratatoolScalacheck + ratatoolScalacheck, + ratatoolDiffy ) + .settings(protoBufSettings) val root = project.in(file(".")) .settings(commonSettings ++ noPublishSettings) @@ -209,5 +215,6 @@ val root = project.in(file(".")) ratatoolScalacheck, ratatoolDiffy, ratatoolSampling, - ratatoolCli + ratatoolCli, + ratatoolExamples ) diff --git a/ratatool-diffy/src/main/scala/com/spotify/ratatool/diffy/BigDiffy.scala b/ratatool-diffy/src/main/scala/com/spotify/ratatool/diffy/BigDiffy.scala index ad9c822d..8669d79e 100644 --- a/ratatool-diffy/src/main/scala/com/spotify/ratatool/diffy/BigDiffy.scala +++ b/ratatool-diffy/src/main/scala/com/spotify/ratatool/diffy/BigDiffy.scala @@ -282,6 +282,22 @@ object BigDiffy extends Command { r.setFields(mergeFields(x.getFields.asScala, y.getFields.asScala).asJava) } + def saveStats[T](bigDiffy: BigDiffy[T], output: String, withHeader: Boolean = false): Unit = { + if (withHeader) { + bigDiffy.keyStats.map(_.toString).saveAsTextFileWithHeader(s"$output/keys", "key\tdifftype") + bigDiffy.fieldStats.map(_.toString).saveAsTextFileWithHeader(s"$output/fields", + "field\tcount\tfraction\tdeltaType\tmin" + + "\tmax\tcount\tmean\tvariance\tstddev\tskewness\tkurtosis") + bigDiffy.globalStats.map(_.toString).saveAsTextFileWithHeader(s"$output/global", + "numTotal\tnumSame\tnumDiff\tnumMissingLhs\tnumMissingRhs") + } + else { + bigDiffy.keyStats.saveAsTextFile(s"$output/keys") + bigDiffy.fieldStats.saveAsTextFile(s"$output/fields") + bigDiffy.globalStats.saveAsTextFile(s"$output/global") + } + } + private def mergeFields(x: Seq[TableFieldSchema], y: Seq[TableFieldSchema]): Seq[TableFieldSchema] = { val xMap = x.map(f => (f.getName, f)).toMap @@ -399,20 +415,7 @@ object BigDiffy extends Command { case m => throw new IllegalArgumentException(s"mode $m not supported") } - - if (header) { - result.keyStats.map(_.toString).saveAsTextFileWithHeader(s"$output/keys", "key\tdifftype") - result.fieldStats.map(_.toString).saveAsTextFileWithHeader(s"$output/fields", - "field\tcount\tfraction\tdeltaType\tmin" + - "\tmax\tcount\tmean\tvariance\tstddev\tskewness\tkurtosis") - result.globalStats.map(_.toString).saveAsTextFileWithHeader(s"$output/global", - "numTotal\tnumSame\tnumDiff\tnumMissingLhs\tnumMissingRhs") - } - else { - result.keyStats.saveAsTextFile(s"$output/keys") - result.fieldStats.saveAsTextFile(s"$output/fields") - result.globalStats.saveAsTextFile(s"$output/global") - } + saveStats(result, output, header) sc.close().waitUntilDone() } diff --git a/ratatool-examples/README.md b/ratatool-examples/README.md index 0a295f35..72c00fe4 100644 --- a/ratatool-examples/README.md +++ b/ratatool-examples/README.md @@ -1,9 +1,15 @@ Examples ======= -These example cover different use cases for generating Avro or TableRow data with Ratatool and Scalacheck. +## Scalacheck +These examples cover different use cases for generating Avro or TableRow data with Ratatool and Scalacheck. The constraints are based on arbitrary criteria defined for [Avro](https://github.com/spotify/ratatool/blob/master/ratatool-examples/src/main/avro/schema.avsc) and [BigQuery](https://github.com/spotify/ratatool/blob/master/ratatool-examples/src/main/resources/schema.json) which should mirror some real life use cases of generating data where some fields have expected values or behaviour. It is recommended to do some reading on ScalaCheck and how Generators work before digging into - these examples. Some resources are provided [here](https://github.com/spotify/ratatool/wiki/Generators). \ No newline at end of file + these examples. Some resources are provided [here](https://github.com/spotify/ratatool/wiki/Generators). + +## Diffy +Contains an example of using BigDiffy with Protobuf programmatically, as this is not currently supported + in the CLI. This should serve as a reasonable workaround for users to build their own specific pipelines + until a more generic version can be made. \ No newline at end of file diff --git a/ratatool-examples/src/main/protobuf/schemas.proto b/ratatool-examples/src/main/protobuf/schemas.proto new file mode 100644 index 00000000..bdf0cc6e --- /dev/null +++ b/ratatool-examples/src/main/protobuf/schemas.proto @@ -0,0 +1,9 @@ +syntax = "proto2"; + +option java_package = "com.spotify.ratatool.examples.proto"; +option optimize_for = SPEED; + +message ExampleRecord { + required string string_field = 1; + required int64 int64_field = 2; +} diff --git a/ratatool-examples/src/main/scala/com/spotify/ratatool/examples/diffy/ProtobufBigDiffyExample.scala b/ratatool-examples/src/main/scala/com/spotify/ratatool/examples/diffy/ProtobufBigDiffyExample.scala new file mode 100644 index 00000000..1ec1019e --- /dev/null +++ b/ratatool-examples/src/main/scala/com/spotify/ratatool/examples/diffy/ProtobufBigDiffyExample.scala @@ -0,0 +1,50 @@ +/* + * Copyright 2018 Spotify AB. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.spotify.ratatool.examples.diffy + +import java.net.URI + +import com.spotify.ratatool.GcsConfiguration +import com.spotify.ratatool.diffy.{BigDiffy, ProtoBufDiffy} +import com.spotify.ratatool.examples.proto.Schemas.ExampleRecord +import org.apache.hadoop.fs.{FileSystem, Path} +import com.spotify.scio._ + +object ProtobufBigDiffyExample { + def recordKeyFn(t: ExampleRecord): String = { + t.getStringField + } + + def main(cmdlineArgs: Array[String]): Unit = { + val (sc, args) = ContextAndArgs(cmdlineArgs) + + val (lhs, rhs, output, header, ignore, unordered) = + (args("lhs"), args("rhs"), args("output"), + args.boolean("with-header", false), args.list("ignore").toSet, + args.list("unordered").toSet) + + val fs = FileSystem.get(new URI(rhs), GcsConfiguration.get()) + val path = fs.globStatus(new Path(rhs)).head.getPath + val diffy = new ProtoBufDiffy[ExampleRecord](ignore, unordered) + val result = BigDiffy.diffProtoBuf[ExampleRecord](sc, lhs, rhs, recordKeyFn, diffy) + + BigDiffy.saveStats(result, output, header) + + sc.close().waitUntilDone() + } +} diff --git a/ratatool-examples/src/main/scala/com/spotify/ratatool/examples/Examples.scala b/ratatool-examples/src/main/scala/com/spotify/ratatool/examples/scalacheck/GenExample.scala similarity index 99% rename from ratatool-examples/src/main/scala/com/spotify/ratatool/examples/Examples.scala rename to ratatool-examples/src/main/scala/com/spotify/ratatool/examples/scalacheck/GenExample.scala index 64df4af7..d5e11c5e 100644 --- a/ratatool-examples/src/main/scala/com/spotify/ratatool/examples/Examples.scala +++ b/ratatool-examples/src/main/scala/com/spotify/ratatool/examples/scalacheck/GenExample.scala @@ -15,7 +15,7 @@ * under the License. */ -package com.spotify.ratatool.examples +package com.spotify.ratatool.examples.scalacheck import java.util diff --git a/ratatool-examples/src/test/scala/com/spotify/ratatool/examples/ExampleAvroGenTest.scala b/ratatool-examples/src/test/scala/com/spotify/ratatool/examples/ExampleAvroGenTest.scala index 5909c109..6479c82f 100644 --- a/ratatool-examples/src/test/scala/com/spotify/ratatool/examples/ExampleAvroGenTest.scala +++ b/ratatool-examples/src/test/scala/com/spotify/ratatool/examples/ExampleAvroGenTest.scala @@ -20,6 +20,7 @@ package com.spotify.ratatool.examples import java.util.UUID import com.spotify.ratatool.avro.specific.{EnumField, ExampleRecord} +import com.spotify.ratatool.examples.scalacheck.ExampleAvroGen import org.scalacheck.{Gen, Prop, Properties} import org.scalacheck.Prop.{AnyOperators, BooleanOperators, forAll} @@ -54,4 +55,4 @@ object ExampleAvroGenTest extends Properties("ExampleAvroGenerator") { val size = m.getNestedRecordField.getMapField.asScala.size size <= 5 && size >= 0 } -} \ No newline at end of file +} diff --git a/ratatool-examples/src/test/scala/com/spotify/ratatool/examples/ExampleTableRowGenTest.scala b/ratatool-examples/src/test/scala/com/spotify/ratatool/examples/ExampleTableRowGenTest.scala index 991c3161..1d8017de 100644 --- a/ratatool-examples/src/test/scala/com/spotify/ratatool/examples/ExampleTableRowGenTest.scala +++ b/ratatool-examples/src/test/scala/com/spotify/ratatool/examples/ExampleTableRowGenTest.scala @@ -18,6 +18,7 @@ package com.spotify.ratatool.examples import com.google.api.services.bigquery.model.TableRow +import com.spotify.ratatool.examples.scalacheck.ExampleTableRowGen import com.spotify.ratatool.scalacheck._ import org.scalacheck.{Gen, Properties} import org.scalacheck.Prop.{AnyOperators, forAll} @@ -43,4 +44,4 @@ object ExampleTableRowGenTest extends Properties("ExampleTableRowGenerator") { new String(b) ?= s } -} \ No newline at end of file +}