Skip to content

Commit

Permalink
Add Proto BigDiffy example (#85)
Browse files Browse the repository at this point in the history
* Add Proto BigDiffy example

* Update READMEs

* Clean up some irrelevant changes

* Change to Example Proto record, fix SBT settings

* Scalastyle fixes

* Add protoBufSettings to other projects

* Add proto settings to CLI project
  • Loading branch information
idreeskhan authored Apr 24, 2018
1 parent d75eda4 commit 91bb160
Show file tree
Hide file tree
Showing 9 changed files with 100 additions and 23 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ A tool for random data sampling and generation
- [Samplers](https://github.com/spotify/ratatool/tree/master/ratatool-sampling/src/main/scala/com/spotify/ratatool/samplers) - random data samplers for Avro, BigQuery and Parquet. True random sampling is supported for Avro only while head mode (sampling from the start) is supported for all sources.
- [Diffy](https://github.com/spotify/ratatool/tree/master/ratatool-diffy/src/main/scala/com/spotify/ratatool/diffy) - field-level record diff tool for Avro, Protobuf and BigQuery TableRow.
- [BigDiffy](https://github.com/spotify/ratatool/blob/master/ratatool-diffy/src/main/scala/com/spotify/ratatool/diffy/BigDiffy.scala) - [Scio](https://github.com/spotify/scio) library for pairwise field-level statistical diff of data sets. See [slides](http://www.lyh.me/slides/bigdiffy.html) for more.
- [Command line tool](https://github.com/spotify/ratatool/tree/master/ratatool-cli/src/main/scala/com/spotify/ratatool/tool) - command line tool for sampling from various sources.
- [Command line tool](https://github.com/spotify/ratatool/tree/master/ratatool-cli/src/main/scala/com/spotify/ratatool/tool) - command line tool for local sampler, or executing BigDiffy and BigSampler.

# Usage

Expand Down
13 changes: 10 additions & 3 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ lazy val ratatoolSampling = project
ratatoolCommon % "compile->compile;test->test",
ratatoolScalacheck % "test"
)
.settings(protoBufSettings)

lazy val ratatoolDiffy = project
.in(file("ratatool-diffy"))
Expand All @@ -152,6 +153,7 @@ lazy val ratatoolDiffy = project
ratatoolSampling,
ratatoolScalacheck % "test"
)
.settings(protoBufSettings)

lazy val ratatoolCli = project
.in(file("ratatool-cli"))
Expand All @@ -171,6 +173,7 @@ lazy val ratatoolCli = project
ratatoolSampling,
ratatoolDiffy
)
.settings(protoBufSettings)

lazy val ratatoolScalacheck = project
.in(file("ratatool-scalacheck"))
Expand All @@ -186,6 +189,7 @@ lazy val ratatoolScalacheck = project
)
.enablePlugins(ProtobufPlugin)
.dependsOn(ratatoolCommon % "compile->compile;test->test")
.settings(protoBufSettings)

lazy val ratatoolExamples = project
.in(file("ratatool-examples"))
Expand All @@ -196,11 +200,13 @@ lazy val ratatoolExamples = project
"com.google.apis" % "google-api-services-bigquery" % bigqueryVersion
)
)
.enablePlugins(ProtobufPlugin, PackPlugin)
.enablePlugins(ProtobufPlugin)
.dependsOn(
ratatoolCommon,
ratatoolScalacheck
ratatoolScalacheck,
ratatoolDiffy
)
.settings(protoBufSettings)

val root = project.in(file("."))
.settings(commonSettings ++ noPublishSettings)
Expand All @@ -209,5 +215,6 @@ val root = project.in(file("."))
ratatoolScalacheck,
ratatoolDiffy,
ratatoolSampling,
ratatoolCli
ratatoolCli,
ratatoolExamples
)
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,22 @@ object BigDiffy extends Command {
r.setFields(mergeFields(x.getFields.asScala, y.getFields.asScala).asJava)
}

def saveStats[T](bigDiffy: BigDiffy[T], output: String, withHeader: Boolean = false): Unit = {
if (withHeader) {
bigDiffy.keyStats.map(_.toString).saveAsTextFileWithHeader(s"$output/keys", "key\tdifftype")
bigDiffy.fieldStats.map(_.toString).saveAsTextFileWithHeader(s"$output/fields",
"field\tcount\tfraction\tdeltaType\tmin" +
"\tmax\tcount\tmean\tvariance\tstddev\tskewness\tkurtosis")
bigDiffy.globalStats.map(_.toString).saveAsTextFileWithHeader(s"$output/global",
"numTotal\tnumSame\tnumDiff\tnumMissingLhs\tnumMissingRhs")
}
else {
bigDiffy.keyStats.saveAsTextFile(s"$output/keys")
bigDiffy.fieldStats.saveAsTextFile(s"$output/fields")
bigDiffy.globalStats.saveAsTextFile(s"$output/global")
}
}

private def mergeFields(x: Seq[TableFieldSchema],
y: Seq[TableFieldSchema]): Seq[TableFieldSchema] = {
val xMap = x.map(f => (f.getName, f)).toMap
Expand Down Expand Up @@ -399,20 +415,7 @@ object BigDiffy extends Command {
case m =>
throw new IllegalArgumentException(s"mode $m not supported")
}

if (header) {
result.keyStats.map(_.toString).saveAsTextFileWithHeader(s"$output/keys", "key\tdifftype")
result.fieldStats.map(_.toString).saveAsTextFileWithHeader(s"$output/fields",
"field\tcount\tfraction\tdeltaType\tmin" +
"\tmax\tcount\tmean\tvariance\tstddev\tskewness\tkurtosis")
result.globalStats.map(_.toString).saveAsTextFileWithHeader(s"$output/global",
"numTotal\tnumSame\tnumDiff\tnumMissingLhs\tnumMissingRhs")
}
else {
result.keyStats.saveAsTextFile(s"$output/keys")
result.fieldStats.saveAsTextFile(s"$output/fields")
result.globalStats.saveAsTextFile(s"$output/global")
}
saveStats(result, output, header)

sc.close().waitUntilDone()
}
Expand Down
10 changes: 8 additions & 2 deletions ratatool-examples/README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
Examples
=======

These example cover different use cases for generating Avro or TableRow data with Ratatool and Scalacheck.
## Scalacheck
These examples cover different use cases for generating Avro or TableRow data with Ratatool and Scalacheck.
The constraints are based on arbitrary criteria defined for [Avro](https://github.com/spotify/ratatool/blob/master/ratatool-examples/src/main/avro/schema.avsc)
and [BigQuery](https://github.com/spotify/ratatool/blob/master/ratatool-examples/src/main/resources/schema.json)
which should mirror some real life use cases of generating data where some fields have expected values
or behaviour. It is recommended to do some reading on ScalaCheck and how Generators work before digging into
these examples. Some resources are provided [here](https://github.com/spotify/ratatool/wiki/Generators).
these examples. Some resources are provided [here](https://github.com/spotify/ratatool/wiki/Generators).

## Diffy
Contains an example of using BigDiffy with Protobuf programmatically, as this is not currently supported
in the CLI. This should serve as a reasonable workaround for users to build their own specific pipelines
until a more generic version can be made.
9 changes: 9 additions & 0 deletions ratatool-examples/src/main/protobuf/schemas.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
syntax = "proto2";

option java_package = "com.spotify.ratatool.examples.proto";
option optimize_for = SPEED;

message ExampleRecord {
required string string_field = 1;
required int64 int64_field = 2;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/*
* Copyright 2018 Spotify AB.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package com.spotify.ratatool.examples.diffy

import java.net.URI

import com.spotify.ratatool.GcsConfiguration
import com.spotify.ratatool.diffy.{BigDiffy, ProtoBufDiffy}
import com.spotify.ratatool.examples.proto.Schemas.ExampleRecord
import org.apache.hadoop.fs.{FileSystem, Path}
import com.spotify.scio._

object ProtobufBigDiffyExample {
def recordKeyFn(t: ExampleRecord): String = {
t.getStringField
}

def main(cmdlineArgs: Array[String]): Unit = {
val (sc, args) = ContextAndArgs(cmdlineArgs)

val (lhs, rhs, output, header, ignore, unordered) =
(args("lhs"), args("rhs"), args("output"),
args.boolean("with-header", false), args.list("ignore").toSet,
args.list("unordered").toSet)

val fs = FileSystem.get(new URI(rhs), GcsConfiguration.get())
val path = fs.globStatus(new Path(rhs)).head.getPath
val diffy = new ProtoBufDiffy[ExampleRecord](ignore, unordered)
val result = BigDiffy.diffProtoBuf[ExampleRecord](sc, lhs, rhs, recordKeyFn, diffy)

BigDiffy.saveStats(result, output, header)

sc.close().waitUntilDone()
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
* under the License.
*/

package com.spotify.ratatool.examples
package com.spotify.ratatool.examples.scalacheck

import java.util

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ package com.spotify.ratatool.examples
import java.util.UUID

import com.spotify.ratatool.avro.specific.{EnumField, ExampleRecord}
import com.spotify.ratatool.examples.scalacheck.ExampleAvroGen
import org.scalacheck.{Gen, Prop, Properties}
import org.scalacheck.Prop.{AnyOperators, BooleanOperators, forAll}

Expand Down Expand Up @@ -54,4 +55,4 @@ object ExampleAvroGenTest extends Properties("ExampleAvroGenerator") {
val size = m.getNestedRecordField.getMapField.asScala.size
size <= 5 && size >= 0
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package com.spotify.ratatool.examples

import com.google.api.services.bigquery.model.TableRow
import com.spotify.ratatool.examples.scalacheck.ExampleTableRowGen
import com.spotify.ratatool.scalacheck._
import org.scalacheck.{Gen, Properties}
import org.scalacheck.Prop.{AnyOperators, forAll}
Expand All @@ -43,4 +44,4 @@ object ExampleTableRowGenTest extends Properties("ExampleTableRowGenerator") {
new String(b) ?= s
}

}
}

0 comments on commit 91bb160

Please sign in to comment.