From 0e8e645dd3ccd1dfc465b1f99e6eaf48514be3aa Mon Sep 17 00:00:00 2001 From: Anton Parkhomenko Date: Mon, 20 Jul 2015 22:20:40 +0300 Subject: [PATCH 01/12] Move existing functionality into `derive` command (close #83) --- project/BuildSettings.scala | 2 +- .../schemaguru/Main.scala | 163 +++------------- .../schemaguru/cli/DeriveCommand.scala | 174 ++++++++++++++++++ 3 files changed, 201 insertions(+), 138 deletions(-) create mode 100644 src/main/scala/com.snowplowanalytics/schemaguru/cli/DeriveCommand.scala diff --git a/project/BuildSettings.scala b/project/BuildSettings.scala index be04aa9..6d556b1 100644 --- a/project/BuildSettings.scala +++ b/project/BuildSettings.scala @@ -20,7 +20,7 @@ object BuildSettings { // Common settings for all our projects lazy val commonSettings = Seq[Setting[_]]( organization := "com.snowplowanalytics", - version := "0.2.0", + version := "0.3.0-M1", scalaVersion := "2.10.5", crossScalaVersions := Seq("2.10.5", "2.11.6"), scalacOptions := Seq("-deprecation", "-encoding", "utf8", diff --git a/src/main/scala/com.snowplowanalytics/schemaguru/Main.scala b/src/main/scala/com.snowplowanalytics/schemaguru/Main.scala index d04c05d..3460341 100644 --- a/src/main/scala/com.snowplowanalytics/schemaguru/Main.scala +++ b/src/main/scala/com.snowplowanalytics/schemaguru/Main.scala @@ -12,24 +12,22 @@ */ package com.snowplowanalytics.schemaguru -// Java -import java.io.File -import java.nio.file.{Files, Paths} - -// json4s -import org.json4s._ -import org.json4s.jackson.JsonMethods._ - // Argot import org.clapper.argot._ import org.clapper.argot.ArgotConverters._ -// This library -import utils._ +import cli._ -object Main extends App with FileSystemJsonGetters { - val parser = new ArgotParser( - programName = "generated.ProjectSettings.name", +object Main extends App { + private val commands = + """ + |Currently supported subcommands are: + |db - use JSON Schema to generate DDL file for specific DB + |derive - use set of JSON instances to derive JSON Schema + """.stripMargin + + private val parser = new ArgotParser( + programName = generated.ProjectSettings.name, compactUsage = true, preUsage = Some("%s: Version %s. Copyright (c) 2015, %s.".format( generated.ProjectSettings.name, @@ -38,134 +36,25 @@ object Main extends App with FileSystemJsonGetters { ) ) - val directoryArgument = parser.option[String](List("dir"), "directory", "Directory which contains JSONs to be converted") - val fileArgument = parser.option[String](List("file"), "file", "Single JSON instance to be converted") - val outputFileArgument = parser.option[String]("output", "file", "Output file") - val cardinalityArgument = parser.option[Int](List("enum"), "n", "Cardinality to evaluate enum property") - val ndjsonFlag = parser.flag[Boolean](List("ndjson"), "Expect ndjson format") - val schemaByArgument = parser.option[String](List("schema-by"), "JSON Path", "Path of Schema title") - val outputDirArgument = parser.option[String](List("output-dir"), "directory", "Directory path for multiple Schemas") - - // self-describing schema arguments - val vendorArgument = parser.option[String](List("vendor"), "name", "Vendor name for self-describing schema") - val nameArgument = parser.option[String](List("name"), "name", "Schema name for self-describing schema") - val versionArgument = parser.option[String](List("schemaver"), "version", "Schema version (in SchemaVer format) for self-describing schema") + val subcommand = parser.parameter[String]("subcommand", "Action to perform", false) + val helpFlag = parser.flag[Boolean](List("help"), "Output help and exit") // dummy flag, to get around https://github.com/bmc/argot/issues/7 + // Simulate subcommands with argot + val primaryArgs = args.take(1) // take only --help or subcommand + val subcommandArgs = args.drop(1) // hide another options from argot parser - parser.parse(args) - - // Get arguments for JSON Path segmentation and validate them - val segmentSchema = (schemaByArgument.value, outputDirArgument.value) match { - case (Some(jsonPath), Some(dirPath)) => Some((jsonPath, dirPath)) - case (None, None) => None - case _ => parser.usage("--schema-by and --output-dir arguments need to be used in conjunction.") - } - - // Get arguments for self-describing schema and validate them - val selfDescribing = (vendorArgument.value, nameArgument.value, versionArgument.value) match { - case (Some(vendor), name, version) => { - name match { - case None if (!segmentSchema.isDefined) => parser.usage("You need to specify --name OR segment schema.") - case Some(_) if (segmentSchema.isDefined) => parser.usage("You need to specify --name OR segment schema.") - case _ => () // we can omit name, but it must be - } - if (!vendor.matches("([A-Za-z0-9\\-\\_\\.]+)")) { - parser.usage("--vendor argument must consist of only letters, numbers, hyphens, underscores and dots") - } else if (name.isDefined && !name.get.matches("([A-Za-z0-9\\-\\_]+)")) { - parser.usage("--name argument must consist of only letters, numbers, hyphens and underscores") - } else if (version.isDefined && !version.get.matches("\\d+\\-\\d+\\-\\d+")) { - parser.usage("--schemaver argument must be in SchemaVer format (example: 1-1-0)") - } - Some(SelfDescribingSchema(vendor, name, version)) + try { + parser.parse(primaryArgs) + } catch { + case _: ArgotUsageException if helpFlag.value.getOrElse(false) => { + println(parser.usageString() + commands) + sys.exit(0) } - case (None, None, None) => None - case _ => parser.usage("--vendor, --name and --schemaver arguments need to be used in conjunction.") } - val enumCardinality = cardinalityArgument.value.getOrElse(0) - - // Check whether provided path exists - List(directoryArgument.value, fileArgument.value).flatten.headOption match { - case None => parser.usage("Either --dir or --file argument must be provided.") - case Some(path) => { - if (Files.exists(Paths.get(path))) () // everything is OK - else parser.usage(s"Path $path does exists") - } - } - - // Decide where and which files should be parsed - val jsonList: ValidJsonList = directoryArgument.value match { - case Some(dir) => ndjsonFlag.value match { - case Some(true) => getJsonsFromFolderWithNDFiles(dir) - case _ => getJsonsFromFolder(dir) - } - case None => fileArgument.value match { - case None => parser.usage("Either --dir or --file argument must be provided.") - case Some(file) => ndjsonFlag.value match { - case Some(true) => getJsonFromNDFile(file) - case _ => List(getJsonFromFile(file)) - } - } - } - - jsonList match { - case Nil => parser.usage("Directory does not contain any JSON files.") - case someJsons => { - segmentSchema match { - case None => { - val result = SchemaGuru.convertsJsonsToSchema(someJsons, enumCardinality) - outputResult(result, outputFileArgument.value, selfDescribing) - } - case Some((path, dir)) => { - val nameToJsonsMapping = JsonPathExtractor.mapByPath(path, jsonList) - nameToJsonsMapping map { - case (key, jsons) => { - val result = SchemaGuru.convertsJsonsToSchema(jsons, enumCardinality) - val fileName = key + ".json" - val file = - if (key == "$SchemaGuruFailed") None - else Some(new File(dir, fileName).getAbsolutePath) - outputResult(result, file, selfDescribing.map(_.copy(name = Some(key)))) - } - } - } - } - } - } - - /** - * Print Schema, warnings and errors - * - * @param result Schema Guru result containing all information - * @param outputFile optional path to file for schema output - * @param selfDescribingInfo optional info to make shema self-describing - */ - def outputResult(result: SchemaGuruResult, outputFile: Option[String], selfDescribingInfo: Option[SelfDescribingSchema]): Unit = { - // Make schema self-describing if necessary - val schema: JValue = selfDescribingInfo match { - case None => result.schema - case Some(description) => description.descriptSchema(result.schema) - } - - // Print JsonSchema to file or stdout - outputFile match { - case Some(file) => { - val output = new java.io.PrintWriter(file) - output.write(pretty(render(schema))) - output.close() - } - case None => println(pretty(render(schema))) - } - - // Print errors - if (!result.errors.isEmpty) { - println("\nErrors:\n " + result.errors.mkString("\n")) - } - - // Print warnings - result.warning match { - case Some(warning) => println(warning.consoleMessage) - case _ => - } + subcommand.value match { + case Some("derive") => DeriveCommand(subcommandArgs) + case Some("db") => DBCommand(subcommandArgs) + case _ => parser.usage("You need to specify subcommand.\n" + commands) } } diff --git a/src/main/scala/com.snowplowanalytics/schemaguru/cli/DeriveCommand.scala b/src/main/scala/com.snowplowanalytics/schemaguru/cli/DeriveCommand.scala new file mode 100644 index 0000000..0befc79 --- /dev/null +++ b/src/main/scala/com.snowplowanalytics/schemaguru/cli/DeriveCommand.scala @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2015 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.schemaguru +package cli + +// Java +import java.io.File +import java.nio.file.{ Files, Paths } + +// json4s +import org.json4s._ +import org.json4s.jackson.JsonMethods._ + +// Argot +import org.clapper.argot._ +import org.clapper.argot.ArgotConverters._ + +// This library +import utils._ + +/** + * Holds all information passed with CLI and decides how to produce + * JSON Schema + * + * @param args array of arguments passed via CLI + */ +class DeriveCommand(args: Array[String]) extends FileSystemJsonGetters { + val parser = new ArgotParser(programName = generated.ProjectSettings.name + " derive", compactUsage = true) + + // primary subcommand's options and arguments + val directoryArgument = parser.option[String](List("dir"), "directory", "Directory which contains JSONs to be converted") + val fileArgument = parser.option[String](List("file"), "file", "Single JSON instance to be converted") + val outputFileArgument = parser.option[String]("output", "file", "Output file") + val cardinalityArgument = parser.option[Int](List("enum"), "n", "Cardinality to evaluate enum property") + val ndjsonFlag = parser.flag[Boolean](List("ndjson"), "Expect ndjson format") + val schemaByArgument = parser.option[String](List("schema-by"), "JSON Path", "Path of Schema title") + val outputDirArgument = parser.option[String](List("output-dir"), "directory", "Directory path for multiple Schemas") + + // self-describing schema arguments + val vendorArgument = parser.option[String](List("vendor"), "name", "Vendor name for self-describing schema") + val nameArgument = parser.option[String](List("name"), "name", "Schema name for self-describing schema") + val versionArgument = parser.option[String](List("schemaver"), "version", "Schema version (in SchemaVer format) for self-describing schema") + + parser.parse(args) + + // Get arguments for JSON Path segmentation and validate them + val segmentSchema = (schemaByArgument.value, outputDirArgument.value) match { + case (Some(jsonPath), Some(dirPath)) => Some((jsonPath, dirPath)) + case (None, None) => None + case _ => parser.usage("--schema-by and --output-dir arguments need to be used in conjunction.") + } + + // Get arguments for self-describing schema and validate them + val selfDescribing = (vendorArgument.value, nameArgument.value, versionArgument.value) match { + case (Some(vendor), name, version) => { + name match { + case None if (!segmentSchema.isDefined) => parser.usage("You need to specify --name OR segment schema.") + case Some(_) if (segmentSchema.isDefined) => parser.usage("You need to specify --name OR segment schema.") + case _ => () // we can omit name, but it must be + } + if (!vendor.matches("([A-Za-z0-9\\-\\_\\.]+)")) { + parser.usage("--vendor argument must consist of only letters, numbers, hyphens, underscores and dots") + } else if (name.isDefined && !name.get.matches("([A-Za-z0-9\\-\\_]+)")) { + parser.usage("--name argument must consist of only letters, numbers, hyphens and underscores") + } else if (version.isDefined && !version.get.matches("\\d+\\-\\d+\\-\\d+")) { + parser.usage("--schemaver argument must be in SchemaVer format (example: 1-1-0)") + } + Some(SelfDescribingSchema(vendor, name, version)) + } + case (None, None, None) => None + case _ => parser.usage("--vendor, --name and --schemaver arguments need to be used in conjunction.") + } + + val enumCardinality = cardinalityArgument.value.getOrElse(0) + + // Check whether provided path exists + List(directoryArgument.value, fileArgument.value).flatten.headOption match { + case None => parser.usage("Either --dir or --file argument must be provided.") + case Some(path) => { + if (Files.exists(Paths.get(path))) () // everything is OK + else parser.usage(s"Path $path does exists") + } + } + + // Decide where and which files should be parsed + val jsonList: ValidJsonList = directoryArgument.value match { + case Some(dir) => ndjsonFlag.value match { + case Some(true) => getJsonsFromFolderWithNDFiles(dir) + case _ => getJsonsFromFolder(dir) + } + case None => fileArgument.value match { + case None => parser.usage("Either --dir or --file argument must be provided.") + case Some(file) => ndjsonFlag.value match { + case Some(true) => getJsonFromNDFile(file) + case _ => List(getJsonFromFile(file)) + } + } + } + + jsonList match { + case Nil => parser.usage("Directory does not contain any JSON files.") + case someJsons => { + segmentSchema match { + case None => { + val result = SchemaGuru.convertsJsonsToSchema(someJsons, enumCardinality) + outputResult(result, outputFileArgument.value, selfDescribing) + } + case Some((path, dir)) => { + val nameToJsonsMapping = JsonPathExtractor.mapByPath(path, jsonList) + nameToJsonsMapping map { + case (key, jsons) => { + val result = SchemaGuru.convertsJsonsToSchema(jsons, enumCardinality) + val fileName = key + ".json" + val file = + if (key == "$SchemaGuruFailed") None + else Some(new File(dir, fileName).getAbsolutePath) + outputResult(result, file, selfDescribing.map(_.copy(name = Some(key)))) + } + } + } + } + } + + /** + * Print Schema, warnings and errors + * + * @param result Schema Guru result containing all information + * @param outputFile optional path to file for schema output + * @param selfDescribingInfo optional info to make shema self-describing + */ + def outputResult(result: SchemaGuruResult, outputFile: Option[String], selfDescribingInfo: Option[SelfDescribingSchema]): Unit = { + // Make schema self-describing if necessary + val schema: JValue = selfDescribingInfo match { + case None => result.schema + case Some(description) => description.descriptSchema(result.schema) + } + + // Print JsonSchema to file or stdout + outputFile match { + case Some(file) => { + val output = new java.io.PrintWriter(file) + output.write(pretty(render(schema))) + output.close() + } + case None => println(pretty(render(schema))) + } + + // Print errors + if (!result.errors.isEmpty) { + println("\nErrors:\n " + result.errors.mkString("\n")) + } + + // Print warnings + result.warning match { + case Some(warning) => println(warning.consoleMessage) + case _ => + } + } + } +} + +object DeriveCommand { + def apply(args: Array[String]) = new DeriveCommand(args) +} From e59899fe16dd95afad3fb5989c4d191da7855107 Mon Sep 17 00:00:00 2001 From: Anton Parkhomenko Date: Mon, 20 Jul 2015 22:23:54 +0300 Subject: [PATCH 02/12] Add `ddl` command which generates JSON Paths files and Redshift DDL (close #84) --- project/Dependencies.scala | 2 + project/SchemaGuruBuild.scala | 1 + .../schemaguru/Main.scala | 5 +- .../schemaguru/cli/DdlCommand.scala | 188 ++++++++++++++++++ 4 files changed, 194 insertions(+), 2 deletions(-) create mode 100644 src/main/scala/com.snowplowanalytics/schemaguru/cli/DdlCommand.scala diff --git a/project/Dependencies.scala b/project/Dependencies.scala index 25feb93..04d44a3 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -39,6 +39,7 @@ object Dependencies { val specs2 = "2.3.13" val scalazSpecs2 = "0.2" val scalaCheck = "1.12.2" + val igluutils = "0.1.0-M1" } object Libraries { @@ -55,6 +56,7 @@ object Dependencies { val json4sJackson = "org.json4s" %% "json4s-jackson" % V.json4s val json4sScalaz = "org.json4s" %% "json4s-scalaz" % V.json4s val jsonpath = "io.gatling" %% "jsonpath" % V.jsonpath + val igluutils = "com.snowplowanalytics" %% "iglu-utils" % V.igluutils // Spray val akka = "com.typesafe.akka" %% "akka-actor" % V.akka val sprayCan = "io.spray" %% "spray-can" % V.spray diff --git a/project/SchemaGuruBuild.scala b/project/SchemaGuruBuild.scala index f23e445..6494508 100644 --- a/project/SchemaGuruBuild.scala +++ b/project/SchemaGuruBuild.scala @@ -45,6 +45,7 @@ object SchemaGuruBuild extends Build { Libraries.json4sJackson, Libraries.json4sScalaz, Libraries.jsonpath, + Libraries.igluutils, // Scala (test only) Libraries.specs2, Libraries.scalazSpecs2, diff --git a/src/main/scala/com.snowplowanalytics/schemaguru/Main.scala b/src/main/scala/com.snowplowanalytics/schemaguru/Main.scala index 3460341..fad762e 100644 --- a/src/main/scala/com.snowplowanalytics/schemaguru/Main.scala +++ b/src/main/scala/com.snowplowanalytics/schemaguru/Main.scala @@ -16,13 +16,14 @@ package com.snowplowanalytics.schemaguru import org.clapper.argot._ import org.clapper.argot.ArgotConverters._ +// This library import cli._ object Main extends App { private val commands = """ |Currently supported subcommands are: - |db - use JSON Schema to generate DDL file for specific DB + |ddl - use JSON Schema to generate DDL file for specific DB |derive - use set of JSON instances to derive JSON Schema """.stripMargin @@ -54,7 +55,7 @@ object Main extends App { subcommand.value match { case Some("derive") => DeriveCommand(subcommandArgs) - case Some("db") => DBCommand(subcommandArgs) + case Some("ddl") => DdlCommand(subcommandArgs) case _ => parser.usage("You need to specify subcommand.\n" + commands) } } diff --git a/src/main/scala/com.snowplowanalytics/schemaguru/cli/DdlCommand.scala b/src/main/scala/com.snowplowanalytics/schemaguru/cli/DdlCommand.scala new file mode 100644 index 0000000..b8b39fb --- /dev/null +++ b/src/main/scala/com.snowplowanalytics/schemaguru/cli/DdlCommand.scala @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2015 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.schemaguru +package cli + +// Scalaz +import scalaz._ +import Scalaz._ + +// Java +import java.io.File + +// Argot +import org.clapper.argot._ +import ArgotConverters._ + +// Igluutils +import com.snowplowanalytics.igluutils._ +import com.snowplowanalytics.igluutils.generators.{ + JsonPathGenerator => JPG, + SchemaFlattener => SF +} +import com.snowplowanalytics.igluutils.generators.redshift.{ RedshiftDdlGenerator => RDG } +import com.snowplowanalytics.igluutils.utils.{ FileUtils => FU } + +/** + * Holds all information passed with CLI and decides how to produce + * DDL and JSON Paths + * + * @param args array of arguments passed via CLI + */ +class DdlCommand(args: Array[String]) { + import DdlCommand._ + + val parser = new ArgotParser( + programName = generated.ProjectSettings.name + " db", + compactUsage = true + ) + + // Set all arguments + val pathOption = parser.option[File](List("path"), "schema", "Path to schema or directory with schemas") + val destinationOption = parser.option[File](List("dest"), "path", "Destination path") + val dbOption = parser.option[String](List("db"), "name", "For which DB we need to produce DDL (default: redshift)") + val withJsonPathsFlag = parser.flag("with-json-paths", false, "Produce JSON Paths files with DDL") + val rawModeFlag = parser.flag("raw", false, "Produce raw DDL without Snowplow-specific data") + val schemaOption = parser.option[String](List("schema"), "name", "Redshift schema name") + val sizeOption = parser.option[Int](List("size"), "n", "Default size for varchar data type") + val splitProductFlag = parser.flag("splitProduct", false, "Split product types into different keys") + + parser.parse(args) + + // Get all arguments + val file = pathOption.value.getOrElse { parser.usage("--path is required option") } + val destination = destinationOption.value.getOrElse(new File("./testing")) + val db = dbOption.value.getOrElse("redshift") + val withJsonPaths = withJsonPathsFlag.value.getOrElse(false) + val rawMode = rawModeFlag.value.getOrElse(false) + val schemaName = schemaOption.value + val size = sizeOption.value.getOrElse(255) + val splitProduct = splitProductFlag.value.getOrElse(false) + + // Check how to handle path + if (file.isDirectory) { + fetchAndParseFromDirectory(file) + } else { + fetchAndParseFromFile(file) + } + + /** + * Get all files from specified ``dir`` and tries to fetch, process and + * output JSON Path and DDL from all found files + * + * @param dir directory with JSON Schemas + */ + private def fetchAndParseFromDirectory(dir: File): Unit = { + val schemas = FU.listSchemas(dir) + schemas.map(fetchAndParseFromFile(_)) + } + + /** + * Fetch JSON Schema from specified ``file``, process and output JSON Path + * and DDL + * + * @param file file with JSON Schema + */ + private def fetchAndParseFromFile(file: File): Unit = { + processFile(file) match { + case Success((jsonPathLines, redshiftLines, warningLines, combined)) => + output(jsonPathLines, redshiftLines, warningLines, combined) + case Failure(str) => { + println(s"Error in [${file.getAbsolutePath}]") + println(str) + sys.exit(1) + } + } + } + + /** + * Core function producing JSON Paths file, DDL, warnings and path + * + * @param file JSON Schema file + * @return all validated information as tuple + */ + def processFile(file: File): Validation[String, (List[String], List[String], List[String], (String, String))] = { + for { + json <- FU.getJsonFromFile(file) + flatSchema <- SF.flattenJsonSchema(json, splitProduct) + } yield { + val combined = getFileName(flatSchema.self) + + val ddl = db match { + case "redshift" => RDG.getRedshiftDdl(flatSchema, schemaName, size, rawMode) + case otherDb => parser.usage(s"Error: DDL generation for $otherDb is not supported yet") + } + val jsonPathLines = JPG.getJsonPathsFile(flatSchema) + + (jsonPathLines, ddl.content.split("\n").toList, ddl.warnings, combined) + } + } + + /** + * Outputs JSON Path file and DDL file to files in ``destination`` + * or prints errors + * + * @param jpf list of JSON Paths + * @param rdf Validated list of DDL lines + * @param combined vendor and filename + */ + private def output(jpf: List[String], rdf: List[String], warnings: List[String], combined: (String, String)): Unit = { + val (vendor, file) = combined + + val ddlDir = new File(destination, "sql/" + vendor).getAbsolutePath + FU.writeListToFile(file + ".sql", ddlDir, rdf).map(println) + + if (withJsonPaths) { + val jsonPathDir = new File(destination, "jsonpaths/" + vendor).getAbsolutePath + FU.writeListToFile(file + ".json", jsonPathDir, jpf).map(println) + } + if (!warnings.isEmpty) { + for { warning <- warnings } println("WARNING: " + warning) + } + } + + /** + * Function to implicitly convert string with path argument to File + * + * @param path valid path to file + * @param opt command-line argument + * @return Java's File if it exists + */ + private implicit def convertFilePath(path: String, opt: CommandLineArgument[File]): File = { + val file = new File(path) + if (!file.exists) { + parser.usage(s"Input file [$path] does not exist.") + } + file + } +} + +object DdlCommand { + def apply(args: Array[String]) = new DdlCommand(args) + + /** + * Get the file path and name from self-describing info + * Like com.mailchimp/subscribe_1 + * + * @param flatSelfElems all information from Self-describing schema + * @return relative filepath + */ + private def getFileName(flatSelfElems: SelfDescInfo): (String, String) = { + // Make the file name + val file = flatSelfElems.name.replaceAll("([^A-Z_])([A-Z])", "$1_$2").toLowerCase.concat("_1") + + // Return the vendor and the file name together + (flatSelfElems.vendor, file) + } + +} From 0e37a4d7453a9751b79f3ddd8d18d8c119bcfbca Mon Sep 17 00:00:00 2001 From: Anton Parkhomenko Date: Tue, 21 Jul 2015 20:08:32 +0300 Subject: [PATCH 03/12] Unify CLI options (close #90) --- .../schemaguru/Main.scala | 29 +++---- .../schemaguru/cli/DdlCommand.scala | 51 +++++-------- .../schemaguru/cli/GuruCommand.scala | 50 +++++++++++++ ...eriveCommand.scala => SchemaCommand.scala} | 75 +++++++++---------- 4 files changed, 120 insertions(+), 85 deletions(-) create mode 100644 src/main/scala/com.snowplowanalytics/schemaguru/cli/GuruCommand.scala rename src/main/scala/com.snowplowanalytics/schemaguru/cli/{DeriveCommand.scala => SchemaCommand.scala} (65%) diff --git a/src/main/scala/com.snowplowanalytics/schemaguru/Main.scala b/src/main/scala/com.snowplowanalytics/schemaguru/Main.scala index fad762e..1d8ffa3 100644 --- a/src/main/scala/com.snowplowanalytics/schemaguru/Main.scala +++ b/src/main/scala/com.snowplowanalytics/schemaguru/Main.scala @@ -20,12 +20,14 @@ import org.clapper.argot.ArgotConverters._ import cli._ object Main extends App { - private val commands = - """ - |Currently supported subcommands are: - |ddl - use JSON Schema to generate DDL file for specific DB - |derive - use set of JSON instances to derive JSON Schema - """.stripMargin + // List of all supported commands + private val commandsList = List(DdlCommand, SchemaCommand) // companion objects with static info + private val commandsMap: Map[String, GuruCommand] = (for { c <- commandsList } yield (c.title, c)).toMap + + // Help message + private val subcommandsHelp = "Subcommands are:\n" + commandsMap.map { + case (title, command) => title + " - " + command.description + }.mkString("\n") private val parser = new ArgotParser( programName = generated.ProjectSettings.name, @@ -42,20 +44,19 @@ object Main extends App { // Simulate subcommands with argot val primaryArgs = args.take(1) // take only --help or subcommand - val subcommandArgs = args.drop(1) // hide another options from argot parser - + val subcommandArgs = args.drop(1) // subcommand arguments try { parser.parse(primaryArgs) } catch { case _: ArgotUsageException if helpFlag.value.getOrElse(false) => { - println(parser.usageString() + commands) + println(parser.usageString() + "\n" + subcommandsHelp) sys.exit(0) } } - subcommand.value match { - case Some("derive") => DeriveCommand(subcommandArgs) - case Some("ddl") => DdlCommand(subcommandArgs) - case _ => parser.usage("You need to specify subcommand.\n" + commands) + // Find command in commandsMap and execute it with args + subcommand.value.flatMap(commandsMap.get(_)) match { + case Some(command) => command(subcommandArgs) + case _ => parser.usage("You need to specify subcommand.\n" + subcommandsHelp) } -} +} \ No newline at end of file diff --git a/src/main/scala/com.snowplowanalytics/schemaguru/cli/DdlCommand.scala b/src/main/scala/com.snowplowanalytics/schemaguru/cli/DdlCommand.scala index b8b39fb..f2dab9f 100644 --- a/src/main/scala/com.snowplowanalytics/schemaguru/cli/DdlCommand.scala +++ b/src/main/scala/com.snowplowanalytics/schemaguru/cli/DdlCommand.scala @@ -39,29 +39,26 @@ import com.snowplowanalytics.igluutils.utils.{ FileUtils => FU } * * @param args array of arguments passed via CLI */ -class DdlCommand(args: Array[String]) { +class DdlCommand(val args: Array[String]) { import DdlCommand._ - val parser = new ArgotParser( - programName = generated.ProjectSettings.name + " db", - compactUsage = true - ) + // Required + val inputArgument = parser.parameter[File]("input", "Path to schema or directory with schemas", false) // Set all arguments - val pathOption = parser.option[File](List("path"), "schema", "Path to schema or directory with schemas") - val destinationOption = parser.option[File](List("dest"), "path", "Destination path") + val outputOption = parser.option[File](List("output"), "path", "Destination directory") val dbOption = parser.option[String](List("db"), "name", "For which DB we need to produce DDL (default: redshift)") val withJsonPathsFlag = parser.flag("with-json-paths", false, "Produce JSON Paths files with DDL") val rawModeFlag = parser.flag("raw", false, "Produce raw DDL without Snowplow-specific data") val schemaOption = parser.option[String](List("schema"), "name", "Redshift schema name") val sizeOption = parser.option[Int](List("size"), "n", "Default size for varchar data type") - val splitProductFlag = parser.flag("splitProduct", false, "Split product types into different keys") + val splitProductFlag = parser.flag("split-product", false, "Split product types into different keys") parser.parse(args) // Get all arguments - val file = pathOption.value.getOrElse { parser.usage("--path is required option") } - val destination = destinationOption.value.getOrElse(new File("./testing")) + val input = inputArgument.value.get // isn't optional + val outputPath = outputOption.value.getOrElse(new File(".")) val db = dbOption.value.getOrElse("redshift") val withJsonPaths = withJsonPathsFlag.value.getOrElse(false) val rawMode = rawModeFlag.value.getOrElse(false) @@ -70,10 +67,10 @@ class DdlCommand(args: Array[String]) { val splitProduct = splitProductFlag.value.getOrElse(false) // Check how to handle path - if (file.isDirectory) { - fetchAndParseFromDirectory(file) + if (input.isDirectory) { + fetchAndParseFromDirectory(input) } else { - fetchAndParseFromFile(file) + fetchAndParseFromFile(input) } /** @@ -139,35 +136,27 @@ class DdlCommand(args: Array[String]) { private def output(jpf: List[String], rdf: List[String], warnings: List[String], combined: (String, String)): Unit = { val (vendor, file) = combined - val ddlDir = new File(destination, "sql/" + vendor).getAbsolutePath + val ddlDir = new File(outputPath, "sql/" + vendor).getAbsolutePath FU.writeListToFile(file + ".sql", ddlDir, rdf).map(println) if (withJsonPaths) { - val jsonPathDir = new File(destination, "jsonpaths/" + vendor).getAbsolutePath + val jsonPathDir = new File(outputPath, "jsonpaths/" + vendor).getAbsolutePath FU.writeListToFile(file + ".json", jsonPathDir, jpf).map(println) } if (!warnings.isEmpty) { for { warning <- warnings } println("WARNING: " + warning) } } - - /** - * Function to implicitly convert string with path argument to File - * - * @param path valid path to file - * @param opt command-line argument - * @return Java's File if it exists - */ - private implicit def convertFilePath(path: String, opt: CommandLineArgument[File]): File = { - val file = new File(path) - if (!file.exists) { - parser.usage(s"Input file [$path] does not exist.") - } - file - } } -object DdlCommand { +/** + * Companion object holding all static information about command + */ +object DdlCommand extends GuruCommand { + val title = "ddl" + val description = "Derive DDL using JSON Schema" + val parser = new ArgotParser(programName = generated.ProjectSettings.name + " " + title, compactUsage = true) + def apply(args: Array[String]) = new DdlCommand(args) /** diff --git a/src/main/scala/com.snowplowanalytics/schemaguru/cli/GuruCommand.scala b/src/main/scala/com.snowplowanalytics/schemaguru/cli/GuruCommand.scala new file mode 100644 index 0000000..ac94b14 --- /dev/null +++ b/src/main/scala/com.snowplowanalytics/schemaguru/cli/GuruCommand.scala @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2015 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.schemaguru +package cli + +// Java +import java.io.File + +// Argot +import org.clapper.argot._ + +trait GuruCommand { + // Helper method + def apply(args: Array[String]) + + // Subcommand itself + val title: String + + // Description for --help + val description: String + + // Every subcommand has it's own parser + val parser: ArgotParser + + /** + * Function to implicitly convert string with path argument to File + * + * @param path valid path to file + * @param opt command-line argument + * @return Java's File if it exists + */ + implicit def convertFilePath(path: String, opt: CommandLineArgument[File]): File = { + val file = new File(path) + if (!file.exists) { + parser.usage(s"Input file [$path] does not exist.") + } + file + } + +} diff --git a/src/main/scala/com.snowplowanalytics/schemaguru/cli/DeriveCommand.scala b/src/main/scala/com.snowplowanalytics/schemaguru/cli/SchemaCommand.scala similarity index 65% rename from src/main/scala/com.snowplowanalytics/schemaguru/cli/DeriveCommand.scala rename to src/main/scala/com.snowplowanalytics/schemaguru/cli/SchemaCommand.scala index 0befc79..bfa40fd 100644 --- a/src/main/scala/com.snowplowanalytics/schemaguru/cli/DeriveCommand.scala +++ b/src/main/scala/com.snowplowanalytics/schemaguru/cli/SchemaCommand.scala @@ -15,7 +15,6 @@ package cli // Java import java.io.File -import java.nio.file.{ Files, Paths } // json4s import org.json4s._ @@ -34,34 +33,36 @@ import utils._ * * @param args array of arguments passed via CLI */ -class DeriveCommand(args: Array[String]) extends FileSystemJsonGetters { - val parser = new ArgotParser(programName = generated.ProjectSettings.name + " derive", compactUsage = true) +class SchemaCommand(val args: Array[String]) extends FileSystemJsonGetters { + import SchemaCommand._ + + // Required + val inputArgument = parser.parameter[File]("input", "Path to schema or directory with schemas", false) // primary subcommand's options and arguments - val directoryArgument = parser.option[String](List("dir"), "directory", "Directory which contains JSONs to be converted") - val fileArgument = parser.option[String](List("file"), "file", "Single JSON instance to be converted") - val outputFileArgument = parser.option[String]("output", "file", "Output file") - val cardinalityArgument = parser.option[Int](List("enum"), "n", "Cardinality to evaluate enum property") + val outputOption = parser.option[String]("output", "path", "Output file (print to stdout otherwise)") + val cardinalityOption = parser.option[Int](List("enum"), "n", "Cardinality to evaluate enum property") val ndjsonFlag = parser.flag[Boolean](List("ndjson"), "Expect ndjson format") - val schemaByArgument = parser.option[String](List("schema-by"), "JSON Path", "Path of Schema title") - val outputDirArgument = parser.option[String](List("output-dir"), "directory", "Directory path for multiple Schemas") + val schemaByOption = parser.option[String](List("schema-by"), "JSON Path", "Path of Schema title") // self-describing schema arguments - val vendorArgument = parser.option[String](List("vendor"), "name", "Vendor name for self-describing schema") - val nameArgument = parser.option[String](List("name"), "name", "Schema name for self-describing schema") - val versionArgument = parser.option[String](List("schemaver"), "version", "Schema version (in SchemaVer format) for self-describing schema") + val vendorOption = parser.option[String](List("vendor"), "name", "Vendor name for self-describing schema") + val nameOption = parser.option[String](List("name"), "name", "Schema name for self-describing schema") + val versionOption = parser.option[String](List("schemaver"), "version", "Schema version (in SchemaVer format) for self-describing schema") parser.parse(args) + val input = inputArgument.value.get // isn't optional + // Get arguments for JSON Path segmentation and validate them - val segmentSchema = (schemaByArgument.value, outputDirArgument.value) match { + val segmentSchema = (schemaByOption.value, outputOption.value) match { case (Some(jsonPath), Some(dirPath)) => Some((jsonPath, dirPath)) - case (None, None) => None - case _ => parser.usage("--schema-by and --output-dir arguments need to be used in conjunction.") + case (Some(jsonPath), None) => Some((jsonPath, ".")) + case _ => None } // Get arguments for self-describing schema and validate them - val selfDescribing = (vendorArgument.value, nameArgument.value, versionArgument.value) match { + val selfDescribing = (vendorOption.value, nameOption.value, versionOption.value) match { case (Some(vendor), name, version) => { name match { case None if (!segmentSchema.isDefined) => parser.usage("You need to specify --name OR segment schema.") @@ -81,31 +82,18 @@ class DeriveCommand(args: Array[String]) extends FileSystemJsonGetters { case _ => parser.usage("--vendor, --name and --schemaver arguments need to be used in conjunction.") } - val enumCardinality = cardinalityArgument.value.getOrElse(0) - - // Check whether provided path exists - List(directoryArgument.value, fileArgument.value).flatten.headOption match { - case None => parser.usage("Either --dir or --file argument must be provided.") - case Some(path) => { - if (Files.exists(Paths.get(path))) () // everything is OK - else parser.usage(s"Path $path does exists") - } - } + val enumCardinality = cardinalityOption.value.getOrElse(0) // Decide where and which files should be parsed - val jsonList: ValidJsonList = directoryArgument.value match { - case Some(dir) => ndjsonFlag.value match { - case Some(true) => getJsonsFromFolderWithNDFiles(dir) - case _ => getJsonsFromFolder(dir) + val jsonList: ValidJsonList = + if (input.isDirectory) ndjsonFlag.value match { + case Some(true) => getJsonsFromFolderWithNDFiles(input.getAbsolutePath) + case _ => getJsonsFromFolder(input.getAbsolutePath) } - case None => fileArgument.value match { - case None => parser.usage("Either --dir or --file argument must be provided.") - case Some(file) => ndjsonFlag.value match { - case Some(true) => getJsonFromNDFile(file) - case _ => List(getJsonFromFile(file)) - } + else ndjsonFlag.value match { + case Some(true) => getJsonFromNDFile(input.getAbsolutePath) + case _ => List(getJsonFromFile(input.getAbsolutePath)) } - } jsonList match { case Nil => parser.usage("Directory does not contain any JSON files.") @@ -113,7 +101,7 @@ class DeriveCommand(args: Array[String]) extends FileSystemJsonGetters { segmentSchema match { case None => { val result = SchemaGuru.convertsJsonsToSchema(someJsons, enumCardinality) - outputResult(result, outputFileArgument.value, selfDescribing) + outputResult(result, outputOption.value, selfDescribing) } case Some((path, dir)) => { val nameToJsonsMapping = JsonPathExtractor.mapByPath(path, jsonList) @@ -169,6 +157,13 @@ class DeriveCommand(args: Array[String]) extends FileSystemJsonGetters { } } -object DeriveCommand { - def apply(args: Array[String]) = new DeriveCommand(args) +/** + * Companion object holding all static information about command + */ +object SchemaCommand extends GuruCommand { + val title = "schema" + val description = "Derive JSON Schema from set of JSON instances" + val parser = new ArgotParser(programName = generated.ProjectSettings.name + " " + title, compactUsage = true) + + def apply(args: Array[String]) = new SchemaCommand(args) } From c87df24d504d915c27ed044b47656e63cfba394c Mon Sep 17 00:00:00 2001 From: Anton Parkhomenko Date: Tue, 21 Jul 2015 20:13:28 +0300 Subject: [PATCH 04/12] Change default SchemaVer to 1-0-0 (close #80) --- README.md | 2 +- .../schemaguru/utils/SelfDescribingSchema.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index cb5a864..77e33bb 100644 --- a/README.md +++ b/README.md @@ -114,7 +114,7 @@ Now just create a new Docker app in the **[Elastic Beanstalk Console] [beanstalk ### Self-describing JSON Schema Guru allows you to produce **[Self-describing JSON Schema] [self-describing]**. -To produce it you need to specify vendor, name (if segmentation isn't using, see below), and version (optional, default value is 0-1-0). +To produce it you need to specify vendor, name (if segmentation isn't using, see below), and version (optional, default value is 1-0-0). ```bash $ ./schema-guru-0.2.0 --dir {{jsons_directory}} --vendor {{your_company}} --name {{schema_name}} --schemaver {{version}} diff --git a/src/main/scala/com.snowplowanalytics/schemaguru/utils/SelfDescribingSchema.scala b/src/main/scala/com.snowplowanalytics/schemaguru/utils/SelfDescribingSchema.scala index 0e9c6df..39fb968 100644 --- a/src/main/scala/com.snowplowanalytics/schemaguru/utils/SelfDescribingSchema.scala +++ b/src/main/scala/com.snowplowanalytics/schemaguru/utils/SelfDescribingSchema.scala @@ -33,7 +33,7 @@ case class SelfDescribingSchema(vendor: String, name: Option[String], version: O val selfObject: JObject = ("self", (("vendor", vendor): JObject) ~ (("name", name.getOrElse("unspecified")): JObject) ~ - (("version", version.getOrElse("0-1-0")): JObject) ~ + (("version", version.getOrElse("1-0-0")): JObject) ~ ("format", "jsonschema")) uri.merge(selfObject).merge(schema) From 38199af2d04d6ca7f300e7eda6ee2a6124ea5be0 Mon Sep 17 00:00:00 2001 From: Anton Parkhomenko Date: Tue, 21 Jul 2015 21:38:42 +0300 Subject: [PATCH 05/12] Don't check for .ndjson extension when --ndjson set (close #74) --- .../schemaguru/utils/FileSystemJsonGetters.scala | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/main/scala/com.snowplowanalytics/schemaguru/utils/FileSystemJsonGetters.scala b/src/main/scala/com.snowplowanalytics/schemaguru/utils/FileSystemJsonGetters.scala index 34bd15e..a5ad4b3 100644 --- a/src/main/scala/com.snowplowanalytics/schemaguru/utils/FileSystemJsonGetters.scala +++ b/src/main/scala/com.snowplowanalytics/schemaguru/utils/FileSystemJsonGetters.scala @@ -124,12 +124,11 @@ trait FileSystemJsonGetters { * delimited JSONS it was pointed at. * * @param dir The directory we are going to get JSONs from - * @param ext The extension of the file we are going to be attempting to grab * @return a List with validated JSONs nested inside */ - def getJsonsFromFolderWithNDFiles(dir: String, ext: String = "ndjson"): ValidJsonList = { + def getJsonsFromFolderWithNDFiles(dir: String): ValidJsonList = { val proccessed = for { - filePath <- new java.io.File(dir).listFiles.filter(_.getName.endsWith("." + ext)) + filePath <- new java.io.File(dir).listFiles.filterNot(_.getName.startsWith(".")) } yield { getJsonFromNDFile(filePath.getAbsolutePath) } From 157aa901b10e71eb2f7d5498dcc4ab37646db488 Mon Sep 17 00:00:00 2001 From: Anton Parkhomenko Date: Thu, 23 Jul 2015 15:40:56 +0300 Subject: [PATCH 06/12] Correctly handle dir of JSONs (close #91) --- .../schemaguru/cli/DdlCommand.scala | 50 +++++++------- .../schemaguru/cli/SchemaCommand.scala | 8 +-- .../utils/FileSystemJsonGetters.scala | 66 +++++++++++-------- 3 files changed, 68 insertions(+), 56 deletions(-) diff --git a/src/main/scala/com.snowplowanalytics/schemaguru/cli/DdlCommand.scala b/src/main/scala/com.snowplowanalytics/schemaguru/cli/DdlCommand.scala index f2dab9f..6a90a37 100644 --- a/src/main/scala/com.snowplowanalytics/schemaguru/cli/DdlCommand.scala +++ b/src/main/scala/com.snowplowanalytics/schemaguru/cli/DdlCommand.scala @@ -20,12 +20,15 @@ import Scalaz._ // Java import java.io.File +// json4s +import org.json4s.JValue + // Argot import org.clapper.argot._ import ArgotConverters._ // Igluutils -import com.snowplowanalytics.igluutils._ +import com.snowplowanalytics.igluutils.SelfDescInfo import com.snowplowanalytics.igluutils.generators.{ JsonPathGenerator => JPG, SchemaFlattener => SF @@ -33,13 +36,16 @@ import com.snowplowanalytics.igluutils.generators.{ import com.snowplowanalytics.igluutils.generators.redshift.{ RedshiftDdlGenerator => RDG } import com.snowplowanalytics.igluutils.utils.{ FileUtils => FU } +// This library +import utils.FileSystemJsonGetters + /** * Holds all information passed with CLI and decides how to produce * DDL and JSON Paths * * @param args array of arguments passed via CLI */ -class DdlCommand(val args: Array[String]) { +class DdlCommand(val args: Array[String]) extends FileSystemJsonGetters { import DdlCommand._ // Required @@ -66,36 +72,28 @@ class DdlCommand(val args: Array[String]) { val size = sizeOption.value.getOrElse(255) val splitProduct = splitProductFlag.value.getOrElse(false) - // Check how to handle path - if (input.isDirectory) { - fetchAndParseFromDirectory(input) - } else { - fetchAndParseFromFile(input) - } + val schemaList: ValidJsonList = + if (input.isDirectory) { + getJsonsFromFolder(input) + } else { + List(getJsonFromFile(input)) + } - /** - * Get all files from specified ``dir`` and tries to fetch, process and - * output JSON Path and DDL from all found files - * - * @param dir directory with JSON Schemas - */ - private def fetchAndParseFromDirectory(dir: File): Unit = { - val schemas = FU.listSchemas(dir) - schemas.map(fetchAndParseFromFile(_)) + schemaList match { + case Nil => parser.usage(s"Directory ${input.getAbsolutePath} does not contain any JSON files") + case someJsons => someJsons.map(processAndOutput) } /** - * Fetch JSON Schema from specified ``file``, process and output JSON Path - * and DDL + * Process schema and output JSON Path and DDL * * @param file file with JSON Schema */ - private def fetchAndParseFromFile(file: File): Unit = { - processFile(file) match { + private def processAndOutput(file: Validation[String, JValue]): Unit = { + processSchema(file) match { case Success((jsonPathLines, redshiftLines, warningLines, combined)) => output(jsonPathLines, redshiftLines, warningLines, combined) case Failure(str) => { - println(s"Error in [${file.getAbsolutePath}]") println(str) sys.exit(1) } @@ -105,13 +103,13 @@ class DdlCommand(val args: Array[String]) { /** * Core function producing JSON Paths file, DDL, warnings and path * - * @param file JSON Schema file + * @param json content of JSON file (JSON Schema) * @return all validated information as tuple */ - def processFile(file: File): Validation[String, (List[String], List[String], List[String], (String, String))] = { + def processSchema(json: Validation[String, JValue]): Validation[String, (List[String], List[String], List[String], (String, String))] = { for { - json <- FU.getJsonFromFile(file) - flatSchema <- SF.flattenJsonSchema(json, splitProduct) + validJson <- json + flatSchema <- SF.flattenJsonSchema(validJson, splitProduct) } yield { val combined = getFileName(flatSchema.self) diff --git a/src/main/scala/com.snowplowanalytics/schemaguru/cli/SchemaCommand.scala b/src/main/scala/com.snowplowanalytics/schemaguru/cli/SchemaCommand.scala index bfa40fd..0fa0554 100644 --- a/src/main/scala/com.snowplowanalytics/schemaguru/cli/SchemaCommand.scala +++ b/src/main/scala/com.snowplowanalytics/schemaguru/cli/SchemaCommand.scala @@ -87,12 +87,12 @@ class SchemaCommand(val args: Array[String]) extends FileSystemJsonGetters { // Decide where and which files should be parsed val jsonList: ValidJsonList = if (input.isDirectory) ndjsonFlag.value match { - case Some(true) => getJsonsFromFolderWithNDFiles(input.getAbsolutePath) - case _ => getJsonsFromFolder(input.getAbsolutePath) + case Some(true) => getJsonsFromFolderWithNDFiles(input) + case _ => getJsonsFromFolder(input) } else ndjsonFlag.value match { - case Some(true) => getJsonFromNDFile(input.getAbsolutePath) - case _ => List(getJsonFromFile(input.getAbsolutePath)) + case Some(true) => getJsonFromNDFile(input) + case _ => List(getJsonFromFile(input)) } jsonList match { diff --git a/src/main/scala/com.snowplowanalytics/schemaguru/utils/FileSystemJsonGetters.scala b/src/main/scala/com.snowplowanalytics/schemaguru/utils/FileSystemJsonGetters.scala index a5ad4b3..35ed4cc 100644 --- a/src/main/scala/com.snowplowanalytics/schemaguru/utils/FileSystemJsonGetters.scala +++ b/src/main/scala/com.snowplowanalytics/schemaguru/utils/FileSystemJsonGetters.scala @@ -14,11 +14,13 @@ package com.snowplowanalytics.schemaguru package utils // Scalaz -import scalaz.Scalaz._ import scalaz._ +import Scalaz._ // Scala -import scala.io.{BufferedSource, Source} +import scala.io.{ BufferedSource, Source } + +import java.io.File // Jackson import com.fasterxml.jackson.core.JsonParseException @@ -33,28 +35,40 @@ import org.json4s.jackson.JsonMethods._ */ trait FileSystemJsonGetters { /** - * Returns a validated List of JSONs from the folder it was pointed at. + * Recursively get all files in ``dir`` except hidden + * + * @param dir directory to scan + * @return list of found files + */ + def listAllFiles(dir: File): List[File] = { + def scanSubdir(subDir: File): Array[File] = { + val these = subDir.listFiles.filterNot(_.getName.startsWith(".")) + these ++ these.filter(_.isDirectory).flatMap(scanSubdir) + } + scanSubdir(dir).filter(_.isFile).toList + } + + /** + * Returns a validated List of JSONs from the folder it was pointed at * * @param dir The directory we are going to get JSONs from - * @param ext The extension of the file we are going to be attempting to grab * @return a List with validated JSONs nested inside */ - def getJsonsFromFolder(dir: String, ext: String = "json"): ValidJsonList = { + def getJsonsFromFolder(dir: File): ValidJsonList = { val proccessed = for { - filePath <- new java.io.File(dir).listFiles.filter(_.getName.endsWith("." + ext)) + file <- listAllFiles(dir) } yield { try { - val file = Source.fromFile(filePath) - val content = file.mkString + val content = Source.fromFile(file).mkString parse(content).success } catch { case e: JsonParseException => { val exception = e.getMessage - s"File [$filePath] contents failed to parse into JSON: [$exception]".failure + s"File [${file.getAbsolutePath}}] contents failed to parse into JSON: [$exception]".failure } case e: Exception => { val exception = e.getMessage - s"File [$filePath] fetching and parsing failed: [$exception]".failure + s"File [${file.getAbsolutePath}] fetching and parsing failed: [$exception]".failure } } } @@ -64,21 +78,21 @@ trait FileSystemJsonGetters { /** * Returns a validated JSON from the specified path * - * @param filePath path to JSON + * @param file file object with JSON * @return a validation either be correct JValue or error as String */ - def getJsonFromFile(filePath: String): Validation[String, JValue] = { + def getJsonFromFile(file: File): Validation[String, JValue] = { try { - val content = Source.fromFile(filePath).mkString + val content = Source.fromFile(file).mkString parse(content).success } catch { case e: JsonParseException => { val exception = e.getMessage - s"File [$filePath] contents failed to parse into JSON: [$exception]".failure + s"File [${file.getAbsolutePath}] contents failed to parse into JSON: [$exception]".failure } case e: Exception => { val exception = e.getMessage - s"File [$filePath] fetching and parsing failed: [$exception]".failure + s"File [${file.getAbsolutePath}] fetching and parsing failed: [$exception]".failure } } } @@ -86,20 +100,20 @@ trait FileSystemJsonGetters { /** * Returns a validated List of JSONs from newline-delimited JSON file * - * @param filePath path to NDJSON + * @param file newline-delimited JSON * @return a List with validated JSONs nested inside */ - def getJsonFromNDFile(filePath: String): ValidJsonList = { - val file: Validation[String, BufferedSource] = try { - Source.fromFile(filePath).success + def getJsonFromNDFile(file: File): ValidJsonList = { + val validatedFile: Validation[String, BufferedSource] = try { + Source.fromFile(file).success } catch { case e: Exception => { val exception = e.getMessage - s"File [$filePath] fetching and parsing failed: [$exception]".failure + s"File [${file.getAbsolutePath}] fetching and parsing failed: [$exception]".failure } } - file match { + validatedFile match { case Success(content) => { val lines = content.mkString.split("\n").zipWithIndex val processed = @@ -109,7 +123,7 @@ trait FileSystemJsonGetters { catch { case e: Exception => { val exception = e.getMessage - s"File [$filePath] failed to parse line $line into JSON: [$exception]".failure + s"File [${file.getAbsolutePath}] failed to parse line $line into JSON: [$exception]".failure } } } @@ -126,13 +140,13 @@ trait FileSystemJsonGetters { * @param dir The directory we are going to get JSONs from * @return a List with validated JSONs nested inside */ - def getJsonsFromFolderWithNDFiles(dir: String): ValidJsonList = { + def getJsonsFromFolderWithNDFiles(dir: File): ValidJsonList = { val proccessed = for { - filePath <- new java.io.File(dir).listFiles.filterNot(_.getName.startsWith(".")) + file <- listAllFiles(dir) } yield { - getJsonFromNDFile(filePath.getAbsolutePath) + getJsonFromNDFile(file) } - proccessed.flatten.toList + proccessed.flatten } } From d5b9c276977d5732d73714c79b064ad5c5e1a863 Mon Sep 17 00:00:00 2001 From: Anton Parkhomenko Date: Fri, 24 Jul 2015 13:11:14 +0300 Subject: [PATCH 07/12] Optional self-desc JSON with --raw (close #92) --- .../scala/com.snowplowanalytics/package.scala | 10 ++++ .../schemaguru/cli/DdlCommand.scala | 56 +++++++++++++------ .../utils/FileSystemJsonGetters.scala | 29 +++++----- 3 files changed, 65 insertions(+), 30 deletions(-) diff --git a/src/main/scala/com.snowplowanalytics/package.scala b/src/main/scala/com.snowplowanalytics/package.scala index 18ea4ea..cfa3a4d 100644 --- a/src/main/scala/com.snowplowanalytics/package.scala +++ b/src/main/scala/com.snowplowanalytics/package.scala @@ -30,4 +30,14 @@ package object schemaguru { * Type Alias for a Valid list of JSONs */ type ValidJsonList = List[Validation[String, JValue]] + + /** + * Class holding JSON with file name + */ + case class JsonFile(fileName: String, content: JValue) + + /** + * Type Alias for a Valid list of JSON files + */ + type ValidJsonFileList = List[Validation[String, JsonFile]] } diff --git a/src/main/scala/com.snowplowanalytics/schemaguru/cli/DdlCommand.scala b/src/main/scala/com.snowplowanalytics/schemaguru/cli/DdlCommand.scala index 6a90a37..2ff6906 100644 --- a/src/main/scala/com.snowplowanalytics/schemaguru/cli/DdlCommand.scala +++ b/src/main/scala/com.snowplowanalytics/schemaguru/cli/DdlCommand.scala @@ -28,13 +28,19 @@ import org.clapper.argot._ import ArgotConverters._ // Igluutils -import com.snowplowanalytics.igluutils.SelfDescInfo +import com.snowplowanalytics.igluutils.{ SelfDescInfo, GenerationResult } import com.snowplowanalytics.igluutils.generators.{ JsonPathGenerator => JPG, SchemaFlattener => SF } -import com.snowplowanalytics.igluutils.generators.redshift.{ RedshiftDdlGenerator => RDG } -import com.snowplowanalytics.igluutils.utils.{ FileUtils => FU } +import com.snowplowanalytics.igluutils.generators.redshift.{ + RedshiftDdlGenerator => RDG +} +import com.snowplowanalytics.igluutils.utils.{ + FileUtils => FU, + StringUtils => SU + +} // This library import utils.FileSystemJsonGetters @@ -72,11 +78,11 @@ class DdlCommand(val args: Array[String]) extends FileSystemJsonGetters { val size = sizeOption.value.getOrElse(255) val splitProduct = splitProductFlag.value.getOrElse(false) - val schemaList: ValidJsonList = + val schemaList: ValidJsonFileList = if (input.isDirectory) { - getJsonsFromFolder(input) + getJsonFilesFromFolder(input) } else { - List(getJsonFromFile(input)) + List(getJsonFileFromFile(input)) } schemaList match { @@ -89,7 +95,7 @@ class DdlCommand(val args: Array[String]) extends FileSystemJsonGetters { * * @param file file with JSON Schema */ - private def processAndOutput(file: Validation[String, JValue]): Unit = { + private def processAndOutput(file: Validation[String, JsonFile]): Unit = { processSchema(file) match { case Success((jsonPathLines, redshiftLines, warningLines, combined)) => output(jsonPathLines, redshiftLines, warningLines, combined) @@ -106,20 +112,36 @@ class DdlCommand(val args: Array[String]) extends FileSystemJsonGetters { * @param json content of JSON file (JSON Schema) * @return all validated information as tuple */ - def processSchema(json: Validation[String, JValue]): Validation[String, (List[String], List[String], List[String], (String, String))] = { - for { + def processSchema(json: Validation[String, JsonFile]): Validation[String, (List[String], List[String], List[String], (String, String))] = { + val processed = for { validJson <- json - flatSchema <- SF.flattenJsonSchema(validJson, splitProduct) + flatSchema <- SF.flattenJsonSchema(validJson.content, splitProduct) } yield { - val combined = getFileName(flatSchema.self) - - val ddl = db match { - case "redshift" => RDG.getRedshiftDdl(flatSchema, schemaName, size, rawMode) - case otherDb => parser.usage(s"Error: DDL generation for $otherDb is not supported yet") + val jsonPathLines = JPG.getJsonPathsFile(flatSchema, rawMode) + + db match { + case "redshift" if rawMode => { // process without self describing info + val ddl = RDG.getRawRedshiftDdl(flatSchema, validJson.fileName, schemaName, size) + val fileNameWithoutExtension = + if (validJson.fileName.endsWith(".json")) validJson.fileName.dropRight(5) + else validJson.fileName + val combined = (".", fileNameWithoutExtension) + (jsonPathLines, ddl.content.split("\n").toList, ddl.warnings, combined).success + } + case "redshift" => { // procrss with self describing info + SF.getSelfDescElems(validJson.content).map { self => + val ddl = RDG.getRedshiftDdl(flatSchema, self, schemaName, size) + val combined = getFileName(self) + (jsonPathLines, ddl.content.split("\n").toList, ddl.warnings, combined) + } + } + case otherDb => parser.usage(s"Error: DDL generation for $otherDb is not supported yet") + } } - val jsonPathLines = JPG.getJsonPathsFile(flatSchema) - (jsonPathLines, ddl.content.split("\n").toList, ddl.warnings, combined) + processed match { + case Success(succ) => succ + case Failure(str) => str.fail } } diff --git a/src/main/scala/com.snowplowanalytics/schemaguru/utils/FileSystemJsonGetters.scala b/src/main/scala/com.snowplowanalytics/schemaguru/utils/FileSystemJsonGetters.scala index 35ed4cc..69443ac 100644 --- a/src/main/scala/com.snowplowanalytics/schemaguru/utils/FileSystemJsonGetters.scala +++ b/src/main/scala/com.snowplowanalytics/schemaguru/utils/FileSystemJsonGetters.scala @@ -58,19 +58,16 @@ trait FileSystemJsonGetters { val proccessed = for { file <- listAllFiles(dir) } yield { - try { - val content = Source.fromFile(file).mkString - parse(content).success - } catch { - case e: JsonParseException => { - val exception = e.getMessage - s"File [${file.getAbsolutePath}}] contents failed to parse into JSON: [$exception]".failure - } - case e: Exception => { - val exception = e.getMessage - s"File [${file.getAbsolutePath}] fetching and parsing failed: [$exception]".failure - } - } + getJsonFromFile(file) + } + proccessed.toList + } + + def getJsonFilesFromFolder(dir: File): ValidJsonFileList = { + val proccessed = for { + file <- listAllFiles(dir) + } yield { + getJsonFileFromFile(file) } proccessed.toList } @@ -97,6 +94,12 @@ trait FileSystemJsonGetters { } } + def getJsonFileFromFile(file: File): Validation[String, JsonFile] = + getJsonFromFile(file) match { + case Success(json) => JsonFile(file.getName, json).success + case Failure(str) => str.fail + } + /** * Returns a validated List of JSONs from newline-delimited JSON file * From 9307d791c93a13d9f0cb6b81e905f2bf77323204 Mon Sep 17 00:00:00 2001 From: Anton Parkhomenko Date: Mon, 27 Jul 2015 14:13:11 +0700 Subject: [PATCH 08/12] Update README to reflect new 0.3.0 (close #93) --- README.md | 121 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 97 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 77e33bb..87d3bfa 100644 --- a/README.md +++ b/README.md @@ -2,53 +2,105 @@ [ ![Build Status] [travis-image] ] [travis] [ ![Release] [release-image] ] [releases] [ ![License] [license-image] ] [license] -Schema Guru is a tool (CLI and web) allowing you to derive **[JSON Schemas] [json-schema]** from a set of JSON instances. +Schema Guru is a tool (CLI and web) allowing you to derive **[JSON Schemas] [json-schema]** from a set of JSON instances process and transform it into different data definition formats. + +Current primary features include: + +- deriviation of JSON Schema from set of JSON instances (``schema`` command) +- generation of **[Redshift] [redshift]** table DDL and JSONPaths file (``ddl`` command) Unlike other tools for deriving JSON Schemas, Schema Guru allows you to derive schema from an unlimited set of instances (making schemas much more precise), and supports many more JSON Schema validation properties. -Schema Guru is used heavily in association with Snowplow's own **[Snowplow] [snowplow]** and **[Iglu] [iglu]** projects. +Schema Guru is used heavily in association with Snowplow's own **[Snowplow] [snowplow]**, **[Iglu] [iglu]** and **[Iglu Utils] [iglu-utils]** projects. ## User Quickstart -### CLI - Download the latest Schema Guru from Bintray: ```bash -$ wget http://dl.bintray.com/snowplow/snowplow-generic/schema_guru_0.2.0.zip -$ unzip schema_guru_0.2.0.zip +$ wget http://dl.bintray.com/snowplow/snowplow-generic/schema_guru_0.3.0.zip +$ unzip schema_guru_0.3.0.zip ``` -Assuming you have a recent JVM installed: +Assuming you have a recent JVM installed. + +### CLI + +#### Schema derivation + +You can use as input either single JSON file or directory with JSON instances (it will be processed recursively). + +Following command will print JSON Schema to stdout: ```bash -$ ./schema-guru-0.2.0 --dir {{jsons_directory}} +$ ./schema-guru-0.3.0 schema {{input}} ``` Also you can specify output file for your schema: ```bash -$ ./schema-guru-0.2.0 --dir {{jsons_directory}} --output {{json_schema_file}} +$ ./schema-guru-0.3.0 schema --output {{json_schema_file}} {{input}} ``` -Or you can analyze a single JSON instance: +You can also switch Schema Guru into **[NDJSON] [ndjson]** mode, where it will look for newline delimited JSONs: ```bash -$ ./schema-guru-0.2.0 --file {{json_instance}} +$ ./schema-guru-0.3.0 schema --ndjson {{input}} ``` -You can also switch Schema Guru into ndjson mode, where it will look for newline delimited JSONs. +You can specify the enum cardinality tolerance for your fields. It means that *all* fields which are found to have less than the specified cardinality will be specified in the JSON Schema using the `enum` property. + +```bash +$ ./schema-guru-0.3.0 schema --enum 5 {{input}} +``` + +#### DDL derivation + +Like for Schema derivation, for DDL input may be also single file with JSON Schema or directory containing JSON Schemas. + +Currently we support DDL only for **[Amazon Redshift] [redshift]**, but in future releases you'll be able to specify another with ``--db`` option. + +Following command will just save Redshift (default ``--db`` value) DDL to current dir. + +```bash +$ ./schema-guru-0.3.0 ddl {{input}} +``` -In this case all your files need to have `.ndjson` extension (as the **[specifications][ndjson-spec]** says); all `.json` files will be skipped. +You also can specify directory for output: ```bash -$ ./schema-guru-0.2.0 --ndjson --dir {{ndjsons_directory}} +$ ./schema-guru-0.3.0 ddl --output {{ddl_dir}} {{input}} ``` -You can specify the enum cardinality tolerance for for your fields. It means that *all* fields which are found to have less than the specified cardinality will be specified in the JSON Schema using the `enum` property. +If you're not a Snowplow Platform user, don't use **[Self-describing Schema] [self-describing]** or just don't want anything specific to it you can produce raw schema: ```bash -$ ./schema-guru-0.2.0 --enum 5 --dir {{jsons_directory}} +$ ./schema-guru-0.3.0 ddl --raw {{input}} +``` + +You may also want to get JSONPaths file for Redshift's **[COPY] [redshift-copy]** command. It will place ``jsonpaths`` dir alongside with ``sql``: + +```bash +$ ./schema-guru-0.3.0 ddl --with-json-paths {{input}} +``` + +The most embarrassing part of shifting from dynamic-typed world to static-typed is product types (or union types) like this in JSON Schema: ``["integer", "string"]``. +How to represent them in SQL DDL? It's a taught question and we think there's no ideal solution. +Thus we provide you two options. By default product types will be transformed as most general ``VARCHAR(4096)``. +But there's another way - you can split column with product types into separate ones with it's types as postfix, for example property ``model`` with type ``["string", "integer"]`` will be transformed into two columns ``mode_string`` and ``model_integer``. +This behaviour can be achieved with ``--split-product-types``. + +Another thing everyone need to consider is default VARCHAR size. If there's no clues about it (like ``maxLength``) 255 will be used. +You can also specify this default value: + +```bash +$ ./schema-guru-0.3.0 ddl --size 32 {{input}} +``` + +You can also specify Redshift Schema for your table. For non-raw mode ``atomic`` used as default. + +```bash +$ ./schema-guru-0.3.0 ddl --raw --schema business {{input}} ``` ### Web UI @@ -56,9 +108,9 @@ $ ./schema-guru-0.2.0 --enum 5 --dir {{jsons_directory}} You can access our hosted demo of the Schema Guru web UI at [schemaguru.snplowanalytics.com] [webui-hosted]. To run it locally: ```bash -$ wget http://dl.bintray.com/snowplow/snowplow-generic/schema_guru_webui_0.2.0.zip -$ unzip schema_guru_webui_0.2.0.zip -$ ./schema-guru-webui-0.2.0 +$ wget http://dl.bintray.com/snowplow/snowplow-generic/schema_guru_webui_0.3.0.zip +$ unzip schema_guru_webui_0.3.0.zip +$ ./schema-guru-webui-0.3.0 ``` The above will run a Spray web server containing Schema Guru on [0.0.0.0:8000] [webui-local]. Interface and port can be specified by `--interface` and `--port` respectively. @@ -88,6 +140,8 @@ Now just create a new Docker app in the **[Elastic Beanstalk Console] [beanstalk ### Functionality +#### Schema derivation + * Takes a directory as an argument and will print out the resulting JsonSchema: - Processes each JSON sequentially - Merges all results into one master Json Schema @@ -104,20 +158,35 @@ Now just create a new Docker app in the **[Elastic Beanstalk Console] [beanstalk * Allows to produce JSON Schemas with different names based on given JSON Path * Supports **[Newline Delimited JSON] [ndjson]** +#### DDL derivation + +* Correctly transforms some of string formats + - uuid becomes ``CHAR(36)`` + - ipv4 becomes ``VARCHAR(14)`` + - ipv6 becomes ``VARCHAR(39)`` + - date-time becomes ``TIMESTAMP`` +* Handles properties with only enums +* Property with ``maxLength(n)`` and ``minLength(n)`` becomes ``CHAR(n)`` +* Can output JSONPaths file +* Can split product types +* Number with ``multiplyOf`` 0.01 becomes ``DECIMAL`` +* Handles Self-describing JSON and can produce raw DDL +* Recognizes integer size by ``minimum`` and ``maximum`` values + + ### Assumptions * All JSONs in the directory are assumed to be of the same event type and will be merged together * All JSONs are assumed to start with either `{ ... }` or `[ ... ]` - If they do not they are discarded * Schema should be as strict as possible - e.g. no `additionalProperties` are allowed currently -* When using Schema Guru to derive schema from newline delimited JSONs they need to have .ndjson extension ### Self-describing JSON -Schema Guru allows you to produce **[Self-describing JSON Schema] [self-describing]**. +``schema`` command allows you to produce **[Self-describing JSON Schema] [self-describing]**. To produce it you need to specify vendor, name (if segmentation isn't using, see below), and version (optional, default value is 1-0-0). ```bash -$ ./schema-guru-0.2.0 --dir {{jsons_directory}} --vendor {{your_company}} --name {{schema_name}} --schemaver {{version}} +$ ./schema-guru-0.3.0 schema --vendor {{your_company}} --name {{schema_name}} --schemaver {{version}} {{input}} ``` ### Schema Segmentation @@ -150,7 +219,7 @@ and You can run it as follows: ```bash -$ ./schema-guru-0.2.0 --dir {{mixed_jsons_directory}} --output-dir {{output_dir}} --schema-by $.event +$ ./schema-guru-0.3.0 schema --output {{output_dir}} --schema-by $.event {{mixed_jsons_directory}} ``` It will put two (or may be more) JSON Schemas into output dir: Purchased_an_Item.json and Posted_a_comment.json. @@ -253,7 +322,7 @@ limitations under the License. [license-image]: http://img.shields.io/badge/license-Apache--2-blue.svg?style=flat [license]: http://www.apache.org/licenses/LICENSE-2.0 -[release-image]: http://img.shields.io/badge/release-0.2.0-blue.svg?style=flat +[release-image]: http://img.shields.io/badge/release-0.3.0-blue.svg?style=flat [releases]: https://github.com/snowplow/schema-guru/releases [json-schema]: http://json-schema.org/ @@ -266,8 +335,12 @@ limitations under the License. [snowplow]: https://github.com/snowplow/snowplow [iglu]: https://github.com/snowplow/iglu +[iglu-utils]: https://github.com/snowplow/iglu-utils [self-describing]: http://snowplowanalytics.com/blog/2014/05/15/introducing-self-describing-jsons/ +[redshift]: http://aws.amazon.com/redshift/ +[redshift-copy]: http://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html + [vagrant-install]: http://docs.vagrantup.com/v2/installation/index.html [virtualbox-install]: https://www.virtualbox.org/wiki/Downloads From bf22eedc48239abf7fb92b3127f0098333ea28e9 Mon Sep 17 00:00:00 2001 From: Anton Parkhomenko Date: Mon, 27 Jul 2015 21:05:14 +0700 Subject: [PATCH 09/12] Fix ordering for JSONPaths file (close #96) --- .../schemaguru/cli/DdlCommand.scala | 131 +++++++++++------- .../schemaguru/utils/FileUtils.scala | 90 ++++++++++++ 2 files changed, 173 insertions(+), 48 deletions(-) create mode 100644 src/main/scala/com.snowplowanalytics/schemaguru/utils/FileUtils.scala diff --git a/src/main/scala/com.snowplowanalytics/schemaguru/cli/DdlCommand.scala b/src/main/scala/com.snowplowanalytics/schemaguru/cli/DdlCommand.scala index 2ff6906..fa9b96c 100644 --- a/src/main/scala/com.snowplowanalytics/schemaguru/cli/DdlCommand.scala +++ b/src/main/scala/com.snowplowanalytics/schemaguru/cli/DdlCommand.scala @@ -20,30 +20,27 @@ import Scalaz._ // Java import java.io.File -// json4s -import org.json4s.JValue - // Argot import org.clapper.argot._ import ArgotConverters._ // Igluutils -import com.snowplowanalytics.igluutils.{ SelfDescInfo, GenerationResult } +import com.snowplowanalytics.igluutils.{ FlatSchema, SelfDescInfo } import com.snowplowanalytics.igluutils.generators.{ - JsonPathGenerator => JPG, SchemaFlattener => SF } import com.snowplowanalytics.igluutils.generators.redshift.{ - RedshiftDdlGenerator => RDG -} -import com.snowplowanalytics.igluutils.utils.{ - FileUtils => FU, - StringUtils => SU - + JsonPathGenerator => JPG, + RedshiftDdlGenerator => RDG, + Ddl } +import com.snowplowanalytics.igluutils.utils.{ StringUtils => SU } // This library -import utils.FileSystemJsonGetters +import utils.{ + FileUtils => FU, + FileSystemJsonGetters +} /** * Holds all information passed with CLI and decides how to produce @@ -97,8 +94,7 @@ class DdlCommand(val args: Array[String]) extends FileSystemJsonGetters { */ private def processAndOutput(file: Validation[String, JsonFile]): Unit = { processSchema(file) match { - case Success((jsonPathLines, redshiftLines, warningLines, combined)) => - output(jsonPathLines, redshiftLines, warningLines, combined) + case Success(ddlOutput) => output(ddlOutput) case Failure(str) => { println(str) sys.exit(1) @@ -106,42 +102,68 @@ class DdlCommand(val args: Array[String]) extends FileSystemJsonGetters { } } + /** + * Produces all data required for DDL file, including it's path, filename, + * header and DDL object + * + * @param flatSchema fields mapped to it's properties + * @param validJson JSON file, containing filename and content + * @param rawMode produce Snowplow-specific info + * @return tuple of values filepath, filename, header and DDL (as object) + */ + private def getRedshiftDdlFile(flatSchema: FlatSchema, validJson: JsonFile, rawMode: Boolean): Validation[String, DdlFile] = { + val schemaCreate = schemaName match { + case Some(s) => Ddl.Schema(s).toDdl + "\n\n" + case None if !rawMode => Ddl.Schema("atomic").toDdl + "\n\n" + case None => "" + } + if (rawMode) { + val fileNameWithoutExtension = + if (validJson.fileName.endsWith(".json")) validJson.fileName.dropRight(5) + else validJson.fileName + val table = RDG.getTableDdl(flatSchema, fileNameWithoutExtension, schemaName, size, true) + val header = RDG.getHeader(validJson.fileName) + DdlFile(".", fileNameWithoutExtension, header, schemaCreate, table).success + } else { + SF.getSelfDescElems(validJson.content).map { self => + val tableName = SU.getTableName(self) + val table = RDG.getTableDdl(flatSchema, tableName, schemaName, size, false) + val combined = getFileName(self) + val header = RDG.getHeader(self) + DdlFile(combined._1, combined._2, header, schemaCreate, table) + } + } + } + /** * Core function producing JSON Paths file, DDL, warnings and path * * @param json content of JSON file (JSON Schema) * @return all validated information as tuple */ - def processSchema(json: Validation[String, JsonFile]): Validation[String, (List[String], List[String], List[String], (String, String))] = { - val processed = for { + private def processSchema(json: Validation[String, JsonFile]): Validation[String, DdlOutput] = { + for { validJson <- json flatSchema <- SF.flattenJsonSchema(validJson.content, splitProduct) + ddlFile <- getRedshiftDdlFile(flatSchema, validJson, rawMode) } yield { - val jsonPathLines = JPG.getJsonPathsFile(flatSchema, rawMode) - - db match { - case "redshift" if rawMode => { // process without self describing info - val ddl = RDG.getRawRedshiftDdl(flatSchema, validJson.fileName, schemaName, size) - val fileNameWithoutExtension = - if (validJson.fileName.endsWith(".json")) validJson.fileName.dropRight(5) - else validJson.fileName - val combined = (".", fileNameWithoutExtension) - (jsonPathLines, ddl.content.split("\n").toList, ddl.warnings, combined).success - } - case "redshift" => { // procrss with self describing info - SF.getSelfDescElems(validJson.content).map { self => - val ddl = RDG.getRedshiftDdl(flatSchema, self, schemaName, size) - val combined = getFileName(self) - (jsonPathLines, ddl.content.split("\n").toList, ddl.warnings, combined) - } - } - case otherDb => parser.usage(s"Error: DDL generation for $otherDb is not supported yet") + db match { + case "redshift" => { + val jsonPathsLines = if (withJsonPaths) { + JPG.getJsonPathsFile(ddlFile.table.columns, rawMode) + } else { "" } + + // Snakify columnNames only after JSONPaths was created + // TODO: refactor it + val tableWithSnakedColumns = ddlFile.table.copy(columns = ddlFile.table.columns.map(c => c.copy(columnName = SU.snakify(c.columnName)))) + + DdlOutput(jsonPathsLines, + ddlFile.header ++ ddlFile.schemaCreate ++ tableWithSnakedColumns.toDdl, + ddlFile.table.warnings, + (ddlFile.path, ddlFile.fileName)) } + case otherDb => parser.usage(s"Error: DDL generation for $otherDb is not supported yet") } - - processed match { - case Success(succ) => succ - case Failure(str) => str.fail } } @@ -149,22 +171,20 @@ class DdlCommand(val args: Array[String]) extends FileSystemJsonGetters { * Outputs JSON Path file and DDL file to files in ``destination`` * or prints errors * - * @param jpf list of JSON Paths - * @param rdf Validated list of DDL lines - * @param combined vendor and filename + * @param ddlOutput everything we need to output */ - private def output(jpf: List[String], rdf: List[String], warnings: List[String], combined: (String, String)): Unit = { - val (vendor, file) = combined + private def output(ddlOutput: DdlOutput): Unit = { + val (vendor, file) = ddlOutput.filePath val ddlDir = new File(outputPath, "sql/" + vendor).getAbsolutePath - FU.writeListToFile(file + ".sql", ddlDir, rdf).map(println) + FU.writeToFile(file + ".sql", ddlDir, ddlOutput.redshiftDdl).map(println) if (withJsonPaths) { val jsonPathDir = new File(outputPath, "jsonpaths/" + vendor).getAbsolutePath - FU.writeListToFile(file + ".json", jsonPathDir, jpf).map(println) + FU.writeToFile(file + ".json", jsonPathDir, ddlOutput.jsonPaths).map(println) } - if (!warnings.isEmpty) { - for { warning <- warnings } println("WARNING: " + warning) + if (!ddlOutput.warnings.isEmpty) { + for { warning <- ddlOutput.warnings } println("WARNING: " + warning) } } } @@ -179,6 +199,21 @@ object DdlCommand extends GuruCommand { def apply(args: Array[String]) = new DdlCommand(args) + /** + * Class holding all information for file with DDL + */ + private case class DdlFile(path: String, fileName: String, header: String, schemaCreate: String, table: Ddl.Table) + + /** + * Class holding all information to output + * + * @param jsonPaths JSONPaths file content + * @param redshiftDdl Redshift Table DDL content + * @param warnings accumulated list of warnings + * @param filePath tuple of dir and file name + */ + private case class DdlOutput(jsonPaths: String, redshiftDdl: String, warnings: List[String], filePath: (String, String)) + /** * Get the file path and name from self-describing info * Like com.mailchimp/subscribe_1 diff --git a/src/main/scala/com.snowplowanalytics/schemaguru/utils/FileUtils.scala b/src/main/scala/com.snowplowanalytics/schemaguru/utils/FileUtils.scala new file mode 100644 index 0000000..393268d --- /dev/null +++ b/src/main/scala/com.snowplowanalytics/schemaguru/utils/FileUtils.scala @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2014-2015 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.schemaguru +package utils + +// Java +import java.io.{ PrintWriter, File } + +// Scalaz +import scalaz._ +import Scalaz._ + +/** + * Utilities for printing and reading to/from files + */ +object FileUtils { + /** + * Creates a new file with the contents of the list inside. + * + * @param fileName The name of the new file + * @param fileDir The directory we want the file to live in, w/o trailing slash + * @param content Content of file + * @return a success or failure string about the process + */ + def writeToFile(fileName: String, fileDir: String, content: String): Validation[String, String] = { + val path = fileDir + "/" + fileName + try { + makeDir(fileDir) match { + case true => { + // Attempt to open the file... + val file = new File(path) + + // Print the contents of the list to the new file... + printToFile(file) { _.println(content) } + + // Output a success message + s"File [${path}] was written successfully!".success + } + case false => s"Could not make new directory to store files in - Check write permissions".fail + } + } catch { + case e: Exception => { + val exception = e.toString + s"File [${path}] failed to write: [$exception]".fail + } + } + } + + /** + * Prints a single line to a file + * + * @param f The File we are going to print to + */ + private def printToFile(f: File)(op: PrintWriter => Unit) { + val p = new PrintWriter(f) + try { + op(p) + } finally { + p.close() + } + } + + /** + * Creates a new directory at the path + * specified and returns a boolean on + * if it was successful. + * + * @param dir The path that needs to be + * created + * @return a boolean of direcroty creation + * success + */ + def makeDir(dir: String): Boolean = { + val file = new File(dir) + if (!file.exists()) { + file.mkdirs + } + true + } +} From 4d51c568d46f76b3c8f8c9e03b6120f488bd290a Mon Sep 17 00:00:00 2001 From: Anton Parkhomenko Date: Tue, 28 Jul 2015 20:02:05 +0700 Subject: [PATCH 10/12] Swap all occurrences of "igluutils" with "schemaddl" to reflect renaming (close #97) --- README.md | 4 ++-- project/Dependencies.scala | 4 ++-- project/SchemaGuruBuild.scala | 2 +- .../schemaguru/cli/DdlCommand.scala | 10 +++++----- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 87d3bfa..d1f69b2 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Current primary features include: Unlike other tools for deriving JSON Schemas, Schema Guru allows you to derive schema from an unlimited set of instances (making schemas much more precise), and supports many more JSON Schema validation properties. -Schema Guru is used heavily in association with Snowplow's own **[Snowplow] [snowplow]**, **[Iglu] [iglu]** and **[Iglu Utils] [iglu-utils]** projects. +Schema Guru is used heavily in association with Snowplow's own **[Snowplow] [snowplow]**, **[Iglu] [iglu]** and **[Schema DDL] [schema-ddl]** projects. ## User Quickstart @@ -335,7 +335,7 @@ limitations under the License. [snowplow]: https://github.com/snowplow/snowplow [iglu]: https://github.com/snowplow/iglu -[iglu-utils]: https://github.com/snowplow/iglu-utils +[schema-ddl]: https://github.com/snowplow/schema-ddl [self-describing]: http://snowplowanalytics.com/blog/2014/05/15/introducing-self-describing-jsons/ [redshift]: http://aws.amazon.com/redshift/ diff --git a/project/Dependencies.scala b/project/Dependencies.scala index 04d44a3..e4d8d1d 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -39,7 +39,7 @@ object Dependencies { val specs2 = "2.3.13" val scalazSpecs2 = "0.2" val scalaCheck = "1.12.2" - val igluutils = "0.1.0-M1" + val schemaddl = "0.1.0-M1" } object Libraries { @@ -56,7 +56,7 @@ object Dependencies { val json4sJackson = "org.json4s" %% "json4s-jackson" % V.json4s val json4sScalaz = "org.json4s" %% "json4s-scalaz" % V.json4s val jsonpath = "io.gatling" %% "jsonpath" % V.jsonpath - val igluutils = "com.snowplowanalytics" %% "iglu-utils" % V.igluutils + val schemaddl = "com.snowplowanalytics" %% "schema-ddl" % V.schemaddl // Spray val akka = "com.typesafe.akka" %% "akka-actor" % V.akka val sprayCan = "io.spray" %% "spray-can" % V.spray diff --git a/project/SchemaGuruBuild.scala b/project/SchemaGuruBuild.scala index 6494508..6b0f65a 100644 --- a/project/SchemaGuruBuild.scala +++ b/project/SchemaGuruBuild.scala @@ -45,7 +45,7 @@ object SchemaGuruBuild extends Build { Libraries.json4sJackson, Libraries.json4sScalaz, Libraries.jsonpath, - Libraries.igluutils, + Libraries.schemaddl, // Scala (test only) Libraries.specs2, Libraries.scalazSpecs2, diff --git a/src/main/scala/com.snowplowanalytics/schemaguru/cli/DdlCommand.scala b/src/main/scala/com.snowplowanalytics/schemaguru/cli/DdlCommand.scala index fa9b96c..a80f6fb 100644 --- a/src/main/scala/com.snowplowanalytics/schemaguru/cli/DdlCommand.scala +++ b/src/main/scala/com.snowplowanalytics/schemaguru/cli/DdlCommand.scala @@ -24,17 +24,17 @@ import java.io.File import org.clapper.argot._ import ArgotConverters._ -// Igluutils -import com.snowplowanalytics.igluutils.{ FlatSchema, SelfDescInfo } -import com.snowplowanalytics.igluutils.generators.{ +// Schema DDL +import com.snowplowanalytics.schemaddl.{ FlatSchema, SelfDescInfo } +import com.snowplowanalytics.schemaddl.generators.{ SchemaFlattener => SF } -import com.snowplowanalytics.igluutils.generators.redshift.{ +import com.snowplowanalytics.schemaddl.generators.redshift.{ JsonPathGenerator => JPG, RedshiftDdlGenerator => RDG, Ddl } -import com.snowplowanalytics.igluutils.utils.{ StringUtils => SU } +import com.snowplowanalytics.schemaddl.utils.{ StringUtils => SU } // This library import utils.{ From 70d1c391f9fb0f4c89bb033a2d4d23459b1c2830 Mon Sep 17 00:00:00 2001 From: Anton Parkhomenko Date: Tue, 28 Jul 2015 21:58:43 +0700 Subject: [PATCH 11/12] Prepare for release --- CHANGELOG | 13 +++++++++++++ project/BuildSettings.scala | 2 +- project/Dependencies.scala | 2 +- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 796e55f..900d5a3 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,16 @@ +0.3.0 (2015-07-28) +------------------ +Swap all occurrences of "igluutils" with "schemaddl" to reflect renaming (#97) +Fix ordering for JSONPaths file (#96) +Update README to reflect new 0.3.0 (#93) +Optional self-desc JSON with --raw (#92) +Correctly handle dir of JSONs (#91) +Don't check for .ndjson extension when --ndjson set (#74) +Change default SchemaVer to 1-0-0 (#80) +Unify CLI options (#90) +Add `ddl` command which generates JSON Paths files and Redshift DDL (#84) +Move existing functionality into `derive` command (#83) + 0.2.0 (2015-07-01) ------------------ Updated vagrant push to also build and publish webui artifact (#72) diff --git a/project/BuildSettings.scala b/project/BuildSettings.scala index 6d556b1..7df0213 100644 --- a/project/BuildSettings.scala +++ b/project/BuildSettings.scala @@ -20,7 +20,7 @@ object BuildSettings { // Common settings for all our projects lazy val commonSettings = Seq[Setting[_]]( organization := "com.snowplowanalytics", - version := "0.3.0-M1", + version := "0.3.0", scalaVersion := "2.10.5", crossScalaVersions := Seq("2.10.5", "2.11.6"), scalacOptions := Seq("-deprecation", "-encoding", "utf8", diff --git a/project/Dependencies.scala b/project/Dependencies.scala index e4d8d1d..21c4883 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -39,7 +39,7 @@ object Dependencies { val specs2 = "2.3.13" val scalazSpecs2 = "0.2" val scalaCheck = "1.12.2" - val schemaddl = "0.1.0-M1" + val schemaddl = "0.1.0" } object Libraries { From bd4efd8766f4fb1c842321d897a7b3c3934c6af4 Mon Sep 17 00:00:00 2001 From: Alex Dean Date: Wed, 29 Jul 2015 19:10:54 +0100 Subject: [PATCH 12/12] Finalized CHANGELOG for release --- CHANGELOG | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 900d5a3..4213584 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,15 +1,15 @@ -0.3.0 (2015-07-28) +0.3.0 (2015-07-29) ------------------ -Swap all occurrences of "igluutils" with "schemaddl" to reflect renaming (#97) -Fix ordering for JSONPaths file (#96) -Update README to reflect new 0.3.0 (#93) +Swapped all occurrences of "igluutils" with "schemaddl" to reflect renaming (#97) +Fixed ordering for JSONPaths file (#96) +Updated README to reflect new 0.3.0 (#93) Optional self-desc JSON with --raw (#92) -Correctly handle dir of JSONs (#91) -Don't check for .ndjson extension when --ndjson set (#74) -Change default SchemaVer to 1-0-0 (#80) -Unify CLI options (#90) -Add `ddl` command which generates JSON Paths files and Redshift DDL (#84) -Move existing functionality into `derive` command (#83) +Now correctly handling dir of JSONs (#91) +No longer checking for .ndjson extension when --ndjson set (#74) +Changed default SchemaVer to 1-0-0 (#80) +Unified CLI options (#90) +Added `ddl` command which generates JSON Paths files and Redshift DDL (#84) +Moved existing functionality into `derive` command (#83) 0.2.0 (2015-07-01) ------------------