diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/Metadata.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/Metadata.scala index bf5250e2c7341..0b91cd01f7711 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/Metadata.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/Metadata.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.sql.catalyst.util import scala.collection.mutable @@ -5,61 +22,99 @@ import scala.collection.mutable import org.json4s._ import org.json4s.jackson.JsonMethods._ -import scala.reflect.ClassTag - -sealed class Metadata private[util] (val map: Map[String, Any]) extends Serializable { - - def getInt(key: String): Int = get(key) - +/** + * Metadata is a wrapper over Map[String, Any] that limits the value type to simple ones: Boolean, + * Long, Double, String, Metadata, Array[Boolean], Array[Long], Array[Double], Array[String], and + * Array[Metadata]. JSON is used for serialization. + * + * The default constructor is private. User should use either [[MetadataBuilder]] or + * [[Metadata$#fromJson]] to create Metadata instances. + * + * @param map an immutable map that stores the data + */ +sealed class Metadata private[util] (private[util] val map: Map[String, Any]) extends Serializable { + + /** Gets a Long. */ + def getLong(key: String): Long = get(key) + + /** Gets a Double. */ def getDouble(key: String): Double = get(key) + /** Gets a Boolean. */ def getBoolean(key: String): Boolean = get(key) + /** Gets a String. */ def getString(key: String): String = get(key) + /** Gets a Metadata. */ def getMetadata(key: String): Metadata = get(key) - def getIntArray(key: String): Array[Int] = getArray(key) + /** Gets a Long array. */ + def getLongArray(key: String): Array[Long] = get(key) - def getDoubleArray(key: String): Array[Double] = getArray(key) + /** Gets a Double array. */ + def getDoubleArray(key: String): Array[Double] = get(key) - def getBooleanArray(key: String): Array[Boolean] = getArray(key) + /** Gets a Boolean array. */ + def getBooleanArray(key: String): Array[Boolean] = get(key) - def getStringArray(key: String): Array[String] = getArray(key) + /** Gets a String array. */ + def getStringArray(key: String): Array[String] = get(key) - def getMetadataArray(key: String): Array[Metadata] = getArray(key) + /** Gets a Metadata array. */ + def getMetadataArray(key: String): Array[Metadata] = get(key) + /** Converts to its JSON representation. */ def toJson: String = { compact(render(Metadata.toJValue(this))) } - private def get[T](key: String): T = { - map(key).asInstanceOf[T] - } + override def toString: String = toJson - private def getArray[T: ClassTag](key: String): Array[T] = { - map(key).asInstanceOf[Seq[T]].toArray + override def equals(obj: Any): Boolean = { + obj match { + case that: Metadata => + if (map.keySet == that.map.keySet) { + map.keys.forall { k => + (map(k), that.map(k)) match { + case (v0: Array[_], v1: Array[_]) => + v0.view == v1.view + case (v0, v1) => + v0 == v1 + } + } + } else { + false + } + case other => + false + } } - override def toString: String = toJson + override def hashCode: Int = Metadata.hash(this) + + private def get[T](key: String): T = { + map(key).asInstanceOf[T] + } } object Metadata { + /** Returns an empty Metadata. */ def empty: Metadata = new Metadata(Map.empty) + /** Creates a Metadata instance from JSON. */ def fromJson(json: String): Metadata = { val map = parse(json).values.asInstanceOf[Map[String, Any]] fromMap(map.toMap) } + /** Creates a Metadata instance from Map[String, Any]. */ private def fromMap(map: Map[String, Any]): Metadata = { val builder = new MetadataBuilder map.foreach { - case (key, value: Int) => - builder.putInt(key, value) case (key, value: BigInt) => - builder.putInt(key, value.toInt) + builder.putLong(key, value.toLong) case (key, value: Double) => builder.putDouble(key, value) case (key, value: Boolean) => @@ -70,22 +125,21 @@ object Metadata { builder.putMetadata(key, fromMap(value.asInstanceOf[Map[String, Any]])) case (key, value: Seq[_]) => if (value.isEmpty) { - builder.putIntArray(key, Seq.empty) + // If it is an empty array, we cannot infer its element type. We put an empty Array[Long]. + builder.putLongArray(key, Array.empty) } else { value.head match { - case _: Int => - builder.putIntArray(key, value.asInstanceOf[Seq[Int]].toSeq) case _: BigInt => - builder.putIntArray(key, value.asInstanceOf[Seq[BigInt]].map(_.toInt).toSeq) + builder.putLongArray(key, value.asInstanceOf[Seq[BigInt]].map(_.toLong).toArray) case _: Double => - builder.putDoubleArray(key, value.asInstanceOf[Seq[Double]].toSeq) + builder.putDoubleArray(key, value.asInstanceOf[Seq[Double]].toArray) case _: Boolean => - builder.putBooleanArray(key, value.asInstanceOf[Seq[Boolean]].toSeq) + builder.putBooleanArray(key, value.asInstanceOf[Seq[Boolean]].toArray) case _: String => - builder.putStringArray(key, value.asInstanceOf[Seq[String]].toSeq) - case _: Map[String, Any] => + builder.putStringArray(key, value.asInstanceOf[Seq[String]].toSeq.toArray) + case _: Map[_, _] => builder.putMetadataArray( - key, value.asInstanceOf[Seq[Map[String, Any]]].map(fromMap).toSeq) + key, value.asInstanceOf[Seq[Map[String, Any]]].map(fromMap).toArray) case other => throw new RuntimeException(s"Do not support array of type ${other.getClass}.") } @@ -96,15 +150,16 @@ object Metadata { builder.build() } + /** Converts to JSON AST. */ private def toJValue(obj: Any): JValue = { obj match { case map: Map[_, _] => - val fields = map.toList.map { case (k: String, v) => (k, toJValue(v)) } + val fields = map.toList.map { case (k: String, v) => (k, toJValue(v))} JObject(fields) - case arr: Seq[_] => + case arr: Array[_] => val values = arr.toList.map(toJValue) JArray(values) - case x: Int => + case x: Long => JInt(x) case x: Double => JDouble(x) @@ -118,37 +173,75 @@ object Metadata { throw new RuntimeException(s"Do not support type ${other.getClass}.") } } + + /** Computes the hash code for the types we support. */ + private def hash(obj: Any): Int = { + obj match { + case map: Map[_, _] => + map.mapValues(hash).## + case arr: Array[_] => + // Seq.empty[T] has the same hashCode regardless of T. + arr.toSeq.map(hash).## + case x: Long => + x.## + case x: Double => + x.## + case x: Boolean => + x.## + case x: String => + x.## + case x: Metadata => + hash(x.map) + case other => + throw new RuntimeException(s"Do not support type ${other.getClass}.") + } + } } +/** + * Builder for [[Metadata]]. If there is a key collision, the latter will overwrite the former. + */ class MetadataBuilder { private val map: mutable.Map[String, Any] = mutable.Map.empty + /** Include the content of an existing [[Metadata]] instance. */ def withMetadata(metadata: Metadata): this.type = { map ++= metadata.map this } - def putInt(key: String, value: Int): this.type = put(key, value) + /** Puts a Long. */ + def putLong(key: String, value: Long): this.type = put(key, value) + /** Puts a Double. */ def putDouble(key: String, value: Double): this.type = put(key, value) + /** Puts a Boolean. */ def putBoolean(key: String, value: Boolean): this.type = put(key, value) + /** Puts a String. */ def putString(key: String, value: String): this.type = put(key, value) + /** Puts a [[Metadata]]. */ def putMetadata(key: String, value: Metadata): this.type = put(key, value) - def putIntArray(key: String, value: Seq[Int]): this.type = put(key, value) + /** Puts a Long array. */ + def putLongArray(key: String, value: Array[Long]): this.type = put(key, value) - def putDoubleArray(key: String, value: Seq[Double]): this.type = put(key, value) + /** Puts a Double array. */ + def putDoubleArray(key: String, value: Array[Double]): this.type = put(key, value) - def putBooleanArray(key: String, value: Seq[Boolean]): this.type = put(key, value) + /** Puts a Boolean array. */ + def putBooleanArray(key: String, value: Array[Boolean]): this.type = put(key, value) - def putStringArray(key: String, value: Seq[String]): this.type = put(key, value) + /** Puts a String array. */ + def putStringArray(key: String, value: Array[String]): this.type = put(key, value) - def putMetadataArray(key: String, value: Seq[Metadata]): this.type = put(key, value) + /** Puts a [[Metadata]] array. */ + def putMetadataArray(key: String, value: Array[Metadata]): this.type = put(key, value) + /** Builds the [[Metadata]] instance. */ def build(): Metadata = { new Metadata(map.toMap) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/MetadataSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/MetadataSuite.scala index 1df7e22da212c..55fb0f8ec2785 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/MetadataSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/MetadataSuite.scala @@ -1,49 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.sql.catalyst.util -import org.json4s.jackson.JsonMethods._ +import org.json4s.jackson.JsonMethods.parse import org.scalatest.FunSuite class MetadataSuite extends FunSuite { val baseMetadata = new MetadataBuilder() - .putString("purpose", "ml") - .build() + .putString("purpose", "ml") + .putBoolean("isBase", true) + .build() val summary = new MetadataBuilder() - .putInt("numFeatures", 10) - .build() + .putLong("numFeatures", 10L) + .build() val age = new MetadataBuilder() - .putString("name", "age") - .putInt("index", 1) - .putBoolean("categorical", false) - .putDouble("average", 45.0) - .build() + .putString("name", "age") + .putLong("index", 1L) + .putBoolean("categorical", false) + .putDouble("average", 45.0) + .build() val gender = new MetadataBuilder() - .putString("name", "gender") - .putInt("index", 5) - .putBoolean("categorical", true) - .putStringArray("categories", Seq("male", "female")) - .build() + .putString("name", "gender") + .putLong("index", 5) + .putBoolean("categorical", true) + .putStringArray("categories", Array("male", "female")) + .build() val metadata = new MetadataBuilder() - .withMetadata(baseMetadata) - .putMetadata("summary", summary) - .putIntArray("int[]", Seq(0, 1)) - .putDoubleArray("double[]", Seq(3.0, 4.0)) - .putBooleanArray("boolean[]", Seq(true, false)) - .putMetadataArray("features", Seq(age, gender)) - .build() + .withMetadata(baseMetadata) + .putBoolean("isBase", false) // overwrite an existing key + .putMetadata("summary", summary) + .putLongArray("long[]", Array(0L, 1L)) + .putDoubleArray("double[]", Array(3.0, 4.0)) + .putBooleanArray("boolean[]", Array(true, false)) + .putMetadataArray("features", Array(age, gender)) + .build() test("metadata builder and getters") { - assert(age.getInt("index") === 1) + assert(age.getLong("index") === 1L) assert(age.getDouble("average") === 45.0) assert(age.getBoolean("categorical") === false) assert(age.getString("name") === "age") assert(metadata.getString("purpose") === "ml") + assert(metadata.getBoolean("isBase") === false) assert(metadata.getMetadata("summary") === summary) - assert(metadata.getIntArray("int[]").toSeq === Seq(0, 1)) + assert(metadata.getLongArray("long[]").toSeq === Seq(0L, 1L)) assert(metadata.getDoubleArray("double[]").toSeq === Seq(3.0, 4.0)) assert(metadata.getBooleanArray("boolean[]").toSeq === Seq(true, false)) assert(gender.getStringArray("categories").toSeq === Seq("male", "female")) @@ -55,6 +75,8 @@ class MetadataSuite extends FunSuite { withClue("toJson must produce a valid JSON string") { parse(json) } - assert(Metadata.fromJson(json) === metadata) + val parsed = Metadata.fromJson(json) + assert(parsed === metadata) + assert(parsed.## === metadata.##) } }