From db890ea8f7899022d9c02dcc4a32c2fce9f13c47 Mon Sep 17 00:00:00 2001 From: mcheah Date: Thu, 12 Mar 2015 19:40:33 -0700 Subject: [PATCH] Removing CastedArray and just using ScalaRunTime. --- .../org/apache/spark/util/CastedArray.scala | 147 ------------------ .../apache/spark/util/PrimitiveSizes.scala | 32 ---- .../org/apache/spark/util/SizeEstimator.scala | 55 ++++--- 3 files changed, 32 insertions(+), 202 deletions(-) delete mode 100644 core/src/main/scala/org/apache/spark/util/CastedArray.scala delete mode 100644 core/src/main/scala/org/apache/spark/util/PrimitiveSizes.scala diff --git a/core/src/main/scala/org/apache/spark/util/CastedArray.scala b/core/src/main/scala/org/apache/spark/util/CastedArray.scala deleted file mode 100644 index a1378dcd88839..0000000000000 --- a/core/src/main/scala/org/apache/spark/util/CastedArray.scala +++ /dev/null @@ -1,147 +0,0 @@ -/* -* Licensed to the Apache Software Foundation (ASF) under one or more -* contributor license agreements. See the NOTICE file distributed with -* this work for additional information regarding copyright ownership. -* The ASF licenses this file to You under the Apache License, Version 2.0 -* (the "License"); you may not use this file except in compliance with -* the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -package org.apache.spark.util - -/** - * Provides a wrapper around an object that is known to be an array, but the specific - * type for the array is unknown. - * - * Normally, in situations when such an array is to be accessed reflectively, one would use - * {@link java.lang.reflect.Array} using getLength() and get() methods. However, it turns - * out that such methods are ill-performant. - * - * It turns out it is better to just use instanceOf and lots of casting over calling through - * to the native C implementation. There is some discussion and a sample code snippet in - * an open JDK ticket. In this - * class, that approach is implemented in an alternative way: creating a wrapper object to - * wrap the array allows the cast to be done once, so the overhead of casting multiple times - * is also avoided. It turns out we invoke the get() method to get the value of the array - * numerous times, so doing the cast just once is worth the cost of constructing the wrapper - * object for larger arrays. - * - * In general, these classes were designed to avoid the need to cast as much as possible. As - * soon as the type of the array is known, it is casted immediately once and all of its metadata - * (primitive type size, length, and whether or not it is a primitive array) is available - * immediately without any further reflection or introspecting on class objects. - */ -sealed trait CastedArray extends Any { - def get(i: Int): AnyRef - def getLength(): Int - def isPrimitiveArray(): Boolean - def getElementSize(): Int -} - -object CastedArray { - // Sizes of primitive types - - def castAndWrap(obj: AnyRef): CastedArray = { - obj match { - case arr: Array[Boolean] => new BooleanCastedArray(arr) - case arr: Array[Byte] => new ByteCastedArray(arr) - case arr: Array[Char] => new CharCastedArray(arr) - case arr: Array[Double] => new DoubleCastedArray(arr) - case arr: Array[Float] => new FloatCastedArray(arr) - case arr: Array[Int] => new IntCastedArray(arr) - case arr: Array[Long] => new LongCastedArray(arr) - case arr: Array[Object] => new ObjectCastedArray(arr) - case arr: Array[Short] => new ShortCastedArray(arr) - case default => throw createBadArrayException(obj) - } - } - - // Boxing is not ideal, but we want to return AnyRef here. An alternative implementation - // that used Java wouldn't force explicitly boxing... but returning Object there would - // make the boxing happen implicitly anyways. In practice this tends to be okay - // in terms of performance. - private class BooleanCastedArray(val arr: Array[Boolean]) extends AnyVal with CastedArray { - override def get(i: Int): AnyRef = Boolean.box(arr(i)) - override def getLength(): Int = arr.length - override def isPrimitiveArray(): Boolean = true - override def getElementSize(): Int = PrimitiveSizes.BOOLEAN_SIZE - } - - private class ByteCastedArray(val arr: Array[Byte]) extends AnyVal with CastedArray { - override def get(i: Int): AnyRef = Byte.box(arr(i)) - override def getLength(): Int = arr.length - override def isPrimitiveArray(): Boolean = true - override def getElementSize(): Int = PrimitiveSizes.BYTE_SIZE - } - - private class CharCastedArray(val arr: Array[Char]) extends AnyVal with CastedArray { - override def get(i: Int): AnyRef = Char.box(arr(i)) - override def getLength(): Int = arr.length - override def isPrimitiveArray(): Boolean = true - override def getElementSize(): Int = PrimitiveSizes.CHAR_SIZE - } - - private class DoubleCastedArray(val arr: Array[Double]) extends AnyVal with CastedArray { - override def get(i: Int): AnyRef = Double.box(arr(i)) - override def getLength(): Int = arr.length - override def isPrimitiveArray(): Boolean = true - override def getElementSize(): Int = PrimitiveSizes.DOUBLE_SIZE - } - - private class FloatCastedArray(val arr: Array[Float]) extends AnyVal with CastedArray { - override def get(i: Int): AnyRef = Float.box(arr(i)) - override def getLength(): Int = arr.length - override def isPrimitiveArray(): Boolean = true - override def getElementSize(): Int = PrimitiveSizes.FLOAT_SIZE - } - - private class IntCastedArray(val arr: Array[Int]) extends AnyVal with CastedArray { - override def get(i: Int): AnyRef = Int.box(arr(i)) - override def getLength(): Int = arr.length - override def isPrimitiveArray(): Boolean = true - override def getElementSize(): Int = PrimitiveSizes.INT_SIZE - } - - private class LongCastedArray(val arr: Array[Long]) extends AnyVal with CastedArray { - override def get(i: Int): AnyRef = Long.box(arr(i)) - override def getLength(): Int = arr.length - override def isPrimitiveArray(): Boolean = true - override def getElementSize(): Int = PrimitiveSizes.LONG_SIZE - } - - private class ObjectCastedArray(val arr: Array[Object]) extends AnyVal with CastedArray { - override def get(i: Int): Object = arr(i) - override def getLength(): Int = arr.length - override def isPrimitiveArray(): Boolean = false - override def getElementSize(): Int = { - throw new UnsupportedOperationException("Cannot introspect " + - " the size of an element in an object array.") - } - } - - private class ShortCastedArray(val arr: Array[Short]) extends AnyVal with CastedArray { - override def get(i: Int): AnyRef = Short.box(arr(i)) - override def getLength(): Int = arr.length - override def isPrimitiveArray(): Boolean = true - override def getElementSize(): Int = PrimitiveSizes.SHORT_SIZE - } - - private def createBadArrayException(badArray : Object): RuntimeException = { - if (badArray == null) { - return new NullPointerException("Array argument is null"); - } else if (!badArray.getClass().isArray()) { - return new IllegalArgumentException("Argument is not an array"); - } else { - return new IllegalArgumentException("Array is of incompatible type"); - } - } -} - diff --git a/core/src/main/scala/org/apache/spark/util/PrimitiveSizes.scala b/core/src/main/scala/org/apache/spark/util/PrimitiveSizes.scala deleted file mode 100644 index 7d335af07c090..0000000000000 --- a/core/src/main/scala/org/apache/spark/util/PrimitiveSizes.scala +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.util - -/** - * Constants for the sizes of primitive types in bytes. - */ -object PrimitiveSizes { - val BYTE_SIZE = 1 - val BOOLEAN_SIZE = 1 - val CHAR_SIZE = 2 - val SHORT_SIZE = 2 - val INT_SIZE = 4 - val LONG_SIZE = 8 - val FLOAT_SIZE = 4 - val DOUBLE_SIZE = 8 -} diff --git a/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala b/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala index d2726da01dd17..f00a8fb024ea9 100644 --- a/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala +++ b/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala @@ -18,17 +18,16 @@ package org.apache.spark.util import java.lang.management.ManagementFactory -import java.lang.reflect.Field -import java.lang.reflect.Modifier -import java.util.IdentityHashMap -import java.util.Random +import java.lang.reflect.{Field, Modifier} +import java.util.{IdentityHashMap, Random} import java.util.concurrent.ConcurrentHashMap -import scala.collection.mutable.ArrayBuffer - import org.apache.spark.Logging import org.apache.spark.util.collection.OpenHashSet +import scala.collection.mutable.ArrayBuffer +import scala.runtime.ScalaRunTime + /** * Estimates the sizes of Java objects (number of bytes of memory they occupy), for use in * memory-aware caches. @@ -38,6 +37,16 @@ import org.apache.spark.util.collection.OpenHashSet */ private[spark] object SizeEstimator extends Logging { + // Sizes of primitive types + private val BYTE_SIZE = 1 + private val BOOLEAN_SIZE = 1 + private val CHAR_SIZE = 2 + private val SHORT_SIZE = 2 + private val INT_SIZE = 4 + private val LONG_SIZE = 8 + private val FLOAT_SIZE = 4 + private val DOUBLE_SIZE = 8 + // Alignment boundary for objects // TODO: Is this arch dependent ? private val ALIGN_SIZE = 8 @@ -155,7 +164,7 @@ private[spark] object SizeEstimator extends Logging { private def visitSingleObject(obj: AnyRef, state: SearchState) { val cls = obj.getClass if (cls.isArray) { - visitArray(obj, state) + visitArray(obj, cls, state) } else if (obj.isInstanceOf[ClassLoader] || obj.isInstanceOf[Class[_]]) { // Hadoop JobConfs created in the interpreter have a ClassLoader, which greatly confuses // the size estimator since it references the whole REPL. Do nothing in this case. In @@ -173,15 +182,15 @@ private[spark] object SizeEstimator extends Logging { private val ARRAY_SIZE_FOR_SAMPLING = 200 private val ARRAY_SAMPLE_SIZE = 100 // should be lower than ARRAY_SIZE_FOR_SAMPLING - private def visitArray(array: AnyRef, state: SearchState) { - val castedArray: CastedArray = CastedArray.castAndWrap(array) - val length = castedArray.getLength + private def visitArray(array: AnyRef, arrayClass: Class[_], state: SearchState) { + val length = ScalaRunTime.array_length(array) + val elementClass = arrayClass.getComponentType() // Arrays have object header and length field which is an integer - var arrSize: Long = alignSize(objectSize + PrimitiveSizes.INT_SIZE) + var arrSize: Long = alignSize(objectSize + INT_SIZE) - if (castedArray.isPrimitiveArray()) { - arrSize += alignSize(length * castedArray.getElementSize()) + if (elementClass.isPrimitive()) { + arrSize += alignSize(length * primitiveSize(elementClass)) state.size += arrSize } else { arrSize += alignSize(length * pointerSize) @@ -189,7 +198,7 @@ private[spark] object SizeEstimator extends Logging { if (length <= ARRAY_SIZE_FOR_SAMPLING) { for (i <- 0 until length) { - state.enqueue(castedArray.get(i)) + state.enqueue(ScalaRunTime.array_apply(array, i).asInstanceOf[AnyRef]) } } else { // Estimate the size of a large array by sampling elements without replacement. @@ -202,7 +211,7 @@ private[spark] object SizeEstimator extends Logging { index = rand.nextInt(length) } while (drawn.contains(index)) drawn.add(index) - val elem = castedArray.get(index) + val elem = ScalaRunTime.array_apply(array, index).asInstanceOf[AnyRef] size += SizeEstimator.estimate(elem, state.visited) } state.size += ((length / (ARRAY_SAMPLE_SIZE * 1.0)) * size).toLong @@ -212,21 +221,21 @@ private[spark] object SizeEstimator extends Logging { private def primitiveSize(cls: Class[_]): Long = { if (cls == classOf[Byte]) { - PrimitiveSizes.BYTE_SIZE + BYTE_SIZE } else if (cls == classOf[Boolean]) { - PrimitiveSizes.BOOLEAN_SIZE + BOOLEAN_SIZE } else if (cls == classOf[Char]) { - PrimitiveSizes.CHAR_SIZE + CHAR_SIZE } else if (cls == classOf[Short]) { - PrimitiveSizes.SHORT_SIZE + SHORT_SIZE } else if (cls == classOf[Int]) { - PrimitiveSizes.INT_SIZE + INT_SIZE } else if (cls == classOf[Long]) { - PrimitiveSizes.LONG_SIZE + LONG_SIZE } else if (cls == classOf[Float]) { - PrimitiveSizes.FLOAT_SIZE + FLOAT_SIZE } else if (cls == classOf[Double]) { - PrimitiveSizes.DOUBLE_SIZE + DOUBLE_SIZE } else { throw new IllegalArgumentException( "Non-primitive class " + cls + " passed to primitiveSize()")