Skip to content

Commit

Permalink
Made some in-memory columnar storage interfaces row-based
Browse files Browse the repository at this point in the history
  • Loading branch information
liancheng committed Sep 10, 2014
1 parent 25b5b86 commit b70d519
Show file tree
Hide file tree
Showing 8 changed files with 73 additions and 56 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,8 @@ private[sql] class BasicColumnBuilder[T <: DataType, JvmType](
}

override def appendFrom(row: Row, ordinal: Int) {
val field = columnType.getField(row, ordinal)
buffer = ensureFreeSpace(buffer, columnType.actualSize(field))
columnType.append(field, buffer)
buffer = ensureFreeSpace(buffer, columnType.actualSize(row, ordinal))
columnType.append(columnType.getField(row, ordinal), buffer)
}

override def build() = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,12 @@
package org.apache.spark.sql.columnar

import java.nio.ByteBuffer
import java.sql.Timestamp

import scala.reflect.runtime.universe.TypeTag

import java.sql.Timestamp

import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.MutableRow
import org.apache.spark.sql.catalyst.expressions.{MutableAny, MutableRow, MutableValue}
import org.apache.spark.sql.catalyst.types._
import org.apache.spark.sql.execution.SparkSqlSerializer

Expand All @@ -41,6 +40,8 @@ private[sql] sealed abstract class ColumnType[T <: DataType, JvmType](
val typeId: Int,
val defaultSize: Int) {

val mutable: MutableValue = new MutableAny

/**
* Extracts a value out of the buffer at the buffer's current position.
*/
Expand All @@ -52,10 +53,10 @@ private[sql] sealed abstract class ColumnType[T <: DataType, JvmType](
def append(v: JvmType, buffer: ByteBuffer)

/**
* Returns the size of the value. This is used to calculate the size of variable length types
* such as byte arrays and strings.
* Returns the size of the value `row(ordinal)`. This is used to calculate the size of variable
* length types such as byte arrays and strings.
*/
def actualSize(v: JvmType): Int = defaultSize
def actualSize(row: Row, ordinal: Int): Int = defaultSize

/**
* Returns `row(ordinal)`. Subclasses should override this method to avoid boxing/unboxing costs
Expand Down Expand Up @@ -200,7 +201,9 @@ private[sql] object SHORT extends NativeColumnType(ShortType, 6, 2) {
}

private[sql] object STRING extends NativeColumnType(StringType, 7, 8) {
override def actualSize(v: String): Int = v.getBytes("utf-8").length + 4
override def actualSize(row: Row, ordinal: Int): Int = {
row.getString(ordinal).getBytes("utf-8").length + 4
}

override def append(v: String, buffer: ByteBuffer) {
val stringBytes = v.getBytes("utf-8")
Expand Down Expand Up @@ -246,7 +249,9 @@ private[sql] sealed abstract class ByteArrayColumnType[T <: DataType](
defaultSize: Int)
extends ColumnType[T, Array[Byte]](typeId, defaultSize) {

override def actualSize(v: Array[Byte]) = v.length + 4
override def actualSize(row: Row, ordinal: Int) = {
getField(row, ordinal).length + 4
}

override def append(v: Array[Byte], buffer: ByteBuffer) {
buffer.putInt(v.length).put(v, 0, v.length)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@ private[sql] trait CompressibleColumnBuilder[T <: NativeType]
abstract override def initialize(initialSize: Int, columnName: String, useCompression: Boolean) {
compressionEncoders =
if (useCompression) {
schemes.filter(_.supports(columnType)).map(_.encoder[T])
schemes.filter(_.supports(columnType)).map(_.encoder[T](columnType))
} else {
Seq(PassThrough.encoder)
Seq(PassThrough.encoder(columnType))
}
super.initialize(initialSize, columnName, useCompression)
}
Expand All @@ -63,11 +63,9 @@ private[sql] trait CompressibleColumnBuilder[T <: NativeType]
}

private def gatherCompressibilityStats(row: Row, ordinal: Int) {
val field = columnType.getField(row, ordinal)

var i = 0
while (i < compressionEncoders.length) {
compressionEncoders(i).gatherCompressibilityStats(field, columnType)
compressionEncoders(i).gatherCompressibilityStats(row, ordinal)
i += 1
}
}
Expand All @@ -84,7 +82,7 @@ private[sql] trait CompressibleColumnBuilder[T <: NativeType]
val typeId = nonNullBuffer.getInt()
val encoder: Encoder[T] = {
val candidate = compressionEncoders.minBy(_.compressionRatio)
if (isWorthCompressing(candidate)) candidate else PassThrough.encoder
if (isWorthCompressing(candidate)) candidate else PassThrough.encoder(columnType)
}

// Header = column type ID + null count + null positions
Expand All @@ -105,6 +103,6 @@ private[sql] trait CompressibleColumnBuilder[T <: NativeType]
.put(nulls)

logInfo(s"Compressor for [$columnName]: $encoder, ratio: ${encoder.compressionRatio}")
encoder.compress(nonNullBuffer, compressedBuffer, columnType)
encoder.compress(nonNullBuffer, compressedBuffer)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,14 @@

package org.apache.spark.sql.columnar.compression

import java.nio.{ByteOrder, ByteBuffer}
import java.nio.{ByteBuffer, ByteOrder}

import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.types.NativeType
import org.apache.spark.sql.columnar.{ColumnType, NativeColumnType}

private[sql] trait Encoder[T <: NativeType] {
def gatherCompressibilityStats(value: T#JvmType, columnType: NativeColumnType[T]) {}
def gatherCompressibilityStats(row: Row, ordinal: Int) {}

def compressedSize: Int

Expand All @@ -33,7 +34,7 @@ private[sql] trait Encoder[T <: NativeType] {
if (uncompressedSize > 0) compressedSize.toDouble / uncompressedSize else 1.0
}

def compress(from: ByteBuffer, to: ByteBuffer, columnType: NativeColumnType[T]): ByteBuffer
def compress(from: ByteBuffer, to: ByteBuffer): ByteBuffer
}

private[sql] trait Decoder[T <: NativeType] extends Iterator[T#JvmType]
Expand All @@ -43,7 +44,7 @@ private[sql] trait CompressionScheme {

def supports(columnType: ColumnType[_, _]): Boolean

def encoder[T <: NativeType]: Encoder[T]
def encoder[T <: NativeType](columnType: NativeColumnType[T]): Encoder[T]

def decoder[T <: NativeType](buffer: ByteBuffer, columnType: NativeColumnType[T]): Decoder[T]
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ import scala.collection.mutable
import scala.reflect.ClassTag
import scala.reflect.runtime.universe.runtimeMirror

import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.SpecificMutableRow
import org.apache.spark.sql.catalyst.types._
import org.apache.spark.sql.columnar._
import org.apache.spark.util.Utils
Expand All @@ -33,18 +34,20 @@ private[sql] case object PassThrough extends CompressionScheme {

override def supports(columnType: ColumnType[_, _]) = true

override def encoder[T <: NativeType] = new this.Encoder[T]
override def encoder[T <: NativeType](columnType: NativeColumnType[T]) = {
new this.Encoder[T](columnType)
}

override def decoder[T <: NativeType](buffer: ByteBuffer, columnType: NativeColumnType[T]) = {
new this.Decoder(buffer, columnType)
}

class Encoder[T <: NativeType] extends compression.Encoder[T] {
class Encoder[T <: NativeType](columnType: NativeColumnType[T]) extends compression.Encoder[T] {
override def uncompressedSize = 0

override def compressedSize = 0

override def compress(from: ByteBuffer, to: ByteBuffer, columnType: NativeColumnType[T]) = {
override def compress(from: ByteBuffer, to: ByteBuffer) = {
// Writes compression type ID and copies raw contents
to.putInt(PassThrough.typeId).put(from).rewind()
to
Expand All @@ -63,7 +66,9 @@ private[sql] case object PassThrough extends CompressionScheme {
private[sql] case object RunLengthEncoding extends CompressionScheme {
override val typeId = 1

override def encoder[T <: NativeType] = new this.Encoder[T]
override def encoder[T <: NativeType](columnType: NativeColumnType[T]) = {
new this.Encoder[T](columnType)
}

override def decoder[T <: NativeType](buffer: ByteBuffer, columnType: NativeColumnType[T]) = {
new this.Decoder(buffer, columnType)
Expand All @@ -74,20 +79,21 @@ private[sql] case object RunLengthEncoding extends CompressionScheme {
case _ => false
}

class Encoder[T <: NativeType] extends compression.Encoder[T] {
class Encoder[T <: NativeType](columnType: NativeColumnType[T]) extends compression.Encoder[T] {
private var _uncompressedSize = 0
private var _compressedSize = 0

// Using `MutableRow` to store the last value to avoid boxing/unboxing cost.
private val lastValue = new GenericMutableRow(1)
private val lastValue = new SpecificMutableRow(Seq(columnType.dataType))
private var lastRun = 0

override def uncompressedSize = _uncompressedSize

override def compressedSize = _compressedSize

override def gatherCompressibilityStats(value: T#JvmType, columnType: NativeColumnType[T]) {
val actualSize = columnType.actualSize(value)
override def gatherCompressibilityStats(row: Row, ordinal: Int) {
val value = columnType.getField(row, ordinal)
val actualSize = columnType.actualSize(row, ordinal)
_uncompressedSize += actualSize

if (lastValue.isNullAt(0)) {
Expand All @@ -105,7 +111,7 @@ private[sql] case object RunLengthEncoding extends CompressionScheme {
}
}

override def compress(from: ByteBuffer, to: ByteBuffer, columnType: NativeColumnType[T]) = {
override def compress(from: ByteBuffer, to: ByteBuffer) = {
to.putInt(RunLengthEncoding.typeId)

if (from.hasRemaining) {
Expand Down Expand Up @@ -171,14 +177,16 @@ private[sql] case object DictionaryEncoding extends CompressionScheme {
new this.Decoder(buffer, columnType)
}

override def encoder[T <: NativeType] = new this.Encoder[T]
override def encoder[T <: NativeType](columnType: NativeColumnType[T]) = {
new this.Encoder[T](columnType)
}

override def supports(columnType: ColumnType[_, _]) = columnType match {
case INT | LONG | STRING => true
case _ => false
}

class Encoder[T <: NativeType] extends compression.Encoder[T] {
class Encoder[T <: NativeType](columnType: NativeColumnType[T]) extends compression.Encoder[T] {
// Size of the input, uncompressed, in bytes. Note that we only count until the dictionary
// overflows.
private var _uncompressedSize = 0
Expand All @@ -200,9 +208,11 @@ private[sql] case object DictionaryEncoding extends CompressionScheme {
// to store dictionary element count.
private var dictionarySize = 4

override def gatherCompressibilityStats(value: T#JvmType, columnType: NativeColumnType[T]) {
override def gatherCompressibilityStats(row: Row, ordinal: Int) {
val value = columnType.getField(row, ordinal)

if (!overflow) {
val actualSize = columnType.actualSize(value)
val actualSize = columnType.actualSize(row, ordinal)
count += 1
_uncompressedSize += actualSize

Expand All @@ -221,7 +231,7 @@ private[sql] case object DictionaryEncoding extends CompressionScheme {
}
}

override def compress(from: ByteBuffer, to: ByteBuffer, columnType: NativeColumnType[T]) = {
override def compress(from: ByteBuffer, to: ByteBuffer) = {
if (overflow) {
throw new IllegalStateException(
"Dictionary encoding should not be used because of dictionary overflow.")
Expand Down Expand Up @@ -279,25 +289,20 @@ private[sql] case object BooleanBitSet extends CompressionScheme {
new this.Decoder(buffer).asInstanceOf[compression.Decoder[T]]
}

override def encoder[T <: NativeType] = (new this.Encoder).asInstanceOf[compression.Encoder[T]]
override def encoder[T <: NativeType](columnType: NativeColumnType[T]) = {
(new this.Encoder).asInstanceOf[compression.Encoder[T]]
}

override def supports(columnType: ColumnType[_, _]) = columnType == BOOLEAN

class Encoder extends compression.Encoder[BooleanType.type] {
private var _uncompressedSize = 0

override def gatherCompressibilityStats(
value: Boolean,
columnType: NativeColumnType[BooleanType.type]) {

override def gatherCompressibilityStats(row: Row, ordinal: Int) {
_uncompressedSize += BOOLEAN.defaultSize
}

override def compress(
from: ByteBuffer,
to: ByteBuffer,
columnType: NativeColumnType[BooleanType.type]) = {

override def compress(from: ByteBuffer, to: ByteBuffer) = {
to.putInt(BooleanBitSet.typeId)
// Total element count (1 byte per Boolean value)
.putInt(from.remaining)
Expand Down Expand Up @@ -364,13 +369,18 @@ private[sql] case object BooleanBitSet extends CompressionScheme {
}
}

private[sql] sealed abstract class IntegralDelta[I <: IntegralType] extends CompressionScheme {
private[sql] sealed abstract class IntegralDelta[I <: IntegralType](
columnType: NativeColumnType[I])
extends CompressionScheme {

override def decoder[T <: NativeType](buffer: ByteBuffer, columnType: NativeColumnType[T]) = {
new this.Decoder(buffer, columnType.asInstanceOf[NativeColumnType[I]])
.asInstanceOf[compression.Decoder[T]]
}

override def encoder[T <: NativeType] = (new this.Encoder).asInstanceOf[compression.Encoder[T]]
override def encoder[T <: NativeType](columnType: NativeColumnType[T]) = {
(new this.Encoder).asInstanceOf[compression.Encoder[T]]
}

/**
* Computes `delta = x - y`, returns `(true, delta)` if `delta` can fit into a single byte, or
Expand All @@ -392,7 +402,8 @@ private[sql] sealed abstract class IntegralDelta[I <: IntegralType] extends Comp

private var initial = true

override def gatherCompressibilityStats(value: I#JvmType, columnType: NativeColumnType[I]) {
override def gatherCompressibilityStats(row: Row, ordinal: Int) {
val value = columnType.getField(row, ordinal)
_uncompressedSize += columnType.defaultSize

if (initial) {
Expand All @@ -406,7 +417,7 @@ private[sql] sealed abstract class IntegralDelta[I <: IntegralType] extends Comp
prev = value
}

override def compress(from: ByteBuffer, to: ByteBuffer, columnType: NativeColumnType[I]) = {
override def compress(from: ByteBuffer, to: ByteBuffer) = {
to.putInt(typeId)

if (from.hasRemaining) {
Expand Down Expand Up @@ -452,7 +463,7 @@ private[sql] sealed abstract class IntegralDelta[I <: IntegralType] extends Comp
}
}

private[sql] case object IntDelta extends IntegralDelta[IntegerType.type] {
private[sql] case object IntDelta extends IntegralDelta[IntegerType.type](INT) {
override val typeId = 4

override def supports(columnType: ColumnType[_, _]) = columnType == INT
Expand All @@ -465,7 +476,7 @@ private[sql] case object IntDelta extends IntegralDelta[IntegerType.type] {
}
}

private[sql] case object LongDelta extends IntegralDelta[LongType.type] {
private[sql] case object LongDelta extends IntegralDelta[LongType.type](LONG) {
override val typeId = 5

override def supports(columnType: ColumnType[_, _]) = columnType == LONG
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import java.sql.Timestamp
import org.scalatest.FunSuite

import org.apache.spark.Logging
import org.apache.spark.sql.catalyst.expressions.{GenericMutableRow, Row}
import org.apache.spark.sql.catalyst.types._
import org.apache.spark.sql.columnar.ColumnarTestUtils._
import org.apache.spark.sql.execution.SparkSqlSerializer
Expand All @@ -49,7 +50,9 @@ class ColumnTypeSuite extends FunSuite with Logging {
expected: Int) {

assertResult(expected, s"Wrong actualSize for $columnType") {
columnType.actualSize(value)
val row = new GenericMutableRow(1)
columnType.setField(row, 0, value)
columnType.actualSize(row, 0)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ class DictionaryEncodingSuite extends FunSuite {
val buffer = builder.build()
val headerSize = CompressionScheme.columnHeaderSize(buffer)
// 4 extra bytes for dictionary size
val dictionarySize = 4 + values.map(columnType.actualSize).sum
val dictionarySize = 4 + rows.map(columnType.actualSize(_, 0)).sum
// 2 bytes for each `Short`
val compressedSize = 4 + dictionarySize + 2 * inputSeq.length
// 4 extra bytes for compression scheme type ID
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ class RunLengthEncodingSuite extends FunSuite {
// Compression scheme ID + compressed contents
val compressedSize = 4 + inputRuns.map { case (index, _) =>
// 4 extra bytes each run for run length
columnType.actualSize(values(index)) + 4
columnType.actualSize(rows(index), 0) + 4
}.sum

// 4 extra bytes for compression scheme type ID
Expand Down

0 comments on commit b70d519

Please sign in to comment.