-
Notifications
You must be signed in to change notification settings - Fork 28.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-20960][SQL] make ColumnVector public #20116
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,6 +23,7 @@ | |
|
||
import org.apache.spark.sql.internal.SQLConf; | ||
import org.apache.spark.sql.types.*; | ||
import org.apache.spark.sql.vectorized.ColumnVector; | ||
import org.apache.spark.unsafe.array.ByteArrayMethods; | ||
import org.apache.spark.unsafe.types.UTF8String; | ||
|
||
|
@@ -586,7 +587,7 @@ public final int appendStruct(boolean isNull) { | |
if (isNull) { | ||
appendNull(); | ||
for (ColumnVector c: childColumns) { | ||
if (c.type instanceof StructType) { | ||
if (c.dataType() instanceof StructType) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Which access type will we use for |
||
((WritableColumnVector) c).appendStruct(true); | ||
} else { | ||
((WritableColumnVector) c).appendNull(); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,32 +14,38 @@ | |
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.spark.sql.execution.vectorized; | ||
package org.apache.spark.sql.vectorized; | ||
|
||
import org.apache.spark.sql.catalyst.util.MapData; | ||
import org.apache.spark.sql.types.DataType; | ||
import org.apache.spark.sql.types.Decimal; | ||
import org.apache.spark.unsafe.types.UTF8String; | ||
|
||
/** | ||
* This class represents in-memory values of a column and provides the main APIs to access the data. | ||
* It supports all the types and contains get APIs as well as their batched versions. The batched | ||
* versions are considered to be faster and preferable whenever possible. | ||
* An interface representing in-memory columnar data in Spark. This interface defines the main APIs | ||
* to access the data, as well as their batched versions. The batched versions are considered to be | ||
* faster and preferable whenever possible. | ||
* | ||
* To handle nested schemas, ColumnVector has two types: Arrays and Structs. In both cases these | ||
* columns have child columns. All of the data are stored in the child columns and the parent column | ||
* only contains nullability. In the case of Arrays, the lengths and offsets are saved in the child | ||
* column and are encoded identically to INTs. | ||
* Most of the APIs take the rowId as a parameter. This is the batch local 0-based row id for values | ||
* in this ColumnVector. | ||
* | ||
* Maps are just a special case of a two field struct. | ||
* ColumnVector supports all the data types including nested types. To handle nested types, | ||
* ColumnVector can have children and is a tree structure. For struct type, it stores the actual | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: child -> children There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it's already There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry for my mistake. |
||
* data of each field in the corresponding child ColumnVector, and only store null information in | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
* the parent ColumnVector. For array type, it stores the actual array elements in the child | ||
* ColumnVector, and store null information, array offsets and lengths in the parent ColumnVector. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
* | ||
* Most of the APIs take the rowId as a parameter. This is the batch local 0-based row id for values | ||
* in the current batch. | ||
* ColumnVector is expected to be reused during the entire data loading process, to avoid allocating | ||
* memory again and again. | ||
* | ||
* ColumnVector is meant to maximize CPU efficiency and not storage footprint, implementations | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: not -> not to minimize |
||
* should prefer computing efficiency over storage efficiency when design the format. Since it is | ||
* expected to reuse the ColumnVector instance, the storage footprint is negligible. | ||
*/ | ||
public abstract class ColumnVector implements AutoCloseable { | ||
|
||
/** | ||
* Returns the data type of this column. | ||
* Returns the data type of this column vector. | ||
*/ | ||
public final DataType dataType() { return type; } | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,26 +14,18 @@ | |
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.spark.sql.execution.vectorized; | ||
package org.apache.spark.sql.vectorized; | ||
|
||
import java.util.*; | ||
|
||
import org.apache.spark.sql.catalyst.InternalRow; | ||
import org.apache.spark.sql.execution.vectorized.MutableColumnarRow; | ||
import org.apache.spark.sql.types.StructType; | ||
|
||
/** | ||
* This class is the in memory representation of rows as they are streamed through operators. It | ||
* is designed to maximize CPU efficiency and not storage footprint. Since it is expected that | ||
* each operator allocates one of these objects, the storage footprint on the task is negligible. | ||
* | ||
* The layout is a columnar with values encoded in their native format. Each RowBatch contains | ||
* a horizontal partitioning of the data, split into columns. | ||
* | ||
* The ColumnarBatch supports either on heap or offheap modes with (mostly) the identical API. | ||
* | ||
* TODO: | ||
* - There are many TODOs for the existing APIs. They should throw a not implemented exception. | ||
* - Compaction: The batch and columns should be able to compact based on a selection vector. | ||
* This class is a wrapper of multiple ColumnVectors and represents a logical table-like data | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about?
|
||
* structure. It provides a row-view of this batch so that Spark can access the data row by row. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
* Instance of it is meant to be reused during the entire data loading process. | ||
*/ | ||
public final class ColumnarBatch { | ||
public static final int DEFAULT_BATCH_SIZE = 4 * 1024; | ||
|
@@ -57,7 +49,7 @@ public void close() { | |
} | ||
|
||
/** | ||
* Returns an iterator over the rows in this batch. This skips rows that are filtered out. | ||
* Returns an iterator over the rows in this batch. | ||
*/ | ||
public Iterator<InternalRow> rowIterator() { | ||
final int maxRows = numRows; | ||
|
@@ -87,19 +79,7 @@ public void remove() { | |
} | ||
|
||
/** | ||
* Resets the batch for writing. | ||
*/ | ||
public void reset() { | ||
for (int i = 0; i < numCols(); ++i) { | ||
if (columns[i] instanceof WritableColumnVector) { | ||
((WritableColumnVector) columns[i]).reset(); | ||
} | ||
} | ||
this.numRows = 0; | ||
} | ||
|
||
/** | ||
* Sets the number of rows that are valid. | ||
* Sets the number of rows that are valid in this batch. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about?
|
||
*/ | ||
public void setNumRows(int numRows) { | ||
assert(numRows <= this.capacity); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove the space before
:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the standard java foreach code style