Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-3966 [Java] JDBC Column Metadata in Arrow Field Metadata #3134

Closed
wants to merge 35 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
5af1b5b
Separating out the field-type creation from the field creation.
Dec 5, 2018
523387f
Updating the API to support an optional 'includeMetadata' field.
Dec 5, 2018
a78c770
Updating Javadocs.
Dec 5, 2018
da77cbe
Creating a configuration class for the JDBC-to-Arrow converter.
Dec 8, 2018
b270044
Updated validaton & documentation, and unit tests for the new JdbcToA…
Dec 8, 2018
df632e3
Updating the SQL tests to include JdbcToArrowConfig versions.
Dec 8, 2018
fe097c8
Merge branch 'jdbc-to-arrow-config' into jdbc-column-metadata
Dec 8, 2018
e34a9e7
Fixing formatting.
Dec 8, 2018
4f1260c
Adding documentation for public static VectorSchemaRoot sqlToArrow(Re…
Dec 8, 2018
8d6cf00
Documentation for public static VectorSchemaRoot sqlToArrow(Connectio…
Dec 8, 2018
b5b0cb1
Merge branch 'jdbc-to-arrow-config' into jdbc-column-metadata
Dec 8, 2018
68c91e7
Modifying the jdbcToArrowSchema and jdbcToArrowVectors methods to rec…
Dec 8, 2018
5bfd6a2
Merge branch 'jdbc-to-arrow-config' into jdbc-column-metadata
Dec 8, 2018
a6fb1be
Fixing function call
Dec 8, 2018
bb3165b
Updating the function calls to use the JdbcToArrowConfig versions.
Dec 8, 2018
7e9ce37
Merge branch 'jdbc-to-arrow-config' into jdbc-column-metadata
Dec 8, 2018
7b4527c
Test for the include-metadata flag in the configuration.
Dec 8, 2018
72d64cc
Affirming the field metadata is empty when the configuration excludes…
Dec 8, 2018
03091a8
Unit tests for including result set metadata.
Dec 8, 2018
881c6c8
Merge pull request #1 from apache/master
mikepigott Dec 9, 2018
1ceac9e
Merge branch 'master' into jdbc-column-metadata
Dec 9, 2018
d847ebc
Fixing file location
Dec 9, 2018
3b17c29
Merge pull request #2 from apache/master
mikepigott Dec 15, 2018
e5b19ee
Merge pull request #3 from apache/master
mikepigott Dec 30, 2018
789c8c8
Merge pull request #4 from apache/master
mikepigott Jan 30, 2019
509a1cc
Merge pull request #5 from apache/master
mikepigott Feb 3, 2019
4a6de86
Merge branch 'master' into jdbc-column-metadata
Feb 3, 2019
69022c2
ARROW-3966: Fixing merge.
Feb 3, 2019
2928513
ARROW-3966: Moving the metadata flag assignment into the builder.
Feb 3, 2019
cfb2ba6
ARROW-3966: Using a helper method for building a UTC calendar with ro…
Feb 3, 2019
cc6cc88
ARROW-3966: Using a 1:N loop instead of a 0:N-1 loop for fewer index …
Feb 3, 2019
65741a9
ARROW-3966: Code review feedback
Feb 5, 2019
e9a9b2b
Merge pull request #6 from apache/master
mikepigott Feb 6, 2019
7049c36
Merge branch 'master' into jdbc-column-metadata
Feb 6, 2019
02f2f34
ARROW-3966: Picking up lost change to support null calendars.
Feb 6, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.arrow.adapter.jdbc;

public class Constants {

public static final String SQL_CATALOG_NAME_KEY = "SQL_CATALOG_NAME";
public static final String SQL_TABLE_NAME_KEY = "SQL_TABLE_NAME";
public static final String SQL_COLUMN_NAME_KEY = "SQL_COLUMN_NAME";
public static final String SQL_TYPE_KEY = "SQL_TYPE";

}
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@
import java.sql.SQLException;
import java.sql.Statement;
import java.util.Calendar;
import java.util.Locale;
import java.util.TimeZone;

import org.apache.arrow.memory.BaseAllocator;
import org.apache.arrow.memory.RootAllocator;
Expand Down Expand Up @@ -90,7 +88,7 @@ public static VectorSchemaRoot sqlToArrow(Connection connection, String query, B
Preconditions.checkNotNull(allocator, "Memory allocator object can not be null");

JdbcToArrowConfig config =
new JdbcToArrowConfig(allocator, Calendar.getInstance(TimeZone.getTimeZone("UTC"), Locale.ROOT));
new JdbcToArrowConfig(allocator, JdbcToArrowUtils.getUtcCalendar(), false);
return sqlToArrow(connection, query, config);
}

Expand All @@ -112,12 +110,13 @@ public static VectorSchemaRoot sqlToArrow(
String query,
BaseAllocator allocator,
Calendar calendar) throws SQLException, IOException {

Preconditions.checkNotNull(connection, "JDBC connection object can not be null");
Preconditions.checkArgument(query != null && query.length() > 0, "SQL query can not be null or empty");
Preconditions.checkNotNull(allocator, "Memory allocator object can not be null");
Preconditions.checkNotNull(calendar, "Calendar object can not be null");

return sqlToArrow(connection, query, new JdbcToArrowConfig(allocator, calendar));
return sqlToArrow(connection, query, new JdbcToArrowConfig(allocator, calendar, false));
}

/**
Expand Down Expand Up @@ -154,7 +153,7 @@ public static VectorSchemaRoot sqlToArrow(Connection connection, String query, J
public static VectorSchemaRoot sqlToArrow(ResultSet resultSet) throws SQLException, IOException {
Preconditions.checkNotNull(resultSet, "JDBC ResultSet object can not be null");

return sqlToArrow(resultSet, Calendar.getInstance(TimeZone.getTimeZone("UTC"), Locale.ROOT));
return sqlToArrow(resultSet, JdbcToArrowUtils.getUtcCalendar());
}

/**
Expand All @@ -171,7 +170,7 @@ public static VectorSchemaRoot sqlToArrow(ResultSet resultSet, BaseAllocator all
Preconditions.checkNotNull(allocator, "Memory Allocator object can not be null");

JdbcToArrowConfig config =
new JdbcToArrowConfig(allocator, Calendar.getInstance(TimeZone.getTimeZone("UTC"), Locale.ROOT));
new JdbcToArrowConfig(allocator, JdbcToArrowUtils.getUtcCalendar(), false);
return sqlToArrow(resultSet, config);
}

Expand All @@ -186,7 +185,7 @@ public static VectorSchemaRoot sqlToArrow(ResultSet resultSet, BaseAllocator all
public static VectorSchemaRoot sqlToArrow(ResultSet resultSet, Calendar calendar) throws SQLException, IOException {
Preconditions.checkNotNull(resultSet, "JDBC ResultSet object can not be null");

return sqlToArrow(resultSet, new JdbcToArrowConfig(new RootAllocator(Integer.MAX_VALUE), calendar));
return sqlToArrow(resultSet, new JdbcToArrowConfig(new RootAllocator(Integer.MAX_VALUE), calendar, false));
}

/**
Expand All @@ -198,12 +197,15 @@ public static VectorSchemaRoot sqlToArrow(ResultSet resultSet, Calendar calendar
* @return Arrow Data Objects {@link VectorSchemaRoot}
* @throws SQLException on error
*/
public static VectorSchemaRoot sqlToArrow(ResultSet resultSet, BaseAllocator allocator, Calendar calendar)
public static VectorSchemaRoot sqlToArrow(
ResultSet resultSet,
BaseAllocator allocator,
Calendar calendar)
throws SQLException, IOException {
Preconditions.checkNotNull(resultSet, "JDBC ResultSet object can not be null");
Preconditions.checkNotNull(allocator, "Memory Allocator object can not be null");

return sqlToArrow(resultSet, new JdbcToArrowConfig(allocator, calendar));
return sqlToArrow(resultSet, new JdbcToArrowConfig(allocator, calendar, false));
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,20 +37,23 @@
public final class JdbcToArrowConfig {
private Calendar calendar;
private BaseAllocator allocator;
private boolean includeMetadata;

/**
* Constructs a new configuration from the provided allocator and calendar. The <code>allocator</code>
* is used when constructing the Arrow vectors from the ResultSet, and the calendar is used to define
* Arrow Timestamp fields, and to read time-based fields from the JDBC <code>ResultSet</code>.
*
* @param allocator The memory allocator to construct the Arrow vectors with.
* @param calendar The calendar to use when constructing Timestamp fields and reading time-based results.
* @param allocator The memory allocator to construct the Arrow vectors with.
* @param calendar The calendar to use when constructing Timestamp fields and reading time-based results.
* @param includeMetadata Whether to include JDBC field metadata in the Arrow Schema Field metadata.
*/
JdbcToArrowConfig(BaseAllocator allocator, Calendar calendar) {
JdbcToArrowConfig(BaseAllocator allocator, Calendar calendar, boolean includeMetadata) {
Preconditions.checkNotNull(allocator, "Memory allocator cannot be null");

this.allocator = allocator;
this.calendar = calendar;
this.includeMetadata = includeMetadata;
}

/**
Expand All @@ -70,4 +73,13 @@ public Calendar getCalendar() {
public BaseAllocator getAllocator() {
return allocator;
}

/**
* Whether to include JDBC ResultSet field metadata in the Arrow Schema field metadata.
*
* @return <code>true</code> to include field metadata, <code>false</code> to exclude it.
*/
public boolean shouldIncludeMetadata() {
return includeMetadata;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
public class JdbcToArrowConfigBuilder {
private Calendar calendar;
private BaseAllocator allocator;
private boolean includeMetadata;

/**
* Default constructor for the <code>JdbcToArrowConfigBuilder}</code>.
Expand All @@ -38,6 +39,7 @@ public class JdbcToArrowConfigBuilder {
public JdbcToArrowConfigBuilder() {
this.allocator = null;
this.calendar = null;
this.includeMetadata = false;
}

/**
Expand All @@ -62,6 +64,32 @@ public JdbcToArrowConfigBuilder(BaseAllocator allocator, Calendar calendar) {

this.allocator = allocator;
this.calendar = calendar;
this.includeMetadata = false;
}

/**
* Constructor for the <code>JdbcToArrowConfigBuilder</code>. Both the
* allocator and calendar are required. A {@link NullPointerException}
* will be thrown if either of those arguments is <code>null</code>.
* <p>
* The allocator is used to construct Arrow vectors from the JDBC ResultSet.
* The calendar is used to determine the time zone of {@link java.sql.Timestamp}
* fields and convert {@link java.sql.Date}, {@link java.sql.Time}, and
* {@link java.sql.Timestamp} fields to a single, common time zone when reading
* from the result set.
* </p>
* <p>
* The <code>includeMetadata</code> argument, if <code>true</code> will cause
* various information about each database field to be added to the Vector
* Schema's field metadata.
* </p>
*
* @param allocator The Arrow Vector memory allocator.
* @param calendar The calendar to use when constructing timestamp fields.
*/
public JdbcToArrowConfigBuilder(BaseAllocator allocator, Calendar calendar, boolean includeMetadata) {
this(allocator, calendar);
this.includeMetadata = includeMetadata;
}

/**
Expand All @@ -87,6 +115,17 @@ public JdbcToArrowConfigBuilder setCalendar(Calendar calendar) {
return this;
}

/**
* Sets whether to include JDBC ResultSet field metadata in the Arrow Schema field metadata.
*
* @param includeMetadata Whether to include or exclude JDBC metadata in the Arrow Schema field metadata.
* @return This instance of the <code>JdbcToArrowConfig</code>, for chaining.
*/
public JdbcToArrowConfigBuilder setIncludeMetadata(boolean includeMetadata) {
this.includeMetadata = includeMetadata;
return this;
}

/**
* This builds the {@link JdbcToArrowConfig} from the provided
* {@link BaseAllocator} and {@link Calendar}.
Expand All @@ -95,6 +134,6 @@ public JdbcToArrowConfigBuilder setCalendar(Calendar calendar) {
* @throws NullPointerException if either the allocator or calendar was not set.
*/
public JdbcToArrowConfig build() {
return new JdbcToArrowConfig(allocator, calendar);
return new JdbcToArrowConfig(allocator, calendar, includeMetadata);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(minor nit - again ignore if you dont agree)
can we default the include metadata in the builder so that only the clients that need it will override..

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense. Will change tonight.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hang on, I'm confused - includeMetadata is initialized to false in the builder. This is the behavior you asked for, right? Only people who call setIncludeMetadata(true) will have the metadata generated.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I saw the ctors using the false flag in other places and was misled..this looks ok.

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,11 @@
import java.sql.Types;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.TimeZone;

import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.BaseFixedWidthVector;
Expand Down Expand Up @@ -103,7 +107,14 @@ public static Schema jdbcToArrowSchema(ResultSetMetaData rsmd, Calendar calendar
Preconditions.checkNotNull(rsmd, "JDBC ResultSetMetaData object can't be null");
Preconditions.checkNotNull(calendar, "Calendar object can't be null");

return jdbcToArrowSchema(rsmd, new JdbcToArrowConfig(new RootAllocator(0), calendar));
return jdbcToArrowSchema(rsmd, new JdbcToArrowConfig(new RootAllocator(0), calendar, false));
}

/**
* Returns the instance of a {java.util.Calendar} with the UTC time zone and root locale.
*/
public static Calendar getUtcCalendar() {
return Calendar.getInstance(TimeZone.getTimeZone("UTC"), Locale.ROOT);
}

/**
Expand Down Expand Up @@ -145,78 +156,103 @@ public static Schema jdbcToArrowSchema(ResultSetMetaData rsmd, JdbcToArrowConfig
Preconditions.checkNotNull(rsmd, "JDBC ResultSetMetaData object can't be null");
Preconditions.checkNotNull(config, "The configuration object must not be null");

final String timezone;
if (config.getCalendar() != null) {
timezone = config.getCalendar().getTimeZone().getID();
} else {
timezone = null;
}

List<Field> fields = new ArrayList<>();
int columnCount = rsmd.getColumnCount();
for (int i = 1; i <= columnCount; i++) {
String columnName = rsmd.getColumnName(i);
final String columnName = rsmd.getColumnName(i);
final FieldType fieldType;

final Map<String, String> metadata;
if (config.shouldIncludeMetadata()) {
metadata = new HashMap<>();
metadata.put(Constants.SQL_CATALOG_NAME_KEY, rsmd.getCatalogName(i));
metadata.put(Constants.SQL_TABLE_NAME_KEY, rsmd.getTableName(i));
metadata.put(Constants.SQL_COLUMN_NAME_KEY, columnName);
metadata.put(Constants.SQL_TYPE_KEY, rsmd.getColumnTypeName(i));

} else {
metadata = null;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(minor nit)
do this in the initialization itself?..avoids the else block..

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can do that, but then I can't make the metadata variable final. I've worked on teams in the past where the best practice was to mark things final as often as possible; is that the same with Arrow?

}

switch (rsmd.getColumnType(i)) {
case Types.BOOLEAN:
case Types.BIT:
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.Bool()), null));
fieldType = new FieldType(true, new ArrowType.Bool(), null, metadata);
break;
case Types.TINYINT:
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.Int(8, true)), null));
fieldType = new FieldType(true, new ArrowType.Int(8, true), null, metadata);
break;
case Types.SMALLINT:
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.Int(16, true)), null));
fieldType = new FieldType(true, new ArrowType.Int(16, true), null, metadata);
break;
case Types.INTEGER:
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.Int(32, true)), null));
fieldType = new FieldType(true, new ArrowType.Int(32, true), null, metadata);
break;
case Types.BIGINT:
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.Int(64, true)), null));
fieldType = new FieldType(true, new ArrowType.Int(64, true), null, metadata);
break;
case Types.NUMERIC:
case Types.DECIMAL:
int precision = rsmd.getPrecision(i);
int scale = rsmd.getScale(i);
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.Decimal(precision, scale)), null));
fieldType = new FieldType(true, new ArrowType.Decimal(precision, scale), null, metadata);
break;
case Types.REAL:
case Types.FLOAT:
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.FloatingPoint(SINGLE)), null));
fieldType = new FieldType(true, new ArrowType.FloatingPoint(SINGLE), null, metadata);
break;
case Types.DOUBLE:
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.FloatingPoint(DOUBLE)), null));
fieldType = new FieldType(true, new ArrowType.FloatingPoint(DOUBLE), null, metadata);
break;
case Types.CHAR:
case Types.NCHAR:
case Types.VARCHAR:
case Types.NVARCHAR:
case Types.LONGVARCHAR:
case Types.LONGNVARCHAR:
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.Utf8()), null));
case Types.CLOB:
fieldType = new FieldType(true, new ArrowType.Utf8(), null, metadata);
break;
case Types.DATE:
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.Date(DateUnit.MILLISECOND)), null));
fieldType = new FieldType(true, new ArrowType.Date(DateUnit.MILLISECOND), null, metadata);
break;
case Types.TIME:
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.Time(TimeUnit.MILLISECOND, 32)), null));
fieldType = new FieldType(true, new ArrowType.Time(TimeUnit.MILLISECOND, 32), null, metadata);
break;
case Types.TIMESTAMP:
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.Timestamp(TimeUnit.MILLISECOND,
config.getCalendar().getTimeZone().getID())), null));
fieldType =
new FieldType(
true,
new ArrowType.Timestamp(TimeUnit.MILLISECOND, timezone),
null,
metadata);
break;
case Types.BINARY:
case Types.VARBINARY:
case Types.LONGVARBINARY:
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.Binary()), null));
break;
case Types.ARRAY:
// TODO Need to handle this type
// fields.add(new Field("list", FieldType.nullable(new ArrowType.List()), null));
break;
case Types.CLOB:
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.Utf8()), null));
break;
case Types.BLOB:
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.Binary()), null));
fieldType = new FieldType(true, new ArrowType.Binary(), null, metadata);
break;

case Types.ARRAY:
// TODO Need to handle this type
// fields.add(new Field("list", FieldType.nullable(new ArrowType.List()), null));
default:
// no-op, shouldn't get here
fieldType = null;
break;
}

if (fieldType != null) {
fields.add(new Field(columnName, fieldType, null));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like the use of FieldType object instead of creating Field object in each case stmt. I am fine with this.

}
}

return new Schema(fields, null);
Expand Down Expand Up @@ -250,7 +286,7 @@ public static void jdbcToArrowVectors(ResultSet rs, VectorSchemaRoot root, Calen
Preconditions.checkNotNull(rs, "JDBC ResultSet object can't be null");
Preconditions.checkNotNull(root, "JDBC ResultSet object can't be null");

jdbcToArrowVectors(rs, root, new JdbcToArrowConfig(new RootAllocator(0), calendar));
jdbcToArrowVectors(rs, root, new JdbcToArrowConfig(new RootAllocator(0), calendar, false));
}

/**
Expand Down
Loading