Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add parquet upload #14449

Merged
merged 32 commits into from
Aug 31, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
4cc378a
allow csv upload to accept parquet file
exemplary-citizen May 3, 2021
c44fcee
Merge branch 'master' of https://github.com/apache/superset into uplo…
exemplary-citizen May 20, 2021
968c048
fix mypy
exemplary-citizen May 20, 2021
c23b1ec
fix if statement
exemplary-citizen May 21, 2021
97bea75
add test for specificying columns in CSV upload
exemplary-citizen May 21, 2021
2f8482b
clean up test
exemplary-citizen May 21, 2021
77b381f
change order in test
exemplary-citizen May 21, 2021
f8b0dfc
fix failures
exemplary-citizen May 21, 2021
b40a3e2
upload parquet to seperate table in test
exemplary-citizen May 24, 2021
f2911b5
fix error message
exemplary-citizen May 25, 2021
dd889b0
fix mypy again
exemplary-citizen May 25, 2021
cfdd3be
rename other extensions to columnar
exemplary-citizen Jun 4, 2021
5e063bd
add new form for columnar upload
exemplary-citizen Jun 6, 2021
4c74594
add support for zip files
exemplary-citizen Jun 6, 2021
9096faf
undo csv form changes except usecols
exemplary-citizen Jun 7, 2021
98c75ba
add more tests for zip
exemplary-citizen Jun 7, 2021
d92a290
isort & black
exemplary-citizen Jun 7, 2021
3501fac
Merge branch 'master' into upload_parquet
exemplary-citizen Jun 7, 2021
f209999
Merge branch 'master' of https://github.com/apache/superset into uplo…
exemplary-citizen Jun 11, 2021
10d6229
pylint
exemplary-citizen Jun 15, 2021
35eaea6
Merge branch 'master' of https://github.com/apache/superset into uplo…
exemplary-citizen Jun 15, 2021
10e825d
Merge branch 'master' of https://github.com/apache/superset into uplo…
exemplary-citizen Aug 2, 2021
658656d
fix trailing space
exemplary-citizen Aug 3, 2021
0f1816b
Merge branch 'master' of https://github.com/apache/superset into uplo…
exemplary-citizen Aug 3, 2021
7b1b53b
Merge branch 'master' of https://github.com/apache/superset into uplo…
exemplary-citizen Aug 5, 2021
b885b38
address more review comments
exemplary-citizen Aug 23, 2021
6bcc0d9
Merge branch 'master' of https://github.com/apache/superset into uplo…
exemplary-citizen Aug 23, 2021
60ae1fe
Merge branch 'master' of https://github.com/apache/superset into uplo…
exemplary-citizen Aug 26, 2021
5b5eb74
pylint
exemplary-citizen Aug 26, 2021
df2930f
black
exemplary-citizen Aug 26, 2021
9b4d35b
Merge branch 'master' of https://github.com/apache/superset into uplo…
exemplary-citizen Aug 31, 2021
de3509e
resolve remaining issues
exemplary-citizen Aug 31, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion superset/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,7 +556,8 @@ def _try_json_readsha(filepath: str, length: int) -> Optional[str]:
# Allowed format types for upload on Database view
EXCEL_EXTENSIONS = {"xlsx", "xls"}
CSV_EXTENSIONS = {"csv", "tsv", "txt"}
ALLOWED_EXTENSIONS = {*EXCEL_EXTENSIONS, *CSV_EXTENSIONS}
COLUMNAR_EXTENSIONS = {"parquet", "zip"}
ALLOWED_EXTENSIONS = {*EXCEL_EXTENSIONS, *CSV_EXTENSIONS, *COLUMNAR_EXTENSIONS}

# CSV Options: key/value pairs that will be passed as argument to DataFrame.to_csv
# method.
Expand Down
17 changes: 16 additions & 1 deletion superset/initialization/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ def init_views(self) -> None:
DashboardModelViewAsync,
)
from superset.views.database.views import (
ColumnarToDatabaseView,
CsvToDatabaseView,
DatabaseView,
ExcelToDatabaseView,
Expand Down Expand Up @@ -281,6 +282,7 @@ def init_views(self) -> None:
appbuilder.add_view_no_menu(CssTemplateAsyncModelView)
appbuilder.add_view_no_menu(CsvToDatabaseView)
appbuilder.add_view_no_menu(ExcelToDatabaseView)
appbuilder.add_view_no_menu(ColumnarToDatabaseView)
appbuilder.add_view_no_menu(Dashboard)
appbuilder.add_view_no_menu(DashboardModelViewAsync)
appbuilder.add_view_no_menu(Datasource)
Expand Down Expand Up @@ -371,7 +373,20 @@ def init_views(self) -> None:
)
),
)

appbuilder.add_link(
"Upload a Columnar file",
label=__("Upload a Columnar file"),
href="/columnartodatabaseview/form",
icon="fa-upload",
category="Data",
category_label=__("Data"),
category_icon="fa-wrench",
cond=lambda: bool(
self.config["COLUMNAR_EXTENSIONS"].intersection(
self.config["ALLOWED_EXTENSIONS"]
)
),
)
try:
import xlrd # pylint: disable=unused-import

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
{#
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
#}
{% extends 'appbuilder/general/model/edit.html' %}

{% block tail_js %}
{{ super() }}
<script>
var db = $("#con");
var schema = $("#schema");

// this element is a text input
// copy it here so it can be reused later
var any_schema_is_allowed = schema.clone();

update_schemas_allowed_for_columnar_upload(db.val());
db.change(function(){
update_schemas_allowed_for_columnar_upload(db.val());
});

function update_schemas_allowed_for_columnar_upload(db_id) {
$.ajax({
method: "GET",
url: "/superset/schemas_access_for_file_upload",
data: {db_id: db_id},
dataType: 'json',
contentType: "application/json; charset=utf-8"
}).done(function(data) {
change_schema_field_in_formview(data)
}).fail(function(error) {
var errorMsg = error.responseJSON.error;
alert("ERROR: " + errorMsg);
});
}

function change_schema_field_in_formview(schemas_allowed){
if (schemas_allowed && schemas_allowed.length > 0) {
var dropdown_schema_lists = '<select id="schema" name="schema" required>';
schemas_allowed.forEach(function(schema_allowed) {
dropdown_schema_lists += ('<option value="' + schema_allowed + '">' + schema_allowed + '</option>');
});
dropdown_schema_lists += '</select>';
$("#schema").replaceWith(dropdown_schema_lists);
} else {
$("#schema").replaceWith(any_schema_is_allowed)
}
}
</script>
{% endblock %}
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
function update_schemas_allowed_for_csv_upload(db_id) {
$.ajax({
method: "GET",
url: "/superset/schemas_access_for_csv_upload",
url: "/superset/schemas_access_for_file_upload",
data: {db_id: db_id},
dataType: 'json',
contentType: "application/json; charset=utf-8"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
function update_schemas_allowed_for_excel_upload(db_id) {
$.ajax({
method: "GET",
url: "/superset/schemas_access_for_excel_upload",
url: "/superset/schemas_access_for_file_upload",
data: {db_id: db_id},
dataType: 'json',
contentType: "application/json; charset=utf-8"
Expand Down
6 changes: 3 additions & 3 deletions superset/views/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3072,11 +3072,11 @@ def sqllab_history(self) -> FlaskResponse:
@api
@has_access_api
@event_logger.log_this
@expose("/schemas_access_for_csv_upload")
def schemas_access_for_csv_upload(self) -> FlaskResponse:
@expose("/schemas_access_for_file_upload")
def schemas_access_for_file_upload(self) -> FlaskResponse:
"""
This method exposes an API endpoint to
get the schema access control settings for csv upload in this database
get the schema access control settings for file upload in this database
"""
if not request.args.get("db_id"):
return json_error_response("No database is allowed for your csv upload")
Expand Down
144 changes: 143 additions & 1 deletion superset/views/database/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,13 @@
from flask_appbuilder.forms import DynamicForm
from flask_babel import lazy_gettext as _
from flask_wtf.file import FileAllowed, FileField, FileRequired
from wtforms import BooleanField, IntegerField, SelectField, StringField
from wtforms import (
BooleanField,
IntegerField,
MultipleFileField,
SelectField,
StringField,
)
from wtforms.ext.sqlalchemy.fields import QuerySelectField
from wtforms.validators import DataRequired, Length, NumberRange, Optional

Expand Down Expand Up @@ -163,6 +169,15 @@ def at_least_one_schema_is_allowed(database: Database) -> bool:
_("Mangle Duplicate Columns"),
description=_('Specify duplicate columns as "X.0, X.1".'),
)
usecols = JsonListField(
_("Use Columns"),
default=None,
description=_(
"Json list of the column names that should be read. "
"If not None, only these columns will be read from the file."
),
validators=[Optional()],
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you provide a screenshot of the updated form UI?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a screenshot to the summary above

Comment on lines +172 to +180
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@villebro do you still want to keep the Use Columns feature in the csv form? if so, should we keep it in this PR or move it to a separate one?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, let's keep this, I think it's a great addition 👍

skipinitialspace = BooleanField(
_("Skip Initial Space"), description=_("Skip spaces after delimiter.")
)
Expand Down Expand Up @@ -402,3 +417,130 @@ def at_least_one_schema_is_allowed(database: Database) -> bool:
'Use [""] for empty string.'
),
)


class ColumnarToDatabaseForm(DynamicForm):
# pylint: disable=E0211
def columnar_allowed_dbs() -> List[Database]: # type: ignore
Comment on lines +423 to +424
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder why this wasn't @staticmethod?? Can we update that here and in the other classes

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding that decorator will throw TypeError: 'staticmethod' object is not callable

# TODO: change allow_csv_upload to allow_file_upload
columnar_enabled_dbs = (
db.session.query(Database).filter_by(allow_csv_upload=True).all()
)
return [
columnar_enabled_db
for columnar_enabled_db in columnar_enabled_dbs
if ColumnarToDatabaseForm.at_least_one_schema_is_allowed(
columnar_enabled_db
)
]
Comment on lines +422 to +435
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's unnecessary duplication here: we could abstract this into a class UploadToDatabaseForm(DynamicForm) with all the repeated boilerplate and then extend those into class CsvToDatabaseForm(UploadToDatabaseForm), class ColumnarToDatabaseForm(UploadToDatabaseForm) etc. But this was here before you already, so not your fault! But if you feel like doing some valuable cleanup work, it would be great to do some of this cleanup in a follow-up PR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just opened #16046. Feel free to assign it to me. I'll try to open a PR for it sometime in the next week

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks - I assigned to you (will keep myself as assignee for review)


@staticmethod
def at_least_one_schema_is_allowed(database: Database) -> bool:
"""
If the user has access to the database or all datasource
1. if schemas_allowed_for_csv_upload is empty
a) if database does not support schema
user is able to upload columnar without specifying schema name
b) if database supports schema
user is able to upload columnar to any schema
2. if schemas_allowed_for_csv_upload is not empty
a) if database does not support schema
This situation is impossible and upload will fail
b) if database supports schema
user is able to upload to schema in schemas_allowed_for_csv_upload
elif the user does not access to the database or all datasource
1. if schemas_allowed_for_csv_upload is empty
a) if database does not support schema
user is unable to upload columnar
b) if database supports schema
user is unable to upload columnar
2. if schemas_allowed_for_csv_upload is not empty
a) if database does not support schema
This situation is impossible and user is unable to upload columnar
b) if database supports schema
user is able to upload to schema in schemas_allowed_for_csv_upload
"""
if security_manager.can_access_database(database):
return True
schemas = database.get_schema_access_for_csv_upload()
if schemas and security_manager.schemas_accessible_by_user(
database, schemas, False
):
return True
return False

name = StringField(
_("Table Name"),
description=_("Name of table to be created from columnar data."),
validators=[DataRequired()],
widget=BS3TextFieldWidget(),
)
columnar_file = MultipleFileField(
_("Columnar File"),
description=_("Select a Columnar file to be uploaded to a database."),
validators=[
DataRequired(),
FileAllowed(
config["ALLOWED_EXTENSIONS"].intersection(
config["COLUMNAR_EXTENSIONS"]
),
_(
"Only the following file extensions are allowed: "
"%(allowed_extensions)s",
allowed_extensions=", ".join(
config["ALLOWED_EXTENSIONS"].intersection(
config["COLUMNAR_EXTENSIONS"]
)
),
),
),
],
)

con = QuerySelectField(
_("Database"),
query_factory=columnar_allowed_dbs,
get_pk=lambda a: a.id,
get_label=lambda a: a.database_name,
)
schema = StringField(
_("Schema"),
description=_("Specify a schema (if database flavor supports this)."),
validators=[Optional()],
widget=BS3TextFieldWidget(),
)
if_exists = SelectField(
_("Table Exists"),
description=_(
"If table exists do one of the following: "
"Fail (do nothing), Replace (drop and recreate table) "
"or Append (insert data)."
),
choices=[
("fail", _("Fail")),
("replace", _("Replace")),
("append", _("Append")),
],
validators=[DataRequired()],
)
usecols = JsonListField(
_("Use Columns"),
default=None,
description=_(
"Json list of the column names that should be read. "
"If not None, only these columns will be read from the file."
),
validators=[Optional()],
)
index = BooleanField(
_("Dataframe Index"), description=_("Write dataframe index as a column.")
)
index_label = StringField(
_("Column Label(s)"),
description=_(
"Column label for index column(s). If None is given "
"and Dataframe Index is True, Index Names are used."
),
validators=[Optional()],
widget=BS3TextFieldWidget(),
)
Loading