From e9d3491616dbc190838f6d241ae5ce48004f1c4b Mon Sep 17 00:00:00 2001 From: Beto Dealmeida Date: Tue, 23 Feb 2021 17:26:03 -0800 Subject: [PATCH 1/4] WIP --- superset/cli.py | 85 ++++++++++++++++++++++++++++--------------------- 1 file changed, 48 insertions(+), 37 deletions(-) diff --git a/superset/cli.py b/superset/cli.py index 5955eb54de540..598e8848aaca3 100755 --- a/superset/cli.py +++ b/superset/cli.py @@ -20,6 +20,7 @@ import sys from datetime import datetime, timedelta from subprocess import Popen +from types import ModuleType from typing import Any, Dict, List, Optional, Type, Union from zipfile import ZipFile @@ -108,18 +109,11 @@ def version(verbose: bool) -> None: print(Style.RESET_ALL) -def load_examples_run( - load_test_data: bool, only_metadata: bool = False, force: bool = False +def load_test_data( + examples: ModuleType, + only_metadata: bool = False, + force: bool = False, ) -> None: - if only_metadata: - print("Loading examples metadata") - else: - examples_db = utils.get_example_database() - print(f"Loading examples metadata and related data into {examples_db}") - - from superset import examples - - examples.load_css_templates() print("Loading energy related dataset") examples.load_energy(only_metadata, force) @@ -133,47 +127,54 @@ def load_examples_run( print("Loading [Tabbed dashboard]") examples.load_tabbed_dashboard(only_metadata) - if not load_test_data: - print("Loading [Random time series data]") - examples.load_random_time_series_data(only_metadata, force) + print("Loading additional examples") + examples.load_from_configs(force, load_test_data=True) + + +def load_examples( + examples: ModuleType, only_metadata: bool = False, force: bool = False +) -> None: + print("Loading [Random time series data]") + examples.load_random_time_series_data(only_metadata, force) - print("Loading [Random long/lat data]") - examples.load_long_lat_data(only_metadata, force) + print("Loading [Random long/lat data]") + examples.load_long_lat_data(only_metadata, force) - print("Loading [Country Map data]") - examples.load_country_map_data(only_metadata, force) + print("Loading [Country Map data]") + examples.load_country_map_data(only_metadata, force) - print("Loading [Multiformat time series]") - examples.load_multiformat_time_series(only_metadata, force) + print("Loading [Multiformat time series]") + examples.load_multiformat_time_series(only_metadata, force) - print("Loading [Paris GeoJson]") - examples.load_paris_iris_geojson(only_metadata, force) + print("Loading [Paris GeoJson]") + examples.load_paris_iris_geojson(only_metadata, force) - print("Loading [San Francisco population polygons]") - examples.load_sf_population_polygons(only_metadata, force) + print("Loading [San Francisco population polygons]") + examples.load_sf_population_polygons(only_metadata, force) - print("Loading [Flights data]") - examples.load_flights(only_metadata, force) + print("Loading [Flights data]") + examples.load_flights(only_metadata, force) - print("Loading [BART lines]") - examples.load_bart_lines(only_metadata, force) + print("Loading [BART lines]") + examples.load_bart_lines(only_metadata, force) - print("Loading [Multi Line]") - examples.load_multi_line(only_metadata) + print("Loading [Multi Line]") + examples.load_multi_line(only_metadata) - print("Loading [Misc Charts] dashboard") - examples.load_misc_dashboard() + print("Loading [Misc Charts] dashboard") + examples.load_misc_dashboard() - print("Loading DECK.gl demo") - examples.load_deck_dash() + print("Loading DECK.gl demo") + examples.load_deck_dash() - # load examples that are stored as YAML config files - examples.load_from_configs(force, load_test_data) + print("Loading additional examples") + examples.load_from_configs(force, load_test_data=False) @with_appcontext @superset.command() @click.option("--load-test-data", "-t", is_flag=True, help="Load additional test data") +@click.option("--load-big-test-data", "-b", is_flag=True, help="Load big test data") @click.option( "--only-metadata", "-m", is_flag=True, help="Only load metadata, skip actual data" ) @@ -184,6 +185,14 @@ def load_examples( load_test_data: bool, only_metadata: bool = False, force: bool = False ) -> None: """Loads a set of Slices and Dashboards and a supporting dataset """ + if only_metadata: + print("Loading examples metadata") + else: + examples_db = utils.get_example_database() + print(f"Loading examples metadata and related data into {examples_db}") + from superset import examples + + examples.load_css_templates() load_examples_run(load_test_data, only_metadata, force) @@ -304,7 +313,9 @@ def export_datasources(datasource_file: Optional[str]) -> None: @superset.command() @with_appcontext @click.option( - "--path", "-p", help="Path to a single ZIP file", + "--path", + "-p", + help="Path to a single ZIP file", ) @click.option( "--username", From 6244d7cb1aa6a4daedce465e8a0ed519438ee4b6 Mon Sep 17 00:00:00 2001 From: Beto Dealmeida Date: Wed, 3 Mar 2021 17:49:27 -0800 Subject: [PATCH 2/4] feat: add option to load big/wide tables --- superset/cli.py | 93 +++++++++++----------- superset/examples/__init__.py | 1 + superset/examples/big_data.py | 49 ++++++++++++ superset/utils/data.py | 143 ++++++++++++++++++++++++++++++++++ 4 files changed, 239 insertions(+), 47 deletions(-) create mode 100644 superset/examples/big_data.py create mode 100644 superset/utils/data.py diff --git a/superset/cli.py b/superset/cli.py index 598e8848aaca3..b999f67e8fa3c 100755 --- a/superset/cli.py +++ b/superset/cli.py @@ -20,7 +20,6 @@ import sys from datetime import datetime, timedelta from subprocess import Popen -from types import ModuleType from typing import Any, Dict, List, Optional, Type, Union from zipfile import ZipFile @@ -109,11 +108,21 @@ def version(verbose: bool) -> None: print(Style.RESET_ALL) -def load_test_data( - examples: ModuleType, +def load_examples_run( + load_test_data: bool, + load_big_data: bool, only_metadata: bool = False, force: bool = False, ) -> None: + if only_metadata: + print("Loading examples metadata") + else: + examples_db = utils.get_example_database() + print(f"Loading examples metadata and related data into {examples_db}") + + from superset import examples + + examples.load_css_templates() print("Loading energy related dataset") examples.load_energy(only_metadata, force) @@ -127,54 +136,51 @@ def load_test_data( print("Loading [Tabbed dashboard]") examples.load_tabbed_dashboard(only_metadata) - print("Loading additional examples") - examples.load_from_configs(force, load_test_data=True) + if not load_test_data: + print("Loading [Random time series data]") + examples.load_random_time_series_data(only_metadata, force) + print("Loading [Random long/lat data]") + examples.load_long_lat_data(only_metadata, force) -def load_examples( - examples: ModuleType, only_metadata: bool = False, force: bool = False -) -> None: - print("Loading [Random time series data]") - examples.load_random_time_series_data(only_metadata, force) - - print("Loading [Random long/lat data]") - examples.load_long_lat_data(only_metadata, force) + print("Loading [Country Map data]") + examples.load_country_map_data(only_metadata, force) - print("Loading [Country Map data]") - examples.load_country_map_data(only_metadata, force) + print("Loading [Multiformat time series]") + examples.load_multiformat_time_series(only_metadata, force) - print("Loading [Multiformat time series]") - examples.load_multiformat_time_series(only_metadata, force) + print("Loading [Paris GeoJson]") + examples.load_paris_iris_geojson(only_metadata, force) - print("Loading [Paris GeoJson]") - examples.load_paris_iris_geojson(only_metadata, force) + print("Loading [San Francisco population polygons]") + examples.load_sf_population_polygons(only_metadata, force) - print("Loading [San Francisco population polygons]") - examples.load_sf_population_polygons(only_metadata, force) + print("Loading [Flights data]") + examples.load_flights(only_metadata, force) - print("Loading [Flights data]") - examples.load_flights(only_metadata, force) + print("Loading [BART lines]") + examples.load_bart_lines(only_metadata, force) - print("Loading [BART lines]") - examples.load_bart_lines(only_metadata, force) + print("Loading [Multi Line]") + examples.load_multi_line(only_metadata) - print("Loading [Multi Line]") - examples.load_multi_line(only_metadata) + print("Loading [Misc Charts] dashboard") + examples.load_misc_dashboard() - print("Loading [Misc Charts] dashboard") - examples.load_misc_dashboard() + print("Loading DECK.gl demo") + examples.load_deck_dash() - print("Loading DECK.gl demo") - examples.load_deck_dash() + if load_big_data: + examples.load_big_data() - print("Loading additional examples") - examples.load_from_configs(force, load_test_data=False) + # load examples that are stored as YAML config files + examples.load_from_configs(force, load_test_data) @with_appcontext @superset.command() @click.option("--load-test-data", "-t", is_flag=True, help="Load additional test data") -@click.option("--load-big-test-data", "-b", is_flag=True, help="Load big test data") +@click.option("--load-big-data", "-b", is_flag=True, help="Load additional big data") @click.option( "--only-metadata", "-m", is_flag=True, help="Only load metadata, skip actual data" ) @@ -182,18 +188,13 @@ def load_examples( "--force", "-f", is_flag=True, help="Force load data even if table already exists" ) def load_examples( - load_test_data: bool, only_metadata: bool = False, force: bool = False + load_test_data: bool, + load_big_data: bool, + only_metadata: bool = False, + force: bool = False, ) -> None: """Loads a set of Slices and Dashboards and a supporting dataset """ - if only_metadata: - print("Loading examples metadata") - else: - examples_db = utils.get_example_database() - print(f"Loading examples metadata and related data into {examples_db}") - from superset import examples - - examples.load_css_templates() - load_examples_run(load_test_data, only_metadata, force) + load_examples_run(load_test_data, load_big_data, only_metadata, force) @with_appcontext @@ -313,9 +314,7 @@ def export_datasources(datasource_file: Optional[str]) -> None: @superset.command() @with_appcontext @click.option( - "--path", - "-p", - help="Path to a single ZIP file", + "--path", "-p", help="Path to a single ZIP file", ) @click.option( "--username", diff --git a/superset/examples/__init__.py b/superset/examples/__init__.py index b8a844739b920..161a52f4b4d19 100644 --- a/superset/examples/__init__.py +++ b/superset/examples/__init__.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. from .bart_lines import load_bart_lines +from .big_data import load_big_data from .birth_names import load_birth_names from .country_map import load_country_map_data from .css_templates import load_css_templates diff --git a/superset/examples/big_data.py b/superset/examples/big_data.py new file mode 100644 index 0000000000000..eb8402fcf6b92 --- /dev/null +++ b/superset/examples/big_data.py @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from typing import List + +import sqlalchemy.sql.sqltypes + +from superset.utils.data import add_data, ColumnInfo + +COLUMN_TYPES = [ + sqlalchemy.sql.sqltypes.INTEGER(), + sqlalchemy.sql.sqltypes.VARCHAR(length=255), + sqlalchemy.sql.sqltypes.TEXT(), + sqlalchemy.sql.sqltypes.BOOLEAN(), + sqlalchemy.sql.sqltypes.FLOAT(), + sqlalchemy.sql.sqltypes.DATE(), + sqlalchemy.sql.sqltypes.TIME(), + sqlalchemy.sql.sqltypes.DATETIME(), +] + + +def load_big_data() -> None: + # create table with 100 columns to test SQL Lab + columns: List[ColumnInfo] = [] + for i in range(100): + column: ColumnInfo = { + "name": f"col{i}", + "type": COLUMN_TYPES[i % len(COLUMN_TYPES)], + "nullable": False, + "default": None, + "autoincrement": "auto", + "primary_key": 1 if i == 0 else 0, + } + columns.append(column) + + add_data(columns=columns, num_rows=1000, table_name="wide_table") diff --git a/superset/utils/data.py b/superset/utils/data.py new file mode 100644 index 0000000000000..e468d1ed18b17 --- /dev/null +++ b/superset/utils/data.py @@ -0,0 +1,143 @@ +import random +import string +import sys +from datetime import date, datetime, time, timedelta +from typing import Any, Callable, cast, Dict, List, Optional + +import sqlalchemy.sql.sqltypes +from flask_appbuilder import Model +from sqlalchemy import Column, inspect, MetaData, Table +from sqlalchemy.sql.visitors import VisitableType +from typing_extensions import TypedDict + +ColumnInfo = TypedDict( + "ColumnInfo", + { + "name": str, + "type": VisitableType, + "nullable": bool, + "default": Optional[Any], + "autoincrement": str, + "primary_key": int, + }, +) + + +example_column = { + "name": "id", + "type": sqlalchemy.sql.sqltypes.INTEGER(), + "nullable": False, + "default": None, + "autoincrement": "auto", + "primary_key": 1, +} + + +MINIMUM_DATE = date(1900, 1, 1) +MAXIMUM_DATE = date.today() +days_range = (MAXIMUM_DATE - MINIMUM_DATE).days + + +def get_type_generator(sqltype: sqlalchemy.sql.sqltypes) -> Callable[[], Any]: + if isinstance(sqltype, sqlalchemy.sql.sqltypes.INTEGER): + return lambda: random.randrange(2147483647) + + if isinstance(sqltype, sqlalchemy.sql.sqltypes.BIGINT): + return lambda: random.randrange(sys.maxsize) + + if isinstance(sqltype, sqlalchemy.sql.sqltypes.VARCHAR): + length = random.randrange(sqltype.length or 255) + return lambda: "".join(random.choices(string.printable, k=length)) + + if isinstance(sqltype, sqlalchemy.sql.sqltypes.TEXT): + length = random.randrange(65535) + return lambda: "".join(random.choices(string.printable, k=length)) + + if isinstance(sqltype, sqlalchemy.sql.sqltypes.BOOLEAN): + return lambda: random.choice([True, False]) + + if isinstance( + sqltype, (sqlalchemy.sql.sqltypes.FLOAT, sqlalchemy.sql.sqltypes.REAL) + ): + return lambda: random.uniform(-sys.maxsize, sys.maxsize) + + if isinstance(sqltype, sqlalchemy.sql.sqltypes.DATE): + return lambda: MINIMUM_DATE + timedelta(days=random.randrange(days_range)) + + if isinstance(sqltype, sqlalchemy.sql.sqltypes.TIME): + return lambda: time( + random.randrange(24), random.randrange(60), random.randrange(60), + ) + + if isinstance( + sqltype, (sqlalchemy.sql.sqltypes.TIMESTAMP, sqlalchemy.sql.sqltypes.DATETIME) + ): + return lambda: datetime.fromordinal(MINIMUM_DATE.toordinal()) + timedelta( + seconds=random.randrange(days_range * 86400) + ) + + raise Exception(f"Unknown type {sqltype}. Please add it to `get_type_generator`.") + + +def add_data( + columns: Optional[List[ColumnInfo]], + num_rows: int, + table_name: str, + append: bool = False, +) -> None: + """ + Generate synthetic data for testing migrations and features. + + If the table already exists `columns` can be `None`. + + :param Optional[List[ColumnInfo]] columns: list of column names and types to create + :param int run_nows: how many rows to generate and insert + :param str table_name: name of table, will be created if it doesn't exist + :param bool append: if the table already exists, append data or replace? + """ + from superset.utils.core import get_example_database + + database = get_example_database() + table_exists = database.has_table_by_name(table_name) + engine = database.get_sqla_engine() + + if columns is None: + if not table_exists: + raise Exception( + f"The table {table_name} does not exist. To create it you need to pass a " + "list of column names and types." + ) + + inspector = inspect(engine) + columns = inspector.get_columns(table_name) + + # create table if needed + column_objects = get_column_objects(columns) + metadata = MetaData() + table = Table(table_name, metadata, *column_objects) + metadata.create_all(engine) + + data = generate_data(columns, num_rows) + engine.execute(table.insert(), data) + + +def get_column_objects(columns: List[ColumnInfo]) -> List[Column]: + out = [] + for column in columns: + kwargs = cast(Dict[str, Any], column.copy()) + kwargs["type_"] = kwargs.pop("type") + out.append(Column(**kwargs)) + return out + + +def generate_data(columns: List[ColumnInfo], num_rows: int) -> List[Dict[str, Any]]: + keys = [column["name"] for column in columns] + return [ + dict(zip(keys, row)) + for row in zip(*[generate_column_data(column, num_rows) for column in columns]) + ] + + +def generate_column_data(column: ColumnInfo, num_rows: int) -> List[Any]: + func = get_type_generator(column["type"]) + return [func() for _ in range(num_rows)] From 95edbd4724a6720887d9876c48796f009e884d63 Mon Sep 17 00:00:00 2001 From: Beto Dealmeida Date: Wed, 3 Mar 2021 18:52:22 -0800 Subject: [PATCH 3/4] Fix lint --- superset/utils/data.py | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/superset/utils/data.py b/superset/utils/data.py index e468d1ed18b17..f09d6666b03db 100644 --- a/superset/utils/data.py +++ b/superset/utils/data.py @@ -1,3 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. import random import string import sys @@ -5,7 +21,6 @@ from typing import Any, Callable, cast, Dict, List, Optional import sqlalchemy.sql.sqltypes -from flask_appbuilder import Model from sqlalchemy import Column, inspect, MetaData, Table from sqlalchemy.sql.visitors import VisitableType from typing_extensions import TypedDict @@ -83,7 +98,7 @@ def add_data( columns: Optional[List[ColumnInfo]], num_rows: int, table_name: str, - append: bool = False, + append: bool = True, ) -> None: """ Generate synthetic data for testing migrations and features. @@ -104,8 +119,8 @@ def add_data( if columns is None: if not table_exists: raise Exception( - f"The table {table_name} does not exist. To create it you need to pass a " - "list of column names and types." + f"The table {table_name} does not exist. To create it you need to " + "pass a list of column names and types." ) inspector = inspect(engine) @@ -117,7 +132,12 @@ def add_data( table = Table(table_name, metadata, *column_objects) metadata.create_all(engine) + if not append: + # pylint: disable=no-value-for-parameter (sqlalchemy/issues/4656) + engine.execute(table.delete()) + data = generate_data(columns, num_rows) + # pylint: disable=no-value-for-parameter (sqlalchemy/issues/4656) engine.execute(table.insert(), data) From e3aba40f965eed320408472bef50bd4c9609ba08 Mon Sep 17 00:00:00 2001 From: Beto Dealmeida Date: Thu, 11 Mar 2021 08:06:35 -0800 Subject: [PATCH 4/4] Address comments --- superset/cli.py | 5 +++-- superset/utils/data.py | 2 ++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/superset/cli.py b/superset/cli.py index b999f67e8fa3c..c0cd984825091 100755 --- a/superset/cli.py +++ b/superset/cli.py @@ -109,8 +109,8 @@ def version(verbose: bool) -> None: def load_examples_run( - load_test_data: bool, - load_big_data: bool, + load_test_data: bool = False, + load_big_data: bool = False, only_metadata: bool = False, force: bool = False, ) -> None: @@ -171,6 +171,7 @@ def load_examples_run( examples.load_deck_dash() if load_big_data: + print("Loading big synthetic data for tests") examples.load_big_data() # load examples that are stored as YAML config files diff --git a/superset/utils/data.py b/superset/utils/data.py index f09d6666b03db..9a1987c41dc85 100644 --- a/superset/utils/data.py +++ b/superset/utils/data.py @@ -66,6 +66,8 @@ def get_type_generator(sqltype: sqlalchemy.sql.sqltypes) -> Callable[[], Any]: if isinstance(sqltype, sqlalchemy.sql.sqltypes.TEXT): length = random.randrange(65535) + # "practicality beats purity" + length = max(length, 2048) return lambda: "".join(random.choices(string.printable, k=length)) if isinstance(sqltype, sqlalchemy.sql.sqltypes.BOOLEAN):