diff --git a/setup.py b/setup.py index 2da3f3254bb2d..033d9ac83ee4e 100644 --- a/setup.py +++ b/setup.py @@ -51,6 +51,7 @@ def get_git_sha(): 'flask-script==2.0.5', 'flask-sqlalchemy==2.0', 'flask-testing==0.6.1', + 'future>=0.16.0, <0.17', 'humanize==0.5.1', 'gunicorn==19.6.0', 'markdown==2.6.8', diff --git a/superset/dataframe.py b/superset/dataframe.py index 9f7aa88b88b74..f3b9f3e1be41c 100644 --- a/superset/dataframe.py +++ b/superset/dataframe.py @@ -10,6 +10,9 @@ from __future__ import print_function from __future__ import unicode_literals +from datetime import datetime, date +from past.builtins import basestring + import pandas as pd import numpy as np @@ -19,6 +22,22 @@ class SupersetDataFrame(object): + # Mapping numpy dtype.char to generic database types + type_map = { + 'b': 'BOOL', # boolean + 'i': 'INT', # (signed) integer + 'u': 'INT', # unsigned integer + 'l': 'INT', # 64bit integer + 'f': 'FLOAT', # floating-point + 'c': 'FLOAT', # complex-floating point + 'm': None, # timedelta + 'M': 'DATETIME', # datetime + 'O': 'OBJECT', # (Python) objects + 'S': 'BYTE', # (byte-)string + 'U': 'STRING', # Unicode + 'V': None, # raw data (void) + } + def __init__(self, df): self.__df = df.where((pd.notnull(df)), None) @@ -30,6 +49,47 @@ def size(self): def data(self): return self.__df.to_dict(orient='records') + @classmethod + def db_type(cls, dtype): + """Given a numpy dtype, Returns a generic database type""" + return cls.type_map.get(dtype.char) + + @classmethod + def datetime_conversion_rate(cls, data_series): + success = 0 + total = 0 + for value in data_series: + total += 1 + try: + pd.to_datetime(value) + success += 1 + except Exception: + continue + return 100 * success / total + + @classmethod + def is_date(cls, dtype): + if dtype.name: + return dtype.name.startswith('datetime') + + @classmethod + def is_dimension(cls, dtype, column_name): + if cls.is_id(column_name): + return False + return dtype.name in ('object', 'bool') + + @classmethod + def is_id(cls, column_name): + return column_name.startswith('id') or column_name.endswith('id') + + @classmethod + def agg_func(cls, dtype, column_name): + # consider checking for key substring too. + if cls.is_id(column_name): + return 'count_distinct' + if np.issubdtype(dtype, np.number): + return 'sum' + @property def columns(self): """Provides metadata about columns for data visualization. @@ -45,22 +105,33 @@ def columns(self): if sample_size: sample = self.__df.sample(sample_size) for col in self.__df.dtypes.keys(): + col_db_type = self.db_type(self.__df.dtypes[col]) column = { 'name': col, - 'type': self.__df.dtypes[col].name, - 'is_date': is_date(self.__df.dtypes[col]), - 'is_dim': is_dimension(self.__df.dtypes[col], col), + 'agg': self.agg_func(self.__df.dtypes[col], col), + 'type': col_db_type, + 'is_date': self.is_date(self.__df.dtypes[col]), + 'is_dim': self.is_dimension(self.__df.dtypes[col], col), } - agg = agg_func(self.__df.dtypes[col], col) - if agg_func: - column['agg'] = agg - if column['type'] == 'object': + if column['type'] in ('OBJECT', None): + v = sample[col].iloc[0] if not sample[col].empty else None + if isinstance(v, basestring): + column['type'] = 'STRING' + elif isinstance(v, int): + column['type'] = 'INT' + elif isinstance(v, float): + column['type'] = 'FLOAT' + elif isinstance(v, (datetime, date)): + column['type'] = 'DATETIME' + column['is_date'] = True + column['is_dim'] = False # check if encoded datetime - if (datetime_conversion_rate(sample[col]) > + if ( + column['type'] == 'STRING' and + self.datetime_conversion_rate(sample[col]) > INFER_COL_TYPES_THRESHOLD): column.update({ - 'type': 'datetime_string', 'is_date': True, 'is_dim': False, 'agg': None @@ -70,42 +141,3 @@ def columns(self): column.pop('agg', None) columns.append(column) return columns - - -# It will give false positives on the numbers that are stored as strings. -# It is hard to distinguish integer numbers and timestamps -def datetime_conversion_rate(data_series): - success = 0 - total = 0 - for value in data_series: - total += 1 - try: - pd.to_datetime(value) - success += 1 - except Exception: - continue - return 100 * success / total - - -def is_date(dtype): - if dtype.name: - return dtype.name.startswith('datetime') - - -def is_dimension(dtype, column_name): - if is_id(column_name): - return False - return dtype.name in ('object', 'bool') - - -def is_id(column_name): - return column_name.startswith('id') or column_name.endswith('id') - - -def agg_func(dtype, column_name): - # consider checking for key substring too. - if is_id(column_name): - return 'count_distinct' - if np.issubdtype(dtype, np.number): - return 'sum' - return None diff --git a/superset/views/core.py b/superset/views/core.py index 3416b36313b6a..ccaa30fd50765 100755 --- a/superset/views/core.py +++ b/superset/views/core.py @@ -1784,6 +1784,7 @@ def sqllab_viz(self): filterable=is_dim, groupby=is_dim, is_dttm=config.get('is_date', False), + type=config.get('type', False), ) cols.append(col) if is_dim: diff --git a/tests/celery_tests.py b/tests/celery_tests.py index 8da39be96c96e..43e1b6f29ef82 100644 --- a/tests/celery_tests.py +++ b/tests/celery_tests.py @@ -9,6 +9,7 @@ import subprocess import time import unittest +from past.builtins import basestring import pandas as pd @@ -238,49 +239,65 @@ def test_run_async_query(self): self.assertEqual(True, query.select_as_cta) self.assertEqual(True, query.select_as_cta_used) + @staticmethod + def de_unicode_dict(d): + def str_if_basestring(o): + if isinstance(o, basestring): + return str(o) + return o + return {str_if_basestring(k): str_if_basestring(d[k]) for k in d} + + @classmethod + def dictify_list_of_dicts(cls, l, k): + return {str(o[k]): cls.de_unicode_dict(o) for o in l} + def test_get_columns(self): main_db = self.get_main_database(db.session) df = main_db.get_df("SELECT * FROM multiformat_time_series", None) cdf = dataframe.SupersetDataFrame(df) + + # Making ordering non-deterministic + cols = self.dictify_list_of_dicts(cdf.columns, 'name') + if main_db.sqlalchemy_uri.startswith('sqlite'): - self.assertEqual( - [{'is_date': True, 'type': 'datetime_string', 'name': 'ds', - 'is_dim': False}, - {'is_date': True, 'type': 'datetime_string', 'name': 'ds2', - 'is_dim': False}, - {'agg': 'sum', 'is_date': False, 'type': 'int64', - 'name': 'epoch_ms', 'is_dim': False}, - {'agg': 'sum', 'is_date': False, 'type': 'int64', - 'name': 'epoch_s', 'is_dim': False}, - {'is_date': True, 'type': 'datetime_string', 'name': 'string0', - 'is_dim': False}, - {'is_date': False, 'type': 'object', - 'name': 'string1', 'is_dim': True}, - {'is_date': True, 'type': 'datetime_string', 'name': 'string2', - 'is_dim': False}, - {'is_date': False, 'type': 'object', - 'name': 'string3', 'is_dim': True}] - , cdf.columns + self.assertEqual(self.dictify_list_of_dicts([ + {'is_date': True, 'type': 'STRING', 'name': 'ds', + 'is_dim': False}, + {'is_date': True, 'type': 'STRING', 'name': 'ds2', + 'is_dim': False}, + {'agg': 'sum', 'is_date': False, 'type': 'INT', + 'name': 'epoch_ms', 'is_dim': False}, + {'agg': 'sum', 'is_date': False, 'type': 'INT', + 'name': 'epoch_s', 'is_dim': False}, + {'is_date': True, 'type': 'STRING', 'name': 'string0', + 'is_dim': False}, + {'is_date': False, 'type': 'STRING', + 'name': 'string1', 'is_dim': True}, + {'is_date': True, 'type': 'STRING', 'name': 'string2', + 'is_dim': False}, + {'is_date': False, 'type': 'STRING', + 'name': 'string3', 'is_dim': True}], 'name') + , cols ) else: - self.assertEqual( - [{'is_date': True, 'type': 'datetime_string', 'name': 'ds', - 'is_dim': False}, - {'is_date': True, 'type': 'datetime64[ns]', - 'name': 'ds2', 'is_dim': False}, - {'agg': 'sum', 'is_date': False, 'type': 'int64', - 'name': 'epoch_ms', 'is_dim': False}, - {'agg': 'sum', 'is_date': False, 'type': 'int64', - 'name': 'epoch_s', 'is_dim': False}, - {'is_date': True, 'type': 'datetime_string', 'name': 'string0', - 'is_dim': False}, - {'is_date': False, 'type': 'object', - 'name': 'string1', 'is_dim': True}, - {'is_date': True, 'type': 'datetime_string', 'name': 'string2', - 'is_dim': False}, - {'is_date': False, 'type': 'object', - 'name': 'string3', 'is_dim': True}] - , cdf.columns + self.assertEqual(self.dictify_list_of_dicts([ + {'is_date': True, 'type': 'DATETIME', 'name': 'ds', + 'is_dim': False}, + {'is_date': True, 'type': 'DATETIME', + 'name': 'ds2', 'is_dim': False}, + {'agg': 'sum', 'is_date': False, 'type': 'INT', + 'name': 'epoch_ms', 'is_dim': False}, + {'agg': 'sum', 'is_date': False, 'type': 'INT', + 'name': 'epoch_s', 'is_dim': False}, + {'is_date': True, 'type': 'STRING', 'name': 'string0', + 'is_dim': False}, + {'is_date': False, 'type': 'STRING', + 'name': 'string1', 'is_dim': True}, + {'is_date': True, 'type': 'STRING', 'name': 'string2', + 'is_dim': False}, + {'is_date': False, 'type': 'STRING', + 'name': 'string3', 'is_dim': True}], 'name') + , cols )