Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[sqllab] assign types for visualize flow #2458

Merged
merged 5 commits into from
Mar 24, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def get_git_sha():
'flask-script==2.0.5',
'flask-sqlalchemy==2.0',
'flask-testing==0.6.1',
'future>=0.16.0, <0.17',
'humanize==0.5.1',
'gunicorn==19.6.0',
'markdown==2.6.8',
Expand Down
128 changes: 80 additions & 48 deletions superset/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
from __future__ import print_function
from __future__ import unicode_literals

from datetime import datetime, date
from past.builtins import basestring

import pandas as pd
import numpy as np

Expand All @@ -19,6 +22,22 @@


class SupersetDataFrame(object):
# Mapping numpy dtype.char to generic database types
type_map = {
'b': 'BOOL', # boolean
'i': 'INT', # (signed) integer
'u': 'INT', # unsigned integer
'l': 'INT', # 64bit integer
'f': 'FLOAT', # floating-point
'c': 'FLOAT', # complex-floating point
'm': None, # timedelta
'M': 'DATETIME', # datetime
'O': 'OBJECT', # (Python) objects
'S': 'BYTE', # (byte-)string
'U': 'STRING', # Unicode
'V': None, # raw data (void)
}

def __init__(self, df):
self.__df = df.where((pd.notnull(df)), None)

Expand All @@ -30,6 +49,47 @@ def size(self):
def data(self):
return self.__df.to_dict(orient='records')

@classmethod
def db_type(cls, dtype):
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldn't this be col_type rather than db_type here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes to be explicit I should differentiate between col_numpy_type and col_db_type, which I'm converting here...

"""Given a numpy dtype, Returns a generic database type"""
return cls.type_map.get(dtype.char)

@classmethod
def datetime_conversion_rate(cls, data_series):
success = 0
total = 0
for value in data_series:
total += 1
try:
pd.to_datetime(value)
success += 1
except Exception:
continue
return 100 * success / total

@classmethod
def is_date(cls, dtype):
if dtype.name:
return dtype.name.startswith('datetime')

@classmethod
def is_dimension(cls, dtype, column_name):
if cls.is_id(column_name):
return False
return dtype.name in ('object', 'bool')

@classmethod
def is_id(cls, column_name):
return column_name.startswith('id') or column_name.endswith('id')

@classmethod
def agg_func(cls, dtype, column_name):
# consider checking for key substring too.
if cls.is_id(column_name):
return 'count_distinct'
if np.issubdtype(dtype, np.number):
return 'sum'

@property
def columns(self):
"""Provides metadata about columns for data visualization.
Expand All @@ -45,22 +105,33 @@ def columns(self):
if sample_size:
sample = self.__df.sample(sample_size)
for col in self.__df.dtypes.keys():
col_db_type = self.db_type(self.__df.dtypes[col])
column = {
'name': col,
'type': self.__df.dtypes[col].name,
'is_date': is_date(self.__df.dtypes[col]),
'is_dim': is_dimension(self.__df.dtypes[col], col),
'agg': self.agg_func(self.__df.dtypes[col], col),
'type': col_db_type,
'is_date': self.is_date(self.__df.dtypes[col]),
'is_dim': self.is_dimension(self.__df.dtypes[col], col),
}
agg = agg_func(self.__df.dtypes[col], col)
if agg_func:
column['agg'] = agg

if column['type'] == 'object':
if column['type'] in ('OBJECT', None):
v = sample[col].iloc[0] if not sample[col].empty else None
if isinstance(v, basestring):
column['type'] = 'STRING'
elif isinstance(v, int):
column['type'] = 'INT'
elif isinstance(v, float):
column['type'] = 'FLOAT'
elif isinstance(v, (datetime, date)):
column['type'] = 'DATETIME'
column['is_date'] = True
column['is_dim'] = False
# check if encoded datetime
if (datetime_conversion_rate(sample[col]) >
if (
column['type'] == 'STRING' and
self.datetime_conversion_rate(sample[col]) >
INFER_COL_TYPES_THRESHOLD):
column.update({
'type': 'datetime_string',
'is_date': True,
'is_dim': False,
'agg': None
Expand All @@ -70,42 +141,3 @@ def columns(self):
column.pop('agg', None)
columns.append(column)
return columns


# It will give false positives on the numbers that are stored as strings.
# It is hard to distinguish integer numbers and timestamps
def datetime_conversion_rate(data_series):
success = 0
total = 0
for value in data_series:
total += 1
try:
pd.to_datetime(value)
success += 1
except Exception:
continue
return 100 * success / total


def is_date(dtype):
if dtype.name:
return dtype.name.startswith('datetime')


def is_dimension(dtype, column_name):
if is_id(column_name):
return False
return dtype.name in ('object', 'bool')


def is_id(column_name):
return column_name.startswith('id') or column_name.endswith('id')


def agg_func(dtype, column_name):
# consider checking for key substring too.
if is_id(column_name):
return 'count_distinct'
if np.issubdtype(dtype, np.number):
return 'sum'
return None
1 change: 1 addition & 0 deletions superset/views/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1784,6 +1784,7 @@ def sqllab_viz(self):
filterable=is_dim,
groupby=is_dim,
is_dttm=config.get('is_date', False),
type=config.get('type', False),
)
cols.append(col)
if is_dim:
Expand Down
89 changes: 53 additions & 36 deletions tests/celery_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import subprocess
import time
import unittest
from past.builtins import basestring

import pandas as pd

Expand Down Expand Up @@ -238,49 +239,65 @@ def test_run_async_query(self):
self.assertEqual(True, query.select_as_cta)
self.assertEqual(True, query.select_as_cta_used)

@staticmethod
def de_unicode_dict(d):
def str_if_basestring(o):
if isinstance(o, basestring):
return str(o)
return o
return {str_if_basestring(k): str_if_basestring(d[k]) for k in d}

@classmethod
def dictify_list_of_dicts(cls, l, k):
return {str(o[k]): cls.de_unicode_dict(o) for o in l}

def test_get_columns(self):
main_db = self.get_main_database(db.session)
df = main_db.get_df("SELECT * FROM multiformat_time_series", None)
cdf = dataframe.SupersetDataFrame(df)

# Making ordering non-deterministic
cols = self.dictify_list_of_dicts(cdf.columns, 'name')

if main_db.sqlalchemy_uri.startswith('sqlite'):
self.assertEqual(
[{'is_date': True, 'type': 'datetime_string', 'name': 'ds',
'is_dim': False},
{'is_date': True, 'type': 'datetime_string', 'name': 'ds2',
'is_dim': False},
{'agg': 'sum', 'is_date': False, 'type': 'int64',
'name': 'epoch_ms', 'is_dim': False},
{'agg': 'sum', 'is_date': False, 'type': 'int64',
'name': 'epoch_s', 'is_dim': False},
{'is_date': True, 'type': 'datetime_string', 'name': 'string0',
'is_dim': False},
{'is_date': False, 'type': 'object',
'name': 'string1', 'is_dim': True},
{'is_date': True, 'type': 'datetime_string', 'name': 'string2',
'is_dim': False},
{'is_date': False, 'type': 'object',
'name': 'string3', 'is_dim': True}]
, cdf.columns
self.assertEqual(self.dictify_list_of_dicts([
{'is_date': True, 'type': 'STRING', 'name': 'ds',
'is_dim': False},
{'is_date': True, 'type': 'STRING', 'name': 'ds2',
'is_dim': False},
{'agg': 'sum', 'is_date': False, 'type': 'INT',
'name': 'epoch_ms', 'is_dim': False},
{'agg': 'sum', 'is_date': False, 'type': 'INT',
'name': 'epoch_s', 'is_dim': False},
{'is_date': True, 'type': 'STRING', 'name': 'string0',
'is_dim': False},
{'is_date': False, 'type': 'STRING',
'name': 'string1', 'is_dim': True},
{'is_date': True, 'type': 'STRING', 'name': 'string2',
'is_dim': False},
{'is_date': False, 'type': 'STRING',
'name': 'string3', 'is_dim': True}], 'name')
, cols
)
else:
self.assertEqual(
[{'is_date': True, 'type': 'datetime_string', 'name': 'ds',
'is_dim': False},
{'is_date': True, 'type': 'datetime64[ns]',
'name': 'ds2', 'is_dim': False},
{'agg': 'sum', 'is_date': False, 'type': 'int64',
'name': 'epoch_ms', 'is_dim': False},
{'agg': 'sum', 'is_date': False, 'type': 'int64',
'name': 'epoch_s', 'is_dim': False},
{'is_date': True, 'type': 'datetime_string', 'name': 'string0',
'is_dim': False},
{'is_date': False, 'type': 'object',
'name': 'string1', 'is_dim': True},
{'is_date': True, 'type': 'datetime_string', 'name': 'string2',
'is_dim': False},
{'is_date': False, 'type': 'object',
'name': 'string3', 'is_dim': True}]
, cdf.columns
self.assertEqual(self.dictify_list_of_dicts([
{'is_date': True, 'type': 'DATETIME', 'name': 'ds',
'is_dim': False},
{'is_date': True, 'type': 'DATETIME',
'name': 'ds2', 'is_dim': False},
{'agg': 'sum', 'is_date': False, 'type': 'INT',
'name': 'epoch_ms', 'is_dim': False},
{'agg': 'sum', 'is_date': False, 'type': 'INT',
'name': 'epoch_s', 'is_dim': False},
{'is_date': True, 'type': 'STRING', 'name': 'string0',
'is_dim': False},
{'is_date': False, 'type': 'STRING',
'name': 'string1', 'is_dim': True},
{'is_date': True, 'type': 'STRING', 'name': 'string2',
'is_dim': False},
{'is_date': False, 'type': 'STRING',
'name': 'string3', 'is_dim': True}], 'name')
, cols
)


Expand Down