Skip to content

Commit

Permalink
[sqllab] assign types for visualize flow
Browse files Browse the repository at this point in the history
Somehow when using the visualize flow, the types were not
assigned at all, creating some bugs downstream. This PR attempts to get
the information required based on what pandas is knows and the types in
the data itself.
  • Loading branch information
mistercrunch committed Mar 23, 2017
1 parent 65c89f5 commit 5c5c130
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 48 deletions.
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def get_git_sha():
'flask-script==2.0.5',
'flask-sqlalchemy==2.0',
'flask-testing==0.6.1',
'future>=0.16.0, <0.17',
'humanize==0.5.1',
'gunicorn==19.6.0',
'markdown==2.6.8',
Expand Down
123 changes: 75 additions & 48 deletions superset/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
from __future__ import print_function
from __future__ import unicode_literals

from datetime import datetime, date
from past.builtins import basestring

import pandas as pd
import numpy as np

Expand All @@ -19,6 +22,21 @@


class SupersetDataFrame(object):
# Mapping numpy dtype.char to generic database types
type_map = {
'b': 'BOOL', # boolean
'i': 'INT', # (signed) integer
'u': 'INT', # unsigned integer
'f': 'FLOAT', # floating-point
'c': 'FLOAT', # complex-floating point
'm': None, # timedelta
'M': 'DATETIME', # datetime
'O': 'OBJECT', # (Python) objects
'S': 'BYTE', # (byte-)string
'U': 'STRING', # Unicode
'V': None, # raw data (void)
}

def __init__(self, df):
self.__df = df.where((pd.notnull(df)), None)

Expand All @@ -30,6 +48,47 @@ def size(self):
def data(self):
return self.__df.to_dict(orient='records')

@classmethod
def db_type(cls, dtype):
"""Given a numpy dtype, Returns a generic database type"""
return cls.type_map.get(dtype.char)

@classmethod
def datetime_conversion_rate(cls, data_series):
success = 0
total = 0
for value in data_series:
total += 1
try:
pd.to_datetime(value)
success += 1
except Exception:
continue
return 100 * success / total

@classmethod
def is_date(cls, dtype):
if dtype.name:
return dtype.name.startswith('datetime')

@classmethod
def is_dimension(cls, dtype, column_name):
if cls.is_id(column_name):
return False
return dtype.name in ('object', 'bool')

@classmethod
def is_id(cls, column_name):
return column_name.startswith('id') or column_name.endswith('id')

@classmethod
def agg_func(cls, dtype, column_name):
# consider checking for key substring too.
if cls.is_id(column_name):
return 'count_distinct'
if np.issubdtype(dtype, np.number):
return 'sum'

@property
def columns(self):
"""Provides metadata about columns for data visualization.
Expand All @@ -45,22 +104,29 @@ def columns(self):
if sample_size:
sample = self.__df.sample(sample_size)
for col in self.__df.dtypes.keys():
db_type = self.db_type(self.__df.dtypes[col])
column = {
'name': col,
'type': self.__df.dtypes[col].name,
'is_date': is_date(self.__df.dtypes[col]),
'is_dim': is_dimension(self.__df.dtypes[col], col),
'agg': self.agg_func(self.__df.dtypes[col], col),
'type': db_type,
'is_date': self.is_date(self.__df.dtypes[col]),
'is_dim': self.is_dimension(self.__df.dtypes[col], col),
}
agg = agg_func(self.__df.dtypes[col], col)
if agg_func:
column['agg'] = agg

if column['type'] == 'object':
if column['type'] in ('OBJECT', None):
v = sample[col][0]
if isinstance(v, basestring):
column['type'] = 'STRING'
elif isinstance(v, int):
column['type'] = 'INT'
elif isinstance(v, float):
column['type'] = 'FLOAT'
elif isinstance(v, (datetime, date)):
column['type'] = 'DATETIME'
# check if encoded datetime
if (datetime_conversion_rate(sample[col]) >
if (self.datetime_conversion_rate(sample[col]) >
INFER_COL_TYPES_THRESHOLD):
column.update({
'type': 'datetime_string',
'is_date': True,
'is_dim': False,
'agg': None
Expand All @@ -70,42 +136,3 @@ def columns(self):
column.pop('agg', None)
columns.append(column)
return columns


# It will give false positives on the numbers that are stored as strings.
# It is hard to distinguish integer numbers and timestamps
def datetime_conversion_rate(data_series):
success = 0
total = 0
for value in data_series:
total += 1
try:
pd.to_datetime(value)
success += 1
except Exception:
continue
return 100 * success / total


def is_date(dtype):
if dtype.name:
return dtype.name.startswith('datetime')


def is_dimension(dtype, column_name):
if is_id(column_name):
return False
return dtype.name in ('object', 'bool')


def is_id(column_name):
return column_name.startswith('id') or column_name.endswith('id')


def agg_func(dtype, column_name):
# consider checking for key substring too.
if is_id(column_name):
return 'count_distinct'
if np.issubdtype(dtype, np.number):
return 'sum'
return None
1 change: 1 addition & 0 deletions superset/views/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1784,6 +1784,7 @@ def sqllab_viz(self):
filterable=is_dim,
groupby=is_dim,
is_dttm=config.get('is_date', False),
type=config.get('type', False),
)
cols.append(col)
if is_dim:
Expand Down

0 comments on commit 5c5c130

Please sign in to comment.