[sqllab] assign types for visualize flow

Somehow when using the visualize flow, the types were not assigned at all, creating some bugs downstream. This PR attempts to get the information required based on what pandas is knows and the types in the data itself.
apache · Mar 23, 2017 · 5c5c130 · 5c5c130
1 parent 65c89f5
commit 5c5c130
Show file tree

Hide file tree

Showing 3 changed files with 77 additions and 48 deletions.
diff --git a/setup.py b/setup.py
@@ -51,6 +51,7 @@ def get_git_sha():
         'flask-script==2.0.5',
         'flask-sqlalchemy==2.0',
         'flask-testing==0.6.1',
+        'future>=0.16.0, <0.17',
         'humanize==0.5.1',
         'gunicorn==19.6.0',
         'markdown==2.6.8',

diff --git a/superset/dataframe.py b/superset/dataframe.py
@@ -10,6 +10,9 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
+from datetime import datetime, date
+from past.builtins import basestring
+
 import pandas as pd
 import numpy as np
 
@@ -19,6 +22,21 @@
 
 
 class SupersetDataFrame(object):
+    # Mapping numpy dtype.char to generic database types
+    type_map = {
+        'b': 'BOOL',  # boolean
+        'i': 'INT',  # (signed) integer
+        'u': 'INT',  # unsigned integer
+        'f': 'FLOAT',  # floating-point
+        'c': 'FLOAT',  # complex-floating point
+        'm': None,  # timedelta
+        'M': 'DATETIME',  # datetime
+        'O': 'OBJECT',  # (Python) objects
+        'S': 'BYTE',  # (byte-)string
+        'U': 'STRING',  # Unicode
+        'V': None,   # raw data (void)
+    }
+
     def __init__(self, df):
         self.__df = df.where((pd.notnull(df)), None)
 
@@ -30,6 +48,47 @@ def size(self):
     def data(self):
         return self.__df.to_dict(orient='records')
 
+    @classmethod
+    def db_type(cls, dtype):
+        """Given a numpy dtype, Returns a generic database type"""
+        return cls.type_map.get(dtype.char)
+
+    @classmethod
+    def datetime_conversion_rate(cls, data_series):
+        success = 0
+        total = 0
+        for value in data_series:
+            total += 1
+            try:
+                pd.to_datetime(value)
+                success += 1
+            except Exception:
+                continue
+        return 100 * success / total
+
+    @classmethod
+    def is_date(cls, dtype):
+        if dtype.name:
+            return dtype.name.startswith('datetime')
+
+    @classmethod
+    def is_dimension(cls, dtype, column_name):
+        if cls.is_id(column_name):
+            return False
+        return dtype.name in ('object', 'bool')
+
+    @classmethod
+    def is_id(cls, column_name):
+        return column_name.startswith('id') or column_name.endswith('id')
+
+    @classmethod
+    def agg_func(cls, dtype, column_name):
+        # consider checking for key substring too.
+        if cls.is_id(column_name):
+            return 'count_distinct'
+        if np.issubdtype(dtype, np.number):
+            return 'sum'
+
     @property
     def columns(self):
         """Provides metadata about columns for data visualization.
@@ -45,22 +104,29 @@ def columns(self):
         if sample_size:
             sample = self.__df.sample(sample_size)
         for col in self.__df.dtypes.keys():
+            db_type = self.db_type(self.__df.dtypes[col])
             column = {
                 'name': col,
-                'type': self.__df.dtypes[col].name,
-                'is_date': is_date(self.__df.dtypes[col]),
-                'is_dim': is_dimension(self.__df.dtypes[col], col),
+                'agg': self.agg_func(self.__df.dtypes[col], col),
+                'type': db_type,
+                'is_date': self.is_date(self.__df.dtypes[col]),
+                'is_dim': self.is_dimension(self.__df.dtypes[col], col),
             }
-            agg = agg_func(self.__df.dtypes[col], col)
-            if agg_func:
-                column['agg'] = agg
 
-            if column['type'] == 'object':
+            if column['type'] in ('OBJECT', None):
+                v = sample[col][0]
+                if isinstance(v, basestring):
+                    column['type'] = 'STRING'
+                elif isinstance(v, int):
+                    column['type'] = 'INT'
+                elif isinstance(v, float):
+                    column['type'] = 'FLOAT'
+                elif isinstance(v, (datetime, date)):
+                    column['type'] = 'DATETIME'
                 # check if encoded datetime
-                if (datetime_conversion_rate(sample[col]) >
+                if (self.datetime_conversion_rate(sample[col]) >
                         INFER_COL_TYPES_THRESHOLD):
                     column.update({
-                        'type': 'datetime_string',
                         'is_date': True,
                         'is_dim': False,
                         'agg': None
@@ -70,42 +136,3 @@ def columns(self):
                 column.pop('agg', None)
             columns.append(column)
         return columns
-
-
-# It will give false positives on the numbers that are stored as strings.
-# It is hard to distinguish integer numbers and timestamps
-def datetime_conversion_rate(data_series):
-    success = 0
-    total = 0
-    for value in data_series:
-        total += 1
-        try:
-            pd.to_datetime(value)
-            success += 1
-        except Exception:
-            continue
-    return 100 * success / total
-
-
-def is_date(dtype):
-    if dtype.name:
-        return dtype.name.startswith('datetime')
-
-
-def is_dimension(dtype, column_name):
-    if is_id(column_name):
-        return False
-    return dtype.name in ('object', 'bool')
-
-
-def is_id(column_name):
-    return column_name.startswith('id') or column_name.endswith('id')
-
-
-def agg_func(dtype, column_name):
-    # consider checking for key substring too.
-    if is_id(column_name):
-        return 'count_distinct'
-    if np.issubdtype(dtype, np.number):
-        return 'sum'
-    return None
diff --git a/superset/views/core.py b/superset/views/core.py
@@ -1784,6 +1784,7 @@ def sqllab_viz(self):
                 filterable=is_dim,
                 groupby=is_dim,
                 is_dttm=config.get('is_date', False),
+                type=config.get('type', False),
             )
             cols.append(col)
             if is_dim: