apache · mistercrunch · Mar 24, 2017 · Mar 22, 2017 · Mar 23, 2017 · Mar 24, 2017
diff --git a/setup.py b/setup.py
@@ -51,6 +51,7 @@ def get_git_sha():
         'flask-script==2.0.5',
         'flask-sqlalchemy==2.0',
         'flask-testing==0.6.1',
+        'future>=0.16.0, <0.17',
         'humanize==0.5.1',
         'gunicorn==19.6.0',
         'markdown==2.6.8',

diff --git a/superset/dataframe.py b/superset/dataframe.py
@@ -10,6 +10,9 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
+from datetime import datetime, date
+from past.builtins import basestring
+
 import pandas as pd
 import numpy as np
 
@@ -19,6 +22,22 @@
 
 
 class SupersetDataFrame(object):
+    # Mapping numpy dtype.char to generic database types
+    type_map = {
+        'b': 'BOOL',  # boolean
+        'i': 'INT',  # (signed) integer
+        'u': 'INT',  # unsigned integer
+        'l': 'INT',  # 64bit integer
+        'f': 'FLOAT',  # floating-point
+        'c': 'FLOAT',  # complex-floating point
+        'm': None,  # timedelta
+        'M': 'DATETIME',  # datetime
+        'O': 'OBJECT',  # (Python) objects
+        'S': 'BYTE',  # (byte-)string
+        'U': 'STRING',  # Unicode
+        'V': None,   # raw data (void)
+    }
+
     def __init__(self, df):
         self.__df = df.where((pd.notnull(df)), None)
 
@@ -30,6 +49,47 @@ def size(self):
     def data(self):
         return self.__df.to_dict(orient='records')
 
+    @classmethod
+    def db_type(cls, dtype):
+        """Given a numpy dtype, Returns a generic database type"""
+        return cls.type_map.get(dtype.char)
+
+    @classmethod
+    def datetime_conversion_rate(cls, data_series):
+        success = 0
+        total = 0
+        for value in data_series:
+            total += 1
+            try:
+                pd.to_datetime(value)
+                success += 1
+            except Exception:
+                continue
+        return 100 * success / total
+
+    @classmethod
+    def is_date(cls, dtype):
+        if dtype.name:
+            return dtype.name.startswith('datetime')
+
+    @classmethod
+    def is_dimension(cls, dtype, column_name):
+        if cls.is_id(column_name):
+            return False
+        return dtype.name in ('object', 'bool')
+
+    @classmethod
+    def is_id(cls, column_name):
+        return column_name.startswith('id') or column_name.endswith('id')
+
+    @classmethod
+    def agg_func(cls, dtype, column_name):
+        # consider checking for key substring too.
+        if cls.is_id(column_name):
+            return 'count_distinct'
+        if np.issubdtype(dtype, np.number):
+            return 'sum'
+
     @property
     def columns(self):
         """Provides metadata about columns for data visualization.
@@ -45,22 +105,33 @@ def columns(self):
         if sample_size:
             sample = self.__df.sample(sample_size)
         for col in self.__df.dtypes.keys():
+            col_db_type = self.db_type(self.__df.dtypes[col])
             column = {
                 'name': col,
-                'type': self.__df.dtypes[col].name,
-                'is_date': is_date(self.__df.dtypes[col]),
-                'is_dim': is_dimension(self.__df.dtypes[col], col),
+                'agg': self.agg_func(self.__df.dtypes[col], col),
+                'type': col_db_type,
+                'is_date': self.is_date(self.__df.dtypes[col]),
+                'is_dim': self.is_dimension(self.__df.dtypes[col], col),
             }
-            agg = agg_func(self.__df.dtypes[col], col)
-            if agg_func:
-                column['agg'] = agg
 
-            if column['type'] == 'object':
+            if column['type'] in ('OBJECT', None):
+                v = sample[col].iloc[0] if not sample[col].empty else None
+                if isinstance(v, basestring):
+                    column['type'] = 'STRING'
+                elif isinstance(v, int):
+                    column['type'] = 'INT'
+                elif isinstance(v, float):
+                    column['type'] = 'FLOAT'
+                elif isinstance(v, (datetime, date)):
+                    column['type'] = 'DATETIME'
+                    column['is_date'] = True
+                    column['is_dim'] = False
                 # check if encoded datetime
-                if (datetime_conversion_rate(sample[col]) >
+                if (
+                        column['type'] == 'STRING' and
+                        self.datetime_conversion_rate(sample[col]) >
                         INFER_COL_TYPES_THRESHOLD):
                     column.update({
-                        'type': 'datetime_string',
                         'is_date': True,
                         'is_dim': False,
                         'agg': None
@@ -70,42 +141,3 @@ def columns(self):
                 column.pop('agg', None)
             columns.append(column)
         return columns
-
-
-# It will give false positives on the numbers that are stored as strings.
-# It is hard to distinguish integer numbers and timestamps
-def datetime_conversion_rate(data_series):
-    success = 0
-    total = 0
-    for value in data_series:
-        total += 1
-        try:
-            pd.to_datetime(value)
-            success += 1
-        except Exception:
-            continue
-    return 100 * success / total
-
-
-def is_date(dtype):
-    if dtype.name:
-        return dtype.name.startswith('datetime')
-
-
-def is_dimension(dtype, column_name):
-    if is_id(column_name):
-        return False
-    return dtype.name in ('object', 'bool')
-
-
-def is_id(column_name):
-    return column_name.startswith('id') or column_name.endswith('id')
-
-
-def agg_func(dtype, column_name):
-    # consider checking for key substring too.
-    if is_id(column_name):
-        return 'count_distinct'
-    if np.issubdtype(dtype, np.number):
-        return 'sum'
-    return None
diff --git a/superset/views/core.py b/superset/views/core.py
@@ -1784,6 +1784,7 @@ def sqllab_viz(self):
                 filterable=is_dim,
                 groupby=is_dim,
                 is_dttm=config.get('is_date', False),
+                type=config.get('type', False),
             )
             cols.append(col)
             if is_dim:

diff --git a/tests/celery_tests.py b/tests/celery_tests.py
@@ -9,6 +9,7 @@
 import subprocess
 import time
 import unittest
+from past.builtins import basestring
 
 import pandas as pd
 
@@ -238,49 +239,65 @@ def test_run_async_query(self):
         self.assertEqual(True, query.select_as_cta)
         self.assertEqual(True, query.select_as_cta_used)
 
+    @staticmethod
+    def de_unicode_dict(d):
+        def str_if_basestring(o):
+            if isinstance(o, basestring):
+                return str(o)
+            return o
+        return {str_if_basestring(k): str_if_basestring(d[k]) for k in d}
+
+    @classmethod
+    def dictify_list_of_dicts(cls, l, k):
+        return {str(o[k]): cls.de_unicode_dict(o) for o in l}
+
     def test_get_columns(self):
         main_db = self.get_main_database(db.session)
         df = main_db.get_df("SELECT * FROM multiformat_time_series", None)
         cdf = dataframe.SupersetDataFrame(df)
+
+        # Making ordering non-deterministic
+        cols = self.dictify_list_of_dicts(cdf.columns, 'name')
+
         if main_db.sqlalchemy_uri.startswith('sqlite'):
-            self.assertEqual(
-                [{'is_date': True, 'type': 'datetime_string', 'name': 'ds',
-                  'is_dim': False},
-                 {'is_date': True, 'type': 'datetime_string', 'name': 'ds2',
-                  'is_dim': False},
-                 {'agg': 'sum', 'is_date': False, 'type': 'int64',
-                  'name': 'epoch_ms', 'is_dim': False},
-                 {'agg': 'sum', 'is_date': False, 'type': 'int64',
-                  'name': 'epoch_s', 'is_dim': False},
-                 {'is_date': True, 'type': 'datetime_string', 'name': 'string0',
-                  'is_dim': False},
-                 {'is_date': False, 'type': 'object',
-                  'name': 'string1', 'is_dim': True},
-                 {'is_date': True, 'type': 'datetime_string', 'name': 'string2',
-                  'is_dim': False},
-                 {'is_date': False, 'type': 'object',
-                  'name': 'string3', 'is_dim': True}]
-                , cdf.columns
+            self.assertEqual(self.dictify_list_of_dicts([
+                {'is_date': True, 'type': 'STRING', 'name': 'ds',
+                    'is_dim': False},
+                {'is_date': True, 'type': 'STRING', 'name': 'ds2',
+                    'is_dim': False},
+                {'agg': 'sum', 'is_date': False, 'type': 'INT',
+                    'name': 'epoch_ms', 'is_dim': False},
+                {'agg': 'sum', 'is_date': False, 'type': 'INT',
+                    'name': 'epoch_s', 'is_dim': False},
+                {'is_date': True, 'type': 'STRING', 'name': 'string0',
+                    'is_dim': False},
+                {'is_date': False, 'type': 'STRING',
+                    'name': 'string1', 'is_dim': True},
+                {'is_date': True, 'type': 'STRING', 'name': 'string2',
+                    'is_dim': False},
+                {'is_date': False, 'type': 'STRING',
+                    'name': 'string3', 'is_dim': True}], 'name')
+                , cols
             )
         else:
-            self.assertEqual(
-                [{'is_date': True, 'type': 'datetime_string', 'name': 'ds',
-                  'is_dim': False},
-                 {'is_date': True, 'type': 'datetime64[ns]',
-                  'name': 'ds2', 'is_dim': False},
-                 {'agg': 'sum', 'is_date': False, 'type': 'int64',
-                  'name': 'epoch_ms', 'is_dim': False},
-                 {'agg': 'sum', 'is_date': False, 'type': 'int64',
-                  'name': 'epoch_s', 'is_dim': False},
-                 {'is_date': True, 'type': 'datetime_string', 'name': 'string0',
-                  'is_dim': False},
-                 {'is_date': False, 'type': 'object',
-                  'name': 'string1', 'is_dim': True},
-                 {'is_date': True, 'type': 'datetime_string', 'name': 'string2',
-                  'is_dim': False},
-                 {'is_date': False, 'type': 'object',
-                  'name': 'string3', 'is_dim': True}]
-                , cdf.columns
+            self.assertEqual(self.dictify_list_of_dicts([
+                {'is_date': True, 'type': 'DATETIME', 'name': 'ds',
+                    'is_dim': False},
+                {'is_date': True, 'type': 'DATETIME',
+                    'name': 'ds2', 'is_dim': False},
+                {'agg': 'sum', 'is_date': False, 'type': 'INT',
+                    'name': 'epoch_ms', 'is_dim': False},
+                {'agg': 'sum', 'is_date': False, 'type': 'INT',
+                    'name': 'epoch_s', 'is_dim': False},
+                {'is_date': True, 'type': 'STRING', 'name': 'string0',
+                    'is_dim': False},
+                {'is_date': False, 'type': 'STRING',
+                    'name': 'string1', 'is_dim': True},
+                {'is_date': True, 'type': 'STRING', 'name': 'string2',
+                    'is_dim': False},
+                {'is_date': False, 'type': 'STRING',
+                    'name': 'string3', 'is_dim': True}], 'name')
+                , cols
             )