Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Moving Current SQL Executor changes to new branch #119

Merged
merged 18 commits into from
Oct 23, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion lux/action/correlation.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def correlation(ldf: LuxDataFrame, ignore_transpose: bool = True):
recommendation = {"action": "Correlation",
"description": "Show relationships between two <p class='highlight-descriptor'>quantitative</p> attributes."}
ignore_rec_flag = False
if (len(ldf)<5): # Doesn't make sense to compute correlation if less than 4 data values
if (len(ldf)<5 and ldf.executor_type == "Pandas"): # Doesn't make sense to compute correlation if less than 4 data values
ignore_rec_flag = True
# Then use the data populated in the vis list to compute score
for vis in vlist:
Expand Down
4 changes: 2 additions & 2 deletions lux/action/univariate.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def univariate(ldf, *args):
intent.extend(filter_specs)
recommendation = {"action":"Distribution",
"description":"Show univariate histograms of <p class='highlight-descriptor'>quantitative</p> attributes."}
if (len(ldf)<5): # Doesn't make sense to generate a histogram if there is less than 5 datapoints (pre-aggregated)
if (len(ldf)<5 and ldf.executor_type == "Pandas"): # Doesn't make sense to generate a histogram if there is less than 5 datapoints (pre-aggregated)
ignore_rec_flag = True
elif (data_type_constraint == "nominal"):
intent = [lux.Clause("?",data_type="nominal")]
Expand All @@ -61,7 +61,7 @@ def univariate(ldf, *args):
intent.extend(filter_specs)
recommendation = {"action":"Temporal",
"description":"Show trends over <p class='highlight-descriptor'>time-related</p> attributes."}
if (len(ldf)<3): # Doesn't make sense to generate a line chart if there is less than 3 datapoints (pre-aggregated)
if (len(ldf)<3 and ldf.executor_type == "Pandas"): # Doesn't make sense to generate a line chart if there is less than 3 datapoints (pre-aggregated)
ignore_rec_flag = True
if (ignore_rec_flag):
recommendation["collection"] = []
Expand Down
115 changes: 17 additions & 98 deletions lux/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from lux.vis.Vis import Vis
from lux.vis.VisList import VisList
from lux.history.history import History
from lux.utils.date_utils import is_datetime_series
from lux.utils.message import Message
from lux.utils.utils import check_import_lux_widget
from typing import Dict, Union, List, Callable
Expand All @@ -28,7 +29,7 @@ class LuxDataFrame(pd.DataFrame):
A subclass of pd.DataFrame that supports all dataframe operations while housing other variables and functions for generating visual recommendations.
'''
# MUST register here for new properties!!
_metadata = ['_intent','data_type_lookup','data_type',
_metadata = ['_intent','data_type_lookup','data_type', 'length',
'data_model_lookup','data_model','unique_values','cardinality','_rec_info', '_pandas_only',
'_min_max','plot_config', '_current_vis','_widget', '_recommendation','_prev','_history', '_saved_export']

Expand All @@ -54,6 +55,7 @@ def __init__(self,*args, **kw):
self._message = Message()
self._pandas_only=False
# Metadata
self.length = None
self.data_type_lookup = None
self.data_type = None
self.data_model_lookup = None
Expand All @@ -77,7 +79,7 @@ def history(self):
return self._history
def maintain_metadata(self):
if (not hasattr(self,"_metadata_fresh") or not self._metadata_fresh ): # Check that metadata has not yet been computed
if (len(self)>0): #only compute metadata information if the dataframe is non-empty
if (len(self)>0 or self.executor_type == "SQL"): #only compute metadata information if the dataframe is non-empty
self.executor.compute_stats(self)
self.executor.compute_dataset_metadata(self)
self._infer_structure()
Expand All @@ -92,6 +94,7 @@ def expire_recs(self):
def expire_metadata(self):
# Set metadata as null
self._metadata_fresh = False
self.length = None
self.data_type_lookup = None
self.data_type = None
self.data_model_lookup = None
Expand All @@ -113,14 +116,14 @@ def _set_axis(self, axis, labels):
super(LuxDataFrame, self)._set_axis(axis, labels)
self.expire_metadata()
self.expire_recs()
def _set_item(self, key, value):
super(LuxDataFrame, self)._set_item(key, value)
self.expire_metadata()
self.expire_recs()
def _update_inplace(self,*args,**kwargs):
super(LuxDataFrame, self)._update_inplace(*args,**kwargs)
self.expire_metadata()
self.expire_recs()
def _set_item(self, key, value):
super(LuxDataFrame, self)._set_item(key, value)
self.expire_metadata()
self.expire_recs()
@property
def default_display(self):
if (self._default_pandas_display):
Expand All @@ -146,12 +149,12 @@ def _infer_structure(self):
# If the dataframe is very small and the index column is not a range index, then it is likely that this is an aggregated data
is_multi_index_flag = self.index.nlevels !=1
not_int_index_flag = self.index.dtype !='int64'
small_df_flag = len(self)<100
small_df_flag = len(self)<100 and len(self)>0
self.pre_aggregated = (is_multi_index_flag or not_int_index_flag) and small_df_flag
if ("Number of Records" in self.columns):
self.pre_aggregated = True
very_small_df_flag = len(self)<=10
if (very_small_df_flag):
if (very_small_df_flag and len(self)>0):
self.pre_aggregated = True
def set_executor_type(self, exe):
if (exe =="SQL"):
Expand All @@ -161,7 +164,7 @@ def set_executor_type(self, exe):
else:
import psycopg2
from lux.executor.SQLExecutor import SQLExecutor
self.executor = SQLExecutor
self.executor = SQLExecutor()
else:
from lux.executor.PandasExecutor import PandasExecutor
self.executor = PandasExecutor()
Expand Down Expand Up @@ -242,11 +245,11 @@ def _parse_validate_compile_intent(self):
from lux.processor.Parser import Parser
from lux.processor.Validator import Validator
self._intent = Parser.parse(self._intent)
Validator.validate_intent(self._intent,self)
self.maintain_metadata()
Validator.validate_intent(self._intent,self)
from lux.processor.Compiler import Compiler
self.current_vis = Compiler.compile_intent(self, self._intent)

def copy_intent(self):
#creates a true copy of the dataframe's intent
output = []
Expand Down Expand Up @@ -294,92 +297,9 @@ def __repr__(self):
def set_SQL_connection(self, connection, t_name):
self.SQLconnection = connection
self.table_name = t_name
self.compute_SQL_dataset_metadata()
self.set_executor_type("SQL")
self.executor.compute_dataset_metadata(self)

def compute_SQL_dataset_metadata(self):
self.get_SQL_attributes()
for attr in list(self.columns):
self[attr] = None
self.data_type_lookup = {}
self.data_type = {}
#####NOTE: since we aren't expecting users to do much data processing with the SQL database, should we just keep this
##### in the initialization and do it just once
self.compute_SQL_data_type()
self.compute_SQL_stats()
self.data_model_lookup = {}
self.data_model = {}
self.compute_data_model()

def compute_SQL_stats(self):
# precompute statistics
self.unique_values = {}
self._min_max = {}

self.get_SQL_unique_values()
#self.get_SQL_cardinality()
for attribute in self.columns:
if self.data_type_lookup[attribute] == 'quantitative':
self._min_max[attribute] = (self[attribute].min(), self[attribute].max())

def get_SQL_attributes(self):
if "." in self.table_name:
table_name = self.table_name[self.table_name.index(".")+1:]
else:
table_name = self.table_name
attr_query = "SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS where TABLE_NAME = '{}'".format(table_name)
attributes = list(pd.read_sql(attr_query, self.SQLconnection)['column_name'])
for attr in attributes:
self[attr] = None

def get_SQL_cardinality(self):
cardinality = {}
for attr in list(self.columns):
card_query = pd.read_sql("SELECT Count(Distinct({})) FROM {}".format(attr, self.table_name), self.SQLconnection)
cardinality[attr] = list(card_query["count"])[0]
self.cardinality = cardinality

def get_SQL_unique_values(self):
unique_vals = {}
for attr in list(self.columns):
unique_query = pd.read_sql("SELECT Distinct({}) FROM {}".format(attr, self.table_name), self.SQLconnection)
unique_vals[attr] = list(unique_query[attr])
self.unique_values = unique_vals

def compute_SQL_data_type(self):
data_type_lookup = {}
sql_dtypes = {}
self.get_SQL_cardinality()
if "." in self.table_name:
table_name = self.table_name[self.table_name.index(".")+1:]
else:
table_name = self.table_name
#get the data types of the attributes in the SQL table
for attr in list(self.columns):
datatype_query = "SELECT DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = '{}' AND COLUMN_NAME = '{}'".format(table_name, attr)
datatype = list(pd.read_sql(datatype_query, self.SQLconnection)['data_type'])[0]
sql_dtypes[attr] = datatype

data_type = {"quantitative":[], "nominal":[], "temporal":[]}
for attr in list(self.columns):
if str(attr).lower() in ["month", "year"]:
data_type_lookup[attr] = "temporal"
data_type["temporal"].append(attr)
elif sql_dtypes[attr] in ["character", "character varying", "boolean", "uuid", "text"]:
data_type_lookup[attr] = "nominal"
data_type["nominal"].append(attr)
elif sql_dtypes[attr] in ["integer", "real", "smallint", "smallserial", "serial"]:
if self.cardinality[attr] < 13:
data_type_lookup[attr] = "nominal"
data_type["nominal"].append(attr)
else:
data_type_lookup[attr] = "quantitative"
data_type["quantitative"].append(attr)
elif "time" in sql_dtypes[attr] or "date" in sql_dtypes[attr]:
data_type_lookup[attr] = "temporal"
data_type["temporal"].append(attr)
self.data_type_lookup = data_type_lookup
self.data_type = data_type
def _append_rec(self,rec_infolist,recommendations:Dict):
if (recommendations["collection"] is not None and len(recommendations["collection"])>0):
rec_infolist.append(recommendations)
Expand Down Expand Up @@ -557,8 +477,7 @@ def _repr_html_(self):
)
display(self.display_pandas())
return

if (len(self)<=0):
if (len(self)<=0 and self.executor_type == "Pandas"):
warnings.warn("\nLux can not operate on an empty dataframe.\nPlease check your input again.\n",stacklevel=2)
display(self.display_pandas())
return
Expand All @@ -567,7 +486,7 @@ def _repr_html_(self):
display(self.display_pandas())
return
self.maintain_metadata()

if (self._intent!=[] and (not hasattr(self,"_compiled") or not self._compiled)):
from lux.processor.Compiler import Compiler
self.current_vis = Compiler.compile_intent(self, self._intent)
Expand Down
27 changes: 14 additions & 13 deletions lux/executor/Executor.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
# Copyright 2019-2020 The Lux Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2019-2020 The Lux Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from lux.vis.VisList import VisList
from lux.utils import utils
Expand Down
2 changes: 1 addition & 1 deletion lux/executor/PandasExecutor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
from lux.core.frame import LuxDataFrame
from lux.executor.Executor import Executor
from lux.utils import utils
from lux.utils.date_utils import is_datetime_series
from lux.utils.utils import check_import_lux_widget, check_if_id_like
from lux.utils.date_utils import is_datetime_series
import warnings
Expand Down Expand Up @@ -388,6 +387,7 @@ def compute_stats(self, ldf:LuxDataFrame):
ldf.unique_values = {}
ldf._min_max = {}
ldf.cardinality = {}
ldf.length = len(ldf)

for attribute in ldf.columns:

Expand Down
Loading