Skip to content

Commit

Permalink
Coalesce all data_type attributes of frame into one (#185)
Browse files Browse the repository at this point in the history
* coalesce data_types into data_type_lookup

* black reformat

* changed to better variable names

* lux not defined error

* fixed

* black format
  • Loading branch information
jinimukh authored Jan 2, 2021
1 parent a06d417 commit 7f7a905
Show file tree
Hide file tree
Showing 13 changed files with 133 additions and 128 deletions.
4 changes: 2 additions & 2 deletions lux/action/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def filter(ldf):
# get unique values for all categorical values specified and creates corresponding filters
fltr = filters[0]

if ldf.data_type_lookup[fltr.attribute] == "nominal":
if ldf.data_type[fltr.attribute] == "nominal":
recommendation = {
"action": "Filter",
"description": f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an alternative value.",
Expand All @@ -60,7 +60,7 @@ def filter(ldf):
new_spec.append(new_filter)
temp_vis = Vis(new_spec)
output.append(temp_vis)
elif ldf.data_type_lookup[fltr.attribute] == "quantitative":
elif ldf.data_type[fltr.attribute] == "quantitative":
recommendation = {
"action": "Filter",
"description": f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an alternative inequality operation.",
Expand Down
4 changes: 1 addition & 3 deletions lux/action/univariate.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,7 @@ def univariate(ldf, *args):
possible_attributes = [
c
for c in ldf.columns
if ldf.data_type_lookup[c] == "quantitative"
and ldf.cardinality[c] > 5
and c != "Number of Records"
if ldf.data_type[c] == "quantitative" and ldf.cardinality[c] > 5 and c != "Number of Records"
]
intent = [lux.Clause(possible_attributes)]
intent.extend(filter_specs)
Expand Down
41 changes: 12 additions & 29 deletions lux/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
from lux.utils.message import Message
from lux.utils.utils import check_import_lux_widget
from typing import Dict, Union, List, Callable

# from lux.executor.Executor import *
import warnings
import traceback
import lux
Expand All @@ -35,10 +37,7 @@ class LuxDataFrame(pd.DataFrame):
_metadata = [
"_intent",
"_inferred_intent",
"data_type_lookup",
"data_type",
"data_model_lookup",
"data_model",
"unique_values",
"cardinality",
"_rec_info",
Expand Down Expand Up @@ -77,10 +76,7 @@ def __init__(self, *args, **kw):
self._message = Message()
self._pandas_only = False
# Metadata
self.data_type_lookup = None
self.data_type = None
self.data_model_lookup = None
self.data_model = None
self.unique_values = None
self.cardinality = None
self._min_max = None
Expand Down Expand Up @@ -126,10 +122,7 @@ def expire_recs(self):
def expire_metadata(self):
# Set metadata as null
self._metadata_fresh = False
self.data_type_lookup = None
self.data_type = None
self.data_model_lookup = None
self.data_model = None
self.unique_values = None
self.cardinality = None
self._min_max = None
Expand Down Expand Up @@ -294,15 +287,11 @@ def compute_SQL_dataset_metadata(self):
self.get_SQL_attributes()
for attr in list(self.columns):
self[attr] = None
self.data_type_lookup = {}
self.data_type = {}
#####NOTE: since we aren't expecting users to do much data processing with the SQL database, should we just keep this
##### in the initialization and do it just once
self.compute_SQL_data_type()
self.compute_SQL_stats()
self.data_model_lookup = {}
self.data_model = {}
self.compute_data_model()

def compute_SQL_stats(self):
# precompute statistics
Expand All @@ -312,7 +301,7 @@ def compute_SQL_stats(self):
self.get_SQL_unique_values()
# self.get_SQL_cardinality()
for attribute in self.columns:
if self.data_type_lookup[attribute] == "quantitative":
if self.data_type[attribute] == "quantitative":
self._min_max[attribute] = (
self[attribute].min(),
self[attribute].max(),
Expand Down Expand Up @@ -349,7 +338,7 @@ def get_SQL_unique_values(self):
self.unique_values = unique_vals

def compute_SQL_data_type(self):
data_type_lookup = {}
data_type = {}
sql_dtypes = {}
self.get_SQL_cardinality()
if "." in self.table_name:
Expand All @@ -362,20 +351,17 @@ def compute_SQL_data_type(self):
datatype = list(pd.read_sql(query, lux.config.SQLconnection)["data_type"])[0]
sql_dtypes[attr] = datatype

data_type = {"quantitative": [], "nominal": [], "temporal": []}
for attr in list(self.columns):
if str(attr).lower() in ["month", "year"]:
data_type_lookup[attr] = "temporal"
data_type["temporal"].append(attr)
data_type[attr] = "temporal"
elif sql_dtypes[attr] in [
"character",
"character varying",
"boolean",
"uuid",
"text",
]:
data_type_lookup[attr] = "nominal"
data_type["nominal"].append(attr)
data_type[attr] = "nominal"
elif sql_dtypes[attr] in [
"integer",
"real",
Expand All @@ -384,15 +370,11 @@ def compute_SQL_data_type(self):
"serial",
]:
if self.cardinality[attr] < 13:
data_type_lookup[attr] = "nominal"
data_type["nominal"].append(attr)
data_type[attr] = "nominal"
else:
data_type_lookup[attr] = "quantitative"
data_type["quantitative"].append(attr)
data_type[attr] = "quantitative"
elif "time" in sql_dtypes[attr] or "date" in sql_dtypes[attr]:
data_type_lookup[attr] = "temporal"
data_type["temporal"].append(attr)
self.data_type_lookup = data_type_lookup
data_type[attr] = "temporal"
self.data_type = data_type

def _append_rec(self, rec_infolist, recommendations: Dict):
Expand All @@ -419,8 +401,9 @@ def maintain_recs(self):
rec_df._message = Message()
# Add warning message if there exist ID fields
id_fields_str = ""
if len(rec_df.data_type["id"]) > 0:
for id_field in rec_df.data_type["id"]:
inverted_data_type = lux.config.executor.invert_data_type(rec_df.data_type)
if len(inverted_data_type["id"]) > 0:
for id_field in inverted_data_type["id"]:
id_fields_str += f"<code>{id_field}</code>, "
id_fields_str = id_fields_str[:-2]
rec_df._message.add(f"{id_fields_str} is not visualized since it resembles an ID field.")
Expand Down
3 changes: 0 additions & 3 deletions lux/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,7 @@
class LuxSeries(pd.Series):
_metadata = [
"_intent",
"data_type_lookup",
"data_type",
"data_model_lookup",
"data_model",
"unique_values",
"cardinality",
"_rec_info",
Expand Down
22 changes: 19 additions & 3 deletions lux/executor/Executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@ def compute_stats(self):
def compute_data_type(self):
return NotImplemented

@staticmethod
def compute_data_model(self):
return NotImplemented
# @staticmethod
# def compute_data_model(self):
# return NotImplemented

def mapping(self, rmap):
group_map = {}
Expand All @@ -67,3 +67,19 @@ def reverseMapping(self, map):
for val in map[valKey]:
reverse_map[val] = valKey
return reverse_map

def invert_data_type(self, data_type):
return self.mapping(data_type)

def compute_data_model(self, data_type):
data_type_inverted = self.invert_data_type(data_type)
data_model = {
"measure": data_type_inverted["quantitative"],
"dimension": data_type_inverted["nominal"]
+ data_type_inverted["temporal"]
+ data_type_inverted["id"],
}
return data_model

def compute_data_model_lookup(self, data_type):
return self.reverseMapping(self.compute_data_model(data_type))
44 changes: 16 additions & 28 deletions lux/executor/PandasExecutor.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,64 +376,59 @@ def execute_2D_binning(vis: Vis):
############ Metadata: data type, model #############
#######################################################
def compute_dataset_metadata(self, ldf: LuxDataFrame):
ldf.data_type_lookup = {}
ldf.data_type = {}
self.compute_data_type(ldf)
ldf.data_model_lookup = {}
ldf.data_model = {}
self.compute_data_model(ldf)

def compute_data_type(self, ldf: LuxDataFrame):
from pandas.api.types import is_datetime64_any_dtype as is_datetime

for attr in list(ldf.columns):
temporal_var_list = ["month", "year", "day", "date", "time"]
if is_datetime(ldf[attr]):
ldf.data_type_lookup[attr] = "temporal"
ldf.data_type[attr] = "temporal"
elif self._is_datetime_string(ldf[attr]):
ldf.data_type_lookup[attr] = "temporal"
ldf.data_type[attr] = "temporal"
elif isinstance(attr, pd._libs.tslibs.timestamps.Timestamp):
ldf.data_type_lookup[attr] = "temporal"
ldf.data_type[attr] = "temporal"
elif str(attr).lower() in temporal_var_list:
ldf.data_type_lookup[attr] = "temporal"
ldf.data_type[attr] = "temporal"
elif pd.api.types.is_float_dtype(ldf.dtypes[attr]):
# int columns gets coerced into floats if contain NaN
convertible2int = pd.api.types.is_integer_dtype(ldf[attr].convert_dtypes())
if convertible2int and ldf.cardinality[attr] != len(ldf) and ldf.cardinality[attr] < 20:
ldf.data_type_lookup[attr] = "nominal"
ldf.data_type[attr] = "nominal"
else:
ldf.data_type_lookup[attr] = "quantitative"
ldf.data_type[attr] = "quantitative"
elif pd.api.types.is_integer_dtype(ldf.dtypes[attr]):
# See if integer value is quantitative or nominal by checking if the ratio of cardinality/data size is less than 0.4 and if there are less than 10 unique values
if ldf.pre_aggregated:
if ldf.cardinality[attr] == len(ldf):
ldf.data_type_lookup[attr] = "nominal"
ldf.data_type[attr] = "nominal"
if ldf.cardinality[attr] / len(ldf) < 0.4 and ldf.cardinality[attr] < 20:
ldf.data_type_lookup[attr] = "nominal"
ldf.data_type[attr] = "nominal"
else:
ldf.data_type_lookup[attr] = "quantitative"
ldf.data_type[attr] = "quantitative"
if check_if_id_like(ldf, attr):
ldf.data_type_lookup[attr] = "id"
ldf.data_type[attr] = "id"
# Eliminate this clause because a single NaN value can cause the dtype to be object
elif pd.api.types.is_string_dtype(ldf.dtypes[attr]):
if check_if_id_like(ldf, attr):
ldf.data_type_lookup[attr] = "id"
ldf.data_type[attr] = "id"
else:
ldf.data_type_lookup[attr] = "nominal"
ldf.data_type[attr] = "nominal"
# check if attribute is any type of datetime dtype
elif is_datetime_series(ldf.dtypes[attr]):
ldf.data_type_lookup[attr] = "temporal"
ldf.data_type[attr] = "temporal"
else:
ldf.data_type_lookup[attr] = "nominal"
ldf.data_type[attr] = "nominal"
# for attr in list(df.dtypes[df.dtypes=="int64"].keys()):
# if self.cardinality[attr]>50:
if ldf.index.dtype != "int64" and ldf.index.name:
ldf.data_type_lookup[ldf.index.name] = "nominal"
ldf.data_type = self.mapping(ldf.data_type_lookup)
ldf.data_type[ldf.index.name] = "nominal"

non_datetime_attrs = []
for attr in ldf.columns:
if ldf.data_type_lookup[attr] == "temporal" and not is_datetime(ldf[attr]):
if ldf.data_type[attr] == "temporal" and not is_datetime(ldf[attr]):
non_datetime_attrs.append(attr)
warn_msg = ""
if len(non_datetime_attrs) == 1:
Expand Down Expand Up @@ -470,13 +465,6 @@ def _is_datetime_string(self, series):
return True
return False

def compute_data_model(self, ldf: LuxDataFrame):
ldf.data_model = {
"measure": ldf.data_type["quantitative"],
"dimension": ldf.data_type["nominal"] + ldf.data_type["temporal"] + ldf.data_type["id"],
}
ldf.data_model_lookup = self.reverseMapping(ldf.data_model)

def compute_stats(self, ldf: LuxDataFrame):
# precompute statistics
ldf.unique_values = {}
Expand Down
13 changes: 9 additions & 4 deletions lux/processor/Compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,8 @@ def populate_data_type_model(ldf, vlist):
# TODO: copy might not be neccesary
from lux.utils.date_utils import is_datetime_string

data_model_lookup = lux.config.executor.compute_data_model_lookup(ldf.data_type)

for vis in vlist:
for clause in vis._inferred_intent:
if clause.description == "?":
Expand All @@ -167,11 +169,11 @@ def populate_data_type_model(ldf, vlist):
# and not is_datetime_string(clause.attribute):
if clause.attribute != "" and clause.attribute != "Record":
if clause.data_type == "":
clause.data_type = ldf.data_type_lookup[clause.attribute]
clause.data_type = ldf.data_type[clause.attribute]
if clause.data_type == "id":
clause.data_type = "nominal"
if clause.data_model == "":
clause.data_model = ldf.data_model_lookup[clause.attribute]
clause.data_model = data_model_lookup[clause.attribute]
if clause.value != "":
# If user provided title for Vis, then don't override.
if vis.title == "":
Expand Down Expand Up @@ -439,16 +441,19 @@ def populate_wildcard_options(_inferred_intent: List[Clause], ldf: LuxDataFrame)
import copy
from lux.utils.utils import convert_to_list

inverted_data_type = lux.config.executor.invert_data_type(ldf.data_type)
data_model = lux.config.executor.compute_data_model(ldf.data_type)

intent = {"attributes": [], "filters": []}
for clause in _inferred_intent:
spec_options = []
if clause.value == "": # attribute
if clause.attribute == "?":
options = set(list(ldf.columns)) # all attributes
if clause.data_type != "":
options = options.intersection(set(ldf.data_type[clause.data_type]))
options = options.intersection(set(inverted_data_type[clause.data_type]))
if clause.data_model != "":
options = options.intersection(set(ldf.data_model[clause.data_model]))
options = options.intersection(set(data_model[clause.data_model]))
options = list(options)
else:
options = convert_to_list(clause.attribute)
Expand Down
8 changes: 6 additions & 2 deletions lux/utils/date_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

import pandas as pd
import lux


def date_formatter(time_stamp, ldf):
Expand All @@ -38,10 +39,13 @@ def date_formatter(time_stamp, ldf):
date_str: str
A reformatted version of the time_stamp according to granularity
"""

inverted_data_type = lux.config.executor.invert_data_type(ldf.data_type)
# TODO: method for data_type_lookup to data_type
datetime = pd.to_datetime(time_stamp)
if ldf.data_type["temporal"]:
if inverted_data_type["temporal"]:
# assumes only one temporal column, may need to change this function to recieve multiple temporal columns in the future
date_column = ldf[ldf.data_type["temporal"][0]]
date_column = ldf[inverted_data_type["temporal"][0]]

granularity = compute_date_granularity(date_column)
date_str = ""
Expand Down
5 changes: 3 additions & 2 deletions tests/test_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,15 +96,16 @@ def test_refresh_inplace():
)
with pytest.warns(UserWarning, match="Lux detects that the attribute 'date' may be temporal."):
df._repr_html_()
assert df.data_type_lookup["date"] == "temporal"
assert df.data_type["date"] == "temporal"

from lux.vis.Vis import Vis

vis = Vis(["date", "value"], df)

df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d")
df.maintain_metadata()
assert df.data_type["temporal"][0] == "date"
inverted_data_type = lux.config.executor.invert_data_type(df.data_type)
assert inverted_data_type["temporal"][0] == "date"

vis.refresh_source(df)
assert vis.mark == "line"
Expand Down
Loading

0 comments on commit 7f7a905

Please sign in to comment.