Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Coalesce all data_type attributes of frame into one #185

Merged
merged 8 commits into from
Jan 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions lux/action/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def filter(ldf):
# get unique values for all categorical values specified and creates corresponding filters
fltr = filters[0]

if ldf.data_type_lookup[fltr.attribute] == "nominal":
if ldf.data_type[fltr.attribute] == "nominal":
recommendation = {
"action": "Filter",
"description": f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an alternative value.",
Expand All @@ -60,7 +60,7 @@ def filter(ldf):
new_spec.append(new_filter)
temp_vis = Vis(new_spec)
output.append(temp_vis)
elif ldf.data_type_lookup[fltr.attribute] == "quantitative":
elif ldf.data_type[fltr.attribute] == "quantitative":
recommendation = {
"action": "Filter",
"description": f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an alternative inequality operation.",
Expand Down
4 changes: 1 addition & 3 deletions lux/action/univariate.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,7 @@ def univariate(ldf, *args):
possible_attributes = [
c
for c in ldf.columns
if ldf.data_type_lookup[c] == "quantitative"
and ldf.cardinality[c] > 5
and c != "Number of Records"
if ldf.data_type[c] == "quantitative" and ldf.cardinality[c] > 5 and c != "Number of Records"
]
intent = [lux.Clause(possible_attributes)]
intent.extend(filter_specs)
Expand Down
41 changes: 12 additions & 29 deletions lux/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
from lux.utils.message import Message
from lux.utils.utils import check_import_lux_widget
from typing import Dict, Union, List, Callable

# from lux.executor.Executor import *
import warnings
import traceback
import lux
Expand All @@ -35,10 +37,7 @@ class LuxDataFrame(pd.DataFrame):
_metadata = [
"_intent",
"_inferred_intent",
"data_type_lookup",
"data_type",
"data_model_lookup",
"data_model",
"unique_values",
"cardinality",
"_rec_info",
Expand Down Expand Up @@ -77,10 +76,7 @@ def __init__(self, *args, **kw):
self._message = Message()
self._pandas_only = False
# Metadata
self.data_type_lookup = None
self.data_type = None
self.data_model_lookup = None
self.data_model = None
self.unique_values = None
self.cardinality = None
self._min_max = None
Expand Down Expand Up @@ -126,10 +122,7 @@ def expire_recs(self):
def expire_metadata(self):
# Set metadata as null
self._metadata_fresh = False
self.data_type_lookup = None
self.data_type = None
self.data_model_lookup = None
self.data_model = None
self.unique_values = None
self.cardinality = None
self._min_max = None
Expand Down Expand Up @@ -294,15 +287,11 @@ def compute_SQL_dataset_metadata(self):
self.get_SQL_attributes()
for attr in list(self.columns):
self[attr] = None
self.data_type_lookup = {}
self.data_type = {}
#####NOTE: since we aren't expecting users to do much data processing with the SQL database, should we just keep this
##### in the initialization and do it just once
self.compute_SQL_data_type()
self.compute_SQL_stats()
self.data_model_lookup = {}
self.data_model = {}
self.compute_data_model()

def compute_SQL_stats(self):
# precompute statistics
Expand All @@ -312,7 +301,7 @@ def compute_SQL_stats(self):
self.get_SQL_unique_values()
# self.get_SQL_cardinality()
for attribute in self.columns:
if self.data_type_lookup[attribute] == "quantitative":
if self.data_type[attribute] == "quantitative":
self._min_max[attribute] = (
self[attribute].min(),
self[attribute].max(),
Expand Down Expand Up @@ -349,7 +338,7 @@ def get_SQL_unique_values(self):
self.unique_values = unique_vals

def compute_SQL_data_type(self):
data_type_lookup = {}
data_type = {}
sql_dtypes = {}
self.get_SQL_cardinality()
if "." in self.table_name:
Expand All @@ -362,20 +351,17 @@ def compute_SQL_data_type(self):
datatype = list(pd.read_sql(query, lux.config.SQLconnection)["data_type"])[0]
sql_dtypes[attr] = datatype

data_type = {"quantitative": [], "nominal": [], "temporal": []}
for attr in list(self.columns):
if str(attr).lower() in ["month", "year"]:
data_type_lookup[attr] = "temporal"
data_type["temporal"].append(attr)
data_type[attr] = "temporal"
elif sql_dtypes[attr] in [
"character",
"character varying",
"boolean",
"uuid",
"text",
]:
data_type_lookup[attr] = "nominal"
data_type["nominal"].append(attr)
data_type[attr] = "nominal"
elif sql_dtypes[attr] in [
"integer",
"real",
Expand All @@ -384,15 +370,11 @@ def compute_SQL_data_type(self):
"serial",
]:
if self.cardinality[attr] < 13:
data_type_lookup[attr] = "nominal"
data_type["nominal"].append(attr)
data_type[attr] = "nominal"
else:
data_type_lookup[attr] = "quantitative"
data_type["quantitative"].append(attr)
data_type[attr] = "quantitative"
elif "time" in sql_dtypes[attr] or "date" in sql_dtypes[attr]:
data_type_lookup[attr] = "temporal"
data_type["temporal"].append(attr)
self.data_type_lookup = data_type_lookup
data_type[attr] = "temporal"
self.data_type = data_type

def _append_rec(self, rec_infolist, recommendations: Dict):
Expand All @@ -419,8 +401,9 @@ def maintain_recs(self):
rec_df._message = Message()
# Add warning message if there exist ID fields
id_fields_str = ""
if len(rec_df.data_type["id"]) > 0:
for id_field in rec_df.data_type["id"]:
inverted_data_type = lux.config.executor.invert_data_type(rec_df.data_type)
if len(inverted_data_type["id"]) > 0:
for id_field in inverted_data_type["id"]:
id_fields_str += f"<code>{id_field}</code>, "
id_fields_str = id_fields_str[:-2]
rec_df._message.add(f"{id_fields_str} is not visualized since it resembles an ID field.")
Expand Down
3 changes: 0 additions & 3 deletions lux/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,7 @@
class LuxSeries(pd.Series):
_metadata = [
"_intent",
"data_type_lookup",
"data_type",
"data_model_lookup",
"data_model",
"unique_values",
"cardinality",
"_rec_info",
Expand Down
22 changes: 19 additions & 3 deletions lux/executor/Executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@ def compute_stats(self):
def compute_data_type(self):
return NotImplemented

@staticmethod
def compute_data_model(self):
return NotImplemented
# @staticmethod
# def compute_data_model(self):
# return NotImplemented

def mapping(self, rmap):
group_map = {}
Expand All @@ -67,3 +67,19 @@ def reverseMapping(self, map):
for val in map[valKey]:
reverse_map[val] = valKey
return reverse_map

def invert_data_type(self, data_type):
return self.mapping(data_type)

def compute_data_model(self, data_type):
data_type_inverted = self.invert_data_type(data_type)
data_model = {
"measure": data_type_inverted["quantitative"],
"dimension": data_type_inverted["nominal"]
+ data_type_inverted["temporal"]
+ data_type_inverted["id"],
}
return data_model

def compute_data_model_lookup(self, data_type):
return self.reverseMapping(self.compute_data_model(data_type))
44 changes: 16 additions & 28 deletions lux/executor/PandasExecutor.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,64 +374,59 @@ def execute_2D_binning(vis: Vis):
############ Metadata: data type, model #############
#######################################################
def compute_dataset_metadata(self, ldf: LuxDataFrame):
ldf.data_type_lookup = {}
ldf.data_type = {}
self.compute_data_type(ldf)
ldf.data_model_lookup = {}
ldf.data_model = {}
self.compute_data_model(ldf)

def compute_data_type(self, ldf: LuxDataFrame):
from pandas.api.types import is_datetime64_any_dtype as is_datetime

for attr in list(ldf.columns):
temporal_var_list = ["month", "year", "day", "date", "time"]
if is_datetime(ldf[attr]):
ldf.data_type_lookup[attr] = "temporal"
ldf.data_type[attr] = "temporal"
elif self._is_datetime_string(ldf[attr]):
ldf.data_type_lookup[attr] = "temporal"
ldf.data_type[attr] = "temporal"
elif isinstance(attr, pd._libs.tslibs.timestamps.Timestamp):
ldf.data_type_lookup[attr] = "temporal"
ldf.data_type[attr] = "temporal"
elif str(attr).lower() in temporal_var_list:
ldf.data_type_lookup[attr] = "temporal"
ldf.data_type[attr] = "temporal"
elif pd.api.types.is_float_dtype(ldf.dtypes[attr]):
# int columns gets coerced into floats if contain NaN
convertible2int = pd.api.types.is_integer_dtype(ldf[attr].convert_dtypes())
if convertible2int and ldf.cardinality[attr] != len(ldf) and ldf.cardinality[attr] < 20:
ldf.data_type_lookup[attr] = "nominal"
ldf.data_type[attr] = "nominal"
else:
ldf.data_type_lookup[attr] = "quantitative"
ldf.data_type[attr] = "quantitative"
elif pd.api.types.is_integer_dtype(ldf.dtypes[attr]):
# See if integer value is quantitative or nominal by checking if the ratio of cardinality/data size is less than 0.4 and if there are less than 10 unique values
if ldf.pre_aggregated:
if ldf.cardinality[attr] == len(ldf):
ldf.data_type_lookup[attr] = "nominal"
ldf.data_type[attr] = "nominal"
if ldf.cardinality[attr] / len(ldf) < 0.4 and ldf.cardinality[attr] < 20:
ldf.data_type_lookup[attr] = "nominal"
ldf.data_type[attr] = "nominal"
else:
ldf.data_type_lookup[attr] = "quantitative"
ldf.data_type[attr] = "quantitative"
if check_if_id_like(ldf, attr):
ldf.data_type_lookup[attr] = "id"
ldf.data_type[attr] = "id"
# Eliminate this clause because a single NaN value can cause the dtype to be object
elif pd.api.types.is_string_dtype(ldf.dtypes[attr]):
if check_if_id_like(ldf, attr):
ldf.data_type_lookup[attr] = "id"
ldf.data_type[attr] = "id"
else:
ldf.data_type_lookup[attr] = "nominal"
ldf.data_type[attr] = "nominal"
# check if attribute is any type of datetime dtype
elif is_datetime_series(ldf.dtypes[attr]):
ldf.data_type_lookup[attr] = "temporal"
ldf.data_type[attr] = "temporal"
else:
ldf.data_type_lookup[attr] = "nominal"
ldf.data_type[attr] = "nominal"
# for attr in list(df.dtypes[df.dtypes=="int64"].keys()):
# if self.cardinality[attr]>50:
if ldf.index.dtype != "int64" and ldf.index.name:
ldf.data_type_lookup[ldf.index.name] = "nominal"
ldf.data_type = self.mapping(ldf.data_type_lookup)
ldf.data_type[ldf.index.name] = "nominal"

non_datetime_attrs = []
for attr in ldf.columns:
if ldf.data_type_lookup[attr] == "temporal" and not is_datetime(ldf[attr]):
if ldf.data_type[attr] == "temporal" and not is_datetime(ldf[attr]):
non_datetime_attrs.append(attr)
warn_msg = ""
if len(non_datetime_attrs) == 1:
Expand Down Expand Up @@ -468,13 +463,6 @@ def _is_datetime_string(self, series):
return True
return False

def compute_data_model(self, ldf: LuxDataFrame):
ldf.data_model = {
"measure": ldf.data_type["quantitative"],
"dimension": ldf.data_type["nominal"] + ldf.data_type["temporal"] + ldf.data_type["id"],
}
ldf.data_model_lookup = self.reverseMapping(ldf.data_model)

def compute_stats(self, ldf: LuxDataFrame):
# precompute statistics
ldf.unique_values = {}
Expand Down
13 changes: 9 additions & 4 deletions lux/processor/Compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,8 @@ def populate_data_type_model(ldf, vlist):
# TODO: copy might not be neccesary
from lux.utils.date_utils import is_datetime_string

data_model_lookup = lux.config.executor.compute_data_model_lookup(ldf.data_type)

for vis in vlist:
for clause in vis._inferred_intent:
if clause.description == "?":
Expand All @@ -167,11 +169,11 @@ def populate_data_type_model(ldf, vlist):
# and not is_datetime_string(clause.attribute):
if clause.attribute != "" and clause.attribute != "Record":
if clause.data_type == "":
clause.data_type = ldf.data_type_lookup[clause.attribute]
clause.data_type = ldf.data_type[clause.attribute]
if clause.data_type == "id":
clause.data_type = "nominal"
if clause.data_model == "":
clause.data_model = ldf.data_model_lookup[clause.attribute]
clause.data_model = data_model_lookup[clause.attribute]
if clause.value != "":
# If user provided title for Vis, then don't override.
if vis.title == "":
Expand Down Expand Up @@ -439,16 +441,19 @@ def populate_wildcard_options(_inferred_intent: List[Clause], ldf: LuxDataFrame)
import copy
from lux.utils.utils import convert_to_list

inverted_data_type = lux.config.executor.invert_data_type(ldf.data_type)
data_model = lux.config.executor.compute_data_model(ldf.data_type)

intent = {"attributes": [], "filters": []}
for clause in _inferred_intent:
spec_options = []
if clause.value == "": # attribute
if clause.attribute == "?":
options = set(list(ldf.columns)) # all attributes
if clause.data_type != "":
options = options.intersection(set(ldf.data_type[clause.data_type]))
options = options.intersection(set(inverted_data_type[clause.data_type]))
if clause.data_model != "":
options = options.intersection(set(ldf.data_model[clause.data_model]))
options = options.intersection(set(data_model[clause.data_model]))
options = list(options)
else:
options = convert_to_list(clause.attribute)
Expand Down
8 changes: 6 additions & 2 deletions lux/utils/date_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

import pandas as pd
import lux


def date_formatter(time_stamp, ldf):
Expand All @@ -38,10 +39,13 @@ def date_formatter(time_stamp, ldf):
date_str: str
A reformatted version of the time_stamp according to granularity
"""

inverted_data_type = lux.config.executor.invert_data_type(ldf.data_type)
# TODO: method for data_type_lookup to data_type
datetime = pd.to_datetime(time_stamp)
if ldf.data_type["temporal"]:
if inverted_data_type["temporal"]:
# assumes only one temporal column, may need to change this function to recieve multiple temporal columns in the future
date_column = ldf[ldf.data_type["temporal"][0]]
date_column = ldf[inverted_data_type["temporal"][0]]

granularity = compute_date_granularity(date_column)
date_str = ""
Expand Down
5 changes: 3 additions & 2 deletions tests/test_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,15 +96,16 @@ def test_refresh_inplace():
)
with pytest.warns(UserWarning, match="Lux detects that the attribute 'date' may be temporal."):
df._repr_html_()
assert df.data_type_lookup["date"] == "temporal"
assert df.data_type["date"] == "temporal"

from lux.vis.Vis import Vis

vis = Vis(["date", "value"], df)

df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d")
df.maintain_metadata()
assert df.data_type["temporal"][0] == "date"
inverted_data_type = lux.config.executor.invert_data_type(df.data_type)
assert inverted_data_type["temporal"][0] == "date"

vis.refresh_source(df)
assert vis.mark == "line"
Expand Down
Loading