Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Coalesce all data_type attributes of frame into one #185

Merged
merged 8 commits into from
Jan 2, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 18 additions & 22 deletions lux/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from lux.utils.message import Message
from lux.utils.utils import check_import_lux_widget
from typing import Dict, Union, List, Callable
from lux.executor.Executor import *
import warnings
import traceback
import lux
Expand All @@ -36,9 +37,6 @@ class LuxDataFrame(pd.DataFrame):
"_intent",
"_inferred_intent",
"data_type_lookup",
"data_type",
"data_model_lookup",
"data_model",
"unique_values",
"cardinality",
"_rec_info",
Expand Down Expand Up @@ -78,9 +76,6 @@ def __init__(self, *args, **kw):
self._pandas_only = False
# Metadata
self.data_type_lookup = None
self.data_type = None
self.data_model_lookup = None
self.data_model = None
self.unique_values = None
self.cardinality = None
self._min_max = None
Expand Down Expand Up @@ -127,14 +122,25 @@ def expire_metadata(self):
# Set metadata as null
self._metadata_fresh = False
self.data_type_lookup = None
self.data_type = None
self.data_model_lookup = None
self.data_model = None
self.unique_values = None
self.cardinality = None
self._min_max = None
self.pre_aggregated = None

def compute_data_type_from_lookup(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this inside the frame.py instead of the Executor.py(which is still shared across SQLExecutor and PandasExecutor)? I think that earlier this semester @thyneb19 worked on moving data type related stuff outside of frame.py so maybe we should keep it that way?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great, I'll move it over

return Executor.mapping(Executor, self.data_type_lookup)

def compute_data_model(self):
data_type = self.compute_data_type_from_lookup()
data_model = {
"measure": data_type["quantitative"],
"dimension": data_type["nominal"] + data_type["temporal"] + data_type["id"],
}
return data_model

def compute_data_model_lookup(self):
return Executor.reverseMapping(Executor, self.compute_data_model())

#####################
## Override Pandas ##
#####################
Expand Down Expand Up @@ -295,14 +301,10 @@ def compute_SQL_dataset_metadata(self):
for attr in list(self.columns):
self[attr] = None
self.data_type_lookup = {}
self.data_type = {}
#####NOTE: since we aren't expecting users to do much data processing with the SQL database, should we just keep this
##### in the initialization and do it just once
self.compute_SQL_data_type()
self.compute_SQL_stats()
self.data_model_lookup = {}
self.data_model = {}
self.compute_data_model()

def compute_SQL_stats(self):
# precompute statistics
Expand Down Expand Up @@ -362,11 +364,9 @@ def compute_SQL_data_type(self):
datatype = list(pd.read_sql(query, lux.config.SQLconnection)["data_type"])[0]
sql_dtypes[attr] = datatype

data_type = {"quantitative": [], "nominal": [], "temporal": []}
for attr in list(self.columns):
if str(attr).lower() in ["month", "year"]:
data_type_lookup[attr] = "temporal"
data_type["temporal"].append(attr)
elif sql_dtypes[attr] in [
"character",
"character varying",
Expand All @@ -375,7 +375,6 @@ def compute_SQL_data_type(self):
"text",
]:
data_type_lookup[attr] = "nominal"
data_type["nominal"].append(attr)
elif sql_dtypes[attr] in [
"integer",
"real",
Expand All @@ -385,15 +384,11 @@ def compute_SQL_data_type(self):
]:
if self.cardinality[attr] < 13:
data_type_lookup[attr] = "nominal"
data_type["nominal"].append(attr)
else:
data_type_lookup[attr] = "quantitative"
data_type["quantitative"].append(attr)
elif "time" in sql_dtypes[attr] or "date" in sql_dtypes[attr]:
data_type_lookup[attr] = "temporal"
data_type["temporal"].append(attr)
self.data_type_lookup = data_type_lookup
self.data_type = data_type

def _append_rec(self, rec_infolist, recommendations: Dict):
if recommendations["collection"] is not None and len(recommendations["collection"]) > 0:
Expand All @@ -419,8 +414,9 @@ def maintain_recs(self):
rec_df._message = Message()
# Add warning message if there exist ID fields
id_fields_str = ""
if len(rec_df.data_type["id"]) > 0:
for id_field in rec_df.data_type["id"]:
data_type = rec_df.compute_data_type_from_lookup()
if len(data_type["id"]) > 0:
for id_field in data_type["id"]:
id_fields_str += f"<code>{id_field}</code>, "
id_fields_str = id_fields_str[:-2]
rec_df._message.add(f"{id_fields_str} is not visualized since it resembles an ID field.")
Expand Down
3 changes: 0 additions & 3 deletions lux/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,6 @@ class LuxSeries(pd.Series):
_metadata = [
"_intent",
"data_type_lookup",
"data_type",
"data_model_lookup",
"data_model",
"unique_values",
"cardinality",
"_rec_info",
Expand Down
8 changes: 5 additions & 3 deletions lux/executor/Executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,16 +51,18 @@ def compute_stats(self):
def compute_data_type(self):
return NotImplemented

@staticmethod
def compute_data_model(self):
return NotImplemented
# @staticmethod
# def compute_data_model(self):
# return NotImplemented

@staticmethod
def mapping(self, rmap):
group_map = {}
for val in ["quantitative", "id", "nominal", "temporal"]:
group_map[val] = list(filter(lambda x: rmap[x] == val, rmap))
return group_map

@staticmethod
def reverseMapping(self, map):
reverse_map = {}
for valKey in map:
Expand Down
12 changes: 0 additions & 12 deletions lux/executor/PandasExecutor.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,11 +375,7 @@ def execute_2D_binning(vis: Vis):
#######################################################
def compute_dataset_metadata(self, ldf: LuxDataFrame):
ldf.data_type_lookup = {}
ldf.data_type = {}
self.compute_data_type(ldf)
ldf.data_model_lookup = {}
ldf.data_model = {}
self.compute_data_model(ldf)

def compute_data_type(self, ldf: LuxDataFrame):
from pandas.api.types import is_datetime64_any_dtype as is_datetime
Expand Down Expand Up @@ -427,7 +423,6 @@ def compute_data_type(self, ldf: LuxDataFrame):
# if self.cardinality[attr]>50:
if ldf.index.dtype != "int64" and ldf.index.name:
ldf.data_type_lookup[ldf.index.name] = "nominal"
ldf.data_type = self.mapping(ldf.data_type_lookup)

non_datetime_attrs = []
for attr in ldf.columns:
Expand Down Expand Up @@ -468,13 +463,6 @@ def _is_datetime_string(self, series):
return True
return False

def compute_data_model(self, ldf: LuxDataFrame):
ldf.data_model = {
"measure": ldf.data_type["quantitative"],
"dimension": ldf.data_type["nominal"] + ldf.data_type["temporal"] + ldf.data_type["id"],
}
ldf.data_model_lookup = self.reverseMapping(ldf.data_model)

def compute_stats(self, ldf: LuxDataFrame):
# precompute statistics
ldf.unique_values = {}
Expand Down
11 changes: 8 additions & 3 deletions lux/processor/Compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,8 @@ def populate_data_type_model(ldf, vlist):
# TODO: copy might not be neccesary
from lux.utils.date_utils import is_datetime_string

data_model_lookup = ldf.compute_data_model_lookup()

for vis in vlist:
for clause in vis._inferred_intent:
if clause.description == "?":
Expand All @@ -170,7 +172,7 @@ def populate_data_type_model(ldf, vlist):
if clause.data_type == "id":
clause.data_type = "nominal"
if clause.data_model == "":
clause.data_model = ldf.data_model_lookup[clause.attribute]
clause.data_model = data_model_lookup[clause.attribute]
if clause.value != "":
# If user provided title for Vis, then don't override.
if vis.title == "":
Expand Down Expand Up @@ -427,16 +429,19 @@ def populate_wildcard_options(_inferred_intent: List[Clause], ldf: LuxDataFrame)
import copy
from lux.utils.utils import convert_to_list

data_type = ldf.compute_data_type_from_lookup()
data_model = ldf.compute_data_model()

intent = {"attributes": [], "filters": []}
for clause in _inferred_intent:
spec_options = []
if clause.value == "": # attribute
if clause.attribute == "?":
options = set(list(ldf.columns)) # all attributes
if clause.data_type != "":
options = options.intersection(set(ldf.data_type[clause.data_type]))
options = options.intersection(set(data_type[clause.data_type]))
if clause.data_model != "":
options = options.intersection(set(ldf.data_model[clause.data_model]))
options = options.intersection(set(data_model[clause.data_model]))
options = list(options)
else:
options = convert_to_list(clause.attribute)
Expand Down
6 changes: 4 additions & 2 deletions lux/utils/date_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,12 @@ def date_formatter(time_stamp, ldf):
date_str: str
A reformatted version of the time_stamp according to granularity
"""
data_type = ldf.compute_data_type_from_lookup()
# TODO: method for data_type_lookup to data_type
datetime = pd.to_datetime(time_stamp)
if ldf.data_type["temporal"]:
if data_type["temporal"]:
# assumes only one temporal column, may need to change this function to recieve multiple temporal columns in the future
date_column = ldf[ldf.data_type["temporal"][0]]
date_column = ldf[data_type["temporal"][0]]

granularity = compute_date_granularity(date_column)
date_str = ""
Expand Down
3 changes: 2 additions & 1 deletion tests/test_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@ def test_refresh_inplace():

df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d")
df.maintain_metadata()
assert df.data_type["temporal"][0] == "date"
data_type = df.compute_data_type_from_lookup()
assert data_type["temporal"][0] == "date"

vis.refresh_source(df)
assert vis.mark == "line"
Expand Down
6 changes: 4 additions & 2 deletions tests/test_nan.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,16 @@ def test_nan_data_type_detection():
]
test = pd.DataFrame(dataset)
test.maintain_metadata()
assert test.data_type["nominal"] == [
data_type = test.compute_data_type_from_lookup()
assert data_type["nominal"] == [
"fully_nan",
"some_nan",
"some_nan2",
], "Categorical columns containing NaNs should be treated as nominal data type"
nona_test = test.dropna(subset=["some_nan"])
nona_test.maintain_metadata()
assert nona_test.data_type["nominal"] == [
data_type = nona_test.compute_data_type_from_lookup()
assert data_type["nominal"] == [
"fully_nan",
"some_nan",
"some_nan2",
Expand Down
59 changes: 34 additions & 25 deletions tests/test_pandas_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,19 +45,28 @@ def test_rename_inplace(global_var):

assert df.data_type_lookup["Name"] == new_df.data_type_lookup["Car Name"]

assert df.data_type != new_df.data_type
data_type = df.compute_data_type_from_lookup()
new_data_type = new_df.compute_data_type_from_lookup()

assert df.data_type["nominal"][0] == "Name"
assert new_df.data_type["nominal"][0] == "Car Name"
assert data_type != new_data_type

assert df.data_model_lookup != new_df.data_model_lookup
assert data_type["nominal"][0] == "Name"
assert new_data_type["nominal"][0] == "Car Name"

assert df.data_model_lookup["Name"] == new_df.data_model_lookup["Car Name"]
data_model_lookup = df.compute_data_model_lookup()
new_data_model_lookup = new_df.compute_data_model_lookup()

assert df.data_model != new_df.data_model
assert data_model_lookup != new_data_model_lookup

assert df.data_model["dimension"][0] == "Name"
assert new_df.data_model["dimension"][0] == "Car Name"
assert data_model_lookup["Name"] == new_data_model_lookup["Car Name"]

data_model = df.compute_data_model()
new_data_model = new_df.compute_data_model()

assert data_model != new_data_model

assert data_model["dimension"][0] == "Name"
assert new_data_model["dimension"][0] == "Car Name"

assert list(df.unique_values.values()) == list(new_df.unique_values.values())
assert list(df.cardinality.values()) == list(new_df.cardinality.values())
Expand All @@ -75,19 +84,28 @@ def test_rename(global_var):

assert df.data_type_lookup["Name"] == new_df.data_type_lookup["Car Name"]

assert df.data_type != new_df.data_type
data_type = df.compute_data_type_from_lookup()
new_data_type = new_df.compute_data_type_from_lookup()

assert data_type != new_data_type

assert data_type["nominal"][0] == "Name"
assert new_data_type["nominal"][0] == "Car Name"

data_model_lookup = df.compute_data_model_lookup()
new_data_model_lookup = new_df.compute_data_model_lookup()

assert df.data_type["nominal"][0] == "Name"
assert new_df.data_type["nominal"][0] == "Car Name"
assert data_model_lookup != new_data_model_lookup

assert df.data_model_lookup != new_df.data_model_lookup
assert data_model_lookup["Name"] == new_data_model_lookup["Car Name"]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here maybe we could just compute the equality between the data_models

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, do you want me to get rid of lines 98-100?


assert df.data_model_lookup["Name"] == new_df.data_model_lookup["Car Name"]
data_model = df.compute_data_model()
new_data_model = new_df.compute_data_model()

assert df.data_model != new_df.data_model
assert data_model != new_data_model

assert df.data_model["dimension"][0] == "Name"
assert new_df.data_model["dimension"][0] == "Car Name"
assert data_model["dimension"][0] == "Name"
assert new_data_model["dimension"][0] == "Car Name"

assert list(df.unique_values.values()) == list(new_df.unique_values.values())
assert list(df.cardinality.values()) == list(new_df.cardinality.values())
Expand Down Expand Up @@ -503,9 +521,6 @@ def test_df_to_series(global_var):
assert df["Weight"]._metadata == [
"_intent",
"data_type_lookup",
"data_type",
"data_model_lookup",
"data_model",
"unique_values",
"cardinality",
"_rec_info",
Expand Down Expand Up @@ -534,9 +549,6 @@ def test_value_counts(global_var):
assert df["Weight"]._metadata == [
"_intent",
"data_type_lookup",
"data_type",
"data_model_lookup",
"data_model",
"unique_values",
"cardinality",
"_rec_info",
Expand Down Expand Up @@ -564,9 +576,6 @@ def test_str_replace(global_var):
assert df["Brand"]._metadata == [
"_intent",
"data_type_lookup",
"data_type",
"data_model_lookup",
"data_model",
"unique_values",
"cardinality",
"_rec_info",
Expand Down
7 changes: 5 additions & 2 deletions tests/test_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@ def test_check_int_id():
"https://github.com/lux-org/lux-datasets/blob/master/data/instacart_sample.csv?raw=true"
)
df._repr_html_()
assert len(df.data_type["id"]) == 3
data_type = df.compute_data_type_from_lookup()
assert len(data_type["id"]) == 3
# assert len(df.data_type["id"]) == 3
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove comment?

assert (
"<code>order_id</code>, <code>product_id</code>, <code>user_id</code> is not visualized since it resembles an ID field."
in df._message.to_html()
Expand Down Expand Up @@ -176,7 +178,8 @@ def test_float_categorical():
]
df = pd.DataFrame(values)
df.maintain_metadata()
assert df.data_type["nominal"] == [
data_type = df.compute_data_type_from_lookup()
assert data_type["nominal"] == [
"A",
"B",
"C",
Expand Down