Skip to content

Commit

Permalink
Colored bar interestingness bug (#189)
Browse files Browse the repository at this point in the history
* rewrote chi2 contingency with pd.crosstab
* catching KeyError issue with chi2 contingency
* padding interestingness with warning instead of error
* interestingness now reuses ndim and nmsr computed in Compiler
* bug fix for parser with int values
* improve Vis repr to better display inferred intent when data is absent but fully compiled intent (all clauses)
  • Loading branch information
dorisjlee committed Dec 28, 2020
1 parent 3c190a5 commit 42b89af
Show file tree
Hide file tree
Showing 3 changed files with 158 additions and 165 deletions.
238 changes: 114 additions & 124 deletions lux/interestingness/interestingness.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from lux.utils.utils import get_filter_specs
from lux.interestingness.similarity import preprocess, euclidean_dist
from lux.vis.VisList import VisList
import warnings


def interestingness(vis: Vis, ldf: LuxDataFrame) -> int:
Expand All @@ -46,134 +47,123 @@ def interestingness(vis: Vis, ldf: LuxDataFrame) -> int:
if vis.data is None or len(vis.data) == 0:
return -1
# raise Exception("Vis.data needs to be populated before interestingness can be computed. Run Executor.execute(vis,ldf).")

n_dim = 0
n_msr = 0

filter_specs = utils.get_filter_specs(vis._inferred_intent)
vis_attrs_specs = utils.get_attrs_specs(vis._inferred_intent)

record_attrs = list(
filter(
lambda x: x.attribute == "Record" and x.data_model == "measure",
vis_attrs_specs,
)
)
n_record = len(record_attrs)
for clause in vis_attrs_specs:
if clause.attribute != "Record":
if clause.data_model == "dimension":
n_dim += 1
if clause.data_model == "measure":
n_msr += 1
n_filter = len(filter_specs)
attr_specs = [clause for clause in vis_attrs_specs if clause.attribute != "Record"]
dimension_lst = vis.get_attr_by_data_model("dimension")
measure_lst = vis.get_attr_by_data_model("measure")
v_size = len(vis.data)

if (
n_dim == 1
and (n_msr == 0 or n_msr == 1)
and ldf.current_vis is not None
and vis.get_attr_by_channel("y")[0].data_type == "quantitative"
and len(ldf.current_vis) == 1
and ldf.current_vis[0].mark == "line"
and len(get_filter_specs(ldf.intent)) > 0
):
query_vc = VisList(ldf.current_vis, ldf)
query_vis = query_vc[0]
preprocess(query_vis)
preprocess(vis)
return 1 - euclidean_dist(query_vis, vis)

# Line/Bar Chart
# print("r:", n_record, "m:", n_msr, "d:",n_dim)
if n_dim == 1 and (n_msr == 0 or n_msr == 1):
if v_size < 2:
return -1

if n_filter == 0:
return unevenness(vis, ldf, measure_lst, dimension_lst)
elif n_filter == 1:
return deviation_from_overall(vis, ldf, filter_specs, measure_lst[0].attribute)
# Histogram
elif n_dim == 0 and n_msr == 1:
if v_size < 2:
return -1
if n_filter == 0 and "Number of Records" in vis.data:
if "Number of Records" in vis.data:
v = vis.data["Number of Records"]
return skewness(v)
elif n_filter == 1 and "Number of Records" in vis.data:
return deviation_from_overall(vis, ldf, filter_specs, "Number of Records")
return -1
# Scatter Plot
elif n_dim == 0 and n_msr == 2:
if v_size < 10:
return -1
if vis.mark == "heatmap":
return weighted_correlation(vis.data["xBinStart"], vis.data["yBinStart"], vis.data["count"])
if n_filter == 1:
v_filter_size = get_filtered_size(filter_specs, vis.data)
sig = v_filter_size / v_size
else:
sig = 1
return sig * monotonicity(vis, attr_specs)
# Scatterplot colored by Dimension
elif n_dim == 1 and n_msr == 2:
if v_size < 10:
try:
filter_specs = utils.get_filter_specs(vis._inferred_intent)
vis_attrs_specs = utils.get_attrs_specs(vis._inferred_intent)
n_dim = vis._ndim
n_msr = vis._nmsr
n_filter = len(filter_specs)
attr_specs = [clause for clause in vis_attrs_specs if clause.attribute != "Record"]
dimension_lst = vis.get_attr_by_data_model("dimension")
measure_lst = vis.get_attr_by_data_model("measure")
v_size = len(vis.data)

if (
n_dim == 1
and (n_msr == 0 or n_msr == 1)
and ldf.current_vis is not None
and vis.get_attr_by_channel("y")[0].data_type == "quantitative"
and len(ldf.current_vis) == 1
and ldf.current_vis[0].mark == "line"
and len(get_filter_specs(ldf.intent)) > 0
):
query_vc = VisList(ldf.current_vis, ldf)
query_vis = query_vc[0]
preprocess(query_vis)
preprocess(vis)
return 1 - euclidean_dist(query_vis, vis)

# Line/Bar Chart
# print("r:", n_record, "m:", n_msr, "d:",n_dim)
if n_dim == 1 and (n_msr == 0 or n_msr == 1):
if v_size < 2:
return -1

if n_filter == 0:
return unevenness(vis, ldf, measure_lst, dimension_lst)
elif n_filter == 1:
return deviation_from_overall(vis, ldf, filter_specs, measure_lst[0].attribute)
# Histogram
elif n_dim == 0 and n_msr == 1:
if v_size < 2:
return -1
if n_filter == 0 and "Number of Records" in vis.data:
if "Number of Records" in vis.data:
v = vis.data["Number of Records"]
return skewness(v)
elif n_filter == 1 and "Number of Records" in vis.data:
return deviation_from_overall(vis, ldf, filter_specs, "Number of Records")
return -1
color_attr = vis.get_attr_by_channel("color")[0].attribute
# Scatter Plot
elif n_dim == 0 and n_msr == 2:
if v_size < 10:
return -1
if vis.mark == "heatmap":
return weighted_correlation(
vis.data["xBinStart"], vis.data["yBinStart"], vis.data["count"]
)
if n_filter == 1:
v_filter_size = get_filtered_size(filter_specs, vis.data)
sig = v_filter_size / v_size
else:
sig = 1
return sig * monotonicity(vis, attr_specs)
# Scatterplot colored by Dimension
elif n_dim == 1 and n_msr == 2:
if v_size < 10:
return -1
color_attr = vis.get_attr_by_channel("color")[0].attribute

C = ldf.cardinality[color_attr]
if C < 40:
return 1 / C
else:
return -1
# Scatterplot colored by dimension
elif n_dim == 1 and n_msr == 2:
return 0.2
# Scatterplot colored by measure
elif n_msr == 3:
return 0.1
# colored line and barchart cases
elif vis.mark == "line" and n_dim == 2:
return 0.15
# for colored bar chart, scoring based on Chi-square test for independence score.
# gives higher scores to colored bar charts with fewer total categories as these charts are easier to read and thus more useful for users
elif vis.mark == "bar" and n_dim == 2:
from scipy.stats import chi2_contingency

measure_column = vis.get_attr_by_data_model("measure")[0].attribute
dimension_columns = vis.get_attr_by_data_model("dimension")

groupby_column = dimension_columns[0].attribute
color_column = dimension_columns[1].attribute

contingency_tbl = pd.crosstab(
vis.data[groupby_column],
vis.data[color_column],
values=vis.data[measure_column],
aggfunc=sum,
)

C = ldf.cardinality[color_attr]
if C < 40:
return 1 / C
try:
color_cardinality = ldf.cardinality[color_column]
groupby_cardinality = ldf.cardinality[groupby_column]
# scale down score based on number of categories
chi2_score = chi2_contingency(contingency_tbl)[0] * 0.9 ** (
color_cardinality + groupby_cardinality
)
score = min(0.10, chi2_score)
except (ValueError, KeyError):
# ValueError results if an entire column of the contingency table is 0, can happen if an applied filter results in a category having no counts
score = -1
return score
# Default
else:
return -1
# Scatterplot colored by dimension
elif n_dim == 1 and n_msr == 2:
return 0.2
# Scatterplot colored by measure
elif n_msr == 3:
return 0.1
# colored line and barchart cases
elif vis.mark == "line" and n_dim == 2:
return 0.15
# for colored bar chart, scoring based on Chi-square test for independence score.
# gives higher scores to colored bar charts with fewer total categories as these charts are easier to read and thus more useful for users
elif vis.mark == "bar" and n_dim == 2:
from scipy.stats import chi2_contingency

measure_column = vis.get_attr_by_data_model("measure")[0].attribute
dimension_columns = vis.get_attr_by_data_model("dimension")

groupby_column = dimension_columns[0].attribute
color_column = dimension_columns[1].attribute

contingency_table = []
groupby_cardinality = ldf.cardinality[groupby_column]
groupby_unique_vals = ldf.unique_values[groupby_column]
for c in range(0, groupby_cardinality):
contingency_table.append(
vis.data[vis.data[groupby_column] == groupby_unique_vals[c]][measure_column]
)
score = 0.12
# ValueError results if an entire column of the contingency table is 0, can happen if an applied filter results in
# a category having no counts

try:
color_cardinality = ldf.cardinality[color_column]
# scale down score based on number of categories
chi2_score = chi2_contingency(contingency_table)[0] * 0.9 ** (
color_cardinality + groupby_cardinality
)
score = min(0.10, chi2_score)
except ValueError:
pass
return score
# Default
else:
except:
# Supress interestingness related issues
warnings.warn(f"An error occurred when computing interestingness for: {vis}")
return -1


Expand Down
8 changes: 5 additions & 3 deletions lux/processor/Parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,9 @@ def parse(intent: List[Union[Clause, str]]) -> List[Clause]:
# TODO: Move validation check to Validator
# if ((clause.description in list(ldf.columns)) or clause.description == "?"):# if clause.description in the list of attributes
# clause.description contain ">","<". or "="
if any(ext in [">", "<", "=", "!="] for ext in clause.description):
if type(clause.description) == str and any(
ext in [">", "<", "=", "!="] for ext in clause.description
):
# then parse it and assign to clause.attribute, clause.filter_op, clause.values
clause.filter_op = re.findall(r"/.*/|>|=|<|>=|<=|!=", clause.description)[0]
split_description = clause.description.split(clause.filter_op)
Expand All @@ -107,7 +109,7 @@ def parse(intent: List[Union[Clause, str]]) -> List[Clause]:
clause.attribute = clause.description
elif type(clause.description) == list:
clause.attribute = clause.description
# else: # then it is probably a value
# clause.values = clause.description
else: # then it is probably a value
clause.value = clause.description
return intent
# ldf._intent = intent
77 changes: 39 additions & 38 deletions lux/vis/Vis.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,10 @@ class Vis:
"""

def __init__(self, intent, source=None, title="", score=0.0):
self._intent = intent # This is the user's original intent to Vis
self._inferred_intent = intent # This is the re-written, expanded version of user's original intent (include inferred vis info)
self._source = source # This is the original data that is attached to the Vis
self._vis_data = (
None # This is the data that represents the Vis (e.g., selected, aggregated, binned)
)
self._intent = intent # user's original intent to Vis
self._inferred_intent = intent # re-written, expanded version of user's original intent
self._source = source # original data attached to the Vis
self._vis_data = None # processed data for Vis (e.g., selected, aggregated, binned)
self._code = None
self._mark = ""
self._min_max = {}
Expand All @@ -39,39 +37,42 @@ def __init__(self, intent, source=None, title="", score=0.0):
self.refresh_source(self._source)

def __repr__(self):
if self._source is None:
return f"<Vis ({str(self._intent)}) mark: {self._mark}, score: {self.score} >"
filter_intents = None
channels, additional_channels = [], []
for clause in self._inferred_intent:

if hasattr(clause, "value"):
if clause.value != "":
filter_intents = clause
if hasattr(clause, "attribute"):
if clause.attribute != "":
if clause.aggregation != "" and clause.aggregation is not None:
attribute = clause._aggregation_name.upper() + "(" + clause.attribute + ")"
elif clause.bin_size > 0:
attribute = "BIN(" + clause.attribute + ")"
else:
attribute = clause.attribute
if clause.channel == "x":
channels.insert(0, [clause.channel, attribute])
elif clause.channel == "y":
channels.insert(1, [clause.channel, attribute])
elif clause.channel != "":
additional_channels.append([clause.channel, attribute])

channels.extend(additional_channels)
str_channels = ""
for channel in channels:
str_channels += channel[0] + ": " + channel[1] + ", "

if filter_intents:
return f"<Vis ({str_channels[:-2]} -- [{filter_intents.attribute}{filter_intents.filter_op}{filter_intents.value}]) mark: {self._mark}, score: {self.score} >"
all_clause = all([isinstance(unit, lux.Clause) for unit in self._inferred_intent])
if all_clause:
filter_intents = None
channels, additional_channels = [], []
for clause in self._inferred_intent:

if hasattr(clause, "value"):
if clause.value != "":
filter_intents = clause
if hasattr(clause, "attribute"):
if clause.attribute != "":
if clause.aggregation != "" and clause.aggregation is not None:
attribute = clause._aggregation_name.upper() + "(" + clause.attribute + ")"
elif clause.bin_size > 0:
attribute = "BIN(" + clause.attribute + ")"
else:
attribute = clause.attribute
if clause.channel == "x":
channels.insert(0, [clause.channel, attribute])
elif clause.channel == "y":
channels.insert(1, [clause.channel, attribute])
elif clause.channel != "":
additional_channels.append([clause.channel, attribute])

channels.extend(additional_channels)
str_channels = ""
for channel in channels:
str_channels += channel[0] + ": " + channel[1] + ", "

if filter_intents:
return f"<Vis ({str_channels[:-2]} -- [{filter_intents.attribute}{filter_intents.filter_op}{filter_intents.value}]) mark: {self._mark}, score: {self.score} >"
else:
return f"<Vis ({str_channels[:-2]}) mark: {self._mark}, score: {self.score} >"
else:
return f"<Vis ({str_channels[:-2]}) mark: {self._mark}, score: {self.score} >"
# When Vis not compiled (e.g., when self._source not populated), print original intent
return f"<Vis ({str(self._intent)}) mark: {self._mark}, score: {self.score} >"

@property
def data(self):
Expand Down

0 comments on commit 42b89af

Please sign in to comment.