Colored bar interestingness bug (#189)

* rewrote chi2 contingency with pd.crosstab * catching KeyError issue with chi2 contingency * padding interestingness with warning instead of error * interestingness now reuses ndim and nmsr computed in Compiler * bug fix for parser with int values * improve Vis repr to better display inferred intent when data is absent but fully compiled intent (all clauses)
lux-org · Dec 28, 2020 · 42b89af · 42b89af
1 parent 3c190a5
commit 42b89af
Show file tree

Hide file tree

Showing 3 changed files with 158 additions and 165 deletions.
diff --git a/lux/interestingness/interestingness.py b/lux/interestingness/interestingness.py
@@ -25,6 +25,7 @@
 from lux.utils.utils import get_filter_specs
 from lux.interestingness.similarity import preprocess, euclidean_dist
 from lux.vis.VisList import VisList
+import warnings
 
 
 def interestingness(vis: Vis, ldf: LuxDataFrame) -> int:
@@ -46,134 +47,123 @@ def interestingness(vis: Vis, ldf: LuxDataFrame) -> int:
     if vis.data is None or len(vis.data) == 0:
         return -1
         # raise Exception("Vis.data needs to be populated before interestingness can be computed. Run Executor.execute(vis,ldf).")
-
-    n_dim = 0
-    n_msr = 0
-
-    filter_specs = utils.get_filter_specs(vis._inferred_intent)
-    vis_attrs_specs = utils.get_attrs_specs(vis._inferred_intent)
-
-    record_attrs = list(
-        filter(
-            lambda x: x.attribute == "Record" and x.data_model == "measure",
-            vis_attrs_specs,
-        )
-    )
-    n_record = len(record_attrs)
-    for clause in vis_attrs_specs:
-        if clause.attribute != "Record":
-            if clause.data_model == "dimension":
-                n_dim += 1
-            if clause.data_model == "measure":
-                n_msr += 1
-    n_filter = len(filter_specs)
-    attr_specs = [clause for clause in vis_attrs_specs if clause.attribute != "Record"]
-    dimension_lst = vis.get_attr_by_data_model("dimension")
-    measure_lst = vis.get_attr_by_data_model("measure")
-    v_size = len(vis.data)
-
-    if (
-        n_dim == 1
-        and (n_msr == 0 or n_msr == 1)
-        and ldf.current_vis is not None
-        and vis.get_attr_by_channel("y")[0].data_type == "quantitative"
-        and len(ldf.current_vis) == 1
-        and ldf.current_vis[0].mark == "line"
-        and len(get_filter_specs(ldf.intent)) > 0
-    ):
-        query_vc = VisList(ldf.current_vis, ldf)
-        query_vis = query_vc[0]
-        preprocess(query_vis)
-        preprocess(vis)
-        return 1 - euclidean_dist(query_vis, vis)
-
-    # Line/Bar Chart
-    # print("r:", n_record, "m:", n_msr, "d:",n_dim)
-    if n_dim == 1 and (n_msr == 0 or n_msr == 1):
-        if v_size < 2:
-            return -1
-
-        if n_filter == 0:
-            return unevenness(vis, ldf, measure_lst, dimension_lst)
-        elif n_filter == 1:
-            return deviation_from_overall(vis, ldf, filter_specs, measure_lst[0].attribute)
-    # Histogram
-    elif n_dim == 0 and n_msr == 1:
-        if v_size < 2:
-            return -1
-        if n_filter == 0 and "Number of Records" in vis.data:
-            if "Number of Records" in vis.data:
-                v = vis.data["Number of Records"]
-                return skewness(v)
-        elif n_filter == 1 and "Number of Records" in vis.data:
-            return deviation_from_overall(vis, ldf, filter_specs, "Number of Records")
-        return -1
-    # Scatter Plot
-    elif n_dim == 0 and n_msr == 2:
-        if v_size < 10:
-            return -1
-        if vis.mark == "heatmap":
-            return weighted_correlation(vis.data["xBinStart"], vis.data["yBinStart"], vis.data["count"])
-        if n_filter == 1:
-            v_filter_size = get_filtered_size(filter_specs, vis.data)
-            sig = v_filter_size / v_size
-        else:
-            sig = 1
-        return sig * monotonicity(vis, attr_specs)
-    # Scatterplot colored by Dimension
-    elif n_dim == 1 and n_msr == 2:
-        if v_size < 10:
+    try:
+        filter_specs = utils.get_filter_specs(vis._inferred_intent)
+        vis_attrs_specs = utils.get_attrs_specs(vis._inferred_intent)
+        n_dim = vis._ndim
+        n_msr = vis._nmsr
+        n_filter = len(filter_specs)
+        attr_specs = [clause for clause in vis_attrs_specs if clause.attribute != "Record"]
+        dimension_lst = vis.get_attr_by_data_model("dimension")
+        measure_lst = vis.get_attr_by_data_model("measure")
+        v_size = len(vis.data)
+
+        if (
+            n_dim == 1
+            and (n_msr == 0 or n_msr == 1)
+            and ldf.current_vis is not None
+            and vis.get_attr_by_channel("y")[0].data_type == "quantitative"
+            and len(ldf.current_vis) == 1
+            and ldf.current_vis[0].mark == "line"
+            and len(get_filter_specs(ldf.intent)) > 0
+        ):
+            query_vc = VisList(ldf.current_vis, ldf)
+            query_vis = query_vc[0]
+            preprocess(query_vis)
+            preprocess(vis)
+            return 1 - euclidean_dist(query_vis, vis)
+
+        # Line/Bar Chart
+        # print("r:", n_record, "m:", n_msr, "d:",n_dim)
+        if n_dim == 1 and (n_msr == 0 or n_msr == 1):
+            if v_size < 2:
+                return -1
+
+            if n_filter == 0:
+                return unevenness(vis, ldf, measure_lst, dimension_lst)
+            elif n_filter == 1:
+                return deviation_from_overall(vis, ldf, filter_specs, measure_lst[0].attribute)
+        # Histogram
+        elif n_dim == 0 and n_msr == 1:
+            if v_size < 2:
+                return -1
+            if n_filter == 0 and "Number of Records" in vis.data:
+                if "Number of Records" in vis.data:
+                    v = vis.data["Number of Records"]
+                    return skewness(v)
+            elif n_filter == 1 and "Number of Records" in vis.data:
+                return deviation_from_overall(vis, ldf, filter_specs, "Number of Records")
             return -1
-        color_attr = vis.get_attr_by_channel("color")[0].attribute
+        # Scatter Plot
+        elif n_dim == 0 and n_msr == 2:
+            if v_size < 10:
+                return -1
+            if vis.mark == "heatmap":
+                return weighted_correlation(
+                    vis.data["xBinStart"], vis.data["yBinStart"], vis.data["count"]
+                )
+            if n_filter == 1:
+                v_filter_size = get_filtered_size(filter_specs, vis.data)
+                sig = v_filter_size / v_size
+            else:
+                sig = 1
+            return sig * monotonicity(vis, attr_specs)
+        # Scatterplot colored by Dimension
+        elif n_dim == 1 and n_msr == 2:
+            if v_size < 10:
+                return -1
+            color_attr = vis.get_attr_by_channel("color")[0].attribute
+
+            C = ldf.cardinality[color_attr]
+            if C < 40:
+                return 1 / C
+            else:
+                return -1
+        # Scatterplot colored by dimension
+        elif n_dim == 1 and n_msr == 2:
+            return 0.2
+        # Scatterplot colored by measure
+        elif n_msr == 3:
+            return 0.1
+        # colored line and barchart cases
+        elif vis.mark == "line" and n_dim == 2:
+            return 0.15
+        # for colored bar chart, scoring based on Chi-square test for independence score.
+        # gives higher scores to colored bar charts with fewer total categories as these charts are easier to read and thus more useful for users
+        elif vis.mark == "bar" and n_dim == 2:
+            from scipy.stats import chi2_contingency
+
+            measure_column = vis.get_attr_by_data_model("measure")[0].attribute
+            dimension_columns = vis.get_attr_by_data_model("dimension")
+
+            groupby_column = dimension_columns[0].attribute
+            color_column = dimension_columns[1].attribute
+
+            contingency_tbl = pd.crosstab(
+                vis.data[groupby_column],
+                vis.data[color_column],
+                values=vis.data[measure_column],
+                aggfunc=sum,
+            )
 
-        C = ldf.cardinality[color_attr]
-        if C < 40:
-            return 1 / C
+            try:
+                color_cardinality = ldf.cardinality[color_column]
+                groupby_cardinality = ldf.cardinality[groupby_column]
+                # scale down score based on number of categories
+                chi2_score = chi2_contingency(contingency_tbl)[0] * 0.9 ** (
+                    color_cardinality + groupby_cardinality
+                )
+                score = min(0.10, chi2_score)
+            except (ValueError, KeyError):
+                # ValueError results if an entire column of the contingency table is 0, can happen if an applied filter results in a category having no counts
+                score = -1
+            return score
+        # Default
         else:
             return -1
-    # Scatterplot colored by dimension
-    elif n_dim == 1 and n_msr == 2:
-        return 0.2
-    # Scatterplot colored by measure
-    elif n_msr == 3:
-        return 0.1
-    # colored line and barchart cases
-    elif vis.mark == "line" and n_dim == 2:
-        return 0.15
-    # for colored bar chart, scoring based on Chi-square test for independence score.
-    # gives higher scores to colored bar charts with fewer total categories as these charts are easier to read and thus more useful for users
-    elif vis.mark == "bar" and n_dim == 2:
-        from scipy.stats import chi2_contingency
-
-        measure_column = vis.get_attr_by_data_model("measure")[0].attribute
-        dimension_columns = vis.get_attr_by_data_model("dimension")
-
-        groupby_column = dimension_columns[0].attribute
-        color_column = dimension_columns[1].attribute
-
-        contingency_table = []
-        groupby_cardinality = ldf.cardinality[groupby_column]
-        groupby_unique_vals = ldf.unique_values[groupby_column]
-        for c in range(0, groupby_cardinality):
-            contingency_table.append(
-                vis.data[vis.data[groupby_column] == groupby_unique_vals[c]][measure_column]
-            )
-        score = 0.12
-        # ValueError results if an entire column of the contingency table is 0, can happen if an applied filter results in
-        # a category having no counts
-
-        try:
-            color_cardinality = ldf.cardinality[color_column]
-            # scale down score based on number of categories
-            chi2_score = chi2_contingency(contingency_table)[0] * 0.9 ** (
-                color_cardinality + groupby_cardinality
-            )
-            score = min(0.10, chi2_score)
-        except ValueError:
-            pass
-        return score
-    # Default
-    else:
+    except:
+        # Supress interestingness related issues
+        warnings.warn(f"An error occurred when computing interestingness for: {vis}")
         return -1
 
 

diff --git a/lux/processor/Parser.py b/lux/processor/Parser.py
@@ -95,7 +95,9 @@ def parse(intent: List[Union[Clause, str]]) -> List[Clause]:
                 # TODO: Move validation check to Validator
                 # if ((clause.description in list(ldf.columns)) or clause.description == "?"):# if clause.description in the list of attributes
                 # clause.description contain ">","<". or "="
-                if any(ext in [">", "<", "=", "!="] for ext in clause.description):
+                if type(clause.description) == str and any(
+                    ext in [">", "<", "=", "!="] for ext in clause.description
+                ):
                     # then parse it and assign to clause.attribute, clause.filter_op, clause.values
                     clause.filter_op = re.findall(r"/.*/|>|=|<|>=|<=|!=", clause.description)[0]
                     split_description = clause.description.split(clause.filter_op)
@@ -107,7 +109,7 @@ def parse(intent: List[Union[Clause, str]]) -> List[Clause]:
                     clause.attribute = clause.description
                 elif type(clause.description) == list:
                     clause.attribute = clause.description
-                # else: # then it is probably a value
-                # 	clause.values = clause.description
+                else:  # then it is probably a value
+                    clause.value = clause.description
         return intent
         # ldf._intent = intent
diff --git a/lux/vis/Vis.py b/lux/vis/Vis.py
@@ -24,12 +24,10 @@ class Vis:
     """
 
     def __init__(self, intent, source=None, title="", score=0.0):
-        self._intent = intent  # This is the user's original intent to Vis
-        self._inferred_intent = intent  # This is the re-written, expanded version of user's original intent (include inferred vis info)
-        self._source = source  # This is the original data that is attached to the Vis
-        self._vis_data = (
-            None  # This is the data that represents the Vis (e.g., selected, aggregated, binned)
-        )
+        self._intent = intent  # user's original intent to Vis
+        self._inferred_intent = intent  # re-written, expanded version of user's original intent
+        self._source = source  # original data attached to the Vis
+        self._vis_data = None  # processed data for Vis (e.g., selected, aggregated, binned)
         self._code = None
         self._mark = ""
         self._min_max = {}
@@ -39,39 +37,42 @@ def __init__(self, intent, source=None, title="", score=0.0):
         self.refresh_source(self._source)
 
     def __repr__(self):
-        if self._source is None:
-            return f"<Vis  ({str(self._intent)}) mark: {self._mark}, score: {self.score} >"
-        filter_intents = None
-        channels, additional_channels = [], []
-        for clause in self._inferred_intent:
-
-            if hasattr(clause, "value"):
-                if clause.value != "":
-                    filter_intents = clause
-            if hasattr(clause, "attribute"):
-                if clause.attribute != "":
-                    if clause.aggregation != "" and clause.aggregation is not None:
-                        attribute = clause._aggregation_name.upper() + "(" + clause.attribute + ")"
-                    elif clause.bin_size > 0:
-                        attribute = "BIN(" + clause.attribute + ")"
-                    else:
-                        attribute = clause.attribute
-                    if clause.channel == "x":
-                        channels.insert(0, [clause.channel, attribute])
-                    elif clause.channel == "y":
-                        channels.insert(1, [clause.channel, attribute])
-                    elif clause.channel != "":
-                        additional_channels.append([clause.channel, attribute])
-
-        channels.extend(additional_channels)
-        str_channels = ""
-        for channel in channels:
-            str_channels += channel[0] + ": " + channel[1] + ", "
-
-        if filter_intents:
-            return f"<Vis  ({str_channels[:-2]} -- [{filter_intents.attribute}{filter_intents.filter_op}{filter_intents.value}]) mark: {self._mark}, score: {self.score} >"
+        all_clause = all([isinstance(unit, lux.Clause) for unit in self._inferred_intent])
+        if all_clause:
+            filter_intents = None
+            channels, additional_channels = [], []
+            for clause in self._inferred_intent:
+
+                if hasattr(clause, "value"):
+                    if clause.value != "":
+                        filter_intents = clause
+                if hasattr(clause, "attribute"):
+                    if clause.attribute != "":
+                        if clause.aggregation != "" and clause.aggregation is not None:
+                            attribute = clause._aggregation_name.upper() + "(" + clause.attribute + ")"
+                        elif clause.bin_size > 0:
+                            attribute = "BIN(" + clause.attribute + ")"
+                        else:
+                            attribute = clause.attribute
+                        if clause.channel == "x":
+                            channels.insert(0, [clause.channel, attribute])
+                        elif clause.channel == "y":
+                            channels.insert(1, [clause.channel, attribute])
+                        elif clause.channel != "":
+                            additional_channels.append([clause.channel, attribute])
+
+            channels.extend(additional_channels)
+            str_channels = ""
+            for channel in channels:
+                str_channels += channel[0] + ": " + channel[1] + ", "
+
+            if filter_intents:
+                return f"<Vis  ({str_channels[:-2]} -- [{filter_intents.attribute}{filter_intents.filter_op}{filter_intents.value}]) mark: {self._mark}, score: {self.score} >"
+            else:
+                return f"<Vis  ({str_channels[:-2]}) mark: {self._mark}, score: {self.score} >"
         else:
-            return f"<Vis  ({str_channels[:-2]}) mark: {self._mark}, score: {self.score} >"
+            # When Vis not compiled (e.g., when self._source not populated), print original intent
+            return f"<Vis  ({str(self._intent)}) mark: {self._mark}, score: {self.score} >"
 
     @property
     def data(self):