Skip to content

Commit

Permalink
Merge pull request #4231 from PrimozGodec/fix-box-plot
Browse files Browse the repository at this point in the history
[FIX] Various fixes of box plot
  • Loading branch information
lanzagar authored Nov 29, 2019
2 parents c58eaa4 + 7f79121 commit 89f31fc
Show file tree
Hide file tree
Showing 8 changed files with 547 additions and 505 deletions.
938 changes: 448 additions & 490 deletions Orange/data/_contingency.c

Large diffs are not rendered by default.

9 changes: 4 additions & 5 deletions Orange/data/_contingency.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -43,17 +43,16 @@ def contingency_floatarray(np.ndarray[np.float64_t, ndim=1] col_data, np.ndarray
i = ranks[i]
v = col_data[i]
tc = classes[i]
if v != last and not npy_isnan(v):
j += 1
V[j] = v
last = v
if npy_isnan(v) and npy_isnan(tc):
unknowns += W[i] if weights else 1.
elif npy_isnan(tc):
row_unknowns[j] += W[i] if weights else 1.
elif npy_isnan(v):
col_unknowns[int(tc)] += W[i] if weights else 1.
elif v != last:
j += 1
V[j] = v
last = v
C[int(tc),j] += W[i] if weights else 1.
else:
C[int(tc),j] += W[i] if weights else 1.

Expand Down
6 changes: 6 additions & 0 deletions Orange/data/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import bottleneck as bn
import numpy as np
from scipy import sparse as sp
from scipy.sparse import issparse

import Orange.data # import for io.py
from Orange.data import (
Expand Down Expand Up @@ -1437,6 +1438,11 @@ def _compute_contingency(self, col_vars=None, row_var=None):
raise ValueError("contingency can be computed only for discrete "
"and continuous values")

# when we select a column in sparse matrix it is still two dimensional
# and sparse - since it is just a column we can afford to transform
# it to dense and make it 1D
if issparse(row_data):
row_data = row_data.toarray().ravel()
if row_data.dtype.kind != "f": #meta attributes can be stored as type object
row_data = row_data.astype(float)

Expand Down
13 changes: 13 additions & 0 deletions Orange/statistics/distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,19 @@ def from_data(cls, data, variable):
self.variable = variable
return self

@property
def array_with_unknowns(self):
"""
This property returns a distribution array with unknowns added
at the end
Returns
-------
np.array
Array with appended unknowns at the end of the row.
"""
return np.append(np.array(self), self.unknowns)

def __getitem__(self, index):
if isinstance(index, str):
index = self.variable.to_val(index)
Expand Down
41 changes: 38 additions & 3 deletions Orange/tests/test_contingency.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import numpy as np
import scipy.sparse as sp
from scipy.sparse import csr_matrix, csc_matrix

from Orange.data import DiscreteVariable, Table, Domain
from Orange.statistics import contingency
Expand All @@ -25,6 +26,7 @@ class TestDiscrete(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.zoo = data.Table("zoo")
cls.test9 = data.Table(test_filename("datasets/test9.tab"))

def test_discrete(self):
cont = contingency.Discrete(self.zoo, 0)
Expand Down Expand Up @@ -153,6 +155,16 @@ def test_continuous_missing(self):
0., 0., 0., 0., 0., 0., 0.])
self.assertEqual(cont.unknowns, 1)

# this one was failing before since the issue in _contingecy.pyx
d.Y[:50] = np.zeros(50) * float("nan")
cont = contingency.Continuous(d, "sepal width")
np.testing.assert_almost_equal(cont.col_unknowns, [0, 0, 0])
np.testing.assert_almost_equal(
cont.row_unknowns,
[0., 0., 1., 0., 0., 0., 0., 0., 1., 5., 5., 5., 2., 9., 6., 2.,
3., 4., 2., 1., 1., 1., 1.])
self.assertEqual(cont.unknowns, 1)

def test_mixedtype_metas(self):
import Orange
zoo = Orange.data.Table("zoo")
Expand Down Expand Up @@ -286,12 +298,35 @@ def test_get_contingencies(self):
assert_dist_equal(cont[2], [1, 0, 0])

def test_compute_contingency_metas(self):
d = data.Table(test_filename("datasets/test9.tab"))
var1, var2 = d.domain[-2], d.domain[-4]
cont = d._compute_contingency([var1], var2)[0][0]
var1, var2 = self.test9.domain[-2], self.test9.domain[-4]
cont = contingency.Discrete(self.test9, var1, var2)
assert_dist_equal(cont, [[3, 0, 0], [0, 2, 0],
[0, 0, 2], [0, 1, 0]])

def test_compute_contingency_row_attribute_sparse(self):
"""
Testing with sparse row variable since currently we do not test the
situation when a row variable is sparse.
"""
d = self.test9
# make X sparse
d.X = csr_matrix(d.X)
var1, var2 = d.domain[0], d.domain[1]
cont = contingency.Discrete(d, var1, var2)
assert_dist_equal(cont, [[1, 0], [1, 0], [1, 0], [1, 0],
[0, 1], [0, 1], [0, 1], [0, 1]])
cont = contingency.Discrete(d, var2, var1)
assert_dist_equal(cont, [[1, 1, 1, 1, 0, 0, 0, 0],
[0, 0, 0, 0, 1, 1, 1, 1]])

d.X = csc_matrix(d.X)
cont = contingency.Discrete(d, var1, var2)
assert_dist_equal(cont, [[1, 0], [1, 0], [1, 0], [1, 0],
[0, 1], [0, 1], [0, 1], [0, 1]])
cont = contingency.Discrete(d, var2, var1)
assert_dist_equal(cont, [[1, 1, 1, 1, 0, 0, 0, 0],
[0, 0, 0, 0, 1, 1, 1, 1]])

def test_compute_contingency_invalid(self):
rstate = np.random.RandomState(0xFFFF)
X = data.ContinuousVariable("X")
Expand Down
11 changes: 11 additions & 0 deletions Orange/tests/test_distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,17 @@ def test_min_max(self):
self.assertEqual(self.num.min(), '1')
self.assertEqual(self.num.max(), '3')

def test_array_with_unknowns(self):
d = data.Table("zoo")
d.Y[0] = np.nan
disc = distribution.Discrete(d, "type")
self.assertIsInstance(disc, np.ndarray)
self.assertEqual(disc.unknowns, 1)
true_freq = [4., 20., 13., 8., 10., 40., 5.]
assert_dist_equal(disc, true_freq)
np.testing.assert_array_equal(disc.array_with_unknowns,
np.append(true_freq, 1))


class TestContinuousDistribution(unittest.TestCase):
@classmethod
Expand Down
11 changes: 5 additions & 6 deletions Orange/widgets/visualize/owboxplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import Orange.data
from Orange.data.filter import FilterDiscrete, FilterContinuous, Values
from Orange.statistics import contingency, distribution
from Orange.statistics.contingency import Discrete

from Orange.widgets import widget, gui
from Orange.widgets.settings import (Setting, DomainContextHandler,
Expand Down Expand Up @@ -647,15 +646,14 @@ def _display_changed_disc(self):
[self.strudel(cont, i)
for i, cont in enumerate(self.conts.array_with_unknowns)
if np.sum(cont) > 0]
self.conts = self.conts[np.sum(np.array(self.conts), axis=1) > 0]

if self.sort_freqs:
# pylint: disable=invalid-unary-operand-type
self.order = sorted(
self.order, key=(-np.sum(
self.conts.array_with_unknowns, axis=1)).__getitem__)
else:
self.boxes = [self.strudel(self.dist, self.dist.unknowns)]
self.boxes = [self.strudel(self.dist.array_with_unknowns)]

for row, box_index in enumerate(self.order):
y = (-len(self.boxes) + row) * 40 + 10
Expand Down Expand Up @@ -921,9 +919,10 @@ def draw_axis_disc(self):
step = steps = 10
else:
if self.group_var:
max_box = max(float(np.sum(dist)) for dist in self.conts)
max_box = max(float(np.sum(dist))
for dist in self.conts.array_with_unknowns)
else:
max_box = float(np.sum(self.dist))
max_box = float(np.sum(self.dist.array_with_unknowns))
if max_box == 0:
self.scale_x = 1
return
Expand All @@ -944,7 +943,7 @@ def draw_axis_disc(self):
right_offset = 0 # offset for the right label
if not self.show_stretched and self.labels:
if self.group_var:
rows = list(zip(self.conts, self.labels))
rows = list(zip(self.conts.array_with_unknowns, self.labels))
else:
rows = [(self.dist, self.labels[0])]
# available space left of the 'group labels'
Expand Down
23 changes: 22 additions & 1 deletion Orange/widgets/visualize/tests/test_owboxplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
import numpy as np
from AnyQt.QtCore import QItemSelectionModel

from Orange.data import Table, ContinuousVariable, StringVariable, Domain
from Orange.data import Table, ContinuousVariable, StringVariable, Domain, \
DiscreteVariable
from Orange.widgets.visualize.owboxplot import (
OWBoxPlot, FilterGraphicsRectItem, _quantiles
)
Expand Down Expand Up @@ -299,6 +300,26 @@ def test_stretching(self):
self.__select_group("chest pain")
self.assertTrue(enabled())

def test_value_all_missing_for_group(self):
"""
This is one of the extreme cases when we have a subgroup value
where all values in selected variable are missing. Box plot should
handle this.
"""
data = Table(Domain([DiscreteVariable("a", values=["v1", "v2", "v3"]),
DiscreteVariable("b", values=["v3", "v4"])]),
[[0., 0.],
[0., 1.],
[1., np.nan],
[1., np.nan],
[2., 0.],
[2., 0.]])
self.send_signal(self.widget.Inputs.data, data)

self.__select_variable("b")
self.__select_group("a")
self.assertTupleEqual(self.widget.conts.shape, (3, 2))


class TestUtils(unittest.TestCase):
def test(self):
Expand Down

0 comments on commit 89f31fc

Please sign in to comment.