Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Data Info: Show statistics about missing values #6623

Merged
merged 2 commits into from
Nov 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 31 additions & 2 deletions Orange/data/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -1551,18 +1551,47 @@ def has_missing_class(self):
return bn.anynan(self._Y)

@staticmethod
def __get_nan_frequency(data):
def __get_nan_count(data):
if data.size == 0:
return 0
dense = data if not sp.issparse(data) else data.data
return np.isnan(dense).sum() / np.prod(data.shape)
return np.isnan(dense).sum()

@classmethod
def __get_nan_frequency(cls, data):
return cls.__get_nan_count(data) / (np.prod(data.shape) or 1)

def get_nan_count_attribute(self):
return self.__get_nan_count(self.X)

def get_nan_count_class(self):
return self.__get_nan_count(self.Y)

def get_nan_count_metas(self):
if self.metas.dtype != object:
return self.__get_nan_count(self.metas)

data = self.metas
if sp.issparse(data):
data = data.tocsc()

count = 0
for i, attr in enumerate(self.domain.metas):
col = data[:, i]
missing = np.isnan(col.astype(float)) \
if not isinstance(attr, StringVariable) else data == ""
count += np.sum(missing)
return count

def get_nan_frequency_attribute(self):
return self.__get_nan_frequency(self.X)

def get_nan_frequency_class(self):
return self.__get_nan_frequency(self.Y)

def get_nan_frequency_metas(self):
return self.get_nan_count_metas() / (np.prod(self.metas.shape) or 1)

def checksum(self, include_metas=True):
# TODO: zlib.adler32 does not work for numpy arrays with dtype object
# (after pickling and unpickling such arrays, checksum changes)
Expand Down
45 changes: 36 additions & 9 deletions Orange/tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -2366,31 +2366,58 @@ def test_value_assignment(self):

class TestTableStats(TableTests):
def test_get_nan_frequency(self):
metas = [DiscreteVariable("x", values=tuple("abc")), StringVariable("s")]
meta_data = np.array([list(range(self.nrows)), ["x"] * self.nrows]).T
domain = self.create_domain(self.attributes, self.class_vars)
table = data.Table(domain, self.data, self.class_data)
self.assertEqual(table.get_nan_frequency_attribute(), 0)
self.assertEqual(table.get_nan_frequency_class(), 0)
domain = Domain(domain.attributes, domain.class_vars, metas)
table = data.Table(domain, self.data, self.class_data, meta_data)

def test_counts(at, cl, me):
x, y, metas = table.X, table.Y, table.metas
for _ in range(2):
self.assertEqual(table.get_nan_count_attribute(), at)
self.assertEqual(table.get_nan_count_class(), cl)
self.assertEqual(table.get_nan_count_metas(), me)
self.assertEqual(table.get_nan_frequency_attribute(), at / np.prod(x.shape))
self.assertEqual(table.get_nan_frequency_class(), cl / np.prod(y.shape))
self.assertEqual(table.get_nan_frequency_metas(), me / np.prod(metas.shape))
with table.unlocked():
table.X = sp.csr_matrix(x)
table.Y = sp.csr_matrix(y)
with table.unlocked():
table.X, table.Y = x, y

test_counts(0, 0, 0)

with table.unlocked():
table.X[1, 2] = table.X[4, 5] = np.nan
self.assertEqual(table.get_nan_frequency_attribute(), 2 / table.X.size)
self.assertEqual(table.get_nan_frequency_class(), 0)
test_counts(2, 0, 0)

with table.unlocked():
table.Y[3:6] = np.nan
self.assertEqual(table.get_nan_frequency_attribute(), 2 / table.X.size)
self.assertEqual(table.get_nan_frequency_class(), 3 / table.Y.size)
test_counts(2, 3, 0)

with table.unlocked():
table.X[1, 2] = table.X[4, 5] = 0
self.assertEqual(table.get_nan_frequency_attribute(), 0)
self.assertEqual(table.get_nan_frequency_class(), 3 / table.Y.size)
test_counts(0, 3, 0)

with table.unlocked():
table.metas[1, 0] = table.metas[3, 0] = np.nan
test_counts(0, 3, 2)

with table.unlocked():
table.metas[5, 1] = ""
test_counts(0, 3, 3)

def test_get_nan_frequency_empty_table(self):
domain = self.create_domain(self.attributes, self.class_vars)
table = data.Table.from_domain(domain)
self.assertEqual(table.get_nan_count_attribute(), 0)
self.assertEqual(table.get_nan_count_class(), 0)
self.assertEqual(table.get_nan_count_metas(), 0)
self.assertEqual(table.get_nan_frequency_attribute(), 0)
self.assertEqual(table.get_nan_frequency_class(), 0)
self.assertEqual(table.get_nan_frequency_metas(), 0)


class TestRowInstance(unittest.TestCase):
Expand Down
59 changes: 44 additions & 15 deletions Orange/widgets/data/owdatainfo.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,25 @@
import threading
import textwrap

from Orange.data import \
Table, StringVariable, DiscreteVariable, ContinuousVariable
try:
from Orange.data.sql.table import SqlTable
except ImportError:
SqlTable = None
import numpy as np

from Orange.widgets import widget, gui
from Orange.widgets.utils.localization import pl
from Orange.widgets.utils.widgetpreview import WidgetPreview
from Orange.widgets.widget import Input

from Orange.data import \
Table, StringVariable, DiscreteVariable, ContinuousVariable

try:
from Orange.data.sql.table import SqlTable
except ImportError:
def is_sql(_):
return False
else:
def is_sql(data):
return isinstance(data, SqlTable)


class OWDataInfo(widget.OWWidget):
name = "Data Info"
Expand Down Expand Up @@ -53,12 +60,13 @@ def data(self, data):
("Size", self._p_size),
("Features", self._p_features),
("Targets", self._p_targets),
("Metas", self._p_metas))
("Metas", self._p_metas),
("Missing data", self._p_missing))
if bool(value := func(data))}
self.data_attrs = data.attributes
self.update_info()

if SqlTable is not None and isinstance(data, SqlTable):
if is_sql(data):
def set_exact_length():
self.data_desc["Size"] = self._p_size(data, exact=True)
self.update_info()
Expand Down Expand Up @@ -101,16 +109,18 @@ def _p_name(data):

@staticmethod
def _p_location(data):
if SqlTable is not None and isinstance(data, SqlTable):
connection_string = ' '.join(
f'{key}={value}'
for key, value in data.connection_params.items()
if value is not None and key != 'password')
return f"SQL Table using connection:<br/>{connection_string}"
if not is_sql(data):
return None

connection_string = ' '.join(
f'{key}={value}'
for key, value in data.connection_params.items()
if value is not None and key != 'password')
return f"SQL Table using connection:<br/>{connection_string}"

@staticmethod
def _p_size(data, exact=False):
exact = exact or SqlTable is None or not isinstance(data, SqlTable)
exact = exact or is_sql(data)
if exact:
n = len(data)
desc = f"{n} {pl(n, 'row')}"
Expand Down Expand Up @@ -152,6 +162,25 @@ def _p_targets(self, data):
def _p_metas(cls, data):
return cls._pack_var_counts(data.domain.metas)

@staticmethod
def _p_missing(data: Table):
if is_sql(data):
return "(not checked for SQL data)"

counts = []
for name, part, n_miss in ((pl(len(data.domain.attributes), "feature"),
data.X, data.get_nan_count_attribute()),
(pl(len(data.domain.class_vars), "targets"),
data.Y, data.get_nan_count_class()),
(pl(len(data.domain.metas), "meta variable"),
data.metas, data.get_nan_count_metas())):
if n_miss:
counts.append(
f"{n_miss} ({n_miss / np.prod(part.shape):.1%}) in {name}")
if not counts:
return "none"
return ", ".join(counts)

@staticmethod
def _count(s, tpe):
return sum(isinstance(x, tpe) for x in s)
Expand Down
22 changes: 13 additions & 9 deletions Orange/widgets/data/tests/test_owdatainfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,24 @@ def test_data(self):
# combinations that must not crash
a, b, c = (DiscreteVariable(n) for n in "abc")
x, y, z = (ContinuousVariable(n) for n in "xyz")
m, n = (StringVariable(n) for n in "nm")
m, n = (StringVariable(n) for n in "mn")
meta_s = np.array([["foo", "bar", ""]]).T
meta_c = np.array([[3.14, np.nan, np.nan]]).T
metadata = np.hstack((meta_s, meta_c))
self.widget.send_report()
for attrs, classes, metas in (((a, b, c), (), ()),
((a, b, c, x), (y,), ()),
((a, b, c), (y, x), (m, )),
((a, b), (y, x, c), (m, )),
((a, ), (b, c), (m, )),
((a, b, x), (c, ), (m, y)),
((), (c, ), (m, y))):
for attrs, classes, metas, metad in (((a, b, c), (), (), None),
((a, b, c, x), (y,), (), None),
((a, b, c), (y, x), (m, ), meta_s),
((a, b, c), (y, ), (x, ), meta_c),
((a, b), (y, x, c), (m, ), meta_s),
((a, ), (b, c), (m, ), meta_s),
((a, b, x), (c, ), (m, y), metadata),
((), (c, ), (m, y), metadata)):
data = Table.from_numpy(
Domain(attrs, classes, metas),
np.zeros((3, len(attrs))),
np.zeros((3, len(classes))),
np.full((3, len(metas)), object()))
metad)
data.attributes = {"att 1": 1, "att 2": True, "att 3": 3}
if metas:
data.name = "name"
Expand Down