Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle kdims referencing multi-index in dask #2789

Merged
merged 1 commit into from
Jun 11, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions holoviews/core/data/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from dask.dataframe import DataFrame

from .. import util
from ..dimension import Dimension
from ..element import Element
from ..ndmapping import NdMapping, item_check, OrderedDict
from .interface import Interface
Expand Down Expand Up @@ -44,10 +45,18 @@ class DaskInterface(PandasInterface):

@classmethod
def init(cls, eltype, data, kdims, vdims):
data, kdims, vdims = PandasInterface.init(eltype, data, kdims, vdims)
data, dims, extra = PandasInterface.init(eltype, data, kdims, vdims)
if not isinstance(data, DataFrame):
data = dd.from_pandas(data, npartitions=cls.default_partitions, sort=False)
return data, kdims, vdims
kdims = [d.name if isinstance(d, Dimension) else d for d in dims['kdims']]

# If a key dimension can be found, speculatively reset index
# to work around lacking dask support for MultiIndex
if any(d for d in kdims if d not in data.columns):
reset = data.reset_index()
if all(d for d in kdims if d in reset.columns):
data = reset
return data, dims, extra

@classmethod
def shape(cls, dataset):
Expand Down
3 changes: 1 addition & 2 deletions tests/core/data/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ def init_column_data(self):
# all interfaces.

def test_dataset_array_init_hm(self):
"Tests support for arrays (homogeneous)"
dataset = Dataset(np.column_stack([self.xs, self.xs_2]),
kdims=['x'], vdims=['x2'])
self.assertTrue(isinstance(dataset.data, self.data_type))
Expand All @@ -106,7 +105,7 @@ def test_dataset_dataframe_init_hm(self):
if pd is None:
raise SkipTest("Pandas not available")
dataset = Dataset(pd.DataFrame({'x':self.xs, 'x2':self.xs_2}),
kdims=['x'], vdims=[ 'x2'])
kdims=['x'], vdims=['x2'])
self.assertTrue(isinstance(dataset.data, self.data_type))

def test_dataset_dataframe_init_hm_alias(self):
Expand Down
21 changes: 20 additions & 1 deletion tests/core/data/testdaskinterface.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
from nose.plugins.attrib import attr
from unittest import SkipTest

import numpy as np

try:
import pandas as pd
import dask.dataframe as dd
except:
dd = None
raise SkipTest("Could not import dask, skipping DaskInterface tests.")

from holoviews.core.data import Dataset

from .testpandasinterface import PandasInterfaceTests


@attr(optional=1)
class DaskDatasetTest(PandasInterfaceTests):
"""
Test of the pandas DaskDataset interface.
Expand Down Expand Up @@ -46,3 +53,15 @@ def test_dataset_sort_string_ht(self):

def test_dataset_boolean_index(self):
raise SkipTest("Not supported")

def test_dataset_from_multi_index(self):
df = pd.DataFrame({'x': np.arange(10), 'y': np.arange(10), 'z': np.random.rand(10)})
ddf = dd.from_pandas(df, 1)
ds = Dataset(ddf.groupby(['x', 'y']).mean(), ['x', 'y'])
self.assertEqual(ds, Dataset(df, ['x', 'y']))

def test_dataset_from_multi_index_tuple_dims(self):
df = pd.DataFrame({'x': np.arange(10), 'y': np.arange(10), 'z': np.random.rand(10)})
ddf = dd.from_pandas(df, 1)
ds = Dataset(ddf.groupby(['x', 'y']).mean(), [('x', 'X'), ('y', 'Y')])
self.assertEqual(ds, Dataset(df, [('x', 'X'), ('y', 'Y')]))
10 changes: 10 additions & 0 deletions tests/core/data/testpandasinterface.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,3 +153,13 @@ def test_dataset_conversion_groupby_with_index(self):
hmap = HoloMap({0: Scatter(([0, 1], [1, 2]), 'index', 'y'),
1: Scatter([(2, 3)], 'index', 'y')}, 'x')
self.assertEqual(scatters, hmap)

def test_dataset_from_multi_index(self):
df = pd.DataFrame({'x': np.arange(10), 'y': np.arange(10), 'z': np.random.rand(10)})
ds = Dataset(df.groupby(['x', 'y']).mean(), ['x', 'y'])
self.assertEqual(ds, Dataset(df, ['x', 'y']))

def test_dataset_from_multi_index_tuple_dims(self):
df = pd.DataFrame({'x': np.arange(10), 'y': np.arange(10), 'z': np.random.rand(10)})
ds = Dataset(df.groupby(['x', 'y']).mean(), [('x', 'X'), ('y', 'Y')])
self.assertEqual(ds, Dataset(df, [('x', 'X'), ('y', 'Y')]))