Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement grid concatenation and standardize datatype casting #2762

Merged
merged 14 commits into from
Jun 22, 2018
27 changes: 25 additions & 2 deletions holoviews/core/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,14 @@
from .multipath import MultiInterface # noqa (API import)
from .image import ImageInterface # noqa (API import)

default_datatype = 'dictionary'
datatypes = ['dictionary', 'grid']

try:
import pandas as pd # noqa (Availability import)
from .pandas import PandasInterface
datatypes = ['dataframe', 'dictionary', 'grid', 'array']
default_datatype = 'dataframe'
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When converting from gridded to columnar data throughout the code it usually has to cast the data to a specific datatype. Various places in the code hardcoded ['pandas', 'dictionary'] in these places, defining a default_datatype avoids having to hardcode this all over the place.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this be "default_columnar_datatype', then? Or are there no cases where columnar data needs to be cast into some gridded data type?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Columnar data cannot be cast to gridded data without some kind of aggregation occurring. So that's correct. Would still be okay with changing it to default_columnar_datatype.

datatypes = ['dataframe', 'dictionary', 'grid']
DFColumns = PandasInterface
except ImportError:
pd = None
Expand Down Expand Up @@ -64,6 +66,27 @@
from .. import util


def concat(datasets, datatype=None):
"""
Concatenates multiple datasets wrapped in an NdMapping type
along all of its dimensions. Before concatenation all datasets
are cast to the same datatype. For columnar data concatenation
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

'same datatype' determined how?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Either explicitly defined or the type of the first dataset that was passed in.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be good to state that bit about it being chosen from the first one if not explicitly set.

adds the columns for the dimensions being concatenated along
and then concatenates all the old and new columns. For gridded
data a new axis is created for each dimension being concatenated
along and then hierarchically concatenates along each dimension.

Signature
---------

datasets: NdMapping of Datasets defining dimensions to concatenate on
datatype: Datatype to cast data to before concatenation

Returns: Dataset
"""
return Interface.concatenate(datasets, datatype)


class DataConversion(object):
"""
DataConversion is a very simple container object which can be
Expand Down Expand Up @@ -291,7 +314,7 @@ def add_dimension(self, dimension, dim_pos, dim_val, vdim=False, **kwargs):
dimensions = dict(kdims=dims)

if issubclass(self.interface, ArrayInterface) and np.asarray(dim_val).dtype != self.data.dtype:
element = self.clone(datatype=['pandas', 'dictionary'])
element = self.clone(datatype=[default_datatype])
data = element.interface.add_dimension(element, dimension, dim_pos, dim_val, vdim)
else:
data = self.interface.add_dimension(self, dimension, dim_pos, dim_val, vdim)
Expand Down
6 changes: 0 additions & 6 deletions holoviews/core/data/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,6 @@ def add_dimension(cls, dataset, dimension, dim_pos, values, vdim):
return np.insert(data, dim_pos, values, axis=1)


@classmethod
def concat(cls, dataset_objs):
cast_objs = cls.cast(dataset_objs)
return np.concatenate([col.data for col in cast_objs])


@classmethod
def sort(cls, dataset, by=[], reverse=False):
data = dataset.data
Expand Down
11 changes: 8 additions & 3 deletions holoviews/core/data/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,9 +244,14 @@ def add_dimension(cls, columns, dimension, dim_pos, values, vdim):
return data

@classmethod
def concat(cls, columns_objs):
cast_objs = cls.cast(columns_objs)
return dd.concat([col.data for col in cast_objs])
def concat(cls, datasets, dimensions, vdims):
dataframes = []
for key, ds in datasets:
data = ds.data.copy()
for d, k in zip(dimensions, key):
data[d.name] = k
dataframes.append(data)
return dd.concat(dataframes)

@classmethod
def dframe(cls, columns, dimensions):
Expand Down
30 changes: 17 additions & 13 deletions holoviews/core/data/dictionary.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from collections import OrderedDict

from collections import OrderedDict, defaultdict
try:
import itertools.izip as zip
except ImportError:
Expand Down Expand Up @@ -139,7 +138,10 @@ def unpack_scalar(cls, dataset, data):
key = list(data.keys())[0]

if len(data[key]) == 1 and key in dataset.vdims:
return data[key][0]
scalar = data[key][0]
return scalar.compute() if hasattr(scalar, 'compute') else scalar
return data


@classmethod
def isscalar(cls, dataset, dim):
Expand Down Expand Up @@ -185,17 +187,19 @@ def redim(cls, dataset, dimensions):
renamed.append((k, v))
return OrderedDict(renamed)


@classmethod
def concat(cls, dataset_objs):
cast_objs = cls.cast(dataset_objs)
cols = set(tuple(c.data.keys()) for c in cast_objs)
if len(cols) != 1:
raise Exception("In order to concatenate, all Dataset objects "
"should have matching set of columns.")
concatenated = OrderedDict()
for column in cols.pop():
concatenated[column] = np.concatenate([obj[column] for obj in cast_objs])
return concatenated
def concat(cls, datasets, dimensions, vdims):
columns = defaultdict(list)
for key, ds in datasets:
for k, vals in ds.data.items():
columns[k].append(vals)
for d, k in zip(dimensions, key):
columns[d.name].append(np.full(len(ds), k))

template = datasets[0][1]
dims = dimensions+template.dimensions()
return OrderedDict([(d.name, np.concatenate(columns[d.name])) for d in dims])


@classmethod
Expand Down
45 changes: 40 additions & 5 deletions holoviews/core/data/grid.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,15 @@
except ImportError:
da = None

def is_dask(array):
return da and isinstance(array, da.Array)

from .dictionary import DictInterface
from .interface import Interface, DataError
from ..dimension import Dimension
from ..element import Element
from ..dimension import OrderedDict as cyODict
from ..ndmapping import NdMapping, item_check
from ..ndmapping import NdMapping, item_check, sorted_context
from .. import util


Expand Down Expand Up @@ -116,14 +118,47 @@ def init(cls, eltype, data, kdims, vdims):
return data, {'kdims':kdims, 'vdims':vdims}, {}


@classmethod
def concat(cls, datasets, dimensions, vdims):
from . import Dataset
with sorted_context(False):
datasets = NdMapping(datasets, kdims=dimensions)
datasets = datasets.clone([(k, v.data if isinstance(v, Dataset) else v)
for k, v in datasets.data.items()])
if len(datasets.kdims) > 1:
items = datasets.groupby(datasets.kdims[:-1]).data.items()
return cls.concat([(k, cls.concat(v, v.kdims, vdims=vdims)) for k, v in items],
datasets.kdims[:-1], vdims)
return cls.concat_dim(datasets, datasets.kdims[0], vdims)


@classmethod
def concat_dim(cls, datasets, dim, vdims):
values, grids = zip(*datasets.items())
new_data = {k: v for k, v in grids[0].items() if k not in vdims}
new_data[dim.name] = np.array(values)
for vdim in vdims:
arrays = [grid[vdim.name] for grid in grids]
shapes = set(arr.shape for arr in arrays)
if len(shapes) > 1:
raise DataError('When concatenating gridded data the shape '
'of arrays must match. %s found that arrays '
'along the %s dimension do not match.' %
(cls.__name__, vdim.name))
stack = np.stack if any(is_dask(arr) for arr in arrays) else da.stack
new_data[vdim.name] = stack(arrays, -1)
return new_data
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since arrays cannot be concatenated along multiple axes at once the implementation of concat on gridded interfaces has two components. A general concat method coordinates hierarchical concatenation along each dimension and uses the interface specific concat_dim method implementations to concatenate along one particular axis or dimension.



@classmethod
def irregular(cls, dataset, dim):
return dataset.data[dim.name if isinstance(dim, Dimension) else dim].ndim > 1


@classmethod
def isscalar(cls, dataset, dim):
return np.unique(cls.values(dataset, dim, expanded=False)) == 1
values = cls.values(dataset, dim, expanded=False)
return values.shape in ((), (1,)) or len(np.unique(values)) == 1


@classmethod
Expand Down Expand Up @@ -541,9 +576,9 @@ def aggregate(cls, dataset, kdims, function, **kwargs):
axes = tuple(dataset.ndims-dataset.get_dimension_index(kdim)-1
for kdim in dataset.kdims if kdim not in kdims)
for vdim in dataset.vdims:
data[vdim.name] = np.atleast_1d(function(dataset.data[vdim.name],
axis=axes, **kwargs))

values = dataset.data[vdim.name]
atleast_1d = da.atleast_1d if is_dask(values) else np.atleast_1d
data[vdim.name] = atleast_1d(function(values, axis=axes, **kwargs))
return data


Expand Down
74 changes: 37 additions & 37 deletions holoviews/core/data/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np

from ..element import Element
from ..ndmapping import OrderedDict
from ..ndmapping import OrderedDict, NdMapping
from .. import util


Expand Down Expand Up @@ -107,30 +107,20 @@ class Interface(param.Parameterized):
def register(cls, interface):
cls.interfaces[interface.datatype] = interface


@classmethod
def cast(cls, dataset, datatype=None, cast_type=None):
def cast(cls, datasets, datatype=None, cast_type=None):
"""
Given a list of Dataset objects, cast them to the specified
datatype (by default the format matching the current interface)
with the given cast_type (if specified).
"""
if len({type(c) for c in dataset}) > 1 and cast_type is None:
raise Exception("Please supply the common cast type")

if datatype is None:
datatype = cls.datatype

unchanged = all({c.interface==cls for c in dataset})
if unchanged and cast_type is None:
return dataset
elif unchanged:
return [cast_type(co, **dict(util.get_param_values(co)))
for co in dataset]

return [co.clone(co.columns(), datatype=[datatype], new_type=cast_type)
for co in dataset]

datatype = datatype or cls.datatype
cast = []
for ds in datasets:
if cast_type is not None or ds.interface.datatype != datatype:
ds = ds.clone(ds, datatype=[datatype], new_type=cast_type)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Casting works quite simply, if the Interface.initialize is passed another dataset and it finds a mismatch between the supplied datatype and the requested datatype it will deconstruct the original dataset into the columnar or gridded tuple format, which is supported by all interfaces. In this way a dataset can easily be cast to any other datatype, except for columnar -> gridded conversions.

cast.append(ds)
return cast

@classmethod
def error(cls):
Expand Down Expand Up @@ -167,7 +157,7 @@ def initialize(cls, eltype, data, kdims, vdims, datatype=None):

if data.interface.datatype in datatype and data.interface.datatype in eltype.datatype:
data = data.data
elif data.interface.gridded:
elif data.interface.gridded and any(cls.interfaces[dt].gridded for dt in datatype):
gridded = OrderedDict([(kd.name, data.dimension_values(kd.name, expanded=False))
for kd in data.kdims])
for vd in data.vdims:
Expand Down Expand Up @@ -307,25 +297,35 @@ def range(cls, dataset, dimension):
return column[0], column[-1]

@classmethod
def concatenate(cls, dataset, datatype=None):
def concatenate(cls, datasets, datatype=None, new_type=None):
"""
Utility function to concatenate a list of Column objects,
returning a new Dataset object. Note that this is unlike the
.concat method which only concatenates the data.
Utility function to concatenate an NdMapping of Dataset objects.
"""
if len(set(type(c) for c in dataset)) != 1:
raise Exception("All inputs must be same type in order to concatenate")

interfaces = set(c.interface for c in dataset)
if len(interfaces)!=1 and datatype is None:
raise Exception("Please specify the concatenated datatype")
elif len(interfaces)!=1:
interface = cls.interfaces[datatype]
else:
interface = interfaces.pop()

concat_data = interface.concat(dataset)
return dataset[0].clone(concat_data)
from . import Dataset, default_datatype
new_type = new_type or Dataset
if isinstance(datasets, NdMapping):
dimensions = datasets.kdims
datasets = datasets.data
if isinstance(datasets, (dict, OrderedDict)):
datasets = datasets.items()
keys, datasets = zip(*datasets)
elif isinstance(datasets, list) and not any(isinstance(v, tuple) for v in datasets):
keys = [()]*len(datasets)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What are all these empty tuple keys for? Just to get things in the right format?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, concatenate is usually meant for concatenating along some dimension but you can also concatenate a simple list of datasets without concatenating along some dimensions. For that case we generate empty tuple keys. Happy to add a comment. Separately I also need to assert that this only happens for tabular data, since gridded data must be concatenated along some dimension.

dimensions = []
template = datasets[0]
datatype = datatype or template.interface.datatype

# Handle non-general datatypes by casting to general type
if datatype == 'array':
datatype = default_datatype
elif datatype == 'image':
datatype = 'grid'

datasets = template.interface.cast(datasets, datatype)
template = datasets[0]
data = list(zip(keys, datasets)) if keys else datasets
concat_data = template.interface.concat(data, dimensions, vdims=template.vdims)
return template.clone(concat_data, kdims=dimensions+template.kdims, new_type=new_type)

@classmethod
def reduce(cls, dataset, reduce_dims, function, **kwargs):
Expand Down
19 changes: 18 additions & 1 deletion holoviews/core/data/iris.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
from itertools import product

import iris
from iris.coords import DimCoord
from iris.cube import CubeList
from iris.experimental.equalise_cubes import equalise_attributes
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will be good to have the iris interface moved to geoviews. Could this be done for 1.10.6?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tests need to be moved into the holoviews package first.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mean 'geoviews' package?

Copy link
Member Author

@philippjfr philippjfr Jun 20, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No I mean the /tests need to move to /holoviews/tests.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The interface tests are defined as mix-in classes, so if I want to run them in geoviews I have to be able to import them from holoviews. We also promised this to the bokeh folks so they can run our bokeh unit tests easily.

from iris.util import guess_coord_axis

import numpy as np
Expand Down Expand Up @@ -230,6 +233,20 @@ def groupby(cls, dataset, dims, container_type=HoloMap, group_type=None, **kwarg
else:
return container_type(data)

@classmethod
def concat_dim(cls, datasets, dim, vdims):
"""
Concatenates datasets along one dimension
"""
cubes = []
for c, cube in datasets.items():
cube = cube.copy()
cube.add_aux_coord(DimCoord([c], var_name=dim.name))
cubes.append(cube)
cubes = CubeList(cubes)
equalise_attributes(cubes)
return cubes.merge_cube()


@classmethod
def range(cls, dataset, dimension):
Expand Down Expand Up @@ -261,7 +278,7 @@ def length(cls, dataset):
"""
Returns the total number of samples in the dataset.
"""
return np.product([len(d.points) for d in dataset.data.coords()])
return np.product([len(d.points) for d in dataset.data.coords(dim_coords=True)])


@classmethod
Expand Down
11 changes: 8 additions & 3 deletions holoviews/core/data/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,14 @@ def range(cls, columns, dimension):


@classmethod
def concat(cls, columns_objs):
cast_objs = cls.cast(columns_objs)
return pd.concat([col.data for col in cast_objs])
def concat(cls, datasets, dimensions, vdims):
dataframes = []
for key, ds in datasets:
data = ds.data.copy()
for d, k in zip(dimensions, key):
data[d.name] = k
dataframes.append(data)
return pd.concat(dataframes)


@classmethod
Expand Down
9 changes: 3 additions & 6 deletions holoviews/core/data/xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,13 +360,10 @@ def ndloc(cls, dataset, indices):
else:
return dataset.data.isel(**isel)


@classmethod
def concat(cls, dataset_objs):
#cast_objs = cls.cast(dataset_objs)
# Reimplement concat to automatically add dimensions
# once multi-dimensional concat has been added to xarray.
return xr.concat([col.data for col in dataset_objs], dim='concat_dim')
def concat_dim(cls, datasets, dim, vdims):
return xr.concat([ds.assign_coords(**{dim.name: c}) for c, ds in datasets.items()],
dim=dim.name)

@classmethod
def redim(cls, dataset, dimensions):
Expand Down
Loading