-
-
Notifications
You must be signed in to change notification settings - Fork 402
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement grid concatenation and standardize datatype casting #2762
Changes from 13 commits
7982048
37bfdbb
bf5ce00
0ef9685
6e17281
901bb17
3ece3d9
95b6844
7dd80c5
ffad755
a96e32c
67bd62a
fc15f1a
6356310
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,12 +17,14 @@ | |
from .multipath import MultiInterface # noqa (API import) | ||
from .image import ImageInterface # noqa (API import) | ||
|
||
default_datatype = 'dictionary' | ||
datatypes = ['dictionary', 'grid'] | ||
|
||
try: | ||
import pandas as pd # noqa (Availability import) | ||
from .pandas import PandasInterface | ||
datatypes = ['dataframe', 'dictionary', 'grid', 'array'] | ||
default_datatype = 'dataframe' | ||
datatypes = ['dataframe', 'dictionary', 'grid'] | ||
DFColumns = PandasInterface | ||
except ImportError: | ||
pd = None | ||
|
@@ -64,6 +66,27 @@ | |
from .. import util | ||
|
||
|
||
def concat(datasets, datatype=None): | ||
""" | ||
Concatenates multiple datasets wrapped in an NdMapping type | ||
along all of its dimensions. Before concatenation all datasets | ||
are cast to the same datatype. For columnar data concatenation | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 'same datatype' determined how? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Either explicitly defined or the type of the first dataset that was passed in. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would be good to state that bit about it being chosen from the first one if not explicitly set. |
||
adds the columns for the dimensions being concatenated along | ||
and then concatenates all the old and new columns. For gridded | ||
data a new axis is created for each dimension being concatenated | ||
along and then hierarchically concatenates along each dimension. | ||
|
||
Signature | ||
--------- | ||
|
||
datasets: NdMapping of Datasets defining dimensions to concatenate on | ||
datatype: Datatype to cast data to before concatenation | ||
|
||
Returns: Dataset | ||
""" | ||
return Interface.concatenate(datasets, datatype) | ||
|
||
|
||
class DataConversion(object): | ||
""" | ||
DataConversion is a very simple container object which can be | ||
|
@@ -291,7 +314,7 @@ def add_dimension(self, dimension, dim_pos, dim_val, vdim=False, **kwargs): | |
dimensions = dict(kdims=dims) | ||
|
||
if issubclass(self.interface, ArrayInterface) and np.asarray(dim_val).dtype != self.data.dtype: | ||
element = self.clone(datatype=['pandas', 'dictionary']) | ||
element = self.clone(datatype=[default_datatype]) | ||
data = element.interface.add_dimension(element, dimension, dim_pos, dim_val, vdim) | ||
else: | ||
data = self.interface.add_dimension(self, dimension, dim_pos, dim_val, vdim) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,13 +17,15 @@ | |
except ImportError: | ||
da = None | ||
|
||
def is_dask(array): | ||
return da and isinstance(array, da.Array) | ||
|
||
from .dictionary import DictInterface | ||
from .interface import Interface, DataError | ||
from ..dimension import Dimension | ||
from ..element import Element | ||
from ..dimension import OrderedDict as cyODict | ||
from ..ndmapping import NdMapping, item_check | ||
from ..ndmapping import NdMapping, item_check, sorted_context | ||
from .. import util | ||
|
||
|
||
|
@@ -116,14 +118,47 @@ def init(cls, eltype, data, kdims, vdims): | |
return data, {'kdims':kdims, 'vdims':vdims}, {} | ||
|
||
|
||
@classmethod | ||
def concat(cls, datasets, dimensions, vdims): | ||
from . import Dataset | ||
with sorted_context(False): | ||
datasets = NdMapping(datasets, kdims=dimensions) | ||
datasets = datasets.clone([(k, v.data if isinstance(v, Dataset) else v) | ||
for k, v in datasets.data.items()]) | ||
if len(datasets.kdims) > 1: | ||
items = datasets.groupby(datasets.kdims[:-1]).data.items() | ||
return cls.concat([(k, cls.concat(v, v.kdims, vdims=vdims)) for k, v in items], | ||
datasets.kdims[:-1], vdims) | ||
return cls.concat_dim(datasets, datasets.kdims[0], vdims) | ||
|
||
|
||
@classmethod | ||
def concat_dim(cls, datasets, dim, vdims): | ||
values, grids = zip(*datasets.items()) | ||
new_data = {k: v for k, v in grids[0].items() if k not in vdims} | ||
new_data[dim.name] = np.array(values) | ||
for vdim in vdims: | ||
arrays = [grid[vdim.name] for grid in grids] | ||
shapes = set(arr.shape for arr in arrays) | ||
if len(shapes) > 1: | ||
raise DataError('When concatenating gridded data the shape ' | ||
'of arrays must match. %s found that arrays ' | ||
'along the %s dimension do not match.' % | ||
(cls.__name__, vdim.name)) | ||
stack = np.stack if any(is_dask(arr) for arr in arrays) else da.stack | ||
new_data[vdim.name] = stack(arrays, -1) | ||
return new_data | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since arrays cannot be concatenated along multiple axes at once the implementation of |
||
|
||
|
||
@classmethod | ||
def irregular(cls, dataset, dim): | ||
return dataset.data[dim.name if isinstance(dim, Dimension) else dim].ndim > 1 | ||
|
||
|
||
@classmethod | ||
def isscalar(cls, dataset, dim): | ||
return np.unique(cls.values(dataset, dim, expanded=False)) == 1 | ||
values = cls.values(dataset, dim, expanded=False) | ||
return values.shape in ((), (1,)) or len(np.unique(values)) == 1 | ||
|
||
|
||
@classmethod | ||
|
@@ -541,9 +576,9 @@ def aggregate(cls, dataset, kdims, function, **kwargs): | |
axes = tuple(dataset.ndims-dataset.get_dimension_index(kdim)-1 | ||
for kdim in dataset.kdims if kdim not in kdims) | ||
for vdim in dataset.vdims: | ||
data[vdim.name] = np.atleast_1d(function(dataset.data[vdim.name], | ||
axis=axes, **kwargs)) | ||
|
||
values = dataset.data[vdim.name] | ||
atleast_1d = da.atleast_1d if is_dask(values) else np.atleast_1d | ||
data[vdim.name] = atleast_1d(function(values, axis=axes, **kwargs)) | ||
return data | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,7 +4,7 @@ | |
import numpy as np | ||
|
||
from ..element import Element | ||
from ..ndmapping import OrderedDict | ||
from ..ndmapping import OrderedDict, NdMapping | ||
from .. import util | ||
|
||
|
||
|
@@ -107,30 +107,20 @@ class Interface(param.Parameterized): | |
def register(cls, interface): | ||
cls.interfaces[interface.datatype] = interface | ||
|
||
|
||
@classmethod | ||
def cast(cls, dataset, datatype=None, cast_type=None): | ||
def cast(cls, datasets, datatype=None, cast_type=None): | ||
""" | ||
Given a list of Dataset objects, cast them to the specified | ||
datatype (by default the format matching the current interface) | ||
with the given cast_type (if specified). | ||
""" | ||
if len({type(c) for c in dataset}) > 1 and cast_type is None: | ||
raise Exception("Please supply the common cast type") | ||
|
||
if datatype is None: | ||
datatype = cls.datatype | ||
|
||
unchanged = all({c.interface==cls for c in dataset}) | ||
if unchanged and cast_type is None: | ||
return dataset | ||
elif unchanged: | ||
return [cast_type(co, **dict(util.get_param_values(co))) | ||
for co in dataset] | ||
|
||
return [co.clone(co.columns(), datatype=[datatype], new_type=cast_type) | ||
for co in dataset] | ||
|
||
datatype = datatype or cls.datatype | ||
cast = [] | ||
for ds in datasets: | ||
if cast_type is not None or ds.interface.datatype != datatype: | ||
ds = ds.clone(ds, datatype=[datatype], new_type=cast_type) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Casting works quite simply, if the |
||
cast.append(ds) | ||
return cast | ||
|
||
@classmethod | ||
def error(cls): | ||
|
@@ -167,7 +157,7 @@ def initialize(cls, eltype, data, kdims, vdims, datatype=None): | |
|
||
if data.interface.datatype in datatype and data.interface.datatype in eltype.datatype: | ||
data = data.data | ||
elif data.interface.gridded: | ||
elif data.interface.gridded and any(cls.interfaces[dt].gridded for dt in datatype): | ||
gridded = OrderedDict([(kd.name, data.dimension_values(kd.name, expanded=False)) | ||
for kd in data.kdims]) | ||
for vd in data.vdims: | ||
|
@@ -307,25 +297,35 @@ def range(cls, dataset, dimension): | |
return column[0], column[-1] | ||
|
||
@classmethod | ||
def concatenate(cls, dataset, datatype=None): | ||
def concatenate(cls, datasets, datatype=None, new_type=None): | ||
""" | ||
Utility function to concatenate a list of Column objects, | ||
returning a new Dataset object. Note that this is unlike the | ||
.concat method which only concatenates the data. | ||
Utility function to concatenate an NdMapping of Dataset objects. | ||
""" | ||
if len(set(type(c) for c in dataset)) != 1: | ||
raise Exception("All inputs must be same type in order to concatenate") | ||
|
||
interfaces = set(c.interface for c in dataset) | ||
if len(interfaces)!=1 and datatype is None: | ||
raise Exception("Please specify the concatenated datatype") | ||
elif len(interfaces)!=1: | ||
interface = cls.interfaces[datatype] | ||
else: | ||
interface = interfaces.pop() | ||
|
||
concat_data = interface.concat(dataset) | ||
return dataset[0].clone(concat_data) | ||
from . import Dataset, default_datatype | ||
new_type = new_type or Dataset | ||
if isinstance(datasets, NdMapping): | ||
dimensions = datasets.kdims | ||
datasets = datasets.data | ||
if isinstance(datasets, (dict, OrderedDict)): | ||
datasets = datasets.items() | ||
keys, datasets = zip(*datasets) | ||
elif isinstance(datasets, list) and not any(isinstance(v, tuple) for v in datasets): | ||
keys = [()]*len(datasets) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What are all these empty tuple keys for? Just to get things in the right format? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, concatenate is usually meant for concatenating along some dimension but you can also concatenate a simple list of datasets without concatenating along some dimensions. For that case we generate empty tuple keys. Happy to add a comment. Separately I also need to assert that this only happens for tabular data, since gridded data must be concatenated along some dimension. |
||
dimensions = [] | ||
template = datasets[0] | ||
datatype = datatype or template.interface.datatype | ||
|
||
# Handle non-general datatypes by casting to general type | ||
if datatype == 'array': | ||
datatype = default_datatype | ||
elif datatype == 'image': | ||
datatype = 'grid' | ||
|
||
datasets = template.interface.cast(datasets, datatype) | ||
template = datasets[0] | ||
data = list(zip(keys, datasets)) if keys else datasets | ||
concat_data = template.interface.concat(data, dimensions, vdims=template.vdims) | ||
return template.clone(concat_data, kdims=dimensions+template.kdims, new_type=new_type) | ||
|
||
@classmethod | ||
def reduce(cls, dataset, reduce_dims, function, **kwargs): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,9 @@ | |
from itertools import product | ||
|
||
import iris | ||
from iris.coords import DimCoord | ||
from iris.cube import CubeList | ||
from iris.experimental.equalise_cubes import equalise_attributes | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will be good to have the iris interface moved to geoviews. Could this be done for 1.10.6? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tests need to be moved into the holoviews package first. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You mean 'geoviews' package? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No I mean the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The interface tests are defined as mix-in classes, so if I want to run them in geoviews I have to be able to import them from holoviews. We also promised this to the bokeh folks so they can run our bokeh unit tests easily. |
||
from iris.util import guess_coord_axis | ||
|
||
import numpy as np | ||
|
@@ -230,6 +233,20 @@ def groupby(cls, dataset, dims, container_type=HoloMap, group_type=None, **kwarg | |
else: | ||
return container_type(data) | ||
|
||
@classmethod | ||
def concat_dim(cls, datasets, dim, vdims): | ||
""" | ||
Concatenates datasets along one dimension | ||
""" | ||
cubes = [] | ||
for c, cube in datasets.items(): | ||
cube = cube.copy() | ||
cube.add_aux_coord(DimCoord([c], var_name=dim.name)) | ||
cubes.append(cube) | ||
cubes = CubeList(cubes) | ||
equalise_attributes(cubes) | ||
return cubes.merge_cube() | ||
|
||
|
||
@classmethod | ||
def range(cls, dataset, dimension): | ||
|
@@ -261,7 +278,7 @@ def length(cls, dataset): | |
""" | ||
Returns the total number of samples in the dataset. | ||
""" | ||
return np.product([len(d.points) for d in dataset.data.coords()]) | ||
return np.product([len(d.points) for d in dataset.data.coords(dim_coords=True)]) | ||
|
||
|
||
@classmethod | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When converting from gridded to columnar data throughout the code it usually has to cast the data to a specific datatype. Various places in the code hardcoded
['pandas', 'dictionary']
in these places, defining a default_datatype avoids having to hardcode this all over the place.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shouldn't this be "default_columnar_datatype', then? Or are there no cases where columnar data needs to be cast into some gridded data type?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Columnar data cannot be cast to gridded data without some kind of aggregation occurring. So that's correct. Would still be okay with changing it to
default_columnar_datatype
.