Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add .iloc and .ndloc integer indexing methods for Datasets #1435

Merged
merged 20 commits into from
Jun 19, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/Tutorials/Introduction.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -550,7 +550,7 @@
"source": [
"print(rgb_parrot)\n",
"print(rgb_parrot[0,0])\n",
"print(rgb_parrot[0,0][0])"
"print(rgb_parrot[0,0].iloc[0, 0])"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was left over from when we returned tuples when indexing RGBs, so I ended up updating it, suppose rgb_parrot[0, 0, 'R'] would have been clearer but we're probably throwing this notebook out right?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right.

]
},
{
Expand Down
6 changes: 3 additions & 3 deletions examples/streams/bokeh/point_selection1D.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,12 @@
"\n",
"# Write function that uses the selection indices to slice points and compute stats\n",
"def selected_info(index):\n",
" arr = points.array()[index]\n",
" selected = points.iloc[index]\n",
" if index:\n",
" label = 'Mean x, y: %.3f, %.3f' % tuple(arr.mean(axis=0))\n",
" label = 'Mean x, y: %.3f, %.3f' % tuple(selected.array().mean(axis=0))\n",
" else:\n",
" label = 'No selection'\n",
" return points.clone(arr, label=label)(style=dict(color='red'))\n",
" return selected.relabel(label)(style=dict(color='red'))\n",
"\n",
"# Combine points and DynamicMap\n",
"points + hv.DynamicMap(selected_info, streams=[selection])"
Expand Down
59 changes: 57 additions & 2 deletions holoviews/core/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import param

from ..dimension import redim
from .interface import Interface
from .interface import Interface, iloc, ndloc
from .array import ArrayInterface
from .dictionary import DictInterface
from .grid import GridInterface
Expand Down Expand Up @@ -433,7 +433,9 @@ def sample(self, samples=[], closest=True, **kwargs):
else:
selection = tuple(selection.columns(kdims+self.vdims).values())

return self.clone(selection, kdims=kdims, new_type=new_type)
datatype = list(util.unique_iterator(self.datatype+['dataframe', 'dict']))
return self.clone(selection, kdims=kdims, new_type=new_type,
datatype=datatype)

lens = set(len(util.wrap_tuple(s)) for s in samples)
if len(lens) > 1:
Expand Down Expand Up @@ -624,6 +626,59 @@ def to(self):
return self._conversion_interface(self)


@property
def iloc(self):
"""
Returns an iloc object providing a convenient interface to
slice and index into the Dataset using row and column indices.
Allow selection by integer index, slice and list of integer
indices and boolean arrays.

Examples:

* Index the first row and column:

dataset.iloc[0, 0]

* Select rows 1 and 2 with a slice:

dataset.iloc[1:3, :]

* Select with a list of integer coordinates:

dataset.iloc[[0, 2, 3]]
"""
return iloc(self)


@property
def ndloc(self):
"""
Returns an ndloc object providing nd-array like indexing for
gridded datasets. Follows NumPy array indexing conventions,
allowing for indexing, slicing and selecting a list of indices
on multi-dimensional arrays using integer indices. The order
of array indices is inverted relative to the Dataset key
dimensions, e.g. an Image with key dimensions 'x' and 'y' can
be indexed with ``image.ndloc[iy, ix]``, where ``iy`` and
``ix`` are integer indices along the y and x dimensions.

Examples:

* Index value in 2D array:

dataset.ndloc[3, 1]

* Slice along y-axis of 2D array:

dataset.ndloc[2:5, :]

* Vectorized (non-orthogonal) indexing along x- and y-axes:

dataset.ndloc[[1, 2, 3], [0, 2, 3]]
"""
return ndloc(self)


# Aliases for pickle backward compatibility
Columns = Dataset
Expand Down
30 changes: 25 additions & 5 deletions holoviews/core/data/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,18 +54,19 @@ def init(cls, eltype, data, kdims, vdims):
except:
data = None

if kdims is None:
kdims = eltype.kdims
if vdims is None:
vdims = eltype.vdims

if data is None or data.ndim > 2 or data.dtype.kind in ['S', 'U', 'O']:
raise ValueError("ArrayInterface interface could not handle input type.")
elif data.ndim == 1:
if eltype._auto_indexable_1d:
if eltype._auto_indexable_1d and len(kdims)+len(vdims)>1:
data = np.column_stack([np.arange(len(data)), data])
else:
data = np.atleast_2d(data).T

if kdims is None:
kdims = eltype.kdims
if vdims is None:
vdims = eltype.vdims
return data, {'kdims':kdims, 'vdims':vdims}, {}

@classmethod
Expand Down Expand Up @@ -232,4 +233,23 @@ def aggregate(cls, dataset, dimensions, function, **kwargs):
return np.atleast_2d(rows)


@classmethod
def iloc(cls, dataset, index):
rows, cols = index
if np.isscalar(cols):
if isinstance(cols, util.basestring):
cols = dataset.get_dimension_index(cols)
if np.isscalar(rows):
return dataset.data[rows, cols]
cols = [dataset.get_dimension_index(cols)]
elif not isinstance(cols, slice):
cols = [dataset.get_dimension_index(d) for d in cols]

if np.isscalar(rows):
rows = [rows]
data = dataset.data[rows, :][:, cols]
if data.ndim == 1:
return np.atleast_2d(data).T
return data

Interface.register(ArrayInterface)
26 changes: 25 additions & 1 deletion holoviews/core/data/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from .. import util
from ..element import Element
from ..ndmapping import NdMapping, item_check
from ..ndmapping import NdMapping, item_check, OrderedDict
from .interface import Interface
from .pandas import PandasInterface

Expand Down Expand Up @@ -241,6 +241,30 @@ def dframe(cls, columns, dimensions):
def nonzero(cls, dataset):
return True

@classmethod
def iloc(cls, dataset, index):
"""
Dask does not support iloc, therefore iloc will execute
the call graph and lose the laziness of the operation.
"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if there could be optional performance warnings we could issue when laziness is lost. Not for this PR though.

rows, cols = index
scalar = False
if isinstance(cols, slice):
cols = [d.name for d in dataset.dimensions()][cols]
elif np.isscalar(cols):
scalar = np.isscalar(rows)
cols = [dataset.get_dimension(cols).name]
else:
cols = [dataset.get_dimension(d).name for d in index[1]]
if np.isscalar(rows):
rows = [rows]

data = OrderedDict()
for c in cols:
data[c] = dataset.data[c].compute().iloc[rows].values
if scalar:
return data[cols[0]][0]
return tuple(data.values())


Interface.register(DaskInterface)
27 changes: 26 additions & 1 deletion holoviews/core/data/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def init(cls, eltype, data, kdims, vdims):
data = {d: data[d] for d in dimensions}
elif isinstance(data, np.ndarray):
if data.ndim == 1:
if eltype._auto_indexable_1d:
if eltype._auto_indexable_1d and len(kdims)+len(vdims)>1:
data = np.column_stack([np.arange(len(data)), data])
else:
data = np.atleast_2d(data).T
Expand Down Expand Up @@ -261,4 +261,29 @@ def aggregate(cls, dataset, kdims, function, **kwargs):
return aggregated


@classmethod
def iloc(cls, dataset, index):
rows, cols = index
scalar = False
if np.isscalar(cols):
scalar = np.isscalar(rows)
cols = [dataset.get_dimension(cols, strict=True)]
elif isinstance(cols, slice):
cols = dataset.dimensions()[cols]
else:
cols = [dataset.get_dimension(d, strict=True) for d in cols]

if np.isscalar(rows):
rows = [rows]

new_data = OrderedDict()
for d, values in dataset.data.items():
if d in cols:
new_data[d] = values[rows]

if scalar:
return new_data[cols[0].name][0]
return new_data


Interface.register(DictInterface)
72 changes: 71 additions & 1 deletion holoviews/core/data/grid.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from collections import OrderedDict, defaultdict
from collections import OrderedDict, defaultdict, Iterable

try:
import itertools.izip as zip
Expand Down Expand Up @@ -167,6 +167,53 @@ def canonicalize(cls, dataset, data, coord_dims=None):
return data


@classmethod
def invert_index(cls, index, length):
if np.isscalar(index):
return length - index
elif isinstance(index, slice):
start, stop = index.start, index.stop
new_start, new_stop = None, None
if start is not None:
new_stop = length - start
if stop is not None:
new_start = length - stop
return slice(new_start-1, new_stop-1)
elif isinstance(index, Iterable):
new_index = []
for ind in index:
new_index.append(length-ind)
return new_index


@classmethod
def ndloc(cls, dataset, indices):
selected = {}
adjusted_inds = []
all_scalar = True
for kd, ind in zip(dataset.kdims[::-1], indices):
coords = cls.coords(dataset, kd.name)
if np.all(coords[1:] < coords[:-1]):
ind = cls.invert_index(ind, len(coords))
if np.isscalar(ind):
ind = [ind]
else:
all_scalar = False
selected[kd.name] = coords[ind]
adjusted_inds.append(ind)
for kd in dataset.kdims:
if kd.name not in selected:
coords = cls.coords(dataset, kd.name)
selected[kd.name] = coords
all_scalar = False
for vd in dataset.vdims:
arr = dataset.dimension_values(vd, flat=False)
if all_scalar and len(dataset.vdims) == 1:
return arr[tuple(ind[0] for ind in adjusted_inds)]
selected[vd.name] = arr[tuple(adjusted_inds)]
return tuple(selected[d.name] for d in dataset.dimensions())


@classmethod
def values(cls, dataset, dim, expanded=True, flat=True):
dim = dataset.get_dimension(dim, strict=True)
Expand Down Expand Up @@ -391,5 +438,28 @@ def sort(cls, dataset, by=[]):
raise Exception('Compressed format cannot be sorted, either instantiate '
'in the desired order or use the expanded format.')

@classmethod
def iloc(cls, dataset, index):
rows, cols = index
scalar = False
if np.isscalar(cols):
scalar = np.isscalar(rows)
cols = [dataset.get_dimension(cols, strict=True)]
elif isinstance(cols, slice):
cols = dataset.dimensions()[cols]
else:
cols = [dataset.get_dimension(d, strict=True) for d in cols]

if np.isscalar(rows):
rows = [rows]

new_data = []
for d in cols:
new_data.append(dataset.dimension_values(d)[rows])

if scalar:
return new_data[0][0]
return tuple(new_data)


Interface.register(GridInterface)
7 changes: 7 additions & 0 deletions holoviews/core/data/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,13 @@ def reindex(cls, dataset, kdims=None, vdims=None):
return data[..., inds] if len(inds) > 1 else data[..., inds[0]]
return data

@classmethod
def coords(cls, dataset, dim, ordered=False, expanded=False):
dim = dataset.get_dimension(dim, strict=True)
if expanded:
return util.expand_grid_coords(dataset, dim)
return cls.values(dataset, dim, expanded=False)

@classmethod
def range(cls, obj, dim):
dim_idx = obj.get_dimension_index(dim)
Expand Down
Loading