diff --git a/datashader/__init__.py b/datashader/__init__.py index 017672043..da620d0a9 100644 --- a/datashader/__init__.py +++ b/datashader/__init__.py @@ -1,5 +1,7 @@ from __future__ import absolute_import +from distutils.version import LooseVersion + import param __version__ = str(param.version.Version(fpath=__file__, archive_commit="$Format:%h$",reponame="datashader")) @@ -15,6 +17,12 @@ except ImportError: pass +# Make RaggedArray pandas extension array available for +# pandas >= 0.24.0 is installed +from pandas import __version__ as pandas_version +if LooseVersion(pandas_version) >= LooseVersion('0.24.0'): + from . import datatypes # noqa (API import) + # make pyct's example/data commands available if possible from functools import partial try: diff --git a/datashader/core.py b/datashader/core.py index fa1f2f8c3..337d0e62c 100644 --- a/datashader/core.py +++ b/datashader/core.py @@ -210,6 +210,7 @@ def line(self, source, x, y, agg=None, axis=0): Define a canvas and a pandas DataFrame with 6 rows >>> import pandas as pd # doctest: +SKIP ... import numpy as np + ... import datashader as ds ... from datashader import Canvas ... import datashader.transfer_functions as tf ... cvs = Canvas() @@ -222,23 +223,23 @@ def line(self, source, x, y, agg=None, axis=0): Aggregate one line across all rows, with coordinates df.A1 by df.B1 >>> agg = cvs.line(df, x='A1', y='B1', axis=0) # doctest: +SKIP - ... tf.shade(agg) + ... tf.spread(tf.shade(agg)) Aggregate two lines across all rows. The first with coordinates df.A1 by df.B1 and the second with coordinates df.A2 by df.B2 >>> agg = cvs.line(df, x=['A1', 'A2'], y=['B1', 'B2'], axis=0) # doctest: +SKIP - ... tf.shade(agg) + ... tf.spread(tf.shade(agg)) Aggregate two lines across all rows where the lines share the same x coordinates. The first line will have coordinates df.A1 by df.B1 and the second will have coordinates df.A1 by df.B2 >>> agg = cvs.line(df, x='A1', y=['B1', 'B2'], axis=0) # doctest: +SKIP - ... tf.shade(agg) + ... tf.spread(tf.shade(agg)) Aggregate 6 length-2 lines, one per row, where the ith line has coordinates [df.A1[i], df.A2[i]] by [df.B1[i], df.B2[i]] >>> agg = cvs.line(df, x=['A1', 'A2'], y=['B1', 'B2'], axis=1) # doctest: +SKIP - ... tf.shade(agg) + ... tf.spread(tf.shade(agg)) Aggregate 6 length-4 lines, one per row, where the x coordinates of every line are [0, 1, 2, 3] and the y coordinates of the ith line @@ -247,10 +248,32 @@ def line(self, source, x, y, agg=None, axis=0): ... x=np.arange(4), ... y=['A1', 'A2', 'B1', 'B2'], ... axis=1) - ... tf.shade(agg) + ... tf.spread(tf.shade(agg)) + + Aggregate RaggedArrays of variable length lines, one per row + (requires pandas >= 0.24.0) + >>> df_ragged = pd.DataFrame({ # doctest: +SKIP + ... 'A1': pd.array([ + ... [1, 1.5], [2, 2.5, 3], [1.5, 2, 3, 4], [3.2, 4, 5]], + ... dtype='Ragged[float32]'), + ... 'B1': pd.array([ + ... [10, 12], [11, 14, 13], [10, 7, 9, 10], [7, 8, 12]], + ... dtype='Ragged[float32]'), + ... 'group': pd.Categorical([0, 1, 2, 1]) + ... }) + ... + ... agg = cvs.line(df_ragged, x='A1', y='B1', axis=1) + ... tf.spread(tf.shade(agg)) + + Aggregate RaggedArrays of variable length lines by group column, + one per row (requires pandas >= 0.24.0) + >>> agg = cvs.line(df_ragged, x='A1', y='B1', # doctest: +SKIP + ... agg=ds.count_cat('group'), axis=1) + ... tf.spread(tf.shade(agg)) """ from .glyphs import (LineAxis0, LinesAxis1, LinesAxis1XConstant, - LinesAxis1YConstant, LineAxis0Multi) + LinesAxis1YConstant, LineAxis0Multi, + LinesAxis1Ragged) from .reductions import any as any_rdn if agg is None: agg = any_rdn() @@ -286,6 +309,9 @@ def line(self, source, x, y, agg=None, axis=0): elif (isinstance(x, (list, tuple)) and isinstance(y, np.ndarray)): glyph = LinesAxis1YConstant(tuple(x), y) + elif (isinstance(x, (Number, string_types)) and + isinstance(y, (Number, string_types))): + glyph = LinesAxis1Ragged(x, y) else: raise ValueError(""" Invalid combination of x and y arguments to Canvas.line when axis=1. @@ -302,6 +328,7 @@ def line(self, source, x, y, agg=None, axis=0): return bypixel(source, self, glyph, agg) + # TODO re 'untested', below: Consider replacing with e.g. a 3x3 # array in the call to Canvas (plot_height=3,plot_width=3), then # show the output as a numpy array that has a compact diff --git a/datashader/datatypes.py b/datashader/datatypes.py new file mode 100644 index 000000000..c552860bd --- /dev/null +++ b/datashader/datatypes.py @@ -0,0 +1,803 @@ +from __future__ import absolute_import +import re +from functools import total_ordering + +import numpy as np +from numba import jit +from pandas.api.extensions import ( + ExtensionDtype, ExtensionArray, register_extension_dtype) +from numbers import Integral + +from pandas.api.types import pandas_dtype +from pandas.core.dtypes.common import is_extension_array_dtype + +try: + # See if we can register extension type with dask >= 1.1.0 + from dask.dataframe.extensions import make_array_nonempty +except ImportError: + make_array_nonempty = None + + +def _validate_ragged_properties(start_indices, flat_array): + """ + Validate that start_indices are flat_array arrays that may be used to + represent a valid RaggedArray. + + Parameters + ---------- + flat_array: numpy array containing concatenation + of all nested arrays to be represented + by this ragged array + start_indices: unsiged integer numpy array the same + length as the ragged array where values + represent the index into flat_array where + the corresponding ragged array element + begins + Raises + ------ + ValueError: + if input arguments are invalid or incompatible properties + """ + + # Validate start_indices + if (not isinstance(start_indices, np.ndarray) or + start_indices.dtype.kind != 'u' or + start_indices.ndim != 1): + raise ValueError(""" +The start_indices property of a RaggedArray must be a 1D numpy array of +unsigned integers (start_indices.dtype.kind == 'u') + Received value of type {typ}: {v}""".format( + typ=type(start_indices), v=repr(start_indices))) + + # Validate flat_array + if (not isinstance(flat_array, np.ndarray) or + flat_array.ndim != 1): + raise ValueError(""" +The flat_array property of a RaggedArray must be a 1D numpy array + Received value of type {typ}: {v}""".format( + typ=type(flat_array), v=repr(flat_array))) + + # Validate start_indices values + # We don't need to check start_indices < 0 because we already know that it + # has an unsigned integer datatype + # + # Note that start_indices[i] == len(flat_array) is valid as it represents + # and empty array element at the end of the ragged array. + invalid_inds = start_indices > len(flat_array) + + if invalid_inds.any(): + some_invalid_vals = start_indices[invalid_inds[:10]] + + raise ValueError(""" +Elements of start_indices must be less than the length of flat_array ({m}) + Invalid values include: {vals}""".format( + m=len(flat_array), vals=repr(some_invalid_vals))) + + +# Internal ragged element array wrapper that provides +# equality, ordering, and hashing. +@total_ordering +class _RaggedElement(object): + + @staticmethod + def ragged_or_nan(a): + if np.isscalar(a) and np.isnan(a): + return a + else: + return _RaggedElement(a) + + @staticmethod + def array_or_nan(a): + if np.isscalar(a) and np.isnan(a): + return a + else: + return a.array + + def __init__(self, array): + self.array = array + + def __hash__(self): + return hash(self.array.tobytes()) + + def __eq__(self, other): + if not isinstance(other, _RaggedElement): + return False + return np.array_equal(self.array, other.array) + + def __lt__(self, other): + if not isinstance(other, _RaggedElement): + return NotImplemented + return _lexograph_lt(self.array, other.array) + + def __repr__(self): + array_repr = repr(self.array) + return array_repr.replace('array', 'ragged_element') + + +@register_extension_dtype +class RaggedDtype(ExtensionDtype): + """ + Pandas ExtensionDtype to represent a ragged array datatype + + Methods not otherwise documented here are inherited from ExtensionDtype; + please see the corresponding method on that class for the docstring + """ + type = np.ndarray + base = np.dtype('O') + _subtype_re = re.compile(r"^ragged\[(?P\w+)\]$") + _metadata = ('_dtype',) + + @property + def name(self): + return 'Ragged[{subtype}]'.format(subtype=self.subtype) + + def __repr__(self): + return self.name + + @classmethod + def construct_array_type(cls): + return RaggedArray + + @classmethod + def construct_from_string(cls, string): + # lowercase string + string = string.lower() + + msg = "Cannot construct a 'RaggedDtype' from '{}'" + if string.startswith('ragged'): + # Extract subtype + try: + subtype_string = cls._parse_subtype(string) + return RaggedDtype(dtype=subtype_string) + except Exception: + raise TypeError(msg.format(string)) + else: + raise TypeError(msg.format(string)) + + def __init__(self, dtype=np.float64): + if isinstance(dtype, RaggedDtype): + self._dtype = dtype.subtype + else: + self._dtype = np.dtype(dtype) + + @property + def subtype(self): + return self._dtype + + @classmethod + def _parse_subtype(cls, dtype_string): + """ + Parse a datatype string to get the subtype + + Parameters + ---------- + dtype_string: str + A string like Ragged[subtype] + + Returns + ------- + subtype: str + + Raises + ------ + ValueError + When the subtype cannot be extracted + """ + # Be case insensitive + dtype_string = dtype_string.lower() + + match = cls._subtype_re.match(dtype_string) + if match: + subtype_string = match.groupdict()['subtype'] + elif dtype_string == 'ragged': + subtype_string = 'float64' + else: + raise ValueError("Cannot parse {dtype_string}".format( + dtype_string=dtype_string)) + return subtype_string + + +def missing(v): + return v is None or (np.isscalar(v) and np.isnan(v)) + + +class RaggedArray(ExtensionArray): + """ + Pandas ExtensionArray to represent ragged arrays + + Methods not otherwise documented here are inherited from ExtensionArray; + please see the corresponding method on that class for the docstring + """ + def __init__(self, data, dtype=None, copy=False): + """ + Construct a RaggedArray + + Parameters + ---------- + data: list or array or dict or RaggedArray + * list or 1D-array: A List or 1D array of lists or 1D arrays that + should be represented by the RaggedArray + + * dict: A dict containing 'start_indices' and 'flat_array' keys + with numpy array values where: + - flat_array: numpy array containing concatenation + of all nested arrays to be represented + by this ragged array + - start_indices: unsiged integer numpy array the same + length as the ragged array where values + represent the index into flat_array where + the corresponding ragged array element + begins + * RaggedArray: A RaggedArray instance to copy + + dtype: RaggedDtype or np.dtype or str or None (default None) + Datatype to use to store underlying values from data. + If none (the default) then dtype will be determined using the + numpy.result_type function. + copy : bool (default False) + Whether to deep copy the input arrays. Only relevant when `data` + has type `dict` or `RaggedArray`. When data is a `list` or + `array`, input arrays are always copied. + """ + if (isinstance(data, dict) and + all(k in data for k in + ['start_indices', 'flat_array'])): + + _validate_ragged_properties( + start_indices=data['start_indices'], + flat_array=data['flat_array']) + + self._start_indices = data['start_indices'] + self._flat_array = data['flat_array'] + dtype = self._flat_array.dtype + + if copy: + self._start_indices = self._start_indices.copy() + self._flat_array = self._flat_array.copy() + + elif isinstance(data, RaggedArray): + self._flat_array = data.flat_array + self._start_indices = data.start_indices + dtype = self._flat_array.dtype + + if copy: + self._start_indices = self._start_indices.copy() + self._flat_array = self._flat_array.copy() + else: + # Compute lengths + index_len = len(data) + buffer_len = sum(len(datum) + if not missing(datum) + else 0 for datum in data) + + # Compute necessary precision of start_indices array + for nbits in [8, 16, 32, 64]: + start_indices_dtype = 'uint' + str(nbits) + max_supported = np.iinfo(start_indices_dtype).max + if buffer_len <= max_supported: + break + + # infer dtype if not provided + if dtype is None: + non_missing = [np.atleast_1d(v) + for v in data if not missing(v)] + if non_missing: + dtype = np.result_type(*non_missing) + else: + dtype = 'float64' + elif isinstance(dtype, RaggedDtype): + dtype = dtype.subtype + + # Initialize representation arrays + self._start_indices = np.zeros(index_len, dtype=start_indices_dtype) + self._flat_array = np.zeros(buffer_len, dtype=dtype) + + # Populate arrays + next_start_ind = 0 + for i, array_el in enumerate(data): + # Compute element length + n = len(array_el) if not missing(array_el) else 0 + + # Update start indices + self._start_indices[i] = next_start_ind + + # Update flat array + self._flat_array[next_start_ind:next_start_ind+n] = array_el + + # increment next start index + next_start_ind += n + + self._dtype = RaggedDtype(dtype=dtype) + + def __eq__(self, other): + if isinstance(other, RaggedArray): + if len(other) != len(self): + raise ValueError(""" +Cannot check equality of RaggedArray values of unequal length + len(ra1) == {len_ra1} + len(ra2) == {len_ra2}""".format( + len_ra1=len(self), + len_ra2=len(other))) + + result = _eq_ragged_ragged( + self.start_indices, self.flat_array, + other.start_indices, other.flat_array) + else: + # Convert other to numpy arrauy + if not isinstance(other, np.ndarray): + other_array = np.asarray(other) + else: + other_array = other + + if other_array.ndim == 1 and other_array.dtype.kind != 'O': + + # Treat as ragged scalar + result = _eq_ragged_scalar( + self.start_indices, self.flat_array, other_array) + elif (other_array.ndim == 1 and + other_array.dtype.kind == 'O' and + len(other_array) == len(self)): + + # Treat as vector + result = _eq_ragged_ndarray1d( + self.start_indices, self.flat_array, other_array) + elif (other_array.ndim == 2 and + other_array.dtype.kind != 'O' and + other_array.shape[0] == len(self)): + + # Treat rows as ragged elements + result = _eq_ragged_ndarray2d( + self.start_indices, self.flat_array, other_array) + else: + raise ValueError(""" +Cannot check equality of RaggedArray of length {ra_len} with: + {other}""".format(ra_len=len(self), other=repr(other))) + + return result + + def __ne__(self, other): + return np.logical_not(self == other) + + @property + def flat_array(self): + """ + numpy array containing concatenation of all nested arrays + + Returns + ------- + np.ndarray + """ + return self._flat_array + + @property + def start_indices(self): + """ + unsiged integer numpy array the same length as the ragged array where + values represent the index into flat_array where the corresponding + ragged array element begins + + Returns + ------- + np.ndarray + """ + return self._start_indices + + def __len__(self): + return len(self._start_indices) + + def __getitem__(self, item): + if isinstance(item, Integral): + if item < -len(self) or item >= len(self): + raise IndexError("{item} is out of bounds".format(item=item)) + else: + # Convert negative item index + if item < 0: + item += len(self) + + slice_start = self.start_indices[item] + slice_end = (self.start_indices[item+1] + if item + 1 <= len(self) - 1 + else len(self.flat_array)) + + return (self.flat_array[slice_start:slice_end] + if slice_end!=slice_start + else np.nan) + + elif type(item) == slice: + data = [] + selected_indices = np.arange(len(self))[item] + + for selected_index in selected_indices: + data.append(self[selected_index]) + + return RaggedArray(data, dtype=self.flat_array.dtype) + + elif isinstance(item, np.ndarray) and item.dtype == 'bool': + data = [] + + for i, m in enumerate(item): + if m: + data.append(self[i]) + + return RaggedArray(data, dtype=self.flat_array.dtype) + elif isinstance(item, (list, np.ndarray)): + return self.take(item, allow_fill=False) + else: + raise IndexError(item) + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + return RaggedArray(scalars, dtype=dtype) + + @classmethod + def _from_factorized(cls, values, original): + return RaggedArray( + [_RaggedElement.array_or_nan(v) for v in values], + dtype=original.flat_array.dtype) + + def _as_ragged_element_array(self): + return np.array([_RaggedElement.ragged_or_nan(self[i]) + for i in range(len(self))]) + + def _values_for_factorize(self): + return self._as_ragged_element_array(), np.nan + + def _values_for_argsort(self): + return self._as_ragged_element_array() + + def unique(self): + from pandas import unique + + uniques = unique(self._as_ragged_element_array()) + return self._from_sequence( + [_RaggedElement.array_or_nan(v) for v in uniques], + dtype=self.dtype) + + def fillna(self, value=None, method=None, limit=None): + # Override in RaggedArray to handle ndarray fill values + from pandas.util._validators import validate_fillna_kwargs + from pandas.core.missing import pad_1d, backfill_1d + + value, method = validate_fillna_kwargs(value, method) + + mask = self.isna() + + if isinstance(value, RaggedArray): + if len(value) != len(self): + raise ValueError("Length of 'value' does not match. Got ({}) " + " expected {}".format(len(value), len(self))) + value = value[mask] + + if mask.any(): + if method is not None: + func = pad_1d if method == 'pad' else backfill_1d + new_values = func(self.astype(object), limit=limit, + mask=mask) + new_values = self._from_sequence(new_values, dtype=self.dtype) + else: + # fill with value + new_values = list(self) + mask_indices, = np.where(mask) + for ind in mask_indices: + new_values[ind] = value + + new_values = self._from_sequence(new_values, dtype=self.dtype) + else: + new_values = self.copy() + return new_values + + def shift(self, periods=1, fill_value=None): + # Override in RaggedArray to handle ndarray fill values + + # Note: this implementation assumes that `self.dtype.na_value` can be + # stored in an instance of your ExtensionArray with `self.dtype`. + if not len(self) or periods == 0: + return self.copy() + + if fill_value is None: + fill_value = np.nan + + empty = self._from_sequence( + [fill_value] * min(abs(periods), len(self)), + dtype=self.dtype + ) + if periods > 0: + a = empty + b = self[:-periods] + else: + a = self[abs(periods):] + b = empty + return self._concat_same_type([a, b]) + + def searchsorted(self, value, side="left", sorter=None): + arr = self._as_ragged_element_array() + if isinstance(value, RaggedArray): + search_value = value._as_ragged_element_array() + else: + search_value = _RaggedElement(value) + return arr.searchsorted(search_value, side=side, sorter=sorter) + + def isna(self): + stop_indices = np.hstack([self.start_indices[1:], + [len(self.flat_array)]]) + + element_lengths = stop_indices - self.start_indices + return element_lengths == 0 + + def take(self, indices, allow_fill=False, fill_value=None): + if allow_fill: + invalid_inds = [i for i in indices if i < -1] + if invalid_inds: + raise ValueError(""" +Invalid indices for take with allow_fill True: {inds}""".format( + inds=invalid_inds[:9])) + sequence = [self[i] if i >= 0 else fill_value + for i in indices] + else: + if len(self) == 0 and len(indices) > 0: + raise IndexError("cannot do a non-empty take") + + sequence = [self[i] for i in indices] + + return RaggedArray(sequence, dtype=self.flat_array.dtype) + + def copy(self, deep=False): + data = dict( + flat_array=self.flat_array, + start_indices=self.start_indices) + + return RaggedArray(data, copy=deep) + + @classmethod + def _concat_same_type(cls, to_concat): + # concat flat_arrays + flat_array = np.hstack(ra.flat_array for ra in to_concat) + + # offset and concat start_indices + offsets = np.hstack([ + [0], + np.cumsum([len(ra.flat_array) for ra in to_concat[:-1]])]) + + start_indices = np.hstack([ra.start_indices + offset + for offset, ra in zip(offsets, to_concat)]) + + return RaggedArray(dict( + flat_array=flat_array, start_indices=start_indices), + copy=False) + + @property + def dtype(self): + return self._dtype + + @property + def nbytes(self): + return (self._flat_array.nbytes + + self._start_indices.nbytes) + + def astype(self, dtype, copy=True): + dtype = pandas_dtype(dtype) + if isinstance(dtype, RaggedDtype): + if copy: + return self.copy() + return self + + elif is_extension_array_dtype(dtype): + return dtype.construct_array_type()._from_sequence( + np.asarray(self)) + + return np.array([v for v in self], dtype=dtype, copy=copy) + + +@jit(nopython=True, nogil=True) +def _eq_ragged_ragged(start_indices1, + flat_array1, + start_indices2, + flat_array2): + """ + Compare elements of two ragged arrays of the same length + + Parameters + ---------- + start_indices1: ndarray + start indices of a RaggedArray 1 + flat_array1: ndarray + flat_array property of a RaggedArray 1 + start_indices2: ndarray + start indices of a RaggedArray 2 + flat_array2: ndarray + flat_array property of a RaggedArray 2 + + Returns + ------- + mask: ndarray + 1D bool array of same length as inputs with elements True when + corresponding elements are equal, False otherwise + """ + n = len(start_indices1) + m1 = len(flat_array1) + m2 = len(flat_array2) + + result = np.zeros(n, dtype=np.bool_) + + for i in range(n): + # Extract inds for ra1 + start_index1 = start_indices1[i] + stop_index1 = start_indices1[i + 1] if i < n - 1 else m1 + len_1 = stop_index1 - start_index1 + + # Extract inds for ra2 + start_index2 = start_indices2[i] + stop_index2 = start_indices2[i + 1] if i < n - 1 else m2 + len_2 = stop_index2 - start_index2 + + if len_1 != len_2: + el_equal = False + else: + el_equal = True + for flat_index1, flat_index2 in \ + zip(range(start_index1, stop_index1), + range(start_index2, stop_index2)): + el_1 = flat_array1[flat_index1] + el_2 = flat_array2[flat_index2] + el_equal &= el_1 == el_2 + + result[i] = el_equal + + return result + + +@jit(nopython=True, nogil=True) +def _eq_ragged_scalar(start_indices, flat_array, val): + """ + Compare elements of a RaggedArray with a scalar array + + Parameters + ---------- + start_indices: ndarray + start indices of a RaggedArray + flat_array: ndarray + flat_array property of a RaggedArray + val: ndarray + + Returns + ------- + mask: ndarray + 1D bool array of same length as inputs with elements True when + ragged element equals scalar val, False otherwise. + """ + n = len(start_indices) + m = len(flat_array) + cols = len(val) + result = np.zeros(n, dtype=np.bool_) + for i in range(n): + start_index = start_indices[i] + stop_index = start_indices[i+1] if i < n - 1 else m + + if stop_index - start_index != cols: + el_equal = False + else: + el_equal = True + for val_index, flat_index in \ + enumerate(range(start_index, stop_index)): + el_equal &= flat_array[flat_index] == val[val_index] + result[i] = el_equal + + return result + + +def _eq_ragged_ndarray1d(start_indices, flat_array, a): + """ + Compare a RaggedArray with a 1D numpy object array of the same length + + Parameters + ---------- + start_indices: ndarray + start indices of a RaggedArray + flat_array: ndarray + flat_array property of a RaggedArray + a: ndarray + 1D numpy array of same length as ra + + Returns + ------- + mask: ndarray + 1D bool array of same length as input with elements True when + corresponding elements are equal, False otherwise + + Notes + ----- + This function is not numba accelerated because it, by design, inputs + a numpy object array + """ + + n = len(start_indices) + m = len(flat_array) + result = np.zeros(n, dtype=np.bool_) + for i in range(n): + start_index = start_indices[i] + stop_index = start_indices[i + 1] if i < n - 1 else m + a_val = a[i] + if (a_val is None or + (np.isscalar(a_val) and np.isnan(a_val)) or + len(a_val) == 0): + result[i] = start_index == stop_index + else: + result[i] = np.array_equal(flat_array[start_index:stop_index], + a_val) + + return result + + +@jit(nopython=True, nogil=True) +def _eq_ragged_ndarray2d(start_indices, flat_array, a): + """ + Compare a RaggedArray with rows of a 2D numpy object array + + Parameters + ---------- + start_indices: ndarray + start indices of a RaggedArray + flat_array: ndarray + flat_array property of a RaggedArray + a: ndarray + A 2D numpy array where the length of the first dimension matches the + length of the RaggedArray + + Returns + ------- + mask: ndarray + 1D bool array of same length as input RaggedArray with elements True + when corresponding elements of ra equal corresponding row of `a` + """ + n = len(start_indices) + m = len(flat_array) + cols = a.shape[1] + + # np.bool is an alias for Python's built-in bool type, np.bool_ is the + # numpy type that numba recognizes + result = np.zeros(n, dtype=np.bool_) + for row in range(n): + start_index = start_indices[row] + stop_index = start_indices[row + 1] if row < n - 1 else m + + # Check equality + if stop_index - start_index != cols: + el_equal = False + else: + el_equal = True + for col, flat_index in enumerate(range(start_index, stop_index)): + el_equal &= flat_array[flat_index] == a[row, col] + result[row] = el_equal + return result + + +@jit(nopython=True, nogil=True) +def _lexograph_lt(a1, a2): + """ + Compare two 1D numpy arrays lexographically + Parameters + ---------- + a1: ndarray + 1D numpy array + a2: ndarray + 1D numpy array + + Returns + ------- + comparison: + True if a1 < a2, False otherwise + """ + for e1, e2 in zip(a1, a2): + if e1 < e2: + return True + elif e1 > e2: + return False + return len(a1) < len(a2) + + +def ragged_array_non_empty(dtype): + return RaggedArray([[1], [1, 2]], dtype=dtype) + + +if make_array_nonempty: + make_array_nonempty.register(RaggedDtype)(ragged_array_non_empty) diff --git a/datashader/glyphs.py b/datashader/glyphs.py index f725c6c84..605f10c5d 100644 --- a/datashader/glyphs.py +++ b/datashader/glyphs.py @@ -481,6 +481,64 @@ def extend(aggs, df, vt, bounds, plot_start=True): return extend +class LinesAxis1Ragged(_PointLike): + def validate(self, in_dshape): + try: + from datashader.datatypes import RaggedDtype + except ImportError: + RaggedDtype = type(None) + + if not isinstance(in_dshape[str(self.x)], RaggedDtype): + raise ValueError('x must be a RaggedArray') + elif not isinstance(in_dshape[str(self.x)], RaggedDtype): + raise ValueError('y must be a RaggedArray') + + def required_columns(self): + return self.x + self.y + + def compute_x_bounds(self, df): + bounds = self._compute_x_bounds(df[self.x].array.flat_array) + return self.maybe_expand_bounds(bounds) + + def compute_y_bounds(self, df): + bounds = self._compute_y_bounds(df[self.y].array.flat_array) + return self.maybe_expand_bounds(bounds) + + @memoize + def compute_bounds_dask(self, ddf): + + r = ddf.map_partitions(lambda df: np.array([[ + np.nanmin(df[self.x].array.flat_array), + np.nanmax(df[self.x].array.flat_array), + np.nanmin(df[self.y].array.flat_array), + np.nanmax(df[self.y].array.flat_array)]] + )).compute() + + x_extents = np.nanmin(r[:, 0]), np.nanmax(r[:, 1]) + y_extents = np.nanmin(r[:, 2]), np.nanmax(r[:, 3]) + + return (self.maybe_expand_bounds(x_extents), + self.maybe_expand_bounds(y_extents)) + + @memoize + def _build_extend(self, x_mapper, y_mapper, info, append): + draw_line = _build_draw_line(append) + map_onto_pixel = _build_map_onto_pixel_for_line(x_mapper, y_mapper) + extend_lines_ragged = _build_extend_line_axis1_ragged(draw_line, map_onto_pixel) + x_name = self.x + y_name = self.y + + def extend(aggs, df, vt, bounds, plot_start=True): + xs = df[x_name].array + ys = df[y_name].array + + cols = aggs + info(df) + # line may be clipped, then mapped to pixels + extend_lines_ragged(vt, bounds, xs, ys, plot_start, *cols) + + return extend + + class Triangles(_PolygonLike): """An unstructured mesh of triangles, with vertices defined by ``xs`` and ``ys``. @@ -565,7 +623,6 @@ def map_onto_pixel(vt, bounds, x, y): return map_onto_pixel - def _build_draw_line(append): """Specialize a line plotting kernel for a given append/axis combination""" @ngjit @@ -860,6 +917,82 @@ def extend_line(vt, bounds, xs, ys, plot_start, *aggs_and_cols): return extend_line +def _build_extend_line_axis1_ragged(draw_line, map_onto_pixel): + + def extend_line(vt, bounds, xs, ys, plot_start, *aggs_and_cols): + x_start_indices = xs.start_indices + x_flat_array = xs.flat_array + + y_start_indices = ys.start_indices + y_flat_array = ys.flat_array + + perform_extend_lines_ragged(vt, + bounds, + x_start_indices, + x_flat_array, + y_start_indices, + y_flat_array, + plot_start, + *aggs_and_cols) + + # @ngjit + def perform_extend_lines_ragged(vt, + bounds, + x_start_indices, + x_flat_array, + y_start_indices, + y_flat_array, + plot_start, + *aggs_and_cols): + + nrows = len(x_start_indices) + x_flat_len = len(x_flat_array) + y_flat_len = len(y_flat_array) + + i = 0 + while i < nrows: + plot_start = True + + # Get x index range + x_start_index = x_start_indices[i] + x_stop_index = (x_start_indices[i + 1] + if i < nrows - 1 + else x_flat_len) + + # Get y index range + y_start_index = y_start_indices[i] + y_stop_index = (y_start_indices[i + 1] + if i < nrows - 1 + else y_flat_len) + + # Find line segment length as shorter of the two segments + segment_len = min(x_stop_index - x_start_index, + y_stop_index - y_start_index) + + j = 0 + while j < segment_len - 1: + + x0 = x_flat_array[x_start_index + j] + y0 = y_flat_array[y_start_index + j] + x1 = x_flat_array[x_start_index + j + 1] + y1 = y_flat_array[y_start_index + j + 1] + + x0, x1, y0, y1, skip, clipped, plot_start = \ + _skip_or_clip(x0, x1, y0, y1, bounds, plot_start) + + if not skip: + x0i, y0i = map_onto_pixel(vt, bounds, x0, y0) + x1i, y1i = map_onto_pixel(vt, bounds, x1, y1) + draw_line(x0i, y0i, x1i, y1i, i, plot_start, clipped, + *aggs_and_cols) + plot_start = False + + j += 1 + i += 1 + + return extend_line + + def _build_draw_triangle(append): """Specialize a triangle plotting kernel for a given append/axis combination""" @ngjit diff --git a/datashader/tests/test_dask.py b/datashader/tests/test_dask.py index a7e3cfb38..58ee03d3e 100644 --- a/datashader/tests/test_dask.py +++ b/datashader/tests/test_dask.py @@ -326,6 +326,12 @@ def test_line(): 'y1': [0, 4, 0], 'y2': [0, 0, 0] }), 'x0', ['y0', 'y1', 'y2'], 0), + + # axis1 RaggedArray + (pd.DataFrame({ + 'x': [[4, 0, -4], [-4, 0, 4, 4, 0, -4]], + 'y': [[0, -4, 0], [0, 4, 0, 0, 0, 0]], + }, dtype='Ragged[int64]'), 'x', 'y', 1), ]) def test_line_manual_range(df, x, y, ax): axis = ds.core.LinearAxis() @@ -393,6 +399,12 @@ def test_line_manual_range(df, x, y, ax): 'y0': [-4, 0, 4] }), ['x0', 'x1', 'x2'], 'y0', 0), + # axis1 RaggedArray + (pd.DataFrame({ + 'x': [[0, -4, 0], [0, 0, 0], [0, 4, 0]], + 'y': [[-4, 0, 4], [4, 0, -4], [-4, 0, 4]], + }, dtype='Ragged[int64]'), 'x', 'y', 1), + ]) def test_line_autorange(df, x, y, ax): axis = ds.core.LinearAxis() diff --git a/datashader/tests/test_datatypes.py b/datashader/tests/test_datatypes.py new file mode 100644 index 000000000..3af4b5a50 --- /dev/null +++ b/datashader/tests/test_datatypes.py @@ -0,0 +1,776 @@ +import pytest +import numpy as np +import pandas as pd +import pandas.tests.extension.base as eb +import pandas.util.testing as tm + +from datashader.datatypes import RaggedDtype, RaggedArray + + +# Testing helpers +# --------------- +def assert_ragged_arrays_equal(ra1, ra2): + assert np.array_equal(ra1.start_indices, ra2.start_indices) + assert np.array_equal(ra1.flat_array, ra2.flat_array) + assert ra1.flat_array.dtype == ra2.flat_array.dtype + + # Make sure ragged elements are equal when iterated over + for a1, a2 in zip(ra1, ra2): + np.testing.assert_array_equal(a1, a2) + + +# Test constructor and properties +# ------------------------------- +def test_construct_ragged_dtype(): + dtype = RaggedDtype() + assert dtype.type == np.ndarray + assert dtype.name == 'Ragged[{subtype}]'.format(subtype=dtype.subtype) + assert dtype.kind == 'O' + + +def test_construct_ragged_array(): + rarray = RaggedArray([[1, 2], [], [10, 20, 30], None, [11, 22, 33, 44]], + dtype='int32') + + # Check flat array + assert rarray.flat_array.dtype == 'int32' + assert np.array_equal( + rarray.flat_array, + np.array([1, 2, 10, 20, 30, 11, 22, 33, 44], dtype='int32')) + + # Check start indices + assert rarray.start_indices.dtype == 'uint8' + assert np.array_equal( + rarray.start_indices, + np.array([0, 2, 2, 5, 5], dtype='uint64')) + + # Check len + assert len(rarray) == 5 + + # Check isna + assert rarray.isna().dtype == 'bool' + assert np.array_equal( + rarray.isna(), [False, True, False, True, False]) + + # Check nbytes + expected = ( + 9 * np.int32().nbytes + # flat_array + 5 * np.uint8().nbytes # start_indices + ) + assert rarray.nbytes == expected + + # Check dtype + assert type(rarray.dtype) == RaggedDtype + + +def test_construct_ragged_array_from_ragged_array(): + rarray = RaggedArray([[1, 2], [], [10, 20, 30], np.nan, [11, 22, 33, 44]], + dtype='int32') + + result = RaggedArray(rarray) + assert_ragged_arrays_equal(result, rarray) + + +def test_construct_ragged_array_fastpath(): + + start_indices = np.array([0, 2, 5, 6, 6, 11], dtype='uint16') + flat_array = np.array( + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='float32') + + rarray = RaggedArray( + dict(start_indices=start_indices, flat_array=flat_array)) + + # Check that arrays were accepted unchanged + assert np.array_equal(rarray.start_indices, start_indices) + assert np.array_equal(rarray.flat_array, flat_array) + + # Check interpretation as ragged array + object_array = np.asarray(rarray) + expected_lists = [[0, 1], [2, 3, 4], [5], [], [6, 7, 8, 9, 10], []] + expected_array = np.array([np.array(v, dtype='float32') + for v in expected_lists], dtype='object') + + assert len(object_array) == len(expected_array) + for a1, a2 in zip(object_array, expected_array): + np.testing.assert_array_equal(a1, a2) + + +def test_validate_ragged_array_fastpath(): + start_indices = np.array([0, 2, 5, 6, 6, 11], dtype='uint16') + flat_array = np.array( + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='float32') + + valid_dict = dict(start_indices=start_indices, flat_array=flat_array) + + # Valid args + RaggedArray(valid_dict) + + # ## start_indices validation ## + # + # not ndarray + with pytest.raises(ValueError) as ve: + RaggedArray(dict(valid_dict, start_indices=25)) + ve.match('start_indices property of a RaggedArray') + + # not unsiged int + with pytest.raises(ValueError) as ve: + RaggedArray(dict(valid_dict, + start_indices=start_indices.astype('float32'))) + ve.match('start_indices property of a RaggedArray') + + # not 1d + with pytest.raises(ValueError) as ve: + RaggedArray(dict(valid_dict, start_indices=np.array([start_indices]))) + ve.match('start_indices property of a RaggedArray') + + # ## flat_array validation ## + # + # not ndarray + with pytest.raises(ValueError) as ve: + RaggedArray(dict(valid_dict, flat_array='foo')) + ve.match('flat_array property of a RaggedArray') + + # not 1d + with pytest.raises(ValueError) as ve: + RaggedArray(dict(valid_dict, flat_array=np.array([flat_array]))) + ve.match('flat_array property of a RaggedArray') + + # ## start_indices out of bounds validation ## + # + bad_start_indices = start_indices.copy() + bad_start_indices[-1] = 99 + with pytest.raises(ValueError) as ve: + RaggedArray(dict(valid_dict, start_indices=bad_start_indices)) + ve.match('start_indices must be less than') + + +def test_start_indices_dtype(): + # The start_indices dtype should be an unsiged int that is only as large + # as needed to handle the length of the flat array + + # Empty + rarray = RaggedArray([[]], dtype='int64') + assert rarray.start_indices.dtype == np.dtype('uint8') + np.testing.assert_array_equal(rarray.start_indices, [0]) + + # Small + rarray = RaggedArray([[23, 24]], dtype='int64') + assert rarray.start_indices.dtype == np.dtype('uint8') + np.testing.assert_array_equal(rarray.start_indices, [0]) + + # Max uint8 + max_uint8 = np.iinfo('uint8').max + rarray = RaggedArray([np.zeros(max_uint8), []], dtype='int64') + assert rarray.start_indices.dtype == np.dtype('uint8') + np.testing.assert_array_equal(rarray.start_indices, [0, max_uint8]) + + # Min uint16 + rarray = RaggedArray([np.zeros(max_uint8 + 1), []], dtype='int64') + assert rarray.start_indices.dtype == np.dtype('uint16') + np.testing.assert_array_equal(rarray.start_indices, [0, max_uint8 + 1]) + + # Max uint16 + max_uint16 = np.iinfo('uint16').max + rarray = RaggedArray([np.zeros(max_uint16), []], dtype='int64') + assert rarray.start_indices.dtype == np.dtype('uint16') + np.testing.assert_array_equal(rarray.start_indices, [0, max_uint16]) + + # Min uint32 + rarray = RaggedArray([np.zeros(max_uint16 + 1), []], dtype='int64') + assert rarray.start_indices.dtype == np.dtype('uint32') + np.testing.assert_array_equal(rarray.start_indices, [0, max_uint16 + 1]) + + +@pytest.mark.parametrize('arg,expected', [ + ([np.array([1, 2], dtype='int64')], 'int64'), + ([[True], [False, True]], 'bool'), + (np.array([np.array([1, 2], dtype='int8'), + np.array([1, 2], dtype='int32')]), 'int32'), + ([[3.2], [2]], 'float64'), + ([np.array([3.2], dtype='float16'), + np.array([2], dtype='float32')], 'float32') +]) +def test_flat_array_type_inference(arg, expected): + rarray = RaggedArray(arg) + assert rarray.flat_array.dtype == np.dtype(expected) + + +# isna +# ----- +def test_isna(): + rarray = RaggedArray([[], [1, 3], [10, 20, 30], + None, [11, 22, 33, 44], []], dtype='int32') + + np.testing.assert_array_equal(rarray.isna(), + np.array([True, False, False, True, False, True])) + + +# __getitem__ +# ----------- +def test_get_item_scalar(): + arg = [[1, 2], [], [10, 20, 30], None, [11, 22, 33, 44]] + rarray = RaggedArray(arg, dtype='float16') + + # Forward + for i, expected in enumerate(arg): + result = rarray[i] + if expected is None: + expected = np.array([], dtype='float16') + + if isinstance(result, np.ndarray): + assert result.dtype == 'float16' + else: + assert np.isnan(result) + + np.testing.assert_array_equal(result, expected) + + # Reversed + for i, expected in enumerate(arg): + result = rarray[i - 5] + if expected is None: + expected = np.array([], dtype='float16') + + if isinstance(result, np.ndarray): + assert result.dtype == 'float16' + else: + assert np.isnan(result) + np.testing.assert_array_equal(result, expected) + + +@pytest.mark.parametrize('index', [-1000, -6, 5, 1000]) +def test_get_item_scalar_out_of_bounds(index): + rarray = RaggedArray([[1, 2], [], [10, 20, 30], None, [11, 22, 33, 44]]) + with pytest.raises(IndexError): + rarray[index] + + +def test_get_item_slice(): + arg = [[1, 2], [], [10, 20, 30], None, [11, 22, 33, 44]] + rarray = RaggedArray(arg, dtype='int16') + + # Slice everything + assert_ragged_arrays_equal(rarray[:], rarray) + + # Slice all but the first + assert_ragged_arrays_equal( + rarray[1:], RaggedArray(arg[1:], dtype='int16')) + + # Slice all but the last + assert_ragged_arrays_equal( + rarray[:-1], RaggedArray(arg[:-1], dtype='int16')) + + # Slice middle + assert_ragged_arrays_equal( + rarray[2:-1], RaggedArray(arg[2:-1], dtype='int16')) + + # Empty slice + assert_ragged_arrays_equal( + rarray[2:1], RaggedArray(arg[2:1], dtype='int16')) + + +@pytest.mark.parametrize('mask', [ + [1, 1, 1, 1, 1], + [0, 1, 0, 1, 1], + [0, 0, 0, 0, 0] +]) +def test_get_item_mask(mask): + arg = np.array([[1, 2], [], [10, 20, 30], None, [11, 22, 33, 44]]) + rarray = RaggedArray(arg, dtype='int16') + mask = np.array(mask, dtype='bool') + + assert_ragged_arrays_equal( + rarray[mask], + RaggedArray(arg[mask], dtype='int16')) + + +@pytest.mark.parametrize('inds', [ + [1, 2, 1, 4], + np.array([1, 2, 1, 4]), + [], + np.array([], dtype='int32'), + [4, 3, 2, 1, 0] +]) +def test_get_item_list(inds): + arg = np.array([[1, 2], [], [10, 20, 30], None, [11, 22, 33, 44]]) + rarray = RaggedArray(arg, dtype='int16') + + assert_ragged_arrays_equal( + rarray[inds], + RaggedArray(arg[inds], dtype='int16')) + + +# _from_factorized +# ---------------- +def test_factorization(): + arg = np.array([[1, 2], [], [1, 2], None, [11, 22, 33, 44]]) + rarray = RaggedArray(arg, dtype='int16') + labels, uniques = rarray.factorize() + + np.testing.assert_array_equal(labels, [0, -1, 0, -1, 1]) + assert_ragged_arrays_equal( + uniques, RaggedArray([[1, 2], [11, 22, 33, 44]], dtype='int16')) + + +# _from_sequence +# -------------- +def test_from_sequence(): + sequence = [[1, 2], [], [1, 2], None, [11, 22, 33, 44]] + rarray = RaggedArray._from_sequence(sequence) + + assert_ragged_arrays_equal( + rarray, RaggedArray(sequence)) + + +# copy +# ---- +def test_copy(): + # Create reference ragged array + original = RaggedArray._from_sequence( + [[1, 2], [], [1, 2], None, [11, 22, 33, 44]]) + + # Copy reference array + copied = original.copy(deep=True) + + # Make sure arrays are equal + assert_ragged_arrays_equal(original, copied) + + # Modify buffer in original + original.flat_array[0] = 99 + assert original.flat_array[0] == 99 + + # Make sure copy was not modified + assert copied.flat_array[0] == 1 + + +# take +# ---- +def test_take(): + # + rarray = RaggedArray._from_sequence( + [[1, 2], [], [10, 20], None, [11, 22, 33, 44]]) + + # allow_fill False + result = rarray.take([0, 2, 1, -1, -2, 0], allow_fill=False) + expected = RaggedArray( + [[1, 2], [10, 20], [], [11, 22, 33, 44], None, [1, 2]]) + assert_ragged_arrays_equal(result, expected) + + # allow fill True + result = rarray.take([0, 2, 1, -1, -1, 0], allow_fill=True) + expected = RaggedArray( + [[1, 2], [10, 20], [], None, None, [1, 2]]) + assert_ragged_arrays_equal(result, expected) + + +# _concat_same_type +# ----------------- +def test_concat_same_type(): + arg1 = [[1, 2], [], [10, 20], None, [11, 22, 33, 44]] + rarray1 = RaggedArray(arg1, dtype='float32') + + arg2 = [[100, 200], None, [99, 100, 101]] + rarray2 = RaggedArray(arg2, dtype='float32') + + arg3 = [None, [27, 28]] + rarray3 = RaggedArray(arg3, dtype='float32') + + result = RaggedArray._concat_same_type([rarray1, rarray2, rarray3]) + expected = RaggedArray(arg1 + arg2 + arg3, dtype='float32') + + assert_ragged_arrays_equal(result, expected) + + +# Test pandas operations +# ---------------------- +def test_pandas_array_construction(): + arg = [[0, 1], [1, 2, 3, 4], None, [-1, -2]] * 2 + ra = pd.array(arg, dtype='ragged[int64]') + + expected = RaggedArray(arg, dtype='int64') + assert_ragged_arrays_equal(ra, expected) + + +def test_series_construction(): + arg = [[0, 1], [1.0, 2, 3.0, 4], None, [-1, -2]] * 2 + rs = pd.Series(arg, dtype='Ragged[int64]') + ra = rs.array + + expected = RaggedArray(arg, dtype='int64') + assert_ragged_arrays_equal(ra, expected) + + +def test_concat_series(): + arg1 = [[1, 2], [], [10, 20], None, [11, 22, 33, 44]] + s1 = pd.Series(arg1, dtype='ragged[int16]') + + arg2 = [[100, 200], None, [99, 100, 101]] + s2 = pd.Series(arg2, dtype='ragged[int16]') + + arg3 = [None, [27, 28]] + s3 = pd.Series(arg3, dtype='ragged[int16]') + + s_concat = pd.concat([s1, s2, s3]) + + expected = pd.Series(arg1+arg2+arg3, + dtype='ragged[int16]', + index=[0, 1, 2, 3, 4, 0, 1, 2, 0, 1]) + + pd.testing.assert_series_equal(s_concat, expected) + + +# Array equality +# -------------- +@pytest.mark.parametrize('scalar', [ + np.array([1, 2]), [1, 2] +]) +def test_array_eq_scalar(scalar): + # Build RaggedArray + arg1 = [[1, 2], [], [1, 2], [1, 3], [11, 22, 33, 44]] + ra = RaggedArray(arg1, dtype='int32') + + # Check equality + result = ra == scalar + expected = np.array([1, 0, 1, 0, 0], dtype='bool') + np.testing.assert_array_equal(result, expected) + + # Check non-equality + result_negated = ra != scalar + expected_negated = ~expected + np.testing.assert_array_equal(result_negated, expected_negated) + + +def test_array_eq_numpy1(): + # Build RaggedArray + arg1 = [[1, 2], [], [1, 2], None, [11, 22, 33, 44]] + + # Construct arrays + ra = RaggedArray(arg1, dtype='int32') + npa = np.array([[1, 2], [2], [1, 2], None, [10, 20, 30, 40]], + dtype='object') + + # Check equality + result = ra == npa + expected = np.array([1, 0, 1, 1, 0], dtype='bool') + np.testing.assert_array_equal(result, expected) + + # Check non-equality + result_negated = ra != npa + expected_negated = ~expected + np.testing.assert_array_equal(result_negated, expected_negated) + + +def test_array_eq_numpy2d(): + # Construct arrays + ra = RaggedArray([[1, 2], [1], [1, 2], None, [33, 44]], + dtype='int32') + npa = np.array([[1, 2], [2, 3], [1, 2], [0, 1], [11, 22]], + dtype='int32') + + # Check equality + result = ra == npa + expected = np.array([1, 0, 1, 0, 0], dtype='bool') + np.testing.assert_array_equal(result, expected) + + # Check non-equality + result_negated = ra != npa + expected_negated = ~expected + np.testing.assert_array_equal(result_negated, expected_negated) + + +def test_array_eq_ragged(): + # Build RaggedArray + arg1 = [[1, 2], [], [1, 2], [3, 2, 1], [11, 22, 33, 44]] + ra1 = RaggedArray(arg1, dtype='int32') + + # Build RaggedArray + arg2 = [[1, 2], [2, 3, 4, 5], [1, 2], [11, 22, 33], [11]] + ra2 = RaggedArray(arg2, dtype='int32') + + # Check equality + result = ra1 == ra2 + expected = np.array([1, 0, 1, 0, 0], dtype='bool') + np.testing.assert_array_equal(result, expected) + + # Check non-equality + result_negated = ra1 != ra2 + expected_negated = ~expected + np.testing.assert_array_equal(result_negated, expected_negated) + + +@pytest.mark.parametrize('other', [ + 'a string', # Incompatible scalars + 32, + RaggedArray([[0, 1], [2, 3, 4]]), # RaggedArray of wrong length + np.array([[0, 1], [2, 3, 4]], dtype='object'), # 1D array wrong length + np.array([[0, 1], [2, 3]], dtype='int32'), # 2D array wrong row count +]) +def test_equality_validation(other): + # Build RaggedArray + arg1 = [[1, 2], [], [1, 2], None, [11, 22, 33, 44]] + ra1 = RaggedArray(arg1, dtype='int32') + + # invalid scalar + with pytest.raises(ValueError, match="Cannot check equality"): + ra1 == other + + +# Pandas-provided extension array tests +# ------------------------------------- +# See http://pandas-docs.github.io/pandas-docs-travis/extending.html +@pytest.fixture +def dtype(): + """A fixture providing the ExtensionDtype to validate.""" + return RaggedDtype() + + +@pytest.fixture +def data(): + """Length-100 array for this type. + * data[0] and data[1] should both be non missing + * data[0] and data[1] should not gbe equal + """ + return RaggedArray( + [[0, 1], [1, 2, 3, 4], [], [-1, -2], []]*20, dtype='float64') + + +@pytest.fixture +def data_repeated(data): + """ + Generate many datasets. + Parameters + ---------- + data : fixture implementing `data` + Returns + ------- + Callable[[int], Generator]: + A callable that takes a `count` argument and + returns a generator yielding `count` datasets. + """ + def gen(count): + for _ in range(count): + yield data + return gen + + +@pytest.fixture +def data_missing(): + """Length-2 array with [NA, Valid]""" + return RaggedArray([[], [-1, 0, 1]], dtype='int16') + + +@pytest.fixture(params=['data', 'data_missing']) +def all_data(request, data, data_missing): + """Parametrized fixture giving 'data' and 'data_missing'""" + if request.param == 'data': + return data + elif request.param == 'data_missing': + return data_missing + + +@pytest.fixture +def data_for_sorting(): + """Length-3 array with a known sort order. + This should be three items [B, C, A] with + A < B < C + """ + return RaggedArray([[1, 0], [2, 0], [0, 0]]) + + +@pytest.fixture +def data_missing_for_sorting(): + """Length-3 array with a known sort order. + This should be three items [B, NA, A] with + A < B and NA missing. + """ + return RaggedArray([[1, 0], [], [0, 0]]) + + +@pytest.fixture +def data_for_grouping(): + """Data for factorization, grouping, and unique tests. + Expected to be like [B, B, NA, NA, A, A, B, C] + Where A < B < C and NA is missing + """ + return RaggedArray( + [[1, 0], [1, 0], [], [], [0, 0], [0, 0], [1, 0], [2, 0]]) + + +@pytest.fixture +def na_cmp(): + return lambda x, y: (np.isscalar(x) and np.isnan(x) and + np.isscalar(y) and np.isnan(y)) + + +@pytest.fixture +def na_value(): + return np.nan + + +# Subclass BaseDtypeTests to run pandas-provided extension array test suite +class TestRaggedConstructors(eb.BaseConstructorsTests): + pass + + +class TestRaggedDtype(eb.BaseDtypeTests): + pass + + +class TestRaggedGetitem(eb.BaseGetitemTests): + + # Override testing methods that assume extension array scalars are + # comparable using `==`. Replace with assert_array_equal. + # + # If pandas introduces a way to customize element equality tests + # these overrides should be removed. + def test_get(self, data): + # GH 20882 + s = pd.Series(data, index=[2 * i for i in range(len(data))]) + np.testing.assert_array_equal(s.get(4), s.iloc[2]) + + result = s.get([4, 6]) + expected = s.iloc[[2, 3]] + self.assert_series_equal(result, expected) + + result = s.get(slice(2)) + expected = s.iloc[[0, 1]] + self.assert_series_equal(result, expected) + + assert s.get(-1) is None + assert s.get(s.index.max() + 1) is None + + s = pd.Series(data[:6], index=list('abcdef')) + np.testing.assert_array_equal(s.get('c'), s.iloc[2]) + + result = s.get(slice('b', 'd')) + expected = s.iloc[[1, 2, 3]] + self.assert_series_equal(result, expected) + + result = s.get('Z') + assert result is None + + np.testing.assert_array_equal(s.get(4), s.iloc[4]) + np.testing.assert_array_equal(s.get(-1), s.iloc[-1]) + assert s.get(len(s)) is None + + def test_take_sequence(self, data): + result = pd.Series(data)[[0, 1, 3]] + np.testing.assert_array_equal(result.iloc[0], data[0]) + np.testing.assert_array_equal(result.iloc[1], data[1]) + np.testing.assert_array_equal(result.iloc[2], data[3]) + + def test_take(self, data, na_value, na_cmp): + result = data.take([0, -1]) + np.testing.assert_array_equal(result.dtype, data.dtype) + np.testing.assert_array_equal(result[0], data[0]) + np.testing.assert_array_equal(result[1], data[-1]) + + result = data.take([0, -1], allow_fill=True, fill_value=na_value) + np.testing.assert_array_equal(result[0], data[0]) + assert na_cmp(result[1], na_value) + + with pytest.raises(IndexError, match="out of bounds"): + data.take([len(data) + 1]) + + +class TestRaggedGroupby(eb.BaseGroupbyTests): + @pytest.mark.parametrize('op', [ + lambda x: 1, + lambda x: [1] * len(x), + # # Op below causes a: + # # ValueError: Names should be list-like for a MultiIndex + # lambda x: pd.Series([1] * len(x)), + lambda x: x, + ], ids=[ + 'scalar', + 'list', + # 'series', + 'object']) + def test_groupby_extension_apply(self, data_for_grouping, op): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], + "B": data_for_grouping}) + df.groupby("B").apply(op) + df.groupby("B").A.apply(op) + df.groupby("A").apply(op) + df.groupby("A").B.apply(op) + + +class TestRaggedInterface(eb.BaseInterfaceTests): + # Add array equality + def test_array_interface(self, data): + result = np.array(data) + np.testing.assert_array_equal(result[0], data[0]) + + result = np.array(data, dtype=object) + expected = np.array(list(data), dtype=object) + + for a1, a2 in zip(result, expected): + if np.isscalar(a1): + assert np.isnan(a1) and np.isnan(a2) + else: + tm.assert_numpy_array_equal(a2, a1) + + +class TestRaggedMethods(eb.BaseMethodsTests): + + # # AttributeError: 'RaggedArray' object has no attribute 'value_counts' + @pytest.mark.skip(reason="value_counts not supported") + def test_value_counts(self): + pass + + # Add array equality + @pytest.mark.parametrize('box', [pd.Series, lambda x: x]) + @pytest.mark.parametrize('method', [lambda x: x.unique(), pd.unique]) + def test_unique(self, data, box, method): + duplicated = box(data._from_sequence([data[0], data[0]])) + + result = method(duplicated) + + assert len(result) == 1 + assert isinstance(result, type(data)) + np.testing.assert_array_equal(result[0], duplicated[0]) + + # Pandas raises + # ValueError: invalid fill value with a + @pytest.mark.skip(reason="pandas cannot fill with ndarray") + def test_fillna_copy_frame(self): + pass + + @pytest.mark.skip(reason="pandas cannot fill with ndarray") + def test_fillna_copy_series(self): + pass + + # Ragged array elements don't support binary operators + @pytest.mark.skip(reason="ragged does not support <= on elements") + def test_combine_le(self): + pass + + @pytest.mark.skip(reason="ragged does not support + on elements") + def test_combine_add(self): + pass + + # Block manager error: + # ValueError: setting an array element with a sequence. + @pytest.mark.skip(reason="combine_first not supported") + def test_combine_first(self): + pass + + +class TestRaggedPrinting(eb.BasePrintingTests): + pass + + +class TestRaggedMissing(eb.BaseMissingTests): + # Pandas doesn't like using an ndarray as fill value. + # Errors like: + # ValueError: invalid fill value with a + @pytest.mark.skip(reason="Can't fill with ndarray") + def test_fillna_series(self): + pass + + @pytest.mark.skip(reason="Can't fill with ndarray") + def test_fillna_frame(self): + pass + + +class TestRaggedReshaping(eb.BaseReshapingTests): + pass diff --git a/datashader/tests/test_pandas.py b/datashader/tests/test_pandas.py index 8031c530a..0bb3acc50 100644 --- a/datashader/tests/test_pandas.py +++ b/datashader/tests/test_pandas.py @@ -642,6 +642,12 @@ def test_bug_570(): 'y0': [-4, 0, 4], 'y1': [-4, 0, 4], }), ['x0', 'x1'], 'y0', 0), + + # axis1 ragged arrays + (pd.DataFrame({ + 'x': pd.array([[4, 0], [0, -4, 0, 4]], dtype='Ragged[float32]'), + 'y': pd.array([[0, -4], [-4, 0, 4, 0]], dtype='Ragged[float32]') + }), 'x', 'y', 1) ]) def test_line_manual_range(df, x, y, ax): axis = ds.core.LinearAxis() @@ -709,6 +715,12 @@ def test_line_manual_range(df, x, y, ax): 'y0': [-4, 0, 4], 'y1': [-4, 0, 4], }), ['x0', 'x1'], 'y0', 0), + + # axis1 ragged arrays + (pd.DataFrame({ + 'x': pd.array([[0, -4, 0], [0, 4, 0]], dtype='Ragged[float32]'), + 'y': pd.array([[-4, 0, 4], [-4, 0, 4]], dtype='Ragged[float32]') + }), 'x', 'y', 1) ]) def test_line_autorange(df, x, y, ax): axis = ds.core.LinearAxis() @@ -808,3 +820,36 @@ def test_line_agg_sum_axis1_none_constant(): out = xr.DataArray(sol, coords=[lincoords, lincoords], dims=['y', 'x']) assert_eq(agg, out) + + +def test_line_autorange_axis1_ragged(): + axis = ds.core.LinearAxis() + lincoords = axis.compute_index( + axis.compute_scale_and_translate((-4., 4.), 9), 9) + + df = pd.DataFrame({ + 'x': pd.array([[4, 0], [0, -4, 0, 4]], dtype='Ragged[float32]'), + 'y': pd.array([[0, -4], [-4, 0, 4, 0]], dtype='Ragged[float32]') + }) + + cvs = ds.Canvas(plot_width=9, plot_height=9) + + agg = cvs.line(df, + 'x', + 'y', + ds.count(), + axis=1) + + sol = np.array([[0, 0, 0, 0, 2, 0, 0, 0, 0], + [0, 0, 0, 1, 0, 1, 0, 0, 0], + [0, 0, 1, 0, 0, 0, 1, 0, 0], + [0, 1, 0, 0, 0, 0, 0, 1, 0], + [1, 0, 0, 0, 0, 0, 0, 0, 2], + [0, 1, 0, 0, 0, 0, 0, 1, 0], + [0, 0, 1, 0, 0, 0, 1, 0, 0], + [0, 0, 0, 1, 0, 1, 0, 0, 0], + [0, 0, 0, 0, 1, 0, 0, 0, 0]], dtype='i4') + + out = xr.DataArray(sol, coords=[lincoords, lincoords], + dims=['y', 'x']) + assert_eq(agg, out) diff --git a/datashader/utils.py b/datashader/utils.py index b4a286f69..fb660b17b 100644 --- a/datashader/utils.py +++ b/datashader/utils.py @@ -12,6 +12,11 @@ import dask.dataframe as dd import datashape +try: + from datashader.datatypes import RaggedDtype +except ImportError: + RaggedDtype = type(None) + ngjit = nb.jit(nopython=True, nogil=True) @@ -369,6 +374,8 @@ def dshape_from_pandas_helper(col): # Pandas stores this as a pytz.tzinfo, but DataShape wants a string tz = str(tz) return datashape.Option(datashape.DateTime(tz=tz)) + elif isinstance(col.dtype, RaggedDtype): + return col.dtype dshape = datashape.CType.from_numpy_dtype(col.dtype) dshape = datashape.string if dshape == datashape.object_ else dshape if dshape in (datashape.string, datashape.datetime_): diff --git a/setup.py b/setup.py index a3fa674cb..d3a7b8685 100644 --- a/setup.py +++ b/setup.py @@ -36,6 +36,7 @@ 'flake8', 'nbsmoke >=0.2.6', 'fastparquet >=0.1.6', # optional dependency + 'pandas >=0.24.1', # optional ragged array support ], 'examples': [], 'examples_extra':[