From fc148de8f5070c94937be07d664302e718b5673c Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Sat, 12 Jan 2019 11:23:51 -0500 Subject: [PATCH 01/45] Fix for pandas 0.24.0rc1 --- datashader/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datashader/utils.py b/datashader/utils.py index a95c228ef..d7f6c2725 100644 --- a/datashader/utils.py +++ b/datashader/utils.py @@ -345,7 +345,7 @@ def dshape_from_pandas_helper(col): )) return datashape.Categorical(col.cat.categories.values, type=cat_dshape, - ordered=col.cat.categorical.ordered) + ordered=col.cat.ordered) elif col.dtype.kind == 'M': tz = getattr(col.dtype, 'tz', None) if tz is not None: From 864a2355a93ed9027fb03eb3a509b362abc2f874 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Sat, 12 Jan 2019 20:37:49 -0500 Subject: [PATCH 02/45] Initial RaggedArray implementation --- datashader/__init__.py | 3 + datashader/datatypes.py | 361 +++++++++++++++++++++++++++++ datashader/tests/test_datatypes.py | 305 ++++++++++++++++++++++++ 3 files changed, 669 insertions(+) create mode 100644 datashader/datatypes.py create mode 100644 datashader/tests/test_datatypes.py diff --git a/datashader/__init__.py b/datashader/__init__.py index 017672043..747f21161 100644 --- a/datashader/__init__.py +++ b/datashader/__init__.py @@ -15,6 +15,9 @@ except ImportError: pass +# Make ragged pandas extension array available +from . import datatypes + # make pyct's example/data commands available if possible from functools import partial try: diff --git a/datashader/datatypes.py b/datashader/datatypes.py new file mode 100644 index 000000000..bc66c90d7 --- /dev/null +++ b/datashader/datatypes.py @@ -0,0 +1,361 @@ +import numpy as np +from pandas.api.extensions import ExtensionDtype, ExtensionArray +from pandas.core.dtypes.dtypes import register_extension_dtype +from numbers import Integral + + +@register_extension_dtype +class RaggedDtype(ExtensionDtype): + name = 'ragged' + type = np.ndarray + base = np.dtype('O') + + @classmethod + def construct_array_type(cls): + return RaggedArray + + @classmethod + def construct_from_string(cls, string): + if string == cls.name: + return cls() + else: + raise TypeError("Cannot construct a '{}' from '{}'" + .format(cls, string)) + + +class RaggedArray(ExtensionArray): + def __init__(self, data, dtype=None): + """ + Construct a RaggedArray + + Parameters + ---------- + data + List or numpy array of lists or numpy arrays + dtype: np.dtype or str or None (default None) + Datatype to use to store underlying values from data. + If none (the default) then dtype will be determined using the + numpy.result_type function + """ + if (isinstance(data, dict) and + all(k in data for k in + ['mask', 'start_indices', 'flat_array'])): + + self._mask = data['mask'] + self._start_indices = data['start_indices'] + self._flat_array = data['flat_array'] + else: + # Compute lengths + index_len = len(data) + buffer_len = sum(len(datum) + if datum is not None + else 0 for datum in data) + + # Compute necessary precision of start_indices array + for nbits in [8, 16, 32, 64]: + start_indices_dtype = 'uint' + str(nbits) + max_supported = np.iinfo(start_indices_dtype).max + if buffer_len <= max_supported: + break + + # infer dtype if not provided + if dtype is None: + dtype = np.result_type(*[np.atleast_1d(v) + for v in data + if v is not None]) + + # Initialize representation arrays + self._mask = np.zeros(index_len, dtype='bool') + self._start_indices = np.zeros(index_len, dtype=start_indices_dtype) + self._flat_array = np.zeros(buffer_len, dtype=dtype) + + # Populate arrays + next_start_ind = 0 + for i, array_el in enumerate(data): + # Check for null values + isnull = array_el is None + + # Compute element length + n = len(array_el) if not isnull else 0 + + # Update mask + self._mask[i] = isnull + + # Update start indices + self._start_indices[i] = next_start_ind + + # Update flat array + self._flat_array[next_start_ind:next_start_ind+n] = array_el + + # increment next start index + next_start_ind += n + + # This is a workaround (hack?) to keep pandas.lib.infer_dtype from + # "raising cannot infer type" ValueError error when calling: + # >>> pd.Series([[0, 1], [1, 2, 3]], dtype='ragged') + self._values = self._flat_array + + @property + def flat_array(self): + """ + numpy array containing concatenation of all nested arrays + + Returns + ------- + np.ndarray + """ + return self._flat_array + + @property + def mask(self): + """ + boolean numpy array the same length as the ragged array where values + of True indicate missing values. + + Returns + ------- + np.ndarray + """ + return self._mask + + @property + def start_indices(self): + """ + integer numpy array the same length as the ragged array where values + represent the index into flat_array where the corresponding ragged + array element begins. + + Returns + ------- + np.ndarray + """ + return self._start_indices + + def __len__(self): + """ + Length of this array + + Returns + ------- + length : int + """ + return len(self._start_indices) + + def __getitem__(self, item): + """ + Parameters + ---------- + item : int, slice, or ndarray + * int: The position in 'self' to get. + + * slice: A slice object, where 'start', 'stop', and 'step' are + integers or None + + * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' + """ + if isinstance(item, Integral): + if item < -len(self) or item >= len(self): + raise IndexError(item) + elif self.mask[item]: + return None + else: + # Convert negative item index + if item < 0: + item = 5 + item + + slice_start = self.start_indices[item] + slice_end = (self.start_indices[item+1] + if item + 1 <= len(self) - 1 + else len(self.flat_array)) + + return self.flat_array[slice_start:slice_end] + + elif type(item) == slice: + data = [] + selected_indices = np.arange(len(self))[item] + + for selected_index in selected_indices: + data.append(self[selected_index]) + + return RaggedArray(data, dtype=self.flat_array.dtype) + + elif isinstance(item, np.ndarray) and item.dtype == 'bool': + data = [] + + for i, m in enumerate(item): + if m: + data.append(self[i]) + + return RaggedArray(data, dtype=self.flat_array.dtype) + else: + raise KeyError(item) + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + """ + Construct a new RaggedArray from a sequence of scalars. + + Parameters + ---------- + scalars : Sequence + Each element will be an instance of the scalar type for this + array, ``cls.dtype.type``. + dtype : dtype, optional + Construct for this particular dtype. This should be a Dtype + compatible with the ExtensionArray. + copy : boolean, default False + If True, copy the underlying data. + + Returns + ------- + RaggedArray + """ + return RaggedArray(scalars) + + @classmethod + def _from_factorized(cls, values, original): + """ + Reconstruct an ExtensionArray after factorization. + + Parameters + ---------- + values : ndarray + An integer ndarray with the factorized values. + original : RaggedArray + The original ExtensionArray that factorize was called on. + + See Also + -------- + pandas.factorize + ExtensionArray.factorize + """ + return RaggedArray(values, dtype=original.flat_array.dtype) + + def _values_for_factorize(self): + # Here we return a list of the ragged elements converted into tuples. + # This is very inefficient, but the elements of this list must be + # hashable, and we must be able to reconstruct a new Ragged Array + # from these elements. + # + # Perhaps we could replace these tuples with a class that provides a + # read-only view of an ndarray slice and provides a hash function. + return [tuple(self[i]) if not self.mask[i] else None + for i in range(len(self))], None + + def isna(self): + """ + A 1-D array indicating if each value is missing. + + Returns + ------- + na_values : np.ndarray + boolean ndarray the same length as the ragged array where values + of True represent missing/NA values. + """ + return self.mask + + def take(self, indices, allow_fill=False, fill_value=None): + """ + Take elements from an array. + + Parameters + ---------- + indices : sequence of integers + Indices to be taken. + allow_fill : bool, default False + How to handle negative values in `indices`. + + * False: negative values in `indices` indicate positional indices + from the right (the default). This is similar to + :func:`numpy.take`. + + * True: negative values in `indices` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. + + fill_value : any, default None + Fill value to use for NA-indices when `allow_fill` is True. + + Returns + ------- + RaggedArray + + Raises + ------ + IndexError + When the indices are out of bounds for the array. + """ + if allow_fill: + sequence = [self[i] if i >= 0 else fill_value + for i in indices] + else: + sequence = [self[i] for i in indices] + + return RaggedArray(sequence, dtype=self.flat_array.dtype) + + def copy(self, deep=False): + """ + Return a copy of the array. + + Parameters + ---------- + deep : bool, default False + Also copy the underlying data backing this array. + + Returns + ------- + RaggedArray + """ + data = dict( + mask=self.mask, + flat_array=self.flat_array, + start_indices=self.start_indices) + + if deep: + # Copy underlying numpy arrays + data = {k: v.copy() for k, v in data.items()} + + return RaggedArray(data) + + @classmethod + def _concat_same_type(cls, to_concat): + """ + Concatenate multiple RaggedArray instances + + Parameters + ---------- + to_concat : list of RaggedArray + + Returns + ------- + RaggedArray + """ + # concat masks + mask = np.hstack(ra.mask for ra in to_concat) + + # concat flat_arrays + flat_array = np.hstack(ra.flat_array for ra in to_concat) + + # offset and concat start_indices + offsets = np.hstack([ + [0], + np.cumsum([len(ra.flat_array) for ra in to_concat[:-1]])]) + + start_indices = np.hstack([ra.start_indices + offset + for offset, ra in zip(offsets, to_concat)]) + + return RaggedArray(dict( + mask=mask, flat_array=flat_array, start_indices=start_indices)) + + @property + def dtype(self): + return RaggedDtype + + @property + def nbytes(self): + """ + The number of bytes needed to store this object in memory. + """ + return (self._flat_array.nbytes + + self._start_indices.nbytes + + self._mask.nbytes) diff --git a/datashader/tests/test_datatypes.py b/datashader/tests/test_datatypes.py new file mode 100644 index 000000000..83b9e7c5c --- /dev/null +++ b/datashader/tests/test_datatypes.py @@ -0,0 +1,305 @@ +import pytest +import numpy as np +import pandas as pd +from datashader.datatypes import RaggedDtype, RaggedArray + + +# Testing helpers +# --------------- +def assert_ragged_arrays_equal(ra1, ra2): + assert np.array_equal(ra1.mask, ra2.mask) + assert np.array_equal(ra1.start_indices, ra2.start_indices) + assert np.array_equal(ra1.flat_array, ra2.flat_array) + assert np.array_equal(ra1.flat_array.dtype, ra2.flat_array.dtype) + + +# Test constructor and properties +# ------------------------------- +def test_construct_ragged_dtype(): + dtype = RaggedDtype() + assert dtype.type == np.ndarray + assert dtype.name == 'ragged' + assert dtype.kind == 'O' + + +def test_construct_ragged_array(): + rarray = RaggedArray([[1, 2], [], [10, 20, 30], None, [11, 22, 33, 44]], + dtype='int32') + + # Check flat array + assert rarray.flat_array.dtype == 'int32' + assert np.array_equal( + rarray.flat_array, + np.array([1, 2, 10, 20, 30, 11, 22, 33, 44], dtype='int32')) + + # Check start indices + assert rarray.start_indices.dtype == 'uint8' + assert np.array_equal( + rarray.start_indices, + np.array([0, 2, 2, 5, 5], dtype='uint64')) + + # Check mask + assert rarray.mask.dtype == 'bool' + assert np.array_equal( + rarray.mask, + np.array([False, False, False, True, False], dtype='bool')) + + # Check len + assert len(rarray) == 5 + + # Check isna + assert rarray.isna().dtype == 'bool' + assert np.array_equal( + rarray.isna(), [False, False, False, True, False]) + + # Check nbytes + expected = ( + 9 * np.int32().nbytes + # flat_array + 5 * np.uint8().nbytes + # start_indices + 5 # mask + ) + assert rarray.nbytes == expected + + # Check dtype + assert rarray.dtype == RaggedDtype + + +def test_start_indices_dtype(): + # The start_indices dtype should be an unsiged int that is only as large + # as needed to handle the length of the flat array + + # Empty + rarray = RaggedArray([[]], dtype='int64') + assert rarray.start_indices.dtype == np.dtype('uint8') + assert np.array_equal(rarray.start_indices, [0]) + + # Small + rarray = RaggedArray([[23, 24]], dtype='int64') + assert rarray.start_indices.dtype == np.dtype('uint8') + assert np.array_equal(rarray.start_indices, [0]) + + # Max uint8 + max_uint8 = np.iinfo('uint8').max + rarray = RaggedArray([np.zeros(max_uint8), []], dtype='int64') + assert rarray.start_indices.dtype == np.dtype('uint8') + assert np.array_equal(rarray.start_indices, [0, max_uint8]) + + # Min uint16 + rarray = RaggedArray([np.zeros(max_uint8 + 1), []], dtype='int64') + assert rarray.start_indices.dtype == np.dtype('uint16') + assert np.array_equal(rarray.start_indices, [0, max_uint8 + 1]) + + # Max uint16 + max_uint16 = np.iinfo('uint16').max + rarray = RaggedArray([np.zeros(max_uint16), []], dtype='int64') + assert rarray.start_indices.dtype == np.dtype('uint16') + assert np.array_equal(rarray.start_indices, [0, max_uint16]) + + # Min uint32 + rarray = RaggedArray([np.zeros(max_uint16 + 1), []], dtype='int64') + assert rarray.start_indices.dtype == np.dtype('uint32') + assert np.array_equal(rarray.start_indices, [0, max_uint16 + 1]) + + +@pytest.mark.parametrize('arg,expected', [ + ([[1, 2]], 'int64'), + ([[True], [False, True]], 'bool'), + (np.array([np.array([1, 2], dtype='int8'), + np.array([1, 2], dtype='int32')]), 'int32'), + ([[3.2], [2]], 'float64'), + ([np.array([3.2], dtype='float16'), + np.array([2], dtype='float32')], 'float32') +]) +def test_flat_array_type_inference(arg, expected): + rarray = RaggedArray(arg) + assert rarray.flat_array.dtype == np.dtype(expected) + + +# __getitem__ +# ----------- +def test_get_item_scalar(): + arg = [[1, 2], [], [10, 20, 30], None, [11, 22, 33, 44]] + rarray = RaggedArray(arg, dtype='float16') + + # Forward + for i, expected in enumerate(arg): + result = rarray[i] + if expected is None: + assert result is None + else: + assert result.dtype == 'float16' + assert np.array_equal(result, expected) + + # Reversed + for i, expected in enumerate(arg): + result = rarray[i - 5] + if expected is None: + assert result is None + else: + assert result.dtype == 'float16' + assert np.array_equal(result, expected) + + +@pytest.mark.parametrize('index', [-1000, -6, 5, 1000]) +def test_get_item_scalar_out_of_bounds(index): + rarray = RaggedArray([[1, 2], [], [10, 20, 30], None, [11, 22, 33, 44]]) + with pytest.raises(IndexError) as e: + result = rarray[index] + + +def test_get_item_slice(): + arg = [[1, 2], [], [10, 20, 30], None, [11, 22, 33, 44]] + rarray = RaggedArray(arg, dtype='int16') + + # Slice everything + assert_ragged_arrays_equal(rarray[:], rarray) + + # Slice all but the first + assert_ragged_arrays_equal( + rarray[1:], RaggedArray(arg[1:], dtype='int16')) + + # Slice all but the last + assert_ragged_arrays_equal( + rarray[:-1], RaggedArray(arg[:-1], dtype='int16')) + + # Slice middle + assert_ragged_arrays_equal( + rarray[2:-1], RaggedArray(arg[2:-1], dtype='int16')) + + # Empty slice + assert_ragged_arrays_equal( + rarray[2:1], RaggedArray(arg[2:1], dtype='int16')) + + +@pytest.mark.parametrize('mask', [ + [1, 1, 1, 1, 1], + [0, 1, 0, 1, 1], + [0, 0, 0, 0, 0] +]) +def test_get_item_mask(mask): + arg = np.array([[1, 2], [], [10, 20, 30], None, [11, 22, 33, 44]]) + rarray = RaggedArray(arg, dtype='int16') + mask = np.array(mask, dtype='bool') + + assert_ragged_arrays_equal( + rarray[mask], + RaggedArray(arg[mask], dtype='int16')) + + +# _from_factorized +# ---------------- +def test_factorization(): + arg = np.array([[1, 2], [], [1, 2], None, [11, 22, 33, 44]]) + rarray = RaggedArray(arg, dtype='int16') + labels, uniques = rarray.factorize() + + assert np.array_equal(labels, [0, 1, 0, -1, 2]) + assert_ragged_arrays_equal( + uniques, RaggedArray([[1, 2], [], [11, 22, 33, 44]], dtype='int16')) + + +# _from_sequence +# -------------- +def test_from_sequence(): + sequence = [[1, 2], [], [1, 2], None, [11, 22, 33, 44]] + rarray = RaggedArray._from_sequence(sequence) + + assert_ragged_arrays_equal( + rarray, RaggedArray(sequence)) + + +# copy +# ---- +def test_copy(): + # Create reference ragged array + original = RaggedArray._from_sequence( + [[1, 2], [], [1, 2], None, [11, 22, 33, 44]]) + + # Copy reference array + copied = original.copy(deep=True) + + # Make sure arrays are equal + assert_ragged_arrays_equal(original, copied) + + # Modify buffer in original + original.flat_array[0] = 99 + assert original.flat_array[0] == 99 + + # Make sure copy was not modified + assert copied.flat_array[0] == 1 + + +# take +# ---- +def test_take(): + # + rarray = RaggedArray._from_sequence( + [[1, 2], [], [10, 20], None, [11, 22, 33, 44]]) + + # allow_fill False + result = rarray.take([0, 2, 1, -1, -2, 0], allow_fill=False) + expected = RaggedArray( + [[1, 2], [10, 20], [], [11, 22, 33, 44], None, [1, 2]]) + assert_ragged_arrays_equal(result, expected) + + # allow fill True + result = rarray.take([0, 2, 1, -1, -1, 0], allow_fill=True) + expected = RaggedArray( + [[1, 2], [10, 20], [], None, None, [1, 2]]) + assert_ragged_arrays_equal(result, expected) + + +# _concat_same_type +# ----------------- +def test_concat_same_type(): + arg1 = [[1, 2], [], [10, 20], None, [11, 22, 33, 44]] + rarray1 = RaggedArray(arg1, dtype='float32') + + arg2 = [[100, 200], None, [99, 100, 101]] + rarray2 = RaggedArray(arg2, dtype='float32') + + arg3 = [None, [27, 28]] + rarray3 = RaggedArray(arg3, dtype='float32') + + result = RaggedArray._concat_same_type([rarray1, rarray2, rarray3]) + expected = RaggedArray(arg1 + arg2 + arg3, dtype='float32') + + assert_ragged_arrays_equal(result, expected) + + +# Test pandas operations +# ---------------------- +def test_pandas_array_construction(): + arg = [[0, 1], [1, 2, 3, 4], None, [-1, -2]] * 2 + ra = pd.array(arg, dtype='ragged') + + expected = RaggedArray(arg) + assert_ragged_arrays_equal(ra, expected) + + +def test_series_construction(): + arg = [[0, 1], [1, 2, 3, 4], None, [-1, -2]] * 2 + rs = pd.Series(arg, dtype='ragged') + ra = rs.array + + expected = RaggedArray(arg) + assert_ragged_arrays_equal(ra, expected) + + +def test_concat_series(): + arg1 = [[1, 2], [], [10, 20], None, [11, 22, 33, 44]] + s1 = pd.Series(arg1, dtype='ragged') + + arg2 = [[100, 200], None, [99, 100, 101]] + s2 = pd.Series(arg2, dtype='ragged') + + arg3 = [None, [27, 28]] + s3 = pd.Series(arg3, dtype='ragged') + + s_concat = pd.concat([s1, s2, s3]) + + expected = pd.Series(arg1+arg2+arg3, + dtype='ragged', + index=[0, 1, 2, 3, 4, 0, 1, 2, 0, 1]) + + pd.testing.assert_series_equal(s_concat, expected) From 440e207feafee08c13a1a623a1e30013f56528a7 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Sun, 13 Jan 2019 18:24:45 -0500 Subject: [PATCH 03/45] Add the extension test suite provided by pandas and fix tests. Something in the fixes for these tests removed the need to for the ._values hack! --- datashader/datatypes.py | 19 +++++---- datashader/tests/test_datatypes.py | 62 +++++++++++++++++++++++++++++- 2 files changed, 72 insertions(+), 9 deletions(-) diff --git a/datashader/datatypes.py b/datashader/datatypes.py index bc66c90d7..3ef8171f3 100644 --- a/datashader/datatypes.py +++ b/datashader/datatypes.py @@ -16,8 +16,8 @@ def construct_array_type(cls): @classmethod def construct_from_string(cls, string): - if string == cls.name: - return cls() + if string == 'ragged': + return RaggedDtype() else: raise TypeError("Cannot construct a '{}' from '{}'" .format(cls, string)) @@ -90,11 +90,6 @@ def __init__(self, data, dtype=None): # increment next start index next_start_ind += n - # This is a workaround (hack?) to keep pandas.lib.infer_dtype from - # "raising cannot infer type" ValueError error when calling: - # >>> pd.Series([[0, 1], [1, 2, 3]], dtype='ragged') - self._values = self._flat_array - @property def flat_array(self): """ @@ -349,7 +344,7 @@ def _concat_same_type(cls, to_concat): @property def dtype(self): - return RaggedDtype + return RaggedDtype() @property def nbytes(self): @@ -359,3 +354,11 @@ def nbytes(self): return (self._flat_array.nbytes + self._start_indices.nbytes + self._mask.nbytes) + + def astype(self, dtype, copy=True): + if isinstance(dtype, RaggedDtype): + if copy: + return self.copy() + return self + + return np.array(self, dtype=dtype, copy=copy) diff --git a/datashader/tests/test_datatypes.py b/datashader/tests/test_datatypes.py index 83b9e7c5c..a9915d762 100644 --- a/datashader/tests/test_datatypes.py +++ b/datashader/tests/test_datatypes.py @@ -1,6 +1,8 @@ import pytest import numpy as np import pandas as pd +from pandas.tests.extension.base import BaseDtypeTests + from datashader.datatypes import RaggedDtype, RaggedArray @@ -61,7 +63,7 @@ def test_construct_ragged_array(): assert rarray.nbytes == expected # Check dtype - assert rarray.dtype == RaggedDtype + assert type(rarray.dtype) == RaggedDtype def test_start_indices_dtype(): @@ -303,3 +305,61 @@ def test_concat_series(): index=[0, 1, 2, 3, 4, 0, 1, 2, 0, 1]) pd.testing.assert_series_equal(s_concat, expected) + + +# Pandas-provided extension array tests +# ------------------------------------- +# See http://pandas-docs.github.io/pandas-docs-travis/extending.html +@pytest.fixture +def dtype(): + """A fixture providing the ExtensionDtype to validate.""" + return RaggedDtype() + + +@pytest.fixture +def data(): + """Length-100 array for this type. + * data[0] and data[1] should both be non missing + * data[0] and data[1] should not gbe equal + """ + return RaggedArray( + [[0, 1], [1, 2, 3, 4], [], None, [-1, -2]]*20, dtype='float64') + + +@pytest.fixture +def data_missing(): + """Length-2 array with [NA, Valid]""" + return RaggedArray([None, [-1, 0, 1]], dtype='int16') + + +@pytest.fixture +def data_for_sorting(): + """Length-3 array with a known sort order. + This should be three items [B, C, A] with + A < B < C + """ + return RaggedArray([[1, 0], [2, 0], [0, 0]]) + + +@pytest.fixture +def data_missing_for_sorting(): + """Length-3 array with a known sort order. + This should be three items [B, NA, A] with + A < B and NA missing. + """ + return RaggedArray([[1, 0], None, [0, 0]]) + + +@pytest.fixture +def data_for_grouping(): + """Data for factorization, grouping, and unique tests. + Expected to be like [B, B, NA, NA, A, A, B, C] + Where A < B < C and NA is missing + """ + return RaggedArray( + [[1, 0], [1, 0], None, None, [0, 0], [0, 0], [1, 0], [2, 0]]) + + +# Subclass BaseDtypeTests to run pandas-provided extension array test suite +class TestRaggedDtype(BaseDtypeTests): + pass From 2f18587154ee694224b8784efdb17ee119988bdd Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Sun, 13 Jan 2019 18:26:21 -0500 Subject: [PATCH 04/45] Import register_extension_dtype from pandas public location --- datashader/datatypes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datashader/datatypes.py b/datashader/datatypes.py index 3ef8171f3..ddb47d72c 100644 --- a/datashader/datatypes.py +++ b/datashader/datatypes.py @@ -1,6 +1,6 @@ import numpy as np -from pandas.api.extensions import ExtensionDtype, ExtensionArray -from pandas.core.dtypes.dtypes import register_extension_dtype +from pandas.api.extensions import ( + ExtensionDtype, ExtensionArray, register_extension_dtype) from numbers import Integral From 5f46b8e85d6ddac4157ed4b047ed72a079b1aeab Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Mon, 14 Jan 2019 10:28:51 -0500 Subject: [PATCH 05/45] Fix copy/paste error --- datashader/datatypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datashader/datatypes.py b/datashader/datatypes.py index ddb47d72c..2324d86e9 100644 --- a/datashader/datatypes.py +++ b/datashader/datatypes.py @@ -156,7 +156,7 @@ def __getitem__(self, item): else: # Convert negative item index if item < 0: - item = 5 + item + item = len(self) + item slice_start = self.start_indices[item] slice_end = (self.start_indices[item+1] From a6b3c27447bb4c99d5640df094ac99dc1126aa92 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Mon, 14 Jan 2019 10:29:15 -0500 Subject: [PATCH 06/45] KeyError -> IndexError --- datashader/datatypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datashader/datatypes.py b/datashader/datatypes.py index 2324d86e9..ac57edd02 100644 --- a/datashader/datatypes.py +++ b/datashader/datatypes.py @@ -183,7 +183,7 @@ def __getitem__(self, item): return RaggedArray(data, dtype=self.flat_array.dtype) else: - raise KeyError(item) + raise IndexError(item) @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): From fbc50659ead54e466867081839bcbd6cd4e75407 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Mon, 14 Jan 2019 12:04:07 -0500 Subject: [PATCH 07/45] Document, validate, and test fast-path RaggedArray construction from start_indices, flat_array, and mask arrays --- datashader/datatypes.py | 105 +++++++++++++++++++++++++++-- datashader/tests/test_datatypes.py | 101 +++++++++++++++++++++++++++ 2 files changed, 199 insertions(+), 7 deletions(-) diff --git a/datashader/datatypes.py b/datashader/datatypes.py index ac57edd02..743810663 100644 --- a/datashader/datatypes.py +++ b/datashader/datatypes.py @@ -4,6 +4,79 @@ from numbers import Integral +def _validate_ragged_properties(data): + """ + Validate that dict contains the necessary properties to construct a + RaggedArray. + + Parameters + ---------- + data: dict + A dict containing 'mask', 'start_indices', and 'flat_array' keys + with numpy array values + + Raises + ------ + ValueError: + if input contains invalid or incompatible properties + """ + # Validate mask + mask = data['mask'] + + if (not isinstance(mask, np.ndarray) or + mask.dtype != 'bool' or + mask.ndim != 1): + raise ValueError(""" +The mask property of a RaggedArray must be a 1D numpy array with dtype=='bool' + Received value of type {typ}: {v}""".format( + typ=type(mask), v=repr(mask))) + + # Validate start_indices + start_indices = data['start_indices'] + + if (not isinstance(start_indices, np.ndarray) or + start_indices.dtype.kind != 'u' or + start_indices.ndim != 1): + raise ValueError(""" +The start_indices property of a RaggedArray must be a 1D numpy array of +unsigned integers (start_indices.dtype.kind == 'u') + Received value of type {typ}: {v}""".format( + typ=type(start_indices), v=repr(start_indices))) + + if len(mask) != len(start_indices): + raise ValueError(""" +The length of the mask and start_indices arrays must be equal + len(mask): {mask_len} + len(start_indices): {start_indices_len}""".format( + mask_len=len(mask), start_indices_len=len(start_indices))) + + # Validate flat_array + flat_array = data['flat_array'] + + if (not isinstance(flat_array, np.ndarray) or + flat_array.ndim != 1): + raise ValueError(""" +The flat_array property of a RaggedArray must be a 1D numpy array + Received value of type {typ}: {v}""".format( + typ=type(flat_array), v=repr(flat_array))) + + # Validate start_indices values + # We don't need to check start_indices < 0 because we already know that it + # has an unsigned integer datatype + # + # Note that start_indices[i] == len(flat_array) is valid as it represents + # and empty array element at the end of the ragged array. + invalid_inds = start_indices > len(flat_array) + + if invalid_inds.any(): + some_invalid_vals = start_indices[invalid_inds[:10]] + + raise ValueError(""" +Elements of start_indices must be less than the length of flat_array ({m}) + Invalid values include: {vals}""".format( + m=len(flat_array), vals=repr(some_invalid_vals))) + + @register_extension_dtype class RaggedDtype(ExtensionDtype): name = 'ragged' @@ -30,17 +103,35 @@ def __init__(self, data, dtype=None): Parameters ---------- - data - List or numpy array of lists or numpy arrays + data: + * list or 1D-array: A List or 1D array of lists or 1D arrays that + should be represented by the RaggedArray + + * dict: A dict containing 'mask', 'start_indices', + and 'flat_array' keys with numpy array values where: + - mask: boolean numpy array the same length as the + ragged array where values of True indicate + missing values + - flat_array: numpy array containing concatenation + of all nested arrays to be represented + by this ragged array + - start_indices: unsiged integer numpy array the same + length as the ragged array where values + represent the index into flat_array where + the corresponding ragged array element + begins + dtype: np.dtype or str or None (default None) Datatype to use to store underlying values from data. If none (the default) then dtype will be determined using the - numpy.result_type function + numpy.result_type function. """ if (isinstance(data, dict) and all(k in data for k in ['mask', 'start_indices', 'flat_array'])): + _validate_ragged_properties(data) + self._mask = data['mask'] self._start_indices = data['start_indices'] self._flat_array = data['flat_array'] @@ -105,7 +196,7 @@ def flat_array(self): def mask(self): """ boolean numpy array the same length as the ragged array where values - of True indicate missing values. + of True indicate missing values Returns ------- @@ -116,9 +207,9 @@ def mask(self): @property def start_indices(self): """ - integer numpy array the same length as the ragged array where values - represent the index into flat_array where the corresponding ragged - array element begins. + unsiged integer numpy array the same length as the ragged array where + values represent the index into flat_array where the corresponding + ragged array element begins Returns ------- diff --git a/datashader/tests/test_datatypes.py b/datashader/tests/test_datatypes.py index a9915d762..f0ad03837 100644 --- a/datashader/tests/test_datatypes.py +++ b/datashader/tests/test_datatypes.py @@ -66,6 +66,107 @@ def test_construct_ragged_array(): assert type(rarray.dtype) == RaggedDtype +def test_construct_ragged_array_fastpath(): + + mask = np.array([False, False, False, True, False, False]) + start_indices = np.array([0, 2, 5, 6, 6, 11], dtype='uint16') + flat_array = np.array( + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='float32') + + rarray = RaggedArray( + dict(mask=mask, start_indices=start_indices, flat_array=flat_array)) + + # Check that arrays were accepted unchanged + assert np.array_equal(rarray.mask, mask) + assert np.array_equal(rarray.start_indices, start_indices) + assert np.array_equal(rarray.flat_array, flat_array) + + # Check interpretation as ragged array + object_array = np.asarray(rarray) + expected_lists = [[0, 1], [2, 3, 4], [5], None, [6, 7, 8, 9, 10], []] + expected_array = np.array([np.array(v, dtype='float32') + if v is not None else None + for v in expected_lists], dtype='object') + + assert len(object_array) == len(expected_array) + for a1, a2 in zip(object_array, expected_array): + assert np.array_equal(a1, a2) + + +def test_validate_ragged_array_fastpath(): + mask = np.array([False, False, False, True, False, False]) + start_indices = np.array([0, 2, 5, 6, 6, 11], dtype='uint16') + flat_array = np.array( + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='float32') + + valid_dict = dict( + mask=mask, start_indices=start_indices, flat_array=flat_array) + + # Valid args + RaggedArray(valid_dict) + + # ## mask validation ## + # + # not ndarray + with pytest.raises(ValueError) as ve: + RaggedArray(dict(valid_dict, mask=25)) + ve.match('mask property of a RaggedArray') + + # not boolean + with pytest.raises(ValueError) as ve: + RaggedArray(dict(valid_dict, mask=mask.astype('float32'))) + ve.match('mask property of a RaggedArray') + + # not 1d + with pytest.raises(ValueError) as ve: + RaggedArray(dict(valid_dict, mask=np.array([mask]))) + ve.match('mask property of a RaggedArray') + + # ## start_indices validation ## + # + # not ndarray + with pytest.raises(ValueError) as ve: + RaggedArray(dict(valid_dict, start_indices=25)) + ve.match('start_indices property of a RaggedArray') + + # not unsiged int + with pytest.raises(ValueError) as ve: + RaggedArray(dict(valid_dict, + start_indices=start_indices.astype('float32'))) + ve.match('start_indices property of a RaggedArray') + + # not 1d + with pytest.raises(ValueError) as ve: + RaggedArray(dict(valid_dict, start_indices=np.array([start_indices]))) + ve.match('start_indices property of a RaggedArray') + + # ## flat_array validation ## + # + # not ndarray + with pytest.raises(ValueError) as ve: + RaggedArray(dict(valid_dict, flat_array='foo')) + ve.match('flat_array property of a RaggedArray') + + # not 1d + with pytest.raises(ValueError) as ve: + RaggedArray(dict(valid_dict, flat_array=np.array([flat_array]))) + ve.match('flat_array property of a RaggedArray') + + # ## matching length validation ## + # + with pytest.raises(ValueError) as ve: + RaggedArray(dict(valid_dict, start_indices=start_indices[:-1])) + ve.match('length of the mask and start_indices arrays must be equal') + + # ## start_indices out of bounds validation ## + # + bad_start_indices = start_indices.copy() + bad_start_indices[-1] = 99 + with pytest.raises(ValueError) as ve: + RaggedArray(dict(valid_dict, start_indices=bad_start_indices)) + ve.match('start_indices must be less than') + + def test_start_indices_dtype(): # The start_indices dtype should be an unsiged int that is only as large # as needed to handle the length of the flat array From 527e9d64f587fe615c0eca46b70c26ba0299e03e Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Mon, 14 Jan 2019 14:35:36 -0500 Subject: [PATCH 08/45] Support indexing RaggedArray with a list --- datashader/datatypes.py | 2 ++ datashader/tests/test_datatypes.py | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/datashader/datatypes.py b/datashader/datatypes.py index 743810663..fd8b87807 100644 --- a/datashader/datatypes.py +++ b/datashader/datatypes.py @@ -273,6 +273,8 @@ def __getitem__(self, item): data.append(self[i]) return RaggedArray(data, dtype=self.flat_array.dtype) + elif isinstance(item, (list, np.ndarray)): + return self.take(item, allow_fill=False) else: raise IndexError(item) diff --git a/datashader/tests/test_datatypes.py b/datashader/tests/test_datatypes.py index f0ad03837..c1d099a77 100644 --- a/datashader/tests/test_datatypes.py +++ b/datashader/tests/test_datatypes.py @@ -289,6 +289,22 @@ def test_get_item_mask(mask): RaggedArray(arg[mask], dtype='int16')) +@pytest.mark.parametrize('inds', [ + [1, 2, 1, 4], + np.array([1, 2, 1, 4]), + [], + np.array([], dtype='int32'), + [4, 3, 2, 1, 0] +]) +def test_get_item_list(inds): + arg = np.array([[1, 2], [], [10, 20, 30], None, [11, 22, 33, 44]]) + rarray = RaggedArray(arg, dtype='int16') + + assert_ragged_arrays_equal( + rarray[inds], + RaggedArray(arg[inds], dtype='int16')) + + # _from_factorized # ---------------- def test_factorization(): From 8d1c34bcb194337e2ee3b9d267a289a559a77df4 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Mon, 14 Jan 2019 15:08:32 -0500 Subject: [PATCH 09/45] Create single RaggedDtype() instance per RaggedArray --- datashader/datatypes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datashader/datatypes.py b/datashader/datatypes.py index fd8b87807..91f538a78 100644 --- a/datashader/datatypes.py +++ b/datashader/datatypes.py @@ -126,6 +126,7 @@ def __init__(self, data, dtype=None): If none (the default) then dtype will be determined using the numpy.result_type function. """ + self._dtype = RaggedDtype() if (isinstance(data, dict) and all(k in data for k in ['mask', 'start_indices', 'flat_array'])): @@ -437,7 +438,7 @@ def _concat_same_type(cls, to_concat): @property def dtype(self): - return RaggedDtype() + return self._dtype @property def nbytes(self): From dad6cc29ea37ba4ac4a0ff69d4a6658b731b343f Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Mon, 14 Jan 2019 15:13:20 -0500 Subject: [PATCH 10/45] Allow astype() to cast RaggedArray to other extension array types --- datashader/datatypes.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/datashader/datatypes.py b/datashader/datatypes.py index 91f538a78..9d253abb1 100644 --- a/datashader/datatypes.py +++ b/datashader/datatypes.py @@ -3,6 +3,9 @@ ExtensionDtype, ExtensionArray, register_extension_dtype) from numbers import Integral +from pandas.api.types import pandas_dtype +from pandas.core.dtypes.common import is_extension_array_dtype + def _validate_ragged_properties(data): """ @@ -450,9 +453,14 @@ def nbytes(self): self._mask.nbytes) def astype(self, dtype, copy=True): + + dtype = pandas_dtype(dtype) if isinstance(dtype, RaggedDtype): if copy: return self.copy() return self + elif is_extension_array_dtype(dtype): + dtype.construct_array_type()._from_sequence(np.asarray(self)) + return np.array(self, dtype=dtype, copy=copy) From fff0c3ec522c38ae1a254c5e135c4948e2d3dc3b Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Mon, 14 Jan 2019 15:21:33 -0500 Subject: [PATCH 11/45] Allow RaggedArray constructor to accept a RaggedArray to copy --- datashader/datatypes.py | 7 ++++++- datashader/tests/test_datatypes.py | 12 ++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/datashader/datatypes.py b/datashader/datatypes.py index 9d253abb1..2d2ad5be1 100644 --- a/datashader/datatypes.py +++ b/datashader/datatypes.py @@ -106,7 +106,7 @@ def __init__(self, data, dtype=None): Parameters ---------- - data: + data: list or array or dict or RaggedArray * list or 1D-array: A List or 1D array of lists or 1D arrays that should be represented by the RaggedArray @@ -123,6 +123,7 @@ def __init__(self, data, dtype=None): represent the index into flat_array where the corresponding ragged array element begins + * RaggedArray: A RaggedArray instance to copy dtype: np.dtype or str or None (default None) Datatype to use to store underlying values from data. @@ -139,6 +140,10 @@ def __init__(self, data, dtype=None): self._mask = data['mask'] self._start_indices = data['start_indices'] self._flat_array = data['flat_array'] + elif isinstance(data, RaggedArray): + self._mask = data.mask.copy() + self._flat_array = data.flat_array.copy() + self._start_indices = data.start_indices.copy() else: # Compute lengths index_len = len(data) diff --git a/datashader/tests/test_datatypes.py b/datashader/tests/test_datatypes.py index c1d099a77..ab4a36a4c 100644 --- a/datashader/tests/test_datatypes.py +++ b/datashader/tests/test_datatypes.py @@ -14,6 +14,10 @@ def assert_ragged_arrays_equal(ra1, ra2): assert np.array_equal(ra1.flat_array, ra2.flat_array) assert np.array_equal(ra1.flat_array.dtype, ra2.flat_array.dtype) + # Make sure ragged elements are equal when iterated over + for a1, a2 in zip(ra1, ra2): + assert np.array_equal(a1, a2) + # Test constructor and properties # ------------------------------- @@ -66,6 +70,14 @@ def test_construct_ragged_array(): assert type(rarray.dtype) == RaggedDtype +def test_construct_ragged_array_from_ragged_array(): + rarray = RaggedArray([[1, 2], [], [10, 20, 30], None, [11, 22, 33, 44]], + dtype='int32') + + result = RaggedArray(rarray) + assert_ragged_arrays_equal(result, rarray) + + def test_construct_ragged_array_fastpath(): mask = np.array([False, False, False, True, False, False]) From 478b65532870183abd30ccd9edadd89e31d330ea Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Mon, 14 Jan 2019 18:47:52 -0500 Subject: [PATCH 12/45] Remove mask property and consider missing to be equivalent to empty --- datashader/datatypes.py | 84 +++++++++--------------------- datashader/tests/test_datatypes.py | 83 ++++++++++++----------------- 2 files changed, 57 insertions(+), 110 deletions(-) diff --git a/datashader/datatypes.py b/datashader/datatypes.py index 2d2ad5be1..c672291e1 100644 --- a/datashader/datatypes.py +++ b/datashader/datatypes.py @@ -15,7 +15,7 @@ def _validate_ragged_properties(data): Parameters ---------- data: dict - A dict containing 'mask', 'start_indices', and 'flat_array' keys + A dict containing 'start_indices', and 'flat_array' keys with numpy array values Raises @@ -23,16 +23,6 @@ def _validate_ragged_properties(data): ValueError: if input contains invalid or incompatible properties """ - # Validate mask - mask = data['mask'] - - if (not isinstance(mask, np.ndarray) or - mask.dtype != 'bool' or - mask.ndim != 1): - raise ValueError(""" -The mask property of a RaggedArray must be a 1D numpy array with dtype=='bool' - Received value of type {typ}: {v}""".format( - typ=type(mask), v=repr(mask))) # Validate start_indices start_indices = data['start_indices'] @@ -46,13 +36,6 @@ def _validate_ragged_properties(data): Received value of type {typ}: {v}""".format( typ=type(start_indices), v=repr(start_indices))) - if len(mask) != len(start_indices): - raise ValueError(""" -The length of the mask and start_indices arrays must be equal - len(mask): {mask_len} - len(start_indices): {start_indices_len}""".format( - mask_len=len(mask), start_indices_len=len(start_indices))) - # Validate flat_array flat_array = data['flat_array'] @@ -99,6 +82,10 @@ def construct_from_string(cls, string): .format(cls, string)) +def missing(v): + return v is None or (np.isscalar(v) and np.isnan(v)) + + class RaggedArray(ExtensionArray): def __init__(self, data, dtype=None): """ @@ -110,11 +97,8 @@ def __init__(self, data, dtype=None): * list or 1D-array: A List or 1D array of lists or 1D arrays that should be represented by the RaggedArray - * dict: A dict containing 'mask', 'start_indices', - and 'flat_array' keys with numpy array values where: - - mask: boolean numpy array the same length as the - ragged array where values of True indicate - missing values + * dict: A dict containing 'start_indices' and 'flat_array' keys + with numpy array values where: - flat_array: numpy array containing concatenation of all nested arrays to be represented by this ragged array @@ -133,22 +117,20 @@ def __init__(self, data, dtype=None): self._dtype = RaggedDtype() if (isinstance(data, dict) and all(k in data for k in - ['mask', 'start_indices', 'flat_array'])): + ['start_indices', 'flat_array'])): _validate_ragged_properties(data) - self._mask = data['mask'] self._start_indices = data['start_indices'] self._flat_array = data['flat_array'] elif isinstance(data, RaggedArray): - self._mask = data.mask.copy() self._flat_array = data.flat_array.copy() self._start_indices = data.start_indices.copy() else: # Compute lengths index_len = len(data) buffer_len = sum(len(datum) - if datum is not None + if not missing(datum) else 0 for datum in data) # Compute necessary precision of start_indices array @@ -162,24 +144,17 @@ def __init__(self, data, dtype=None): if dtype is None: dtype = np.result_type(*[np.atleast_1d(v) for v in data - if v is not None]) + if not missing(v)]) # Initialize representation arrays - self._mask = np.zeros(index_len, dtype='bool') self._start_indices = np.zeros(index_len, dtype=start_indices_dtype) self._flat_array = np.zeros(buffer_len, dtype=dtype) # Populate arrays next_start_ind = 0 for i, array_el in enumerate(data): - # Check for null values - isnull = array_el is None - # Compute element length - n = len(array_el) if not isnull else 0 - - # Update mask - self._mask[i] = isnull + n = len(array_el) if not missing(array_el) else 0 # Update start indices self._start_indices[i] = next_start_ind @@ -201,18 +176,6 @@ def flat_array(self): """ return self._flat_array - @property - def mask(self): - """ - boolean numpy array the same length as the ragged array where values - of True indicate missing values - - Returns - ------- - np.ndarray - """ - return self._mask - @property def start_indices(self): """ @@ -251,8 +214,6 @@ def __getitem__(self, item): if isinstance(item, Integral): if item < -len(self) or item >= len(self): raise IndexError(item) - elif self.mask[item]: - return None else: # Convert negative item index if item < 0: @@ -336,8 +297,7 @@ def _values_for_factorize(self): # # Perhaps we could replace these tuples with a class that provides a # read-only view of an ndarray slice and provides a hash function. - return [tuple(self[i]) if not self.mask[i] else None - for i in range(len(self))], None + return [tuple(self[i]) for i in range(len(self))], None def isna(self): """ @@ -349,7 +309,11 @@ def isna(self): boolean ndarray the same length as the ragged array where values of True represent missing/NA values. """ - return self.mask + stop_indices = np.hstack([self.start_indices[1:], + [len(self.flat_array)]]) + + element_lengths = stop_indices - self.start_indices + return element_lengths == 0 def take(self, indices, allow_fill=False, fill_value=None): """ @@ -383,6 +347,11 @@ def take(self, indices, allow_fill=False, fill_value=None): When the indices are out of bounds for the array. """ if allow_fill: + invalid_inds = [i for i in indices if i < -1] + if invalid_inds: + raise ValueError(""" +Invalid indices for take with allow_fill True: {inds}""".format( + inds=invalid_inds[:9])) sequence = [self[i] if i >= 0 else fill_value for i in indices] else: @@ -404,7 +373,6 @@ def copy(self, deep=False): RaggedArray """ data = dict( - mask=self.mask, flat_array=self.flat_array, start_indices=self.start_indices) @@ -427,9 +395,6 @@ def _concat_same_type(cls, to_concat): ------- RaggedArray """ - # concat masks - mask = np.hstack(ra.mask for ra in to_concat) - # concat flat_arrays flat_array = np.hstack(ra.flat_array for ra in to_concat) @@ -442,7 +407,7 @@ def _concat_same_type(cls, to_concat): for offset, ra in zip(offsets, to_concat)]) return RaggedArray(dict( - mask=mask, flat_array=flat_array, start_indices=start_indices)) + flat_array=flat_array, start_indices=start_indices)) @property def dtype(self): @@ -454,8 +419,7 @@ def nbytes(self): The number of bytes needed to store this object in memory. """ return (self._flat_array.nbytes + - self._start_indices.nbytes + - self._mask.nbytes) + self._start_indices.nbytes) def astype(self, dtype, copy=True): diff --git a/datashader/tests/test_datatypes.py b/datashader/tests/test_datatypes.py index ab4a36a4c..f5785eaab 100644 --- a/datashader/tests/test_datatypes.py +++ b/datashader/tests/test_datatypes.py @@ -9,7 +9,6 @@ # Testing helpers # --------------- def assert_ragged_arrays_equal(ra1, ra2): - assert np.array_equal(ra1.mask, ra2.mask) assert np.array_equal(ra1.start_indices, ra2.start_indices) assert np.array_equal(ra1.flat_array, ra2.flat_array) assert np.array_equal(ra1.flat_array.dtype, ra2.flat_array.dtype) @@ -44,25 +43,18 @@ def test_construct_ragged_array(): rarray.start_indices, np.array([0, 2, 2, 5, 5], dtype='uint64')) - # Check mask - assert rarray.mask.dtype == 'bool' - assert np.array_equal( - rarray.mask, - np.array([False, False, False, True, False], dtype='bool')) - # Check len assert len(rarray) == 5 # Check isna assert rarray.isna().dtype == 'bool' assert np.array_equal( - rarray.isna(), [False, False, False, True, False]) + rarray.isna(), [False, True, False, True, False]) # Check nbytes expected = ( 9 * np.int32().nbytes + # flat_array - 5 * np.uint8().nbytes + # start_indices - 5 # mask + 5 * np.uint8().nbytes # start_indices ) assert rarray.nbytes == expected @@ -80,24 +72,21 @@ def test_construct_ragged_array_from_ragged_array(): def test_construct_ragged_array_fastpath(): - mask = np.array([False, False, False, True, False, False]) start_indices = np.array([0, 2, 5, 6, 6, 11], dtype='uint16') flat_array = np.array( [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='float32') rarray = RaggedArray( - dict(mask=mask, start_indices=start_indices, flat_array=flat_array)) + dict(start_indices=start_indices, flat_array=flat_array)) # Check that arrays were accepted unchanged - assert np.array_equal(rarray.mask, mask) assert np.array_equal(rarray.start_indices, start_indices) assert np.array_equal(rarray.flat_array, flat_array) # Check interpretation as ragged array object_array = np.asarray(rarray) - expected_lists = [[0, 1], [2, 3, 4], [5], None, [6, 7, 8, 9, 10], []] + expected_lists = [[0, 1], [2, 3, 4], [5], [], [6, 7, 8, 9, 10], []] expected_array = np.array([np.array(v, dtype='float32') - if v is not None else None for v in expected_lists], dtype='object') assert len(object_array) == len(expected_array) @@ -106,34 +95,15 @@ def test_construct_ragged_array_fastpath(): def test_validate_ragged_array_fastpath(): - mask = np.array([False, False, False, True, False, False]) start_indices = np.array([0, 2, 5, 6, 6, 11], dtype='uint16') flat_array = np.array( [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='float32') - valid_dict = dict( - mask=mask, start_indices=start_indices, flat_array=flat_array) + valid_dict = dict(start_indices=start_indices, flat_array=flat_array) # Valid args RaggedArray(valid_dict) - # ## mask validation ## - # - # not ndarray - with pytest.raises(ValueError) as ve: - RaggedArray(dict(valid_dict, mask=25)) - ve.match('mask property of a RaggedArray') - - # not boolean - with pytest.raises(ValueError) as ve: - RaggedArray(dict(valid_dict, mask=mask.astype('float32'))) - ve.match('mask property of a RaggedArray') - - # not 1d - with pytest.raises(ValueError) as ve: - RaggedArray(dict(valid_dict, mask=np.array([mask]))) - ve.match('mask property of a RaggedArray') - # ## start_indices validation ## # # not ndarray @@ -164,12 +134,6 @@ def test_validate_ragged_array_fastpath(): RaggedArray(dict(valid_dict, flat_array=np.array([flat_array]))) ve.match('flat_array property of a RaggedArray') - # ## matching length validation ## - # - with pytest.raises(ValueError) as ve: - RaggedArray(dict(valid_dict, start_indices=start_indices[:-1])) - ve.match('length of the mask and start_indices arrays must be equal') - # ## start_indices out of bounds validation ## # bad_start_indices = start_indices.copy() @@ -230,6 +194,16 @@ def test_flat_array_type_inference(arg, expected): assert rarray.flat_array.dtype == np.dtype(expected) +# isna +# ----- +def test_isna(): + rarray = RaggedArray([[], [1, 3], [10, 20, 30], + None, [11, 22, 33, 44], []], dtype='int32') + + assert np.array_equal(rarray.isna(), + np.array([True, False, False, True, False, True])) + + # __getitem__ # ----------- def test_get_item_scalar(): @@ -240,19 +214,19 @@ def test_get_item_scalar(): for i, expected in enumerate(arg): result = rarray[i] if expected is None: - assert result is None - else: - assert result.dtype == 'float16' - assert np.array_equal(result, expected) + expected = np.array([], dtype='float16') + + assert result.dtype == 'float16' + assert np.array_equal(result, expected) # Reversed for i, expected in enumerate(arg): result = rarray[i - 5] if expected is None: - assert result is None - else: - assert result.dtype == 'float16' - assert np.array_equal(result, expected) + expected = np.array([], dtype='float16') + + assert result.dtype == 'float16' + assert np.array_equal(result, expected) @pytest.mark.parametrize('index', [-1000, -6, 5, 1000]) @@ -324,7 +298,7 @@ def test_factorization(): rarray = RaggedArray(arg, dtype='int16') labels, uniques = rarray.factorize() - assert np.array_equal(labels, [0, 1, 0, -1, 2]) + assert np.array_equal(labels, [0, 1, 0, 1, 2]) assert_ragged_arrays_equal( uniques, RaggedArray([[1, 2], [], [11, 22, 33, 44]], dtype='int16')) @@ -461,6 +435,15 @@ def data_missing(): return RaggedArray([None, [-1, 0, 1]], dtype='int16') +@pytest.fixture(params=['data', 'data_missing']) +def all_data(request, data, data_missing): + """Parametrized fixture giving 'data' and 'data_missing'""" + if request.param == 'data': + return data + elif request.param == 'data_missing': + return data_missing + + @pytest.fixture def data_for_sorting(): """Length-3 array with a known sort order. From 9d84b3cfbab28cb21256920baed6dca22ced0ba7 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Mon, 14 Jan 2019 19:24:25 -0500 Subject: [PATCH 13/45] More test fixes for `[]` being null --- datashader/datatypes.py | 11 +++++++---- datashader/tests/test_datatypes.py | 12 ++++++------ 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/datashader/datatypes.py b/datashader/datatypes.py index c672291e1..51ef02922 100644 --- a/datashader/datatypes.py +++ b/datashader/datatypes.py @@ -142,9 +142,12 @@ def __init__(self, data, dtype=None): # infer dtype if not provided if dtype is None: - dtype = np.result_type(*[np.atleast_1d(v) - for v in data - if not missing(v)]) + non_missing = [np.atleast_1d(v) + for v in data if not missing(v)] + if non_missing: + dtype = np.result_type(*non_missing) + else: + dtype = 'float64' # Initialize representation arrays self._start_indices = np.zeros(index_len, dtype=start_indices_dtype) @@ -297,7 +300,7 @@ def _values_for_factorize(self): # # Perhaps we could replace these tuples with a class that provides a # read-only view of an ndarray slice and provides a hash function. - return [tuple(self[i]) for i in range(len(self))], None + return [tuple(self[i]) for i in range(len(self))], () def isna(self): """ diff --git a/datashader/tests/test_datatypes.py b/datashader/tests/test_datatypes.py index f5785eaab..8139d9070 100644 --- a/datashader/tests/test_datatypes.py +++ b/datashader/tests/test_datatypes.py @@ -298,9 +298,9 @@ def test_factorization(): rarray = RaggedArray(arg, dtype='int16') labels, uniques = rarray.factorize() - assert np.array_equal(labels, [0, 1, 0, 1, 2]) + assert np.array_equal(labels, [0, -1, 0, -1, 1]) assert_ragged_arrays_equal( - uniques, RaggedArray([[1, 2], [], [11, 22, 33, 44]], dtype='int16')) + uniques, RaggedArray([[1, 2], [11, 22, 33, 44]], dtype='int16')) # _from_sequence @@ -426,13 +426,13 @@ def data(): * data[0] and data[1] should not gbe equal """ return RaggedArray( - [[0, 1], [1, 2, 3, 4], [], None, [-1, -2]]*20, dtype='float64') + [[0, 1], [1, 2, 3, 4], [], [-1, -2], []]*20, dtype='float64') @pytest.fixture def data_missing(): """Length-2 array with [NA, Valid]""" - return RaggedArray([None, [-1, 0, 1]], dtype='int16') + return RaggedArray([[], [-1, 0, 1]], dtype='int16') @pytest.fixture(params=['data', 'data_missing']) @@ -459,7 +459,7 @@ def data_missing_for_sorting(): This should be three items [B, NA, A] with A < B and NA missing. """ - return RaggedArray([[1, 0], None, [0, 0]]) + return RaggedArray([[1, 0], [], [0, 0]]) @pytest.fixture @@ -469,7 +469,7 @@ def data_for_grouping(): Where A < B < C and NA is missing """ return RaggedArray( - [[1, 0], [1, 0], None, None, [0, 0], [0, 0], [1, 0], [2, 0]]) + [[1, 0], [1, 0], [], [], [0, 0], [0, 0], [1, 0], [2, 0]]) # Subclass BaseDtypeTests to run pandas-provided extension array test suite From d71f86631a058c4c9d242eb80b6b45dbe4d66e2e Mon Sep 17 00:00:00 2001 From: "James A. Bednar" Date: Mon, 14 Jan 2019 20:27:19 -0500 Subject: [PATCH 14/45] Update datashader/datatypes.py Co-Authored-By: jonmmease --- datashader/datatypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datashader/datatypes.py b/datashader/datatypes.py index 51ef02922..2ad3329a4 100644 --- a/datashader/datatypes.py +++ b/datashader/datatypes.py @@ -15,7 +15,7 @@ def _validate_ragged_properties(data): Parameters ---------- data: dict - A dict containing 'start_indices', and 'flat_array' keys + A dict containing 'start_indices' and 'flat_array' keys with numpy array values Raises From 4cd7b4c48f9f9d18244b79b17ffd33872a2ef9b9 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Tue, 15 Jan 2019 20:26:46 -0500 Subject: [PATCH 15/45] Add RaggedElement wrapper class for internal pandas operations Add additional ExtensionArray test suites --- datashader/datatypes.py | 169 ++++++++++++++++++-- datashader/tests/test_datatypes.py | 242 +++++++++++++++++++++++++++-- 2 files changed, 382 insertions(+), 29 deletions(-) diff --git a/datashader/datatypes.py b/datashader/datatypes.py index 2ad3329a4..0d693e716 100644 --- a/datashader/datatypes.py +++ b/datashader/datatypes.py @@ -1,3 +1,5 @@ +from functools import total_ordering + import numpy as np from pandas.api.extensions import ( ExtensionDtype, ExtensionArray, register_extension_dtype) @@ -63,6 +65,48 @@ def _validate_ragged_properties(data): m=len(flat_array), vals=repr(some_invalid_vals))) +# Internal ragged element array wrapper that provides +# equality, ordering, and hashing. +@total_ordering +class RaggedElement(object): + + @staticmethod + def ragged_or_nan(a): + if np.isscalar(a) and np.isnan(a): + return a + else: + return RaggedElement(a) + + @staticmethod + def array_or_nan(a): + if np.isscalar(a) and np.isnan(a): + return a + else: + return a.array + + def __init__(self, array): + self.array = array + + def __hash__(self): + # TODO: Rewrite using self.array directly without tuple + return hash(tuple(self.array)) + + def __eq__(self, other): + if not isinstance(other, RaggedElement): + return False + return np.array_equal(self.array, other.array) + + def __lt__(self, other): + # TODO: Rewrite using self.array directly without tuples + if not isinstance(other, RaggedElement): + return NotImplemented + return tuple(self.array) < tuple(other.array) + + def __repr__(self): + array_repr = repr(self.array) + return array_repr.replace('array', 'ragged_element') + + @register_extension_dtype class RaggedDtype(ExtensionDtype): name = 'ragged' @@ -216,7 +260,7 @@ def __getitem__(self, item): """ if isinstance(item, Integral): if item < -len(self) or item >= len(self): - raise IndexError(item) + raise IndexError("{item} is out of bounds".format(item=item)) else: # Convert negative item index if item < 0: @@ -227,7 +271,9 @@ def __getitem__(self, item): if item + 1 <= len(self) - 1 else len(self.flat_array)) - return self.flat_array[slice_start:slice_end] + return (self.flat_array[slice_start:slice_end] + if slice_end!=slice_start + else np.nan) elif type(item) == slice: data = [] @@ -290,17 +336,113 @@ def _from_factorized(cls, values, original): pandas.factorize ExtensionArray.factorize """ - return RaggedArray(values, dtype=original.flat_array.dtype) + return RaggedArray( + [RaggedElement.array_or_nan(v) for v in values], + dtype=original.flat_array.dtype) + + def _as_ragged_element_array(self): + return np.array([RaggedElement.ragged_or_nan(self[i]) + for i in range(len(self))]) def _values_for_factorize(self): - # Here we return a list of the ragged elements converted into tuples. - # This is very inefficient, but the elements of this list must be - # hashable, and we must be able to reconstruct a new Ragged Array - # from these elements. - # - # Perhaps we could replace these tuples with a class that provides a - # read-only view of an ndarray slice and provides a hash function. - return [tuple(self[i]) for i in range(len(self))], () + return self._as_ragged_element_array(), np.nan + + def _values_for_argsort(self): + return self._as_ragged_element_array() + + def unique(self): + """ + Compute the ExtensionArray of unique values. + + Returns + ------- + uniques : ExtensionArray + """ + from pandas import unique + + uniques = unique(self._as_ragged_element_array()) + return self._from_sequence( + [RaggedElement.array_or_nan(v) for v in uniques], + dtype=self.dtype) + + def shift(self, periods=1, fill_value=None): + # type: (int, object) -> ExtensionArray + """ + Shift values by desired number. + + Override in RaggedArray to handle ndarray fill values + """ + # Note: this implementation assumes that `self.dtype.na_value` can be + # stored in an instance of your ExtensionArray with `self.dtype`. + if not len(self) or periods == 0: + return self.copy() + + if fill_value is None: + fill_value = np.nan + + empty = self._from_sequence( + [fill_value] * min(abs(periods), len(self)), + dtype=self.dtype + ) + if periods > 0: + a = empty + b = self[:-periods] + else: + a = self[abs(periods):] + b = empty + return self._concat_same_type([a, b]) + + def searchsorted(self, value, side="left", sorter=None): + """ + Find indices where elements should be inserted to maintain order. + + .. versionadded:: 0.24.0 + + Find the indices into a sorted array `self` (a) such that, if the + corresponding elements in `v` were inserted before the indices, the + order of `self` would be preserved. + + Assuming that `a` is sorted: + + ====== ============================ + `side` returned index `i` satisfies + ====== ============================ + left ``self[i-1] < v <= self[i]`` + right ``self[i-1] <= v < self[i]`` + ====== ============================ + + Parameters + ---------- + value : array_like + Values to insert into `self`. + side : {'left', 'right'}, optional + If 'left', the index of the first suitable location found is given. + If 'right', return the last such index. If there is no suitable + index, return either 0 or N (where N is the length of `self`). + sorter : 1-D array_like, optional + Optional array of integer indices that sort array a into ascending + order. They are typically the result of argsort. + + Returns + ------- + indices : array of ints + Array of insertion points with the same shape as `value`. + + See Also + -------- + numpy.searchsorted : Similar method from NumPy. + """ + # Note: the base tests provided by pandas only test the basics. + # We do not test + # 1. Values outside the range of the `data_for_sorting` fixture + # 2. Values between the values in the `data_for_sorting` fixture + # 3. Missing values. + arr = self._as_ragged_element_array() + if isinstance(value, RaggedArray): + search_value = value._as_ragged_element_array() + else: + search_value = RaggedElement(value) + return arr.searchsorted(search_value, side=side, sorter=sorter) def isna(self): """ @@ -358,6 +500,9 @@ def take(self, indices, allow_fill=False, fill_value=None): sequence = [self[i] if i >= 0 else fill_value for i in indices] else: + if len(self) == 0 and len(indices) > 0: + raise IndexError("cannot do a non-empty take") + sequence = [self[i] for i in indices] return RaggedArray(sequence, dtype=self.flat_array.dtype) @@ -435,4 +580,4 @@ def astype(self, dtype, copy=True): elif is_extension_array_dtype(dtype): dtype.construct_array_type()._from_sequence(np.asarray(self)) - return np.array(self, dtype=dtype, copy=copy) + return np.array([v for v in self], dtype=dtype, copy=copy) diff --git a/datashader/tests/test_datatypes.py b/datashader/tests/test_datatypes.py index 8139d9070..5ab4b15c8 100644 --- a/datashader/tests/test_datatypes.py +++ b/datashader/tests/test_datatypes.py @@ -1,7 +1,8 @@ import pytest import numpy as np import pandas as pd -from pandas.tests.extension.base import BaseDtypeTests +import pandas.tests.extension.base as eb +import pandas.util.testing as tm from datashader.datatypes import RaggedDtype, RaggedArray @@ -15,7 +16,7 @@ def assert_ragged_arrays_equal(ra1, ra2): # Make sure ragged elements are equal when iterated over for a1, a2 in zip(ra1, ra2): - assert np.array_equal(a1, a2) + np.testing.assert_array_equal(a1, a2) # Test constructor and properties @@ -63,7 +64,7 @@ def test_construct_ragged_array(): def test_construct_ragged_array_from_ragged_array(): - rarray = RaggedArray([[1, 2], [], [10, 20, 30], None, [11, 22, 33, 44]], + rarray = RaggedArray([[1, 2], [], [10, 20, 30], np.nan, [11, 22, 33, 44]], dtype='int32') result = RaggedArray(rarray) @@ -91,7 +92,7 @@ def test_construct_ragged_array_fastpath(): assert len(object_array) == len(expected_array) for a1, a2 in zip(object_array, expected_array): - assert np.array_equal(a1, a2) + np.testing.assert_array_equal(a1, a2) def test_validate_ragged_array_fastpath(): @@ -150,34 +151,34 @@ def test_start_indices_dtype(): # Empty rarray = RaggedArray([[]], dtype='int64') assert rarray.start_indices.dtype == np.dtype('uint8') - assert np.array_equal(rarray.start_indices, [0]) + np.testing.assert_array_equal(rarray.start_indices, [0]) # Small rarray = RaggedArray([[23, 24]], dtype='int64') assert rarray.start_indices.dtype == np.dtype('uint8') - assert np.array_equal(rarray.start_indices, [0]) + np.testing.assert_array_equal(rarray.start_indices, [0]) # Max uint8 max_uint8 = np.iinfo('uint8').max rarray = RaggedArray([np.zeros(max_uint8), []], dtype='int64') assert rarray.start_indices.dtype == np.dtype('uint8') - assert np.array_equal(rarray.start_indices, [0, max_uint8]) + np.testing.assert_array_equal(rarray.start_indices, [0, max_uint8]) # Min uint16 rarray = RaggedArray([np.zeros(max_uint8 + 1), []], dtype='int64') assert rarray.start_indices.dtype == np.dtype('uint16') - assert np.array_equal(rarray.start_indices, [0, max_uint8 + 1]) + np.testing.assert_array_equal(rarray.start_indices, [0, max_uint8 + 1]) # Max uint16 max_uint16 = np.iinfo('uint16').max rarray = RaggedArray([np.zeros(max_uint16), []], dtype='int64') assert rarray.start_indices.dtype == np.dtype('uint16') - assert np.array_equal(rarray.start_indices, [0, max_uint16]) + np.testing.assert_array_equal(rarray.start_indices, [0, max_uint16]) # Min uint32 rarray = RaggedArray([np.zeros(max_uint16 + 1), []], dtype='int64') assert rarray.start_indices.dtype == np.dtype('uint32') - assert np.array_equal(rarray.start_indices, [0, max_uint16 + 1]) + np.testing.assert_array_equal(rarray.start_indices, [0, max_uint16 + 1]) @pytest.mark.parametrize('arg,expected', [ @@ -200,7 +201,7 @@ def test_isna(): rarray = RaggedArray([[], [1, 3], [10, 20, 30], None, [11, 22, 33, 44], []], dtype='int32') - assert np.array_equal(rarray.isna(), + np.testing.assert_array_equal(rarray.isna(), np.array([True, False, False, True, False, True])) @@ -216,8 +217,12 @@ def test_get_item_scalar(): if expected is None: expected = np.array([], dtype='float16') - assert result.dtype == 'float16' - assert np.array_equal(result, expected) + if isinstance(result, np.ndarray): + assert result.dtype == 'float16' + else: + assert np.isnan(result) + + np.testing.assert_array_equal(result, expected) # Reversed for i, expected in enumerate(arg): @@ -225,8 +230,11 @@ def test_get_item_scalar(): if expected is None: expected = np.array([], dtype='float16') - assert result.dtype == 'float16' - assert np.array_equal(result, expected) + if isinstance(result, np.ndarray): + assert result.dtype == 'float16' + else: + assert np.isnan(result) + np.testing.assert_array_equal(result, expected) @pytest.mark.parametrize('index', [-1000, -6, 5, 1000]) @@ -298,7 +306,7 @@ def test_factorization(): rarray = RaggedArray(arg, dtype='int16') labels, uniques = rarray.factorize() - assert np.array_equal(labels, [0, -1, 0, -1, 1]) + np.testing.assert_array_equal(labels, [0, -1, 0, -1, 1]) assert_ragged_arrays_equal( uniques, RaggedArray([[1, 2], [11, 22, 33, 44]], dtype='int16')) @@ -429,6 +437,25 @@ def data(): [[0, 1], [1, 2, 3, 4], [], [-1, -2], []]*20, dtype='float64') +@pytest.fixture +def data_repeated(data): + """ + Generate many datasets. + Parameters + ---------- + data : fixture implementing `data` + Returns + ------- + Callable[[int], Generator]: + A callable that takes a `count` argument and + returns a generator yielding `count` datasets. + """ + def gen(count): + for _ in range(count): + yield data + return gen + + @pytest.fixture def data_missing(): """Length-2 array with [NA, Valid]""" @@ -472,6 +499,187 @@ def data_for_grouping(): [[1, 0], [1, 0], [], [], [0, 0], [0, 0], [1, 0], [2, 0]]) +@pytest.fixture +def na_cmp(): + return lambda x, y: (np.isscalar(x) and np.isnan(x) and + np.isscalar(y) and np.isnan(y)) + + +@pytest.fixture +def na_value(): + return np.nan + + # Subclass BaseDtypeTests to run pandas-provided extension array test suite -class TestRaggedDtype(BaseDtypeTests): +class TestRaggedConstructors(eb.BaseConstructorsTests): + pass + + +class TestRaggedDtype(eb.BaseDtypeTests): + pass + + +class TestRaggedGetitem(eb.BaseGetitemTests): + + # Override testing methods that assume extension array scalars are + # comparable using `==`. Replace with assert_array_equal. + # + # If pandas introduces a way to customize element equality tests + # these overrides should be removed. + def test_get(self, data): + # GH 20882 + s = pd.Series(data, index=[2 * i for i in range(len(data))]) + np.testing.assert_array_equal(s.get(4), s.iloc[2]) + + result = s.get([4, 6]) + expected = s.iloc[[2, 3]] + self.assert_series_equal(result, expected) + + result = s.get(slice(2)) + expected = s.iloc[[0, 1]] + self.assert_series_equal(result, expected) + + assert s.get(-1) is None + assert s.get(s.index.max() + 1) is None + + s = pd.Series(data[:6], index=list('abcdef')) + np.testing.assert_array_equal(s.get('c'), s.iloc[2]) + + result = s.get(slice('b', 'd')) + expected = s.iloc[[1, 2, 3]] + self.assert_series_equal(result, expected) + + result = s.get('Z') + assert result is None + + np.testing.assert_array_equal(s.get(4), s.iloc[4]) + np.testing.assert_array_equal(s.get(-1), s.iloc[-1]) + assert s.get(len(s)) is None + + def test_take_sequence(self, data): + result = pd.Series(data)[[0, 1, 3]] + np.testing.assert_array_equal(result.iloc[0], data[0]) + np.testing.assert_array_equal(result.iloc[1], data[1]) + np.testing.assert_array_equal(result.iloc[2], data[3]) + + def test_take(self, data, na_value, na_cmp): + result = data.take([0, -1]) + np.testing.assert_array_equal(result.dtype, data.dtype) + np.testing.assert_array_equal(result[0], data[0]) + np.testing.assert_array_equal(result[1], data[-1]) + + result = data.take([0, -1], allow_fill=True, fill_value=na_value) + np.testing.assert_array_equal(result[0], data[0]) + assert na_cmp(result[1], na_value) + + with pytest.raises(IndexError, match="out of bounds"): + data.take([len(data) + 1]) + + +class TestRaggedGroupby(eb.BaseGroupbyTests): + @pytest.mark.parametrize('op', [ + lambda x: 1, + lambda x: [1] * len(x), + # # Op below causes a: + # # ValueError: Names should be list-like for a MultiIndex + # lambda x: pd.Series([1] * len(x)), + lambda x: x, + ], ids=[ + 'scalar', + 'list', + # 'series', + 'object']) + def test_groupby_extension_apply(self, data_for_grouping, op): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], + "B": data_for_grouping}) + df.groupby("B").apply(op) + df.groupby("B").A.apply(op) + df.groupby("A").apply(op) + df.groupby("A").B.apply(op) + + +class TestRaggedInterface(eb.BaseInterfaceTests): + # Add array equality + def test_array_interface(self, data): + result = np.array(data) + np.testing.assert_array_equal(result[0], data[0]) + + result = np.array(data, dtype=object) + expected = np.array(list(data), dtype=object) + + for a1, a2 in zip(result, expected): + if np.isscalar(a1): + assert np.isnan(a1) and np.isnan(a2) + else: + tm.assert_numpy_array_equal(a2, a1) + + +class TestRaggedMethods(eb.BaseMethodsTests): + + # AttributeError: 'RaggedArray' object has no attribute 'value_counts' + @pytest.mark.skip(reason="value_counts not supported") + def test_value_counts(self, all_data, dropna): + pass + + # Add array equality + @pytest.mark.parametrize('box', [pd.Series, lambda x: x]) + @pytest.mark.parametrize('method', [lambda x: x.unique(), pd.unique]) + def test_unique(self, data, box, method): + duplicated = box(data._from_sequence([data[0], data[0]])) + + result = method(duplicated) + + assert len(result) == 1 + assert isinstance(result, type(data)) + np.testing.assert_array_equal(result[0], duplicated[0]) + + # Pandas raises + # ValueError: invalid fill value with a + @pytest.mark.skip(reason="pandas cannot fill with ndarray") + def test_fillna_copy_frame(self, data_missing): + pass + + @pytest.mark.skip(reason="pandas cannot fill with ndarray") + def test_fillna_copy_series(self, data_missing): + pass + + # Ragged array elements don't support binary operators + @pytest.mark.skip(reason="ragged does not support <= on elements") + def test_combine_le(self, data_repeated): + pass + + @pytest.mark.skip(reason="ragged does not support + on elements") + def test_combine_add(self, data_repeated): + pass + + # Block manager error: + # ValueError: setting an array element with a sequence. + @pytest.mark.skip(reason="combine_first not supported") + def test_combine_first(self, data): + pass + + +class TestRaggedPrinting(eb.BasePrintingTests): + pass + + +class TestRaggedMissing(eb.BaseMissingTests): + + # Pandas doesn't like using an ndarray as fill value. + # Errors like: + # ValueError: Length of 'value' does not match. Got (3) expected 2 + @pytest.mark.skip(reason="Can't fill with ndarray") + def test_fillna_scalar(self, data_missing): + pass + + @pytest.mark.skip(reason="Can't fill with ndarray") + def test_fillna_series(self, data_missing): + pass + + @pytest.mark.skip(reason="Can't fill with ndarray") + def test_fillna_frame(self, data_missing): + pass + + +class TestRaggedReshaping(eb.BaseReshapingTests): pass From 16aff67b93d297bf31780449f5e6b40f7fec00d8 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Thu, 17 Jan 2019 07:51:32 -0500 Subject: [PATCH 16/45] Override fillna is RaggedArray and enable test --- datashader/datatypes.py | 91 +++++++++++++++++++++++++++++- datashader/tests/test_datatypes.py | 7 +-- 2 files changed, 91 insertions(+), 7 deletions(-) diff --git a/datashader/datatypes.py b/datashader/datatypes.py index 0d693e716..9d5f346f5 100644 --- a/datashader/datatypes.py +++ b/datashader/datatypes.py @@ -365,13 +365,102 @@ def unique(self): [RaggedElement.array_or_nan(v) for v in uniques], dtype=self.dtype) + def fillna(self, value=None, method=None, limit=None): + """ + Fill NA/NaN values using the specified method. + + Parameters + ---------- + value : scalar, array-like + If a scalar value is passed it is used to fill all missing values. + Alternatively, an array-like 'value' can be given. It's expected + that the array-like have the same length as 'self'. + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + Method to use for filling holes in reindexed Series + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + limit : int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. + + Returns + ------- + filled : ExtensionArray with NA/NaN filled + """ + # Override in RaggedArray to handle ndarray fill values + from pandas.api.types import is_array_like + from pandas.util._validators import validate_fillna_kwargs + from pandas.core.missing import pad_1d, backfill_1d + + value, method = validate_fillna_kwargs(value, method) + + mask = self.isna() + + if isinstance(value, RaggedArray): + if len(value) != len(self): + raise ValueError("Length of 'value' does not match. Got ({}) " + " expected {}".format(len(value), len(self))) + value = value[mask] + + if mask.any(): + if method is not None: + func = pad_1d if method == 'pad' else backfill_1d + new_values = func(self.astype(object), limit=limit, + mask=mask) + new_values = self._from_sequence(new_values, dtype=self.dtype) + else: + # fill with value + new_values = list(self) + mask_indices, = np.where(mask) + for ind in mask_indices: + new_values[ind] = value + + new_values = self._from_sequence(new_values, dtype=self.dtype) + else: + new_values = self.copy() + return new_values + def shift(self, periods=1, fill_value=None): # type: (int, object) -> ExtensionArray """ Shift values by desired number. - Override in RaggedArray to handle ndarray fill values + Newly introduced missing values are filled with + ``self.dtype.na_value``. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + periods : int, default 1 + The number of periods to shift. Negative values are allowed + for shifting backwards. + + fill_value : object, optional + The scalar value to use for newly introduced missing values. + The default is ``self.dtype.na_value`` + + .. versionadded:: 0.24.0 + + Returns + ------- + shifted : ExtensionArray + + Notes + ----- + If ``self`` is empty or ``periods`` is 0, a copy of ``self`` is + returned. + + If ``periods > len(self)``, then an array of size + len(self) is returned, with all values filled with + ``self.dtype.na_value``. """ + # Override in RaggedArray to handle ndarray fill values + # Note: this implementation assumes that `self.dtype.na_value` can be # stored in an instance of your ExtensionArray with `self.dtype`. if not len(self) or periods == 0: diff --git a/datashader/tests/test_datatypes.py b/datashader/tests/test_datatypes.py index 5ab4b15c8..20533c377 100644 --- a/datashader/tests/test_datatypes.py +++ b/datashader/tests/test_datatypes.py @@ -664,14 +664,9 @@ class TestRaggedPrinting(eb.BasePrintingTests): class TestRaggedMissing(eb.BaseMissingTests): - # Pandas doesn't like using an ndarray as fill value. # Errors like: - # ValueError: Length of 'value' does not match. Got (3) expected 2 - @pytest.mark.skip(reason="Can't fill with ndarray") - def test_fillna_scalar(self, data_missing): - pass - + # ValueError: invalid fill value with a @pytest.mark.skip(reason="Can't fill with ndarray") def test_fillna_series(self, data_missing): pass From 5772ade2136906697d0a76ba1a2735be68729b2a Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Thu, 17 Jan 2019 07:52:35 -0500 Subject: [PATCH 17/45] Add vectorized equality operators --- datashader/datatypes.py | 183 +++++++++++++++++++++++++++++ datashader/tests/test_datatypes.py | 95 +++++++++++++++ 2 files changed, 278 insertions(+) diff --git a/datashader/datatypes.py b/datashader/datatypes.py index 9d5f346f5..a7abf863a 100644 --- a/datashader/datatypes.py +++ b/datashader/datatypes.py @@ -212,6 +212,50 @@ def __init__(self, data, dtype=None): # increment next start index next_start_ind += n + def __eq__(self, other): + if isinstance(other, RaggedArray): + if len(other) != len(self): + raise ValueError(""" +Cannot check equality of RaggedArray values of unequal length + len(ra1) == {len_ra1} + len(ra2) == {len_ra2}""".format( + len_ra1=len(self), + len_ra2=len(other))) + + result = _eq_ragged_ragged(self, other) + else: + # Convert other to numpy arrauy + if not isinstance(other, np.ndarray): + other_array = np.asarray(other) + else: + other_array = other + + if other_array.ndim == 1 and other_array.dtype.kind != 'O': + + # Treat as ragged scalar + result = _eq_ragged_scalar(self, other_array) + elif (other_array.ndim == 1 and + other_array.dtype.kind == 'O' and + len(other_array) == len(self)): + + # Treat as vector + result = _eq_ragged_ndarray1d(self, other_array) + elif (other_array.ndim == 2 and + other_array.dtype.kind != 'O' and + other_array.shape[0] == len(self)): + + # Treat rows as ragged elements + result = _eq_ragged_ndarray2d(self, other_array) + else: + raise ValueError(""" +Cannot check equality of RaggedArray of length {ra_len} with: + {other}""".format(ra_len=len(self), other=repr(other))) + + return result + + def __ne__(self, other): + return np.logical_not(self == other) + @property def flat_array(self): """ @@ -670,3 +714,142 @@ def astype(self, dtype, copy=True): dtype.construct_array_type()._from_sequence(np.asarray(self)) return np.array([v for v in self], dtype=dtype, copy=copy) + + +def _eq_ragged_ragged(ra1, ra2): + """ + Compare elements of two ragged arrays of the same length + + Parameters + ---------- + ra1: RaggedArray + ra2: RaggedArray + + Returns + ------- + mask: ndarray + 1D bool array of same length as inputs with elements True when + corresponding elements are equal, False otherwise + """ + start_indices1 = ra1.start_indices + flat_array1 = ra1.flat_array + + start_indices2 = ra2.start_indices + flat_array2 = ra2.flat_array + + n = len(start_indices1) + m1 = len(flat_array1) + m2 = len(flat_array2) + + result = np.zeros(n, dtype=np.bool) + + for i in range(n): + # Extract inds for ra1 + start_index1 = start_indices1[i] + stop_index1 = start_indices1[i + 1] if i < n - 1 else m1 + + # Extract inds for ra2 + start_index2 = start_indices2[i] + stop_index2 = start_indices2[i + 1] if i < n - 1 else m2 + + result[i] = np.array_equal(flat_array1[start_index1:stop_index1], + flat_array2[start_index2:stop_index2]) + + return result + + +def _eq_ragged_scalar(ra, val): + """ + Compare elements of a RaggedArray with a scalar array + + Parameters + ---------- + ra: RaggedArray + val: ndarray + + Returns + ------- + mask: ndarray + 1D bool array of same length as inputs with elements True when + ragged element equals scalar val, False otherwise. + """ + start_indices = ra.start_indices + flat_array = ra.flat_array + + n = len(start_indices) + m = len(flat_array) + result = np.zeros(n, dtype=np.bool) + for i in range(n): + start_index = start_indices[i] + stop_index = start_indices[i+1] if i < n - 1 else m + result[i] = np.array_equal(flat_array[start_index:stop_index], val) + + return result + + +def _eq_ragged_ndarray1d(ra, a): + """ + Compare a RaggedArray with a 1D numpy object array of the same length + + Parameters + ---------- + ra: RaggedArray + a: ndarray + 1D numpy array of same length as ra + + Returns + ------- + mask: ndarray + 1D bool array of same length as input with elements True when + corresponding elements are equal, False otherwise + """ + start_indices = ra.start_indices + flat_array = ra.flat_array + + n = len(start_indices) + m = len(flat_array) + result = np.zeros(n, dtype=np.bool) + for i in range(n): + start_index = start_indices[i] + stop_index = start_indices[i + 1] if i < n - 1 else m + a_val = a[i] + if (a_val is None or + (np.isscalar(a_val) and np.isnan(a_val)) or + len(a_val) == 0): + result[i] = start_index == stop_index + else: + result[i] = np.array_equal(flat_array[start_index:stop_index], + a_val) + + return result + + +def _eq_ragged_ndarray2d(ra, a): + """ + Compare a RaggedArray with rows of a 2D numpy object array + + Parameters + ---------- + ra: RaggedArray + a: ndarray + A 2D numpy array where the length of the first dimension matches the + length of the RaggedArray + + Returns + ------- + mask: ndarray + 1D bool array of same length as input RaggedArray with elements True + when corresponding elements of ra equals corresponding row of a + """ + start_indices = ra.start_indices + flat_array = ra.flat_array + + n = len(start_indices) + m = len(flat_array) + result = np.zeros(n, dtype=np.bool) + for i in range(n): + start_index = start_indices[i] + stop_index = start_indices[i + 1] if i < n - 1 else m + result[i] = np.array_equal(flat_array[start_index:stop_index], + a[i, :]) + return result diff --git a/datashader/tests/test_datatypes.py b/datashader/tests/test_datatypes.py index 20533c377..d4d583074 100644 --- a/datashader/tests/test_datatypes.py +++ b/datashader/tests/test_datatypes.py @@ -418,6 +418,101 @@ def test_concat_series(): pd.testing.assert_series_equal(s_concat, expected) +# Array equality +# -------------- +@pytest.mark.parametrize('scalar', [ + np.array([1, 2]), [1, 2] +]) +def test_array_eq_scalar(scalar): + # Build RaggedArray + arg1 = [[1, 2], [], [1, 2], None, [11, 22, 33, 44]] + ra = RaggedArray(arg1, dtype='int32') + + # Check equality + result = ra == scalar + expected = np.array([1, 0, 1, 0, 0], dtype='bool') + np.testing.assert_array_equal(result, expected) + + # Check non-equality + result_negated = ra != scalar + expected_negated = ~expected + np.testing.assert_array_equal(result_negated, expected_negated) + + +def test_array_eq_numpy1(): + # Build RaggedArray + arg1 = [[1, 2], [], [1, 2], None, [11, 22, 33, 44]] + + # Construct arrays + ra = RaggedArray(arg1, dtype='int32') + npa = np.array(arg1, dtype='object') + + # Check equality + result = ra == npa + expected = np.array([1, 1, 1, 1, 1], dtype='bool') + np.testing.assert_array_equal(result, expected) + + # Check non-equality + result_negated = ra != npa + expected_negated = ~expected + np.testing.assert_array_equal(result_negated, expected_negated) + + +def test_array_eq_numpy2d(): + # Construct arrays + ra = RaggedArray([[1, 2], [], [1, 2], None, [11, 22, 33, 44]], + dtype='int32') + npa = np.array([[1, 2], [2, 3], [1, 2], [0, 1], [11, 22]], + dtype='int32') + + # Check equality + result = ra == npa + expected = np.array([1, 0, 1, 0, 0], dtype='bool') + np.testing.assert_array_equal(result, expected) + + # Check non-equality + result_negated = ra != npa + expected_negated = ~expected + np.testing.assert_array_equal(result_negated, expected_negated) + + +def test_array_eq_ragged(): + # Build RaggedArray + arg1 = [[1, 2], [], [1, 2], None, [11, 22, 33, 44]] + ra1 = RaggedArray(arg1, dtype='int32') + + # Build RaggedArray + arg2 = [[1, 2], [2, 3, 4, 5], [1, 2], None, [11]] + ra2 = RaggedArray(arg2, dtype='int32') + + # Check equality + result = ra1 == ra2 + expected = np.array([1, 0, 1, 1, 0], dtype='bool') + np.testing.assert_array_equal(result, expected) + + # Check non-equality + result_negated = ra1 != ra2 + expected_negated = ~expected + np.testing.assert_array_equal(result_negated, expected_negated) + + +@pytest.mark.parametrize('other', [ + 'a string', # Incompatible scalars + 32, + RaggedArray([[0, 1], [2, 3, 4]]), # RaggedArray of wrong length + np.array([[0, 1], [2, 3, 4]], dtype='object'), # 1D array wrong length + np.array([[0, 1], [2, 3]], dtype='int32'), # 2D array wrong row count +]) +def test_equality_validation(other): + # Build RaggedArray + arg1 = [[1, 2], [], [1, 2], None, [11, 22, 33, 44]] + ra1 = RaggedArray(arg1, dtype='int32') + + # invalid scalar + with pytest.raises(ValueError, match="Cannot check equality"): + res = ra1 == other + + # Pandas-provided extension array tests # ------------------------------------- # See http://pandas-docs.github.io/pandas-docs-travis/extending.html From 939405b8fcbe93dec4a9a5b976a4cd3bfed805d7 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Thu, 17 Jan 2019 08:01:44 -0500 Subject: [PATCH 18/45] pass start_indices and flat_array arrays as args to _validate_ragged_properties rather than dict key/value pairs. --- datashader/datatypes.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/datashader/datatypes.py b/datashader/datatypes.py index a7abf863a..eb661823d 100644 --- a/datashader/datatypes.py +++ b/datashader/datatypes.py @@ -9,26 +9,28 @@ from pandas.core.dtypes.common import is_extension_array_dtype -def _validate_ragged_properties(data): +def _validate_ragged_properties(start_indices, flat_array): """ - Validate that dict contains the necessary properties to construct a - RaggedArray. + Validate that start_indices are flat_array arrays may be used to + represent a valid RaggedArray. Parameters ---------- - data: dict - A dict containing 'start_indices' and 'flat_array' keys - with numpy array values - + flat_array: numpy array containing concatenation + of all nested arrays to be represented + by this ragged array + start_indices: unsiged integer numpy array the same + length as the ragged array where values + represent the index into flat_array where + the corresponding ragged array element + begins Raises ------ ValueError: - if input contains invalid or incompatible properties + if input arguments are invalid or incompatible properties """ # Validate start_indices - start_indices = data['start_indices'] - if (not isinstance(start_indices, np.ndarray) or start_indices.dtype.kind != 'u' or start_indices.ndim != 1): @@ -39,8 +41,6 @@ def _validate_ragged_properties(data): typ=type(start_indices), v=repr(start_indices))) # Validate flat_array - flat_array = data['flat_array'] - if (not isinstance(flat_array, np.ndarray) or flat_array.ndim != 1): raise ValueError(""" @@ -163,7 +163,9 @@ def __init__(self, data, dtype=None): all(k in data for k in ['start_indices', 'flat_array'])): - _validate_ragged_properties(data) + _validate_ragged_properties( + start_indices=data['start_indices'], + flat_array=data['flat_array']) self._start_indices = data['start_indices'] self._flat_array = data['flat_array'] From 7f355d2c1430538a180e9e72dbb75ef7228ffb68 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Thu, 17 Jan 2019 08:09:29 -0500 Subject: [PATCH 19/45] Add copy arg to RaggedArray constructor --- datashader/datatypes.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/datashader/datatypes.py b/datashader/datatypes.py index eb661823d..2a538f1ae 100644 --- a/datashader/datatypes.py +++ b/datashader/datatypes.py @@ -131,7 +131,7 @@ def missing(v): class RaggedArray(ExtensionArray): - def __init__(self, data, dtype=None): + def __init__(self, data, dtype=None, copy=False): """ Construct a RaggedArray @@ -157,6 +157,10 @@ def __init__(self, data, dtype=None): Datatype to use to store underlying values from data. If none (the default) then dtype will be determined using the numpy.result_type function. + copy : bool (default False) + Whether to deep copy the input arrays. Only relevant when `data` + has type `dict` or `RaggedArray`. When data is a `list` or + `array`, input arrays are always copied. """ self._dtype = RaggedDtype() if (isinstance(data, dict) and @@ -169,9 +173,17 @@ def __init__(self, data, dtype=None): self._start_indices = data['start_indices'] self._flat_array = data['flat_array'] + + if copy: + self._start_indices = self._start_indices.copy() + self._flat_array = self._flat_array.copy() elif isinstance(data, RaggedArray): - self._flat_array = data.flat_array.copy() - self._start_indices = data.start_indices.copy() + self._flat_array = data.flat_array + self._start_indices = data.start_indices + + if copy: + self._start_indices = self._start_indices.copy() + self._flat_array = self._flat_array.copy() else: # Compute lengths index_len = len(data) @@ -659,11 +671,7 @@ def copy(self, deep=False): flat_array=self.flat_array, start_indices=self.start_indices) - if deep: - # Copy underlying numpy arrays - data = {k: v.copy() for k, v in data.items()} - - return RaggedArray(data) + return RaggedArray(data, copy=deep) @classmethod def _concat_same_type(cls, to_concat): @@ -690,7 +698,8 @@ def _concat_same_type(cls, to_concat): for offset, ra in zip(offsets, to_concat)]) return RaggedArray(dict( - flat_array=flat_array, start_indices=start_indices)) + flat_array=flat_array, start_indices=start_indices), + copy=False) @property def dtype(self): From 9e449468fe114c17bed911e41fdd54013054f3c2 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Thu, 17 Jan 2019 08:10:22 -0500 Subject: [PATCH 20/45] += --- datashader/datatypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datashader/datatypes.py b/datashader/datatypes.py index 2a538f1ae..f8f4847e7 100644 --- a/datashader/datatypes.py +++ b/datashader/datatypes.py @@ -322,7 +322,7 @@ def __getitem__(self, item): else: # Convert negative item index if item < 0: - item = len(self) + item + item += len(self) slice_start = self.start_indices[item] slice_end = (self.start_indices[item+1] From a52728a2a5f7e465a7d84ff3adffcb3ee7330d20 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Thu, 17 Jan 2019 08:12:11 -0500 Subject: [PATCH 21/45] Fix missing return --- datashader/datatypes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datashader/datatypes.py b/datashader/datatypes.py index f8f4847e7..a1615ddba 100644 --- a/datashader/datatypes.py +++ b/datashader/datatypes.py @@ -722,7 +722,8 @@ def astype(self, dtype, copy=True): return self elif is_extension_array_dtype(dtype): - dtype.construct_array_type()._from_sequence(np.asarray(self)) + return dtype.construct_array_type()._from_sequence( + np.asarray(self)) return np.array([v for v in self], dtype=dtype, copy=copy) From 75f914d10541277c93553dc67c67ff47cd3e9cac Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Thu, 17 Jan 2019 09:37:26 -0500 Subject: [PATCH 22/45] Parameterize RaggedDtype by element type This way the element dtype can be specified in the ragged datatype string e.g. >>> pd.Series([[1, 2], [2, 3, 4], None], dtype='Ragged[uint16]') Out[13]: 0 [1 2] 1 [2 3 4] 2 NaN dtype: Ragged[uint16] --- datashader/datatypes.py | 80 +++++++++++++++++++++++++++--- datashader/tests/test_datatypes.py | 20 ++++---- 2 files changed, 83 insertions(+), 17 deletions(-) diff --git a/datashader/datatypes.py b/datashader/datatypes.py index a1615ddba..365fece9c 100644 --- a/datashader/datatypes.py +++ b/datashader/datatypes.py @@ -1,3 +1,4 @@ +import re from functools import total_ordering import numpy as np @@ -112,6 +113,15 @@ class RaggedDtype(ExtensionDtype): name = 'ragged' type = np.ndarray base = np.dtype('O') + _subtype_re = re.compile(r"^ragged\[(?P\w+)\]$") + _metadata = ('_dtype',) + + @property + def name(self): + return 'Ragged[{subtype}]'.format(subtype=self.subtype) + + def __repr__(self): + return self.name @classmethod def construct_array_type(cls): @@ -119,11 +129,61 @@ def construct_array_type(cls): @classmethod def construct_from_string(cls, string): - if string == 'ragged': - return RaggedDtype() + # lowercase string + string = string.lower() + + msg = "Cannot construct a 'RaggedDtype' from '{}'" + if string.startswith('ragged'): + # Extract subtype + try: + subtype_string = cls._parse_subtype(string) + return RaggedDtype(dtype=subtype_string) + except Exception: + raise TypeError(msg.format(string)) + else: + raise TypeError(msg.format(string)) + + def __init__(self, dtype=np.float64): + if isinstance(dtype, RaggedDtype): + self._dtype = dtype.subtype else: - raise TypeError("Cannot construct a '{}' from '{}'" - .format(cls, string)) + self._dtype = np.dtype(dtype) + + @property + def subtype(self): + return self._dtype + + @classmethod + def _parse_subtype(cls, dtype_string): + """ + Parse a datatype string to get the subtype + + Parameters + ---------- + dtype_string: str + A string like Ragged[subtype] + + Returns + ------- + subtype: str + + Raises + ------ + ValueError + When the subtype cannot be extracted + """ + # Be case insensitive + dtype_string = dtype_string.lower() + + match = cls._subtype_re.match(dtype_string) + if match: + subtype_string = match.groupdict()['subtype'] + elif dtype_string == 'ragged': + subtype_string = 'float64' + else: + raise ValueError("Cannot parse {dtype_string}".format( + dtype_string=dtype_string)) + return subtype_string def missing(v): @@ -153,7 +213,7 @@ def __init__(self, data, dtype=None, copy=False): begins * RaggedArray: A RaggedArray instance to copy - dtype: np.dtype or str or None (default None) + dtype: RaggedDtype or np.dtype or str or None (default None) Datatype to use to store underlying values from data. If none (the default) then dtype will be determined using the numpy.result_type function. @@ -162,7 +222,6 @@ def __init__(self, data, dtype=None, copy=False): has type `dict` or `RaggedArray`. When data is a `list` or `array`, input arrays are always copied. """ - self._dtype = RaggedDtype() if (isinstance(data, dict) and all(k in data for k in ['start_indices', 'flat_array'])): @@ -173,13 +232,16 @@ def __init__(self, data, dtype=None, copy=False): self._start_indices = data['start_indices'] self._flat_array = data['flat_array'] + dtype = self._flat_array.dtype if copy: self._start_indices = self._start_indices.copy() self._flat_array = self._flat_array.copy() + elif isinstance(data, RaggedArray): self._flat_array = data.flat_array self._start_indices = data.start_indices + dtype = self._flat_array.dtype if copy: self._start_indices = self._start_indices.copy() @@ -206,6 +268,8 @@ def __init__(self, data, dtype=None, copy=False): dtype = np.result_type(*non_missing) else: dtype = 'float64' + elif isinstance(dtype, RaggedDtype): + dtype = dtype.subtype # Initialize representation arrays self._start_indices = np.zeros(index_len, dtype=start_indices_dtype) @@ -226,6 +290,8 @@ def __init__(self, data, dtype=None, copy=False): # increment next start index next_start_ind += n + self._dtype = RaggedDtype(dtype=dtype) + def __eq__(self, other): if isinstance(other, RaggedArray): if len(other) != len(self): @@ -375,7 +441,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): ------- RaggedArray """ - return RaggedArray(scalars) + return RaggedArray(scalars, dtype=dtype) @classmethod def _from_factorized(cls, values, original): diff --git a/datashader/tests/test_datatypes.py b/datashader/tests/test_datatypes.py index d4d583074..9aabb9759 100644 --- a/datashader/tests/test_datatypes.py +++ b/datashader/tests/test_datatypes.py @@ -12,7 +12,7 @@ def assert_ragged_arrays_equal(ra1, ra2): assert np.array_equal(ra1.start_indices, ra2.start_indices) assert np.array_equal(ra1.flat_array, ra2.flat_array) - assert np.array_equal(ra1.flat_array.dtype, ra2.flat_array.dtype) + assert ra1.flat_array.dtype == ra2.flat_array.dtype # Make sure ragged elements are equal when iterated over for a1, a2 in zip(ra1, ra2): @@ -24,7 +24,7 @@ def assert_ragged_arrays_equal(ra1, ra2): def test_construct_ragged_dtype(): dtype = RaggedDtype() assert dtype.type == np.ndarray - assert dtype.name == 'ragged' + assert dtype.name == 'Ragged[{subtype}]'.format(subtype=dtype.subtype) assert dtype.kind == 'O' @@ -384,35 +384,35 @@ def test_concat_same_type(): # ---------------------- def test_pandas_array_construction(): arg = [[0, 1], [1, 2, 3, 4], None, [-1, -2]] * 2 - ra = pd.array(arg, dtype='ragged') + ra = pd.array(arg, dtype='ragged[int64]') expected = RaggedArray(arg) assert_ragged_arrays_equal(ra, expected) def test_series_construction(): - arg = [[0, 1], [1, 2, 3, 4], None, [-1, -2]] * 2 - rs = pd.Series(arg, dtype='ragged') + arg = [[0, 1], [1.0, 2, 3.0, 4], None, [-1, -2]] * 2 + rs = pd.Series(arg, dtype='Ragged[int64]') ra = rs.array - expected = RaggedArray(arg) + expected = RaggedArray(arg, dtype='int64') assert_ragged_arrays_equal(ra, expected) def test_concat_series(): arg1 = [[1, 2], [], [10, 20], None, [11, 22, 33, 44]] - s1 = pd.Series(arg1, dtype='ragged') + s1 = pd.Series(arg1, dtype='ragged[int16]') arg2 = [[100, 200], None, [99, 100, 101]] - s2 = pd.Series(arg2, dtype='ragged') + s2 = pd.Series(arg2, dtype='ragged[int16]') arg3 = [None, [27, 28]] - s3 = pd.Series(arg3, dtype='ragged') + s3 = pd.Series(arg3, dtype='ragged[int16]') s_concat = pd.concat([s1, s2, s3]) expected = pd.Series(arg1+arg2+arg3, - dtype='ragged', + dtype='ragged[int16]', index=[0, 1, 2, 3, 4, 0, 1, 2, 0, 1]) pd.testing.assert_series_equal(s_concat, expected) From 32f4a3c975bbf9d57d81414c24112dd60a327552 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Thu, 17 Jan 2019 18:52:45 -0500 Subject: [PATCH 23/45] Remove tuple conversions in RaggedElement --- datashader/datatypes.py | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/datashader/datatypes.py b/datashader/datatypes.py index 365fece9c..88f4d0738 100644 --- a/datashader/datatypes.py +++ b/datashader/datatypes.py @@ -89,8 +89,7 @@ def __init__(self, array): self.array = array def __hash__(self): - # TODO: Rewrite using self.array directly without tuple - return hash(tuple(self.array)) + return hash(self.array.tobytes()) def __eq__(self, other): if not isinstance(other, RaggedElement): @@ -101,7 +100,8 @@ def __lt__(self, other): # TODO: Rewrite using self.array directly without tuples if not isinstance(other, RaggedElement): return NotImplemented - return tuple(self.array) < tuple(other.array) + # return tuple(self.array) < tuple(other.array) + return _lexograph_lt(self.array, other.array) def __repr__(self): array_repr = repr(self.array) @@ -931,3 +931,26 @@ def _eq_ragged_ndarray2d(ra, a): result[i] = np.array_equal(flat_array[start_index:stop_index], a[i, :]) return result + + +def _lexograph_lt(a1, a2): + """ + Compare two 1D numpy arrays lexographically + Parameters + ---------- + a1: ndarray + 1D numpy array + a2: ndarray + 1D numpy array + + Returns + ------- + comparison: + True if a1 < a2, False otherwise + """ + for e1, e2 in zip(a1, a2): + if e1 < e2: + return True + elif e1 > e2: + return False + return len(a1) < len(a2) From 27403a7e1e6b0dcac300b75ee8f2bc1a1af6737d Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Thu, 17 Jan 2019 18:53:23 -0500 Subject: [PATCH 24/45] Designate _RaggedElement as an internal class --- datashader/datatypes.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/datashader/datatypes.py b/datashader/datatypes.py index 88f4d0738..9f7693f2d 100644 --- a/datashader/datatypes.py +++ b/datashader/datatypes.py @@ -69,14 +69,14 @@ def _validate_ragged_properties(start_indices, flat_array): # Internal ragged element array wrapper that provides # equality, ordering, and hashing. @total_ordering -class RaggedElement(object): +class _RaggedElement(object): @staticmethod def ragged_or_nan(a): if np.isscalar(a) and np.isnan(a): return a else: - return RaggedElement(a) + return _RaggedElement(a) @staticmethod def array_or_nan(a): @@ -92,15 +92,13 @@ def __hash__(self): return hash(self.array.tobytes()) def __eq__(self, other): - if not isinstance(other, RaggedElement): + if not isinstance(other, _RaggedElement): return False return np.array_equal(self.array, other.array) def __lt__(self, other): - # TODO: Rewrite using self.array directly without tuples - if not isinstance(other, RaggedElement): + if not isinstance(other, _RaggedElement): return NotImplemented - # return tuple(self.array) < tuple(other.array) return _lexograph_lt(self.array, other.array) def __repr__(self): @@ -110,7 +108,6 @@ def __repr__(self): @register_extension_dtype class RaggedDtype(ExtensionDtype): - name = 'ragged' type = np.ndarray base = np.dtype('O') _subtype_re = re.compile(r"^ragged\[(?P\w+)\]$") @@ -461,11 +458,11 @@ def _from_factorized(cls, values, original): ExtensionArray.factorize """ return RaggedArray( - [RaggedElement.array_or_nan(v) for v in values], + [_RaggedElement.array_or_nan(v) for v in values], dtype=original.flat_array.dtype) def _as_ragged_element_array(self): - return np.array([RaggedElement.ragged_or_nan(self[i]) + return np.array([_RaggedElement.ragged_or_nan(self[i]) for i in range(len(self))]) def _values_for_factorize(self): @@ -486,7 +483,7 @@ def unique(self): uniques = unique(self._as_ragged_element_array()) return self._from_sequence( - [RaggedElement.array_or_nan(v) for v in uniques], + [_RaggedElement.array_or_nan(v) for v in uniques], dtype=self.dtype) def fillna(self, value=None, method=None, limit=None): @@ -654,7 +651,7 @@ def searchsorted(self, value, side="left", sorter=None): if isinstance(value, RaggedArray): search_value = value._as_ragged_element_array() else: - search_value = RaggedElement(value) + search_value = _RaggedElement(value) return arr.searchsorted(search_value, side=side, sorter=sorter) def isna(self): From e93c24dcd3f1651ab72c6d0e8b5e3b1c7b4566c1 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Thu, 17 Jan 2019 19:43:57 -0500 Subject: [PATCH 25/45] numba jit utility functions --- datashader/datatypes.py | 129 ++++++++++++++++++++--------- datashader/tests/test_datatypes.py | 15 ++-- 2 files changed, 98 insertions(+), 46 deletions(-) diff --git a/datashader/datatypes.py b/datashader/datatypes.py index 9f7693f2d..c671891b0 100644 --- a/datashader/datatypes.py +++ b/datashader/datatypes.py @@ -2,6 +2,7 @@ from functools import total_ordering import numpy as np +from numba import jit from pandas.api.extensions import ( ExtensionDtype, ExtensionArray, register_extension_dtype) from numbers import Integral @@ -299,7 +300,9 @@ def __eq__(self, other): len_ra1=len(self), len_ra2=len(other))) - result = _eq_ragged_ragged(self, other) + result = _eq_ragged_ragged( + self.start_indices, self.flat_array, + other.start_indices, other.flat_array) else: # Convert other to numpy arrauy if not isinstance(other, np.ndarray): @@ -310,19 +313,22 @@ def __eq__(self, other): if other_array.ndim == 1 and other_array.dtype.kind != 'O': # Treat as ragged scalar - result = _eq_ragged_scalar(self, other_array) + result = _eq_ragged_scalar( + self.start_indices, self.flat_array, other_array) elif (other_array.ndim == 1 and other_array.dtype.kind == 'O' and len(other_array) == len(self)): # Treat as vector - result = _eq_ragged_ndarray1d(self, other_array) + result = _eq_ragged_ndarray1d( + self.start_indices, self.flat_array, other_array) elif (other_array.ndim == 2 and other_array.dtype.kind != 'O' and other_array.shape[0] == len(self)): # Treat rows as ragged elements - result = _eq_ragged_ndarray2d(self, other_array) + result = _eq_ragged_ndarray2d( + self.start_indices, self.flat_array, other_array) else: raise ValueError(""" Cannot check equality of RaggedArray of length {ra_len} with: @@ -791,14 +797,24 @@ def astype(self, dtype, copy=True): return np.array([v for v in self], dtype=dtype, copy=copy) -def _eq_ragged_ragged(ra1, ra2): +@jit(nopython=True, nogil=True) +def _eq_ragged_ragged(start_indices1, + flat_array1, + start_indices2, + flat_array2): """ Compare elements of two ragged arrays of the same length Parameters ---------- - ra1: RaggedArray - ra2: RaggedArray + start_indices1: ndarray + start indices of a RaggedArray 1 + flat_array1: ndarray + flat_array property of a RaggedArray 1 + start_indices2: ndarray + start indices of a RaggedArray 2 + flat_array2: ndarray + flat_array property of a RaggedArray 2 Returns ------- @@ -806,40 +822,50 @@ def _eq_ragged_ragged(ra1, ra2): 1D bool array of same length as inputs with elements True when corresponding elements are equal, False otherwise """ - start_indices1 = ra1.start_indices - flat_array1 = ra1.flat_array - - start_indices2 = ra2.start_indices - flat_array2 = ra2.flat_array - n = len(start_indices1) m1 = len(flat_array1) m2 = len(flat_array2) - result = np.zeros(n, dtype=np.bool) + result = np.zeros(n, dtype=np.bool_) for i in range(n): # Extract inds for ra1 start_index1 = start_indices1[i] stop_index1 = start_indices1[i + 1] if i < n - 1 else m1 + len_1 = stop_index1 - start_index1 # Extract inds for ra2 start_index2 = start_indices2[i] stop_index2 = start_indices2[i + 1] if i < n - 1 else m2 + len_2 = stop_index2 - start_index2 + + if len_1 != len_2: + el_equal = False + else: + el_equal = True + for flat_index1, flat_index2 in \ + zip(range(start_index1, stop_index1), + range(start_index2, stop_index2)): + el_1 = flat_array1[flat_index1] + el_2 = flat_array2[flat_index2] + el_equal &= el_1 == el_2 - result[i] = np.array_equal(flat_array1[start_index1:stop_index1], - flat_array2[start_index2:stop_index2]) + result[i] = el_equal return result -def _eq_ragged_scalar(ra, val): +@jit(nopython=True, nogil=True) +def _eq_ragged_scalar(start_indices, flat_array, val): """ Compare elements of a RaggedArray with a scalar array Parameters ---------- - ra: RaggedArray + start_indices: ndarray + start indices of a RaggedArray + flat_array: ndarray + flat_array property of a RaggedArray val: ndarray Returns @@ -848,27 +874,36 @@ def _eq_ragged_scalar(ra, val): 1D bool array of same length as inputs with elements True when ragged element equals scalar val, False otherwise. """ - start_indices = ra.start_indices - flat_array = ra.flat_array - n = len(start_indices) m = len(flat_array) - result = np.zeros(n, dtype=np.bool) + cols = len(val) + result = np.zeros(n, dtype=np.bool_) for i in range(n): start_index = start_indices[i] stop_index = start_indices[i+1] if i < n - 1 else m - result[i] = np.array_equal(flat_array[start_index:stop_index], val) + + if stop_index - start_index != cols: + el_equal = False + else: + el_equal = True + for val_index, flat_index in \ + enumerate(range(start_index, stop_index)): + el_equal &= flat_array[flat_index] == val[val_index] + result[i] = el_equal return result -def _eq_ragged_ndarray1d(ra, a): +def _eq_ragged_ndarray1d(start_indices, flat_array, a): """ Compare a RaggedArray with a 1D numpy object array of the same length Parameters ---------- - ra: RaggedArray + start_indices: ndarray + start indices of a RaggedArray + flat_array: ndarray + flat_array property of a RaggedArray a: ndarray 1D numpy array of same length as ra @@ -877,13 +912,16 @@ def _eq_ragged_ndarray1d(ra, a): mask: ndarray 1D bool array of same length as input with elements True when corresponding elements are equal, False otherwise + + Notes + ----- + This function is not numba accelerated because it, but design, inputs + a numpy object array """ - start_indices = ra.start_indices - flat_array = ra.flat_array n = len(start_indices) m = len(flat_array) - result = np.zeros(n, dtype=np.bool) + result = np.zeros(n, dtype=np.bool_) for i in range(n): start_index = start_indices[i] stop_index = start_indices[i + 1] if i < n - 1 else m @@ -899,13 +937,17 @@ def _eq_ragged_ndarray1d(ra, a): return result -def _eq_ragged_ndarray2d(ra, a): +@jit(nopython=True, nogil=True) +def _eq_ragged_ndarray2d(start_indices, flat_array, a): """ Compare a RaggedArray with rows of a 2D numpy object array Parameters ---------- - ra: RaggedArray + start_indices: ndarray + start indices of a RaggedArray + flat_array: ndarray + flat_array property of a RaggedArray a: ndarray A 2D numpy array where the length of the first dimension matches the length of the RaggedArray @@ -916,20 +958,29 @@ def _eq_ragged_ndarray2d(ra, a): 1D bool array of same length as input RaggedArray with elements True when corresponding elements of ra equals corresponding row of a """ - start_indices = ra.start_indices - flat_array = ra.flat_array - n = len(start_indices) m = len(flat_array) - result = np.zeros(n, dtype=np.bool) - for i in range(n): - start_index = start_indices[i] - stop_index = start_indices[i + 1] if i < n - 1 else m - result[i] = np.array_equal(flat_array[start_index:stop_index], - a[i, :]) + cols = a.shape[1] + + # np.bool is an alias for Python's built-in bool type, np.bool_ is the + # numpy type that numba recognizes + result = np.zeros(n, dtype=np.bool_) + for row in range(n): + start_index = start_indices[row] + stop_index = start_indices[row + 1] if row < n - 1 else m + + # Check equality + if stop_index - start_index != cols: + el_equal = False + else: + el_equal = True + for col, flat_index in enumerate(range(start_index, stop_index)): + el_equal &= flat_array[flat_index] == a[row, col] + result[row] = el_equal return result +@jit(nopython=True, nogil=True) def _lexograph_lt(a1, a2): """ Compare two 1D numpy arrays lexographically diff --git a/datashader/tests/test_datatypes.py b/datashader/tests/test_datatypes.py index 9aabb9759..90380f841 100644 --- a/datashader/tests/test_datatypes.py +++ b/datashader/tests/test_datatypes.py @@ -425,7 +425,7 @@ def test_concat_series(): ]) def test_array_eq_scalar(scalar): # Build RaggedArray - arg1 = [[1, 2], [], [1, 2], None, [11, 22, 33, 44]] + arg1 = [[1, 2], [], [1, 2], [1, 3], [11, 22, 33, 44]] ra = RaggedArray(arg1, dtype='int32') # Check equality @@ -445,11 +445,12 @@ def test_array_eq_numpy1(): # Construct arrays ra = RaggedArray(arg1, dtype='int32') - npa = np.array(arg1, dtype='object') + npa = np.array([[1, 2], [2], [1, 2], None, [10, 20, 30, 40]], + dtype='object') # Check equality result = ra == npa - expected = np.array([1, 1, 1, 1, 1], dtype='bool') + expected = np.array([1, 0, 1, 1, 0], dtype='bool') np.testing.assert_array_equal(result, expected) # Check non-equality @@ -460,7 +461,7 @@ def test_array_eq_numpy1(): def test_array_eq_numpy2d(): # Construct arrays - ra = RaggedArray([[1, 2], [], [1, 2], None, [11, 22, 33, 44]], + ra = RaggedArray([[1, 2], [1], [1, 2], None, [33, 44]], dtype='int32') npa = np.array([[1, 2], [2, 3], [1, 2], [0, 1], [11, 22]], dtype='int32') @@ -478,16 +479,16 @@ def test_array_eq_numpy2d(): def test_array_eq_ragged(): # Build RaggedArray - arg1 = [[1, 2], [], [1, 2], None, [11, 22, 33, 44]] + arg1 = [[1, 2], [], [1, 2], [3, 2, 1], [11, 22, 33, 44]] ra1 = RaggedArray(arg1, dtype='int32') # Build RaggedArray - arg2 = [[1, 2], [2, 3, 4, 5], [1, 2], None, [11]] + arg2 = [[1, 2], [2, 3, 4, 5], [1, 2], [11, 22, 33], [11]] ra2 = RaggedArray(arg2, dtype='int32') # Check equality result = ra1 == ra2 - expected = np.array([1, 0, 1, 1, 0], dtype='bool') + expected = np.array([1, 0, 1, 0, 0], dtype='bool') np.testing.assert_array_equal(result, expected) # Check non-equality From 3fda78684b192e37fd1f83a1450d611bb3421e14 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Thu, 17 Jan 2019 20:08:11 -0500 Subject: [PATCH 26/45] Don't auto-import RaggedArray unless pandas is at least version 0.24.0 --- datashader/__init__.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/datashader/__init__.py b/datashader/__init__.py index 747f21161..0aeb61954 100644 --- a/datashader/__init__.py +++ b/datashader/__init__.py @@ -1,5 +1,7 @@ from __future__ import absolute_import +from distutils.version import LooseVersion + import param __version__ = str(param.version.Version(fpath=__file__, archive_commit="$Format:%h$",reponame="datashader")) @@ -15,8 +17,11 @@ except ImportError: pass -# Make ragged pandas extension array available -from . import datatypes +# Make RaggedArray pandas extension array available for +# pandas >= 0.24.0 is installed +from pandas import __version__ as pandas_version +if LooseVersion(pandas_version) >= LooseVersion('0.24.0'): + from . import datatypes # make pyct's example/data commands available if possible from functools import partial From 04453ce3ea9a797731e6e8b1b5d3f34126db1677 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Sun, 20 Jan 2019 05:38:06 -0500 Subject: [PATCH 27/45] wrap _compute_*_bounds static methods with compute_*_bounds methods This allows subclasses to override how the DataFrame is used to compute the bounds --- datashader/dask.py | 4 ++-- datashader/glyphs.py | 18 ++++++++++++++++-- datashader/pandas.py | 4 ++-- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/datashader/dask.py b/datashader/dask.py index e39a8fecf..9babd1d22 100644 --- a/datashader/dask.py +++ b/datashader/dask.py @@ -31,8 +31,8 @@ def dask_pipeline(df, schema, canvas, glyph, summary): def shape_bounds_st_and_axis(df, canvas, glyph): - x_range = canvas.x_range or glyph._compute_x_bounds_dask(df) - y_range = canvas.y_range or glyph._compute_y_bounds_dask(df) + x_range = canvas.x_range or glyph.compute_x_bounds_dask(df) + y_range = canvas.y_range or glyph.compute_y_bounds_dask(df) x_min, x_max, y_min, y_max = bounds = compute(*(x_range + y_range)) x_range, y_range = (x_min, x_max), (y_min, y_max) diff --git a/datashader/glyphs.py b/datashader/glyphs.py index 213c5e90f..a769aed23 100644 --- a/datashader/glyphs.py +++ b/datashader/glyphs.py @@ -27,6 +27,12 @@ def validate(self, in_dshape): elif not isreal(in_dshape.measure[self.y]): raise ValueError('y must be real') + def compute_x_bounds(self, df): + return self._compute_x_bounds(df[self.x].values) + + def compute_y_bounds(self, df): + return self._compute_y_bounds(df[self.y].values) + @staticmethod @ngjit def _compute_x_bounds(xs): @@ -68,7 +74,7 @@ def _compute_y_bounds(ys): return minval, maxval @memoize - def _compute_x_bounds_dask(self, df): + def compute_x_bounds_dask(self, df): """Like ``PointLike._compute_x_bounds``, but memoized because ``df`` is immutable/hashable (a Dask dataframe). """ @@ -85,7 +91,7 @@ def _compute_x_bounds_dask(self, df): @memoize - def _compute_y_bounds_dask(self, df): + def compute_y_bounds_dask(self, df): """Like ``PointLike._compute_y_bounds``, but memoized because ``df`` is immutable/hashable (a Dask dataframe). """ @@ -129,6 +135,14 @@ def validate(self, in_dshape): if not isreal(in_dshape.measure[col]): raise ValueError('{} must be real'.format(col)) + def compute_x_bounds(self, df): + xs = df[self.x].values + return self._compute_x_bounds(xs.reshape(np.prod(xs.shape))) + + def compute_y_bounds(self, df): + ys = df[self.y].values + return self._compute_y_bounds(ys.reshape(np.prod(ys.shape))) + class Point(_PointLike): """A point, with center at ``x`` and ``y``. diff --git a/datashader/pandas.py b/datashader/pandas.py index 191bbf38e..54f9ecc11 100644 --- a/datashader/pandas.py +++ b/datashader/pandas.py @@ -26,8 +26,8 @@ def pointlike(glyph, df, schema, canvas, summary): y_mapper = canvas.y_axis.mapper extend = glyph._build_extend(x_mapper, y_mapper, info, append) - x_range = canvas.x_range or glyph._compute_x_bounds(df[glyph.x].values) - y_range = canvas.y_range or glyph._compute_y_bounds(df[glyph.y].values) + x_range = canvas.x_range or glyph.compute_x_bounds(df) + y_range = canvas.y_range or glyph.compute_y_bounds(df) width = canvas.plot_width height = canvas.plot_height From 642a8581b122350f74c73d137d6f97db5ba084be Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Sun, 20 Jan 2019 05:52:31 -0500 Subject: [PATCH 28/45] Small refactor to remove the need for a specialized _PolygonLike glyph_dispatch --- datashader/glyphs.py | 4 +++- datashader/pandas.py | 28 ---------------------------- 2 files changed, 3 insertions(+), 29 deletions(-) diff --git a/datashader/glyphs.py b/datashader/glyphs.py index a769aed23..375c3c65a 100644 --- a/datashader/glyphs.py +++ b/datashader/glyphs.py @@ -233,8 +233,10 @@ def _build_extend(self, x_mapper, y_mapper, info, append): draw_triangle, draw_triangle_interp = _build_draw_triangle(append) map_onto_pixel = _build_map_onto_pixel_for_triangle(x_mapper, y_mapper) extend_triangles = _build_extend_triangles(draw_triangle, draw_triangle_interp, map_onto_pixel) + weight_type = self.weight_type + interpolate = self.interpolate - def extend(aggs, df, vt, bounds, weight_type=True, interpolate=True): + def extend(aggs, df, vt, bounds, plot_start=True): cols = info(df) assert cols, 'There must be at least one column on which to aggregate' # mapped to pixels, then may be clipped diff --git a/datashader/pandas.py b/datashader/pandas.py index 54f9ecc11..e9000a1c8 100644 --- a/datashader/pandas.py +++ b/datashader/pandas.py @@ -42,31 +42,3 @@ def pointlike(glyph, df, schema, canvas, summary): extend(bases, df, x_st + y_st, x_range + y_range) return finalize(bases, coords=[y_axis, x_axis], dims=[glyph.y, glyph.x]) - - - -@glyph_dispatch.register(_PolygonLike) -def polygonlike(glyph, df, schema, canvas, summary): - create, info, append, _, finalize = compile_components(summary, schema, glyph) - x_mapper = canvas.x_axis.mapper - y_mapper = canvas.y_axis.mapper - extend = glyph._build_extend(x_mapper, y_mapper, info, append) - - xs = df[glyph.x].values - x_range = canvas.x_range or glyph._compute_x_bounds(xs.reshape(np.prod(xs.shape))) - ys = df[glyph.y].values - y_range = canvas.y_range or glyph._compute_y_bounds(ys.reshape(np.prod(ys.shape))) - - width = canvas.plot_width - height = canvas.plot_height - - x_st = canvas.x_axis.compute_scale_and_translate(x_range, width) - y_st = canvas.y_axis.compute_scale_and_translate(y_range, height) - - x_axis = canvas.x_axis.compute_index(x_st, width) - y_axis = canvas.y_axis.compute_index(y_st, height) - - bases = create((height, width)) - extend(bases, df, x_st + y_st, x_range + y_range, weight_type=glyph.weight_type, interpolate=glyph.interpolate) - - return finalize(bases, coords=[y_axis, x_axis], dims=[glyph.y, glyph.x]) From 97bccf51faee70d1abdb4de8cde547d4d7d746dc Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Sun, 20 Jan 2019 06:29:38 -0500 Subject: [PATCH 29/45] Refactor to extract required_columns glyph method --- datashader/core.py | 7 +++---- datashader/glyphs.py | 7 ++++++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/datashader/core.py b/datashader/core.py index 40c40beab..582189e68 100644 --- a/datashader/core.py +++ b/datashader/core.py @@ -541,10 +541,9 @@ def bypixel(source, canvas, glyph, agg): def _cols_to_keep(columns, glyph, agg): cols_to_keep = OrderedDict({col: False for col in columns}) - cols_to_keep[glyph.x] = True - cols_to_keep[glyph.y] = True - if hasattr(glyph, 'z'): - cols_to_keep[glyph.z[0]] = True + for col in glyph.required_columns(): + cols_to_keep[col] = True + if hasattr(agg, 'values'): for subagg in agg.values: if subagg.column is not None: diff --git a/datashader/glyphs.py b/datashader/glyphs.py index 375c3c65a..e2b925b6a 100644 --- a/datashader/glyphs.py +++ b/datashader/glyphs.py @@ -27,6 +27,9 @@ def validate(self, in_dshape): elif not isreal(in_dshape.measure[self.y]): raise ValueError('y must be real') + def required_columns(self): + return [self.x, self.y] + def compute_x_bounds(self, df): return self._compute_x_bounds(df[self.x].values) @@ -88,7 +91,6 @@ def compute_x_bounds_dask(self, df): #print("No x range; defaulting to x-1,x+1") minval, maxval = minval-1, minval+1 return minval, maxval - @memoize def compute_y_bounds_dask(self, df): @@ -135,6 +137,9 @@ def validate(self, in_dshape): if not isreal(in_dshape.measure[col]): raise ValueError('{} must be real'.format(col)) + def required_columns(self): + return [self.x, self.y] + list(self.z) + def compute_x_bounds(self, df): xs = df[self.x].values return self._compute_x_bounds(xs.reshape(np.prod(xs.shape))) From 2860511fcc1ef571d02734a0f6c2c4919e027327 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Sun, 20 Jan 2019 07:26:07 -0500 Subject: [PATCH 30/45] Initial cvs.lines and LinesXY glyph --- datashader/core.py | 39 +++++++++++++++++ datashader/glyphs.py | 78 +++++++++++++++++++++++++++++++++ datashader/pandas.py | 4 +- datashader/tests/test_pandas.py | 34 ++++++++++++++ 4 files changed, 154 insertions(+), 1 deletion(-) diff --git a/datashader/core.py b/datashader/core.py index 582189e68..ed258bf07 100644 --- a/datashader/core.py +++ b/datashader/core.py @@ -187,6 +187,45 @@ def line(self, source, x, y, agg=None): agg = any_rdn() return bypixel(source, self, Line(x, y), agg) + def lines(self, source, x, y, agg=None, + x_constant=None, y_constant=None): + """Compute a reduction by pixel, mapping each row of source to pixels + in a distinct line + + Parameters + ---------- + source : pandas.DataFrame, dask.DataFrame, or xarray.DataArray/Dataset + The input datasource. + x, y: str or list + The x and y coordinates defining the line segments. Either xs and + ys are both strings or both lists. + * str: The name of a RaggedArray column in source that + contains the x or y coordinates of the line segments. + * list: A list of the names of float or integer + columns that contains the x or y coordinates of + the line segment + agg : Reduction, optional + Reduction to compute. Default is ``any()``. + x_constant, y_constant: list or array of numbers + If xs is set to a list of column labels then y_constants may be + set to a list of numbers the same length as xs. These y + coordinates will be applied to every row. Similarly, if ys is + a list of column labels, x_constants may be set to a list of + numbers to specify the x coordinates to be applied to every line + segment. + + Exactly one of xs and x_constants may be specified and exactly + one of ys and y_constants may be specified. + """ + from .glyphs import LinesXY + from .reductions import any as any_rdn + if agg is None: + agg = any_rdn() + return bypixel(source, + self, + LinesXY(tuple(x), tuple(y)), + agg) + # TODO re 'untested', below: Consider replacing with e.g. a 3x3 # array in the call to Canvas (plot_height=3,plot_width=3), then # show the output as a numpy array that has a compact diff --git a/datashader/glyphs.py b/datashader/glyphs.py index e2b925b6a..fbe44b202 100644 --- a/datashader/glyphs.py +++ b/datashader/glyphs.py @@ -27,6 +27,14 @@ def validate(self, in_dshape): elif not isreal(in_dshape.measure[self.y]): raise ValueError('y must be real') + @property + def x_label(self): + return self.x + + @property + def y_label(self): + return self.y + def required_columns(self): return [self.x, self.y] @@ -225,6 +233,49 @@ def extend(aggs, df, vt, bounds, plot_start=True): return extend +class LinesXY(_PointLike): + def validate(self, in_dshape): + # TODO + pass + + def required_columns(self): + return self.x + self.y + + @property + def x_label(self): + return 'x' + + @property + def y_label(self): + return 'y' + + def compute_x_bounds(self, df): + # return self._compute_x_bounds(df[self.x].values) + raise NotImplementedError() + + def compute_y_bounds(self, df): + # return self._compute_y_bounds(df[self.y].values) + raise NotImplementedError() + + @memoize + def _build_extend(self, x_mapper, y_mapper, info, append): + draw_line = _build_draw_line(append) + map_onto_pixel = _build_map_onto_pixel_for_line(x_mapper, y_mapper) + extend_lines_xy = _build_extend_lines_xy(draw_line, map_onto_pixel) + x_names = self.x + y_names = self.y + + def extend(aggs, df, vt, bounds, plot_start=True): + xs = tuple(df[x_name].values for x_name in x_names) + ys = tuple(df[y_name].values for y_name in y_names) + + cols = aggs + info(df) + # line may be clipped, then mapped to pixels + extend_lines_xy(vt, bounds, xs, ys, plot_start, *cols) + + return extend + + class Triangles(_PolygonLike): """An unstructured mesh of triangles, with vertices defined by ``xs`` and ``ys``. @@ -465,6 +516,33 @@ def extend_line(vt, bounds, xs, ys, plot_start, *aggs_and_cols): return extend_line +def _build_extend_lines_xy(draw_line, map_onto_pixel): + extend_line = _build_extend_line(draw_line, map_onto_pixel) + + @ngjit + def extend_lines_xy(vt, bounds, xs, ys, plot_start, *aggs_and_cols): + """ + here xs and ys are tuples of arrays and non-empty + """ + cols = len(xs) + rows = len(xs[0]) + line_xs = np.zeros(cols, dtype=xs[0].dtype) + line_ys = np.zeros(cols, dtype=xs[0].dtype) + + for r in range(rows): + # populate line_xs/line_ys + for c in range(cols): + line_xs[c] = xs[c][r] + line_ys[c] = ys[c][r] + + # extend line + extend_line( + vt, bounds, line_xs, line_ys, plot_start, *aggs_and_cols) + + return extend_lines_xy + + + def _build_draw_triangle(append): """Specialize a triangle plotting kernel for a given append/axis combination""" @ngjit diff --git a/datashader/pandas.py b/datashader/pandas.py index e9000a1c8..dffe377d8 100644 --- a/datashader/pandas.py +++ b/datashader/pandas.py @@ -41,4 +41,6 @@ def pointlike(glyph, df, schema, canvas, summary): bases = create((height, width)) extend(bases, df, x_st + y_st, x_range + y_range) - return finalize(bases, coords=[y_axis, x_axis], dims=[glyph.y, glyph.x]) + return finalize(bases, + coords=[y_axis, x_axis], + dims=[glyph.y_label, glyph.x_label]) diff --git a/datashader/tests/test_pandas.py b/datashader/tests/test_pandas.py index b5e2a0057..239bff407 100644 --- a/datashader/tests/test_pandas.py +++ b/datashader/tests/test_pandas.py @@ -593,3 +593,37 @@ def test_bug_570(): yi, xi = np.where(agg.values == 1) assert np.array_equal(yi, np.arange(73, 300)) assert np.array_equal(xi, np.array([590] * len(yi))) + + +def test_lines_xy(): + axis = ds.core.LinearAxis() + lincoords = axis.compute_index(axis.compute_scale_and_translate((-3., 3.), 7), 7) + + df = pd.DataFrame({ + 'x0': [4, -4], + 'x1': [0, 0], + 'x2': [-4, 4], + 'y0': [0, 0], + 'y1': [-4, 4], + 'y2': [0, 0] + }) + + cvs = ds.Canvas(plot_width=7, plot_height=7, + x_range=(-3, 3), y_range=(-3, 3)) + + agg = cvs.lines(df, + ['x0', 'x1', 'x2'], + ['y0', 'y1', 'y2'], + ds.count()) + + sol = np.array([[0, 0, 1, 0, 1, 0, 0], + [0, 1, 0, 0, 0, 1, 0], + [1, 0, 0, 0, 0, 0, 1], + [0, 0, 0, 0, 0, 0, 0], + [1, 0, 0, 0, 0, 0, 1], + [0, 1, 0, 0, 0, 1, 0], + [0, 0, 1, 0, 1, 0, 0]], dtype='i4') + + out = xr.DataArray(sol, coords=[lincoords, lincoords], + dims=['y', 'x']) + assert_eq(agg, out) \ No newline at end of file From d7cf092a2778b8e3fdb975a18302cdcf80d3982b Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Sun, 20 Jan 2019 13:15:18 -0500 Subject: [PATCH 31/45] WIP of LinesRagged type --- datashader/core.py | 28 +++++----- datashader/glyphs.py | 94 +++++++++++++++++++++++++++++++++ datashader/tests/test_pandas.py | 30 ++++++++++- 3 files changed, 135 insertions(+), 17 deletions(-) diff --git a/datashader/core.py b/datashader/core.py index ed258bf07..3dc59aee6 100644 --- a/datashader/core.py +++ b/datashader/core.py @@ -7,6 +7,8 @@ from xarray import DataArray, Dataset from collections import OrderedDict +from datashader.datatypes import RaggedArray +from datashader.glyphs import LinesRagged from .utils import Dispatcher, ngjit, calc_res, calc_bbox, orient_array, compute_coords from .utils import get_indices, dshape_from_pandas, dshape_from_dask from .utils import Expr # noqa (API import) @@ -202,29 +204,23 @@ def lines(self, source, x, y, agg=None, * str: The name of a RaggedArray column in source that contains the x or y coordinates of the line segments. * list: A list of the names of float or integer - columns that contains the x or y coordinates of - the line segment + columns that contains the x or y coordinates of + the line segment agg : Reduction, optional Reduction to compute. Default is ``any()``. - x_constant, y_constant: list or array of numbers - If xs is set to a list of column labels then y_constants may be - set to a list of numbers the same length as xs. These y - coordinates will be applied to every row. Similarly, if ys is - a list of column labels, x_constants may be set to a list of - numbers to specify the x coordinates to be applied to every line - segment. - - Exactly one of xs and x_constants may be specified and exactly - one of ys and y_constants may be specified. """ from .glyphs import LinesXY from .reductions import any as any_rdn if agg is None: agg = any_rdn() - return bypixel(source, - self, - LinesXY(tuple(x), tuple(y)), - agg) + # TODO: Check inputs and make LinesRagged, LinesX, LinesY + + if isinstance(x, RaggedArray): + glyph = LinesRagged(x, y) + else: + glyph = LinesXY(tuple(x), tuple(y)) + + return bypixel(source, self, glyph, agg) # TODO re 'untested', below: Consider replacing with e.g. a 3x3 # array in the call to Canvas (plot_height=3,plot_width=3), then diff --git a/datashader/glyphs.py b/datashader/glyphs.py index fbe44b202..e99ad218c 100644 --- a/datashader/glyphs.py +++ b/datashader/glyphs.py @@ -276,6 +276,41 @@ def extend(aggs, df, vt, bounds, plot_start=True): return extend +class LinesRagged(_PointLike): + def validate(self, in_dshape): + # TODO + pass + + def required_columns(self): + return self.x + self.y + + def compute_x_bounds(self, df): + # return self._compute_x_bounds(df[self.x].values) + raise NotImplementedError() + + def compute_y_bounds(self, df): + # return self._compute_y_bounds(df[self.y].values) + raise NotImplementedError() + + @memoize + def _build_extend(self, x_mapper, y_mapper, info, append): + draw_line = _build_draw_line(append) + map_onto_pixel = _build_map_onto_pixel_for_line(x_mapper, y_mapper) + extend_lines_ragged = _build_extend_lines_ragged(draw_line, map_onto_pixel) + x_name = self.x + y_name = self.y + + def extend(aggs, df, vt, bounds, plot_start=True): + xs = df[x_name].array + ys = df[y_name].array + + cols = aggs + info(df) + # line may be clipped, then mapped to pixels + extend_lines_ragged(vt, bounds, xs, ys, plot_start, *cols) + + return extend + + class Triangles(_PolygonLike): """An unstructured mesh of triangles, with vertices defined by ``xs`` and ``ys``. @@ -542,6 +577,65 @@ def extend_lines_xy(vt, bounds, xs, ys, plot_start, *aggs_and_cols): return extend_lines_xy +def _build_extend_lines_ragged(draw_line, map_onto_pixel): + extend_line = _build_extend_line(draw_line, map_onto_pixel) + + def extend_lines_ragged(vt, bounds, xs, ys, plot_start, *aggs_and_cols): + """ + here xs and ys are tuples of arrays and non-empty + """ + x_start_indices = xs.start_indices + x_flat_array = xs.flat_array + + y_start_indices = ys.start_indices + y_flat_array = ys.flat_array + + perform_extend_lines_ragged(vt, + bounds, + x_start_indices, + x_flat_array, + y_start_indices, + y_flat_array, + plot_start, + *aggs_and_cols) + + # @ngjit + def perform_extend_lines_ragged(vt, + bounds, + x_start_indices, + x_flat_array, + y_start_indices, + y_flat_array, + plot_start, + *aggs_and_cols): + + x_flat_len = len(x_flat_array) + y_flat_len = len(y_flat_array) + + rows = len(x_start_indices) + for r in range(rows): + # Get x index range + x_start_index = x_start_indices[r] + x_stop_index = (x_start_indices[r + 1] + if r < rows - 1 + else x_flat_len) + + # Get y index range + y_start_index = y_start_indices[r] + y_stop_index = (y_start_indices[r + 1] + if r < rows - 1 + else y_flat_len) + + # Build line slices + line_xs = x_flat_array[x_start_index:x_stop_index] + line_ys = y_flat_array[y_start_index:y_stop_index] + + # Perform extend line + extend_line( + vt, bounds, line_xs, line_ys, plot_start, *aggs_and_cols) + + return extend_lines_ragged + def _build_draw_triangle(append): """Specialize a triangle plotting kernel for a given append/axis combination""" diff --git a/datashader/tests/test_pandas.py b/datashader/tests/test_pandas.py index 239bff407..99458b71a 100644 --- a/datashader/tests/test_pandas.py +++ b/datashader/tests/test_pandas.py @@ -626,4 +626,32 @@ def test_lines_xy(): out = xr.DataArray(sol, coords=[lincoords, lincoords], dims=['y', 'x']) - assert_eq(agg, out) \ No newline at end of file + assert_eq(agg, out) + + +def test_lines_ragged(): + axis = ds.core.LinearAxis() + lincoords = axis.compute_index( + axis.compute_scale_and_translate((-3., 3.), 7), 7) + + df = pd.DataFrame({ + 'x': pd.array([[4, -4], [-4, 4, -4, 4]], dtype='Ragged[float32]'), + 'y': pd.array([[0, 0], [-4, 4, 0, 0]], dtype='Ragged[float32]') + }) + + cvs = ds.Canvas(plot_width=7, plot_height=7, + x_range=(-3, 3), y_range=(-3, 3)) + + agg = cvs.lines(df, 'x', 'y', ds.count()) + + sol = np.array([[0, 0, 1, 0, 1, 0, 0], + [0, 1, 0, 0, 0, 1, 0], + [1, 0, 0, 0, 0, 0, 1], + [0, 0, 0, 0, 0, 0, 0], + [1, 0, 0, 0, 0, 0, 1], + [0, 1, 0, 0, 0, 1, 0], + [0, 0, 1, 0, 1, 0, 0]], dtype='i4') + + out = xr.DataArray(sol, coords=[lincoords, lincoords], + dims=['y', 'x']) + assert_eq(agg, out) From ea08fd1984846e8e739c58e367c61b4090a1632b Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Thu, 7 Feb 2019 18:34:38 -0500 Subject: [PATCH 32/45] Remove unused canvas.lines method --- datashader/core.py | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/datashader/core.py b/datashader/core.py index a56e9c70b..cbaedd36a 100644 --- a/datashader/core.py +++ b/datashader/core.py @@ -294,38 +294,6 @@ def line(self, source, x, y, agg=None, axis=0): return bypixel(source, self, glyph, agg) - def lines(self, source, x, y, agg=None, - x_constant=None, y_constant=None): - """Compute a reduction by pixel, mapping each row of source to pixels - in a distinct line - - Parameters - ---------- - source : pandas.DataFrame, dask.DataFrame, or xarray.DataArray/Dataset - The input datasource. - x, y: str or list - The x and y coordinates defining the line segments. Either xs and - ys are both strings or both lists. - * str: The name of a RaggedArray column in source that - contains the x or y coordinates of the line segments. - * list: A list of the names of float or integer - columns that contains the x or y coordinates of - the line segment - agg : Reduction, optional - Reduction to compute. Default is ``any()``. - """ - from .glyphs import LinesXY - from .reductions import any as any_rdn - if agg is None: - agg = any_rdn() - # TODO: Check inputs and make LinesRagged, LinesX, LinesY - - if isinstance(x, RaggedArray): - glyph = LinesRagged(x, y) - else: - glyph = LinesXY(tuple(x), tuple(y)) - - return bypixel(source, self, glyph, agg) # TODO re 'untested', below: Consider replacing with e.g. a 3x3 # array in the call to Canvas (plot_height=3,plot_width=3), then From 1b02b0d73edbd1eb8ce59a8622c9504485dafb09 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Fri, 8 Feb 2019 07:58:37 -0500 Subject: [PATCH 33/45] Add RaggedArray line aggregation support for pandas --- datashader/core.py | 40 +++++++++++++++---- datashader/glyphs.py | 70 ++++++++++++++++++++------------- datashader/tests/test_pandas.py | 45 ++++++++++++++------- datashader/utils.py | 4 ++ 4 files changed, 110 insertions(+), 49 deletions(-) diff --git a/datashader/core.py b/datashader/core.py index cbaedd36a..911987780 100644 --- a/datashader/core.py +++ b/datashader/core.py @@ -10,8 +10,6 @@ from xarray import DataArray, Dataset from collections import OrderedDict -from datashader.datatypes import RaggedArray -from datashader.glyphs import LinesRagged from .utils import Dispatcher, ngjit, calc_res, calc_bbox, orient_array, compute_coords from .utils import get_indices, dshape_from_pandas, dshape_from_dask from .utils import Expr # noqa (API import) @@ -202,6 +200,7 @@ def line(self, source, x, y, agg=None, axis=0): Define a canvas and a pandas DataFrame with 6 rows >>> import pandas as pd # doctest: +SKIP ... import numpy as np + ... import datashader as ds ... from datashader import Canvas ... import datashader.transfer_functions as tf ... cvs = Canvas() @@ -214,23 +213,23 @@ def line(self, source, x, y, agg=None, axis=0): Aggregate one line across all rows, with coordinates df.A1 by df.B1 >>> agg = cvs.line(df, x='A1', y='B1', axis=0) # doctest: +SKIP - ... tf.shade(agg) + ... tf.spread(tf.shade(agg)) Aggregate two lines across all rows. The first with coordinates df.A1 by df.B1 and the second with coordinates df.A2 by df.B2 >>> agg = cvs.line(df, x=['A1', 'A2'], y=['B1', 'B2'], axis=0) # doctest: +SKIP - ... tf.shade(agg) + ... tf.spread(tf.shade(agg)) Aggregate two lines across all rows where the lines share the same x coordinates. The first line will have coordinates df.A1 by df.B1 and the second will have coordinates df.A1 by df.B2 >>> agg = cvs.line(df, x='A1', y=['B1', 'B2'], axis=0) # doctest: +SKIP - ... tf.shade(agg) + ... tf.spread(tf.shade(agg)) Aggregate 6 length-2 lines, one per row, where the ith line has coordinates [df.A1[i], df.A2[i]] by [df.B1[i], df.B2[i]] >>> agg = cvs.line(df, x=['A1', 'A2'], y=['B1', 'B2'], axis=1) # doctest: +SKIP - ... tf.shade(agg) + ... tf.spread(tf.shade(agg)) Aggregate 6 length-4 lines, one per row, where the x coordinates of every line are [0, 1, 2, 3] and the y coordinates of the ith line @@ -239,10 +238,32 @@ def line(self, source, x, y, agg=None, axis=0): ... x=np.arange(4), ... y=['A1', 'A2', 'B1', 'B2'], ... axis=1) - ... tf.shade(agg) + ... tf.spread(tf.shade(agg)) + + Aggregate RaggedArrays of variable length lines, one per row + (requires pandas >= 0.24.0) + >>> df_ragged = pd.DataFrame({ # doctest: +SKIP + ... 'A1': pd.array([ + ... [1, 1.5], [2, 2.5, 3], [1.5, 2, 3, 4], [3.2, 4, 5]], + ... dtype='Ragged[float32]'), + ... 'B1': pd.array([ + ... [10, 12], [11, 14, 13], [10, 7, 9, 10], [7, 8, 12]], + ... dtype='Ragged[float32]'), + ... 'group': pd.Categorical([0, 1, 2, 1]) + ... }) + ... + ... agg = cvs.line(df_ragged, x='A1', y='B1', axis=1) + ... tf.spread(tf.shade(agg)) + + Aggregate RaggedArrays of variable length lines by group column, + one per row (requires pandas >= 0.24.0) + >>> agg = cvs.line(df_ragged, x='A1', y='B1', # doctest: +SKIP + ... agg=ds.count_cat('group'), axis=1) + ... tf.spread(tf.shade(agg)) """ from .glyphs import (LineAxis0, LinesAxis1, LinesAxis1XConstant, - LinesAxis1YConstant, LineAxis0Multi) + LinesAxis1YConstant, LineAxis0Multi, + LinesAxis1Ragged) from .reductions import any as any_rdn if agg is None: agg = any_rdn() @@ -278,6 +299,9 @@ def line(self, source, x, y, agg=None, axis=0): elif (isinstance(x, (list, tuple)) and isinstance(y, np.ndarray)): glyph = LinesAxis1YConstant(tuple(x), y) + elif (isinstance(x, (Number, string_types)) and + isinstance(y, (Number, string_types))): + glyph = LinesAxis1Ragged(x, y) else: raise ValueError(""" Invalid combination of x and y arguments to Canvas.line when axis=1. diff --git a/datashader/glyphs.py b/datashader/glyphs.py index 6e5a04b8b..5d5ac4eac 100644 --- a/datashader/glyphs.py +++ b/datashader/glyphs.py @@ -469,7 +469,7 @@ def extend(aggs, df, vt, bounds, plot_start=True): return extend -class LinesRagged(_PointLike): +class LinesAxis1Ragged(_PointLike): def validate(self, in_dshape): # TODO pass @@ -478,18 +478,18 @@ def required_columns(self): return self.x + self.y def compute_x_bounds(self, df): - # return self._compute_x_bounds(df[self.x].values) - raise NotImplementedError() + bounds = self._compute_x_bounds(df[self.x].array.flat_array) + return self.maybe_expand_bounds(bounds) def compute_y_bounds(self, df): - # return self._compute_y_bounds(df[self.y].values) - raise NotImplementedError() + bounds = self._compute_y_bounds(df[self.y].array.flat_array) + return self.maybe_expand_bounds(bounds) @memoize def _build_extend(self, x_mapper, y_mapper, info, append): draw_line = _build_draw_line(append) map_onto_pixel = _build_map_onto_pixel_for_line(x_mapper, y_mapper) - extend_lines_ragged = _build_extend_lines_ragged(draw_line, map_onto_pixel) + extend_lines_ragged = _build_extend_line_axis1_ragged(draw_line, map_onto_pixel) x_name = self.x y_name = self.y @@ -882,13 +882,9 @@ def extend_line(vt, bounds, xs, ys, plot_start, *aggs_and_cols): return extend_line -def _build_extend_lines_ragged(draw_line, map_onto_pixel): - extend_line = _build_extend_line(draw_line, map_onto_pixel) +def _build_extend_line_axis1_ragged(draw_line, map_onto_pixel): - def extend_lines_ragged(vt, bounds, xs, ys, plot_start, *aggs_and_cols): - """ - here xs and ys are tuples of arrays and non-empty - """ + def extend_line(vt, bounds, xs, ys, plot_start, *aggs_and_cols): x_start_indices = xs.start_indices x_flat_array = xs.flat_array @@ -914,32 +910,52 @@ def perform_extend_lines_ragged(vt, plot_start, *aggs_and_cols): + nrows = len(x_start_indices) x_flat_len = len(x_flat_array) y_flat_len = len(y_flat_array) - rows = len(x_start_indices) - for r in range(rows): + i = 0 + while i < nrows: + plot_start = True + # Get x index range - x_start_index = x_start_indices[r] - x_stop_index = (x_start_indices[r + 1] - if r < rows - 1 + x_start_index = x_start_indices[i] + x_stop_index = (x_start_indices[i + 1] + if i < nrows - 1 else x_flat_len) # Get y index range - y_start_index = y_start_indices[r] - y_stop_index = (y_start_indices[r + 1] - if r < rows - 1 + y_start_index = y_start_indices[i] + y_stop_index = (y_start_indices[i + 1] + if i < nrows - 1 else y_flat_len) - # Build line slices - line_xs = x_flat_array[x_start_index:x_stop_index] - line_ys = y_flat_array[y_start_index:y_stop_index] + # Find line segment length as shorter of the two segments + segment_len = min(x_stop_index - x_start_index, + y_stop_index - y_start_index) + + j = 0 + while j < segment_len - 1: + + x0 = x_flat_array[x_start_index + j] + y0 = y_flat_array[y_start_index + j] + x1 = x_flat_array[x_start_index + j + 1] + y1 = y_flat_array[y_start_index + j + 1] - # Perform extend line - extend_line( - vt, bounds, line_xs, line_ys, plot_start, *aggs_and_cols) + x0, x1, y0, y1, skip, clipped, plot_start = \ + _skip_or_clip(x0, x1, y0, y1, bounds, plot_start) - return extend_lines_ragged + if not skip: + x0i, y0i = map_onto_pixel(vt, bounds, x0, y0) + x1i, y1i = map_onto_pixel(vt, bounds, x1, y1) + draw_line(x0i, y0i, x1i, y1i, i, plot_start, clipped, + *aggs_and_cols) + plot_start = False + + j += 1 + i += 1 + + return extend_line def _build_draw_triangle(append): diff --git a/datashader/tests/test_pandas.py b/datashader/tests/test_pandas.py index 4770afd54..0bb3acc50 100644 --- a/datashader/tests/test_pandas.py +++ b/datashader/tests/test_pandas.py @@ -642,6 +642,12 @@ def test_bug_570(): 'y0': [-4, 0, 4], 'y1': [-4, 0, 4], }), ['x0', 'x1'], 'y0', 0), + + # axis1 ragged arrays + (pd.DataFrame({ + 'x': pd.array([[4, 0], [0, -4, 0, 4]], dtype='Ragged[float32]'), + 'y': pd.array([[0, -4], [-4, 0, 4, 0]], dtype='Ragged[float32]') + }), 'x', 'y', 1) ]) def test_line_manual_range(df, x, y, ax): axis = ds.core.LinearAxis() @@ -709,6 +715,12 @@ def test_line_manual_range(df, x, y, ax): 'y0': [-4, 0, 4], 'y1': [-4, 0, 4], }), ['x0', 'x1'], 'y0', 0), + + # axis1 ragged arrays + (pd.DataFrame({ + 'x': pd.array([[0, -4, 0], [0, 4, 0]], dtype='Ragged[float32]'), + 'y': pd.array([[-4, 0, 4], [-4, 0, 4]], dtype='Ragged[float32]') + }), 'x', 'y', 1) ]) def test_line_autorange(df, x, y, ax): axis = ds.core.LinearAxis() @@ -810,28 +822,33 @@ def test_line_agg_sum_axis1_none_constant(): assert_eq(agg, out) -def test_lines_ragged(): +def test_line_autorange_axis1_ragged(): axis = ds.core.LinearAxis() lincoords = axis.compute_index( - axis.compute_scale_and_translate((-3., 3.), 7), 7) + axis.compute_scale_and_translate((-4., 4.), 9), 9) df = pd.DataFrame({ - 'x': pd.array([[4, -4], [-4, 4, -4, 4]], dtype='Ragged[float32]'), - 'y': pd.array([[0, 0], [-4, 4, 0, 0]], dtype='Ragged[float32]') + 'x': pd.array([[4, 0], [0, -4, 0, 4]], dtype='Ragged[float32]'), + 'y': pd.array([[0, -4], [-4, 0, 4, 0]], dtype='Ragged[float32]') }) - cvs = ds.Canvas(plot_width=7, plot_height=7, - x_range=(-3, 3), y_range=(-3, 3)) + cvs = ds.Canvas(plot_width=9, plot_height=9) - agg = cvs.lines(df, 'x', 'y', ds.count()) + agg = cvs.line(df, + 'x', + 'y', + ds.count(), + axis=1) - sol = np.array([[0, 0, 1, 0, 1, 0, 0], - [0, 1, 0, 0, 0, 1, 0], - [1, 0, 0, 0, 0, 0, 1], - [0, 0, 0, 0, 0, 0, 0], - [1, 0, 0, 0, 0, 0, 1], - [0, 1, 0, 0, 0, 1, 0], - [0, 0, 1, 0, 1, 0, 0]], dtype='i4') + sol = np.array([[0, 0, 0, 0, 2, 0, 0, 0, 0], + [0, 0, 0, 1, 0, 1, 0, 0, 0], + [0, 0, 1, 0, 0, 0, 1, 0, 0], + [0, 1, 0, 0, 0, 0, 0, 1, 0], + [1, 0, 0, 0, 0, 0, 0, 0, 2], + [0, 1, 0, 0, 0, 0, 0, 1, 0], + [0, 0, 1, 0, 0, 0, 1, 0, 0], + [0, 0, 0, 1, 0, 1, 0, 0, 0], + [0, 0, 0, 0, 1, 0, 0, 0, 0]], dtype='i4') out = xr.DataArray(sol, coords=[lincoords, lincoords], dims=['y', 'x']) diff --git a/datashader/utils.py b/datashader/utils.py index 4e44f293e..802be9276 100644 --- a/datashader/utils.py +++ b/datashader/utils.py @@ -12,6 +12,8 @@ import dask.dataframe as dd import datashape +from datashader.datatypes import RaggedDtype + ngjit = nb.jit(nopython=True, nogil=True) @@ -369,6 +371,8 @@ def dshape_from_pandas_helper(col): # Pandas stores this as a pytz.tzinfo, but DataShape wants a string tz = str(tz) return datashape.Option(datashape.DateTime(tz=tz)) + elif isinstance(col.dtype, RaggedDtype): + return col.dtype dshape = datashape.CType.from_numpy_dtype(col.dtype) dshape = datashape.string if dshape == datashape.object_ else dshape if dshape in (datashape.string, datashape.datetime_): From 23143110c2054ba7ca1e58b6e50d7bdb14bb0cda Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Fri, 8 Feb 2019 08:19:55 -0500 Subject: [PATCH 34/45] Dask ragged array support --- datashader/datatypes.py | 14 ++++++++++++++ datashader/glyphs.py | 18 ++++++++++++++++++ datashader/tests/test_dask.py | 12 ++++++++++++ 3 files changed, 44 insertions(+) diff --git a/datashader/datatypes.py b/datashader/datatypes.py index c671891b0..6ddca8396 100644 --- a/datashader/datatypes.py +++ b/datashader/datatypes.py @@ -10,6 +10,12 @@ from pandas.api.types import pandas_dtype from pandas.core.dtypes.common import is_extension_array_dtype +try: + # See if we can register extension type with dask >= 1.1.0 + from dask.dataframe.extensions import make_array_nonempty +except ImportError: + make_array_nonempty = None + def _validate_ragged_properties(start_indices, flat_array): """ @@ -1002,3 +1008,11 @@ def _lexograph_lt(a1, a2): elif e1 > e2: return False return len(a1) < len(a2) + + +def ragged_array_non_empty(dtype): + return RaggedArray([[1], [1, 2]], dtype=dtype) + + +if make_array_nonempty: + make_array_nonempty.register(RaggedDtype)(ragged_array_non_empty) diff --git a/datashader/glyphs.py b/datashader/glyphs.py index 5d5ac4eac..b41378e54 100644 --- a/datashader/glyphs.py +++ b/datashader/glyphs.py @@ -485,6 +485,24 @@ def compute_y_bounds(self, df): bounds = self._compute_y_bounds(df[self.y].array.flat_array) return self.maybe_expand_bounds(bounds) + @memoize + def compute_x_bounds_dask(self, df): + """Like ``PointLike._compute_x_bounds``, but memoized because + ``df`` is immutable/hashable (a Dask dataframe). + """ + xs = df[self.x].compute().array.flat_array + minval, maxval = np.nanmin(xs), np.nanmax(xs) + return self.maybe_expand_bounds((minval, maxval)) + + @memoize + def compute_y_bounds_dask(self, df): + """Like ``PointLike._compute_y_bounds``, but memoized because + ``df`` is immutable/hashable (a Dask dataframe). + """ + ys = df[self.y].compute().array.flat_array + minval, maxval = np.nanmin(ys), np.nanmax(ys) + return self.maybe_expand_bounds((minval, maxval)) + @memoize def _build_extend(self, x_mapper, y_mapper, info, append): draw_line = _build_draw_line(append) diff --git a/datashader/tests/test_dask.py b/datashader/tests/test_dask.py index 2c0780766..e45b1a6ef 100644 --- a/datashader/tests/test_dask.py +++ b/datashader/tests/test_dask.py @@ -324,6 +324,12 @@ def test_line(): 'y1': [0, 4, 0], 'y2': [0, 0, 0] }), 'x0', ['y0', 'y1', 'y2'], 0), + + # axis1 RaggedArray + (pd.DataFrame({ + 'x': [[4, 0, -4], [-4, 0, 4, 4, 0, -4]], + 'y': [[0, -4, 0], [0, 4, 0, 0, 0, 0]], + }, dtype='Ragged[int64]'), 'x', 'y', 1), ]) def test_line_manual_range(df, x, y, ax): axis = ds.core.LinearAxis() @@ -391,6 +397,12 @@ def test_line_manual_range(df, x, y, ax): 'y0': [-4, 0, 4] }), ['x0', 'x1', 'x2'], 'y0', 0), + # axis1 RaggedArray + (pd.DataFrame({ + 'x': [[0, -4, 0], [0, 0, 0], [0, 4, 0]], + 'y': [[-4, 0, 4], [4, 0, -4], [-4, 0, 4]], + }, dtype='Ragged[int64]'), 'x', 'y', 1), + ]) def test_line_autorange(df, x, y, ax): axis = ds.core.LinearAxis() From f4a40ebd193393dd618ced7de4f7fadd54549a8a Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Fri, 8 Feb 2019 08:23:21 -0500 Subject: [PATCH 35/45] flake8 --- datashader/__init__.py | 2 +- datashader/datatypes.py | 1 - datashader/tests/test_datatypes.py | 6 +++--- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/datashader/__init__.py b/datashader/__init__.py index 0aeb61954..da620d0a9 100644 --- a/datashader/__init__.py +++ b/datashader/__init__.py @@ -21,7 +21,7 @@ # pandas >= 0.24.0 is installed from pandas import __version__ as pandas_version if LooseVersion(pandas_version) >= LooseVersion('0.24.0'): - from . import datatypes + from . import datatypes # noqa (API import) # make pyct's example/data commands available if possible from functools import partial diff --git a/datashader/datatypes.py b/datashader/datatypes.py index 6ddca8396..709567f2e 100644 --- a/datashader/datatypes.py +++ b/datashader/datatypes.py @@ -525,7 +525,6 @@ def fillna(self, value=None, method=None, limit=None): filled : ExtensionArray with NA/NaN filled """ # Override in RaggedArray to handle ndarray fill values - from pandas.api.types import is_array_like from pandas.util._validators import validate_fillna_kwargs from pandas.core.missing import pad_1d, backfill_1d diff --git a/datashader/tests/test_datatypes.py b/datashader/tests/test_datatypes.py index 90380f841..e770046bc 100644 --- a/datashader/tests/test_datatypes.py +++ b/datashader/tests/test_datatypes.py @@ -240,8 +240,8 @@ def test_get_item_scalar(): @pytest.mark.parametrize('index', [-1000, -6, 5, 1000]) def test_get_item_scalar_out_of_bounds(index): rarray = RaggedArray([[1, 2], [], [10, 20, 30], None, [11, 22, 33, 44]]) - with pytest.raises(IndexError) as e: - result = rarray[index] + with pytest.raises(IndexError): + rarray[index] def test_get_item_slice(): @@ -511,7 +511,7 @@ def test_equality_validation(other): # invalid scalar with pytest.raises(ValueError, match="Cannot check equality"): - res = ra1 == other + ra1 == other # Pandas-provided extension array tests From 59b0b3a5c620231f5a036e17acc3025507f1a9aa Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Fri, 8 Feb 2019 08:37:38 -0500 Subject: [PATCH 36/45] Add validation for LinesAxis1Ragged --- datashader/glyphs.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/datashader/glyphs.py b/datashader/glyphs.py index b526d7140..95e56e7c6 100644 --- a/datashader/glyphs.py +++ b/datashader/glyphs.py @@ -472,8 +472,15 @@ def extend(aggs, df, vt, bounds, plot_start=True): class LinesAxis1Ragged(_PointLike): def validate(self, in_dshape): - # TODO - pass + try: + from datashader.datatypes import RaggedDtype + except ImportError: + RaggedDtype = type(None) + + if not isinstance(in_dshape[str(self.x)], RaggedDtype): + raise ValueError('x must be a RaggedArray') + elif not isinstance(in_dshape[str(self.x)], RaggedDtype): + raise ValueError('y must be a RaggedArray') def required_columns(self): return self.x + self.y From c48429e328e3a58103a1b8f26b2290dbf6ef9a1e Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Fri, 8 Feb 2019 08:38:05 -0500 Subject: [PATCH 37/45] Exception handling on import for pandas < 0.24 --- datashader/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/datashader/utils.py b/datashader/utils.py index 614b14572..fb660b17b 100644 --- a/datashader/utils.py +++ b/datashader/utils.py @@ -12,7 +12,10 @@ import dask.dataframe as dd import datashape -from datashader.datatypes import RaggedDtype +try: + from datashader.datatypes import RaggedDtype +except ImportError: + RaggedDtype = type(None) ngjit = nb.jit(nopython=True, nogil=True) From cdecd85a372a2935c5cff2db1c7cf1380660ffc2 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Fri, 8 Feb 2019 08:41:28 -0500 Subject: [PATCH 38/45] Add pandas >=0.24.1 as testing dependency so that we can test RaggedArray support --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index a3fa674cb..d3a7b8685 100644 --- a/setup.py +++ b/setup.py @@ -36,6 +36,7 @@ 'flake8', 'nbsmoke >=0.2.6', 'fastparquet >=0.1.6', # optional dependency + 'pandas >=0.24.1', # optional ragged array support ], 'examples': [], 'examples_extra':[ From 7c8b953241813070e3809e557ef07dabe48798de Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Fri, 8 Feb 2019 09:46:32 -0500 Subject: [PATCH 39/45] absolute import --- datashader/datatypes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/datashader/datatypes.py b/datashader/datatypes.py index 709567f2e..48eefed19 100644 --- a/datashader/datatypes.py +++ b/datashader/datatypes.py @@ -1,3 +1,4 @@ +from __future__ import absolute_import import re from functools import total_ordering From c846f0c2b4d898e6fce3c4a3e9878b98df55beca Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Fri, 8 Feb 2019 19:32:20 -0500 Subject: [PATCH 40/45] specify that int lists should cast to int64 numpy arrays To address AppVeyor failures --- datashader/tests/test_datatypes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datashader/tests/test_datatypes.py b/datashader/tests/test_datatypes.py index e770046bc..cb0520638 100644 --- a/datashader/tests/test_datatypes.py +++ b/datashader/tests/test_datatypes.py @@ -182,7 +182,7 @@ def test_start_indices_dtype(): @pytest.mark.parametrize('arg,expected', [ - ([[1, 2]], 'int64'), + ([np.array([1, 2], dtype='int64')], 'int64'), ([[True], [False, True]], 'bool'), (np.array([np.array([1, 2], dtype='int8'), np.array([1, 2], dtype='int32')]), 'int32'), @@ -386,7 +386,7 @@ def test_pandas_array_construction(): arg = [[0, 1], [1, 2, 3, 4], None, [-1, -2]] * 2 ra = pd.array(arg, dtype='ragged[int64]') - expected = RaggedArray(arg) + expected = RaggedArray(arg, dtype='int64') assert_ragged_arrays_equal(ra, expected) From cad7d0a7fbea06b394005172b9456008df21274a Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Sat, 23 Feb 2019 19:26:19 -0500 Subject: [PATCH 41/45] Remove parameterized args from skipped tests No reason to skip every combination, and this was causing pytest-xdist to throw an internal error when running tests in parallel --- datashader/tests/test_datatypes.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/datashader/tests/test_datatypes.py b/datashader/tests/test_datatypes.py index cb0520638..3af4b5a50 100644 --- a/datashader/tests/test_datatypes.py +++ b/datashader/tests/test_datatypes.py @@ -712,9 +712,9 @@ def test_array_interface(self, data): class TestRaggedMethods(eb.BaseMethodsTests): - # AttributeError: 'RaggedArray' object has no attribute 'value_counts' + # # AttributeError: 'RaggedArray' object has no attribute 'value_counts' @pytest.mark.skip(reason="value_counts not supported") - def test_value_counts(self, all_data, dropna): + def test_value_counts(self): pass # Add array equality @@ -732,26 +732,26 @@ def test_unique(self, data, box, method): # Pandas raises # ValueError: invalid fill value with a @pytest.mark.skip(reason="pandas cannot fill with ndarray") - def test_fillna_copy_frame(self, data_missing): + def test_fillna_copy_frame(self): pass @pytest.mark.skip(reason="pandas cannot fill with ndarray") - def test_fillna_copy_series(self, data_missing): + def test_fillna_copy_series(self): pass # Ragged array elements don't support binary operators @pytest.mark.skip(reason="ragged does not support <= on elements") - def test_combine_le(self, data_repeated): + def test_combine_le(self): pass @pytest.mark.skip(reason="ragged does not support + on elements") - def test_combine_add(self, data_repeated): + def test_combine_add(self): pass # Block manager error: # ValueError: setting an array element with a sequence. @pytest.mark.skip(reason="combine_first not supported") - def test_combine_first(self, data): + def test_combine_first(self): pass @@ -764,11 +764,11 @@ class TestRaggedMissing(eb.BaseMissingTests): # Errors like: # ValueError: invalid fill value with a @pytest.mark.skip(reason="Can't fill with ndarray") - def test_fillna_series(self, data_missing): + def test_fillna_series(self): pass @pytest.mark.skip(reason="Can't fill with ndarray") - def test_fillna_frame(self, data_missing): + def test_fillna_frame(self): pass From 89d1d51172ab1106edc869dbb92d79888853a87f Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Sat, 23 Feb 2019 19:27:30 -0500 Subject: [PATCH 42/45] Add Dask optimized bounds calculations for ragged list glyph --- datashader/glyphs.py | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/datashader/glyphs.py b/datashader/glyphs.py index 03b437195..605f10c5d 100644 --- a/datashader/glyphs.py +++ b/datashader/glyphs.py @@ -505,22 +505,20 @@ def compute_y_bounds(self, df): return self.maybe_expand_bounds(bounds) @memoize - def compute_x_bounds_dask(self, df): - """Like ``PointLike._compute_x_bounds``, but memoized because - ``df`` is immutable/hashable (a Dask dataframe). - """ - xs = df[self.x].compute().array.flat_array - minval, maxval = np.nanmin(xs), np.nanmax(xs) - return self.maybe_expand_bounds((minval, maxval)) + def compute_bounds_dask(self, ddf): - @memoize - def compute_y_bounds_dask(self, df): - """Like ``PointLike._compute_y_bounds``, but memoized because - ``df`` is immutable/hashable (a Dask dataframe). - """ - ys = df[self.y].compute().array.flat_array - minval, maxval = np.nanmin(ys), np.nanmax(ys) - return self.maybe_expand_bounds((minval, maxval)) + r = ddf.map_partitions(lambda df: np.array([[ + np.nanmin(df[self.x].array.flat_array), + np.nanmax(df[self.x].array.flat_array), + np.nanmin(df[self.y].array.flat_array), + np.nanmax(df[self.y].array.flat_array)]] + )).compute() + + x_extents = np.nanmin(r[:, 0]), np.nanmax(r[:, 1]) + y_extents = np.nanmin(r[:, 2]), np.nanmax(r[:, 3]) + + return (self.maybe_expand_bounds(x_extents), + self.maybe_expand_bounds(y_extents)) @memoize def _build_extend(self, x_mapper, y_mapper, info, append): From 92eaab2123f61fb76f628148d9f43008ba89fba3 Mon Sep 17 00:00:00 2001 From: "James A. Bednar" Date: Thu, 28 Feb 2019 05:51:14 -0500 Subject: [PATCH 43/45] Apply suggestions from code review Co-Authored-By: jonmmease --- datashader/datatypes.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/datashader/datatypes.py b/datashader/datatypes.py index 48eefed19..9d8e3c356 100644 --- a/datashader/datatypes.py +++ b/datashader/datatypes.py @@ -20,7 +20,7 @@ def _validate_ragged_properties(start_indices, flat_array): """ - Validate that start_indices are flat_array arrays may be used to + Validate that start_indices are flat_array arrays that may be used to represent a valid RaggedArray. Parameters @@ -456,14 +456,14 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): @classmethod def _from_factorized(cls, values, original): """ - Reconstruct an ExtensionArray after factorization. + Reconstruct a RaggedArray after factorization. Parameters ---------- values : ndarray An integer ndarray with the factorized values. original : RaggedArray - The original ExtensionArray that factorize was called on. + The original RaggedArray that factorize was called on. See Also -------- @@ -921,7 +921,7 @@ def _eq_ragged_ndarray1d(start_indices, flat_array, a): Notes ----- - This function is not numba accelerated because it, but design, inputs + This function is not numba accelerated because it, by design, inputs a numpy object array """ @@ -962,7 +962,7 @@ def _eq_ragged_ndarray2d(start_indices, flat_array, a): ------- mask: ndarray 1D bool array of same length as input RaggedArray with elements True - when corresponding elements of ra equals corresponding row of a + when corresponding elements of ra equal corresponding row of `a` """ n = len(start_indices) m = len(flat_array) From 1538909bd15721f7184fc0228a59733cf2a2bbf4 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Thu, 28 Feb 2019 06:09:56 -0500 Subject: [PATCH 44/45] Refer to parent docstrings rather than duplicate --- datashader/datatypes.py | 236 ++++++---------------------------------- 1 file changed, 35 insertions(+), 201 deletions(-) diff --git a/datashader/datatypes.py b/datashader/datatypes.py index 9d8e3c356..7e1d348dc 100644 --- a/datashader/datatypes.py +++ b/datashader/datatypes.py @@ -123,6 +123,9 @@ class RaggedDtype(ExtensionDtype): @property def name(self): + """ + See docstring for ExtensionDtype.name + """ return 'Ragged[{subtype}]'.format(subtype=self.subtype) def __repr__(self): @@ -130,10 +133,16 @@ def __repr__(self): @classmethod def construct_array_type(cls): + """ + See docstring for ExtensionDtype.construct_array_type + """ return RaggedArray @classmethod def construct_from_string(cls, string): + """ + See docstring for ExtensionDtype.construct_from_string + """ # lowercase string string = string.lower() @@ -372,25 +381,13 @@ def start_indices(self): def __len__(self): """ - Length of this array - - Returns - ------- - length : int + See docstring for ExtensionArray.__len__ """ return len(self._start_indices) def __getitem__(self, item): """ - Parameters - ---------- - item : int, slice, or ndarray - * int: The position in 'self' to get. - - * slice: A slice object, where 'start', 'stop', and 'step' are - integers or None - - * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' + See docstring for ExtensionArray.__getitem__ """ if isinstance(item, Integral): if item < -len(self) or item >= len(self): @@ -434,41 +431,14 @@ def __getitem__(self, item): @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): """ - Construct a new RaggedArray from a sequence of scalars. - - Parameters - ---------- - scalars : Sequence - Each element will be an instance of the scalar type for this - array, ``cls.dtype.type``. - dtype : dtype, optional - Construct for this particular dtype. This should be a Dtype - compatible with the ExtensionArray. - copy : boolean, default False - If True, copy the underlying data. - - Returns - ------- - RaggedArray + See docstring for ExtensionArray._from_sequence """ return RaggedArray(scalars, dtype=dtype) @classmethod def _from_factorized(cls, values, original): """ - Reconstruct a RaggedArray after factorization. - - Parameters - ---------- - values : ndarray - An integer ndarray with the factorized values. - original : RaggedArray - The original RaggedArray that factorize was called on. - - See Also - -------- - pandas.factorize - ExtensionArray.factorize + See docstring for ExtensionArray._from_factorized """ return RaggedArray( [_RaggedElement.array_or_nan(v) for v in values], @@ -479,18 +449,20 @@ def _as_ragged_element_array(self): for i in range(len(self))]) def _values_for_factorize(self): + """ + See docstring for ExtensionArray._values_for_factorize + """ return self._as_ragged_element_array(), np.nan def _values_for_argsort(self): + """ + See docstring for ExtensionArray._values_for_argsort + """ return self._as_ragged_element_array() def unique(self): """ - Compute the ExtensionArray of unique values. - - Returns - ------- - uniques : ExtensionArray + See docstring for ExtensionArray.unique """ from pandas import unique @@ -501,29 +473,7 @@ def unique(self): def fillna(self, value=None, method=None, limit=None): """ - Fill NA/NaN values using the specified method. - - Parameters - ---------- - value : scalar, array-like - If a scalar value is passed it is used to fill all missing values. - Alternatively, an array-like 'value' can be given. It's expected - that the array-like have the same length as 'self'. - method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None - Method to use for filling holes in reindexed Series - pad / ffill: propagate last valid observation forward to next valid - backfill / bfill: use NEXT valid observation to fill gap - limit : int, default None - If method is specified, this is the maximum number of consecutive - NaN values to forward/backward fill. In other words, if there is - a gap with more than this number of consecutive NaNs, it will only - be partially filled. If method is not specified, this is the - maximum number of entries along the entire axis where NaNs will be - filled. - - Returns - ------- - filled : ExtensionArray with NA/NaN filled + See docstring for ExtensionArray.fillna """ # Override in RaggedArray to handle ndarray fill values from pandas.util._validators import validate_fillna_kwargs @@ -560,37 +510,7 @@ def fillna(self, value=None, method=None, limit=None): def shift(self, periods=1, fill_value=None): # type: (int, object) -> ExtensionArray """ - Shift values by desired number. - - Newly introduced missing values are filled with - ``self.dtype.na_value``. - - .. versionadded:: 0.24.0 - - Parameters - ---------- - periods : int, default 1 - The number of periods to shift. Negative values are allowed - for shifting backwards. - - fill_value : object, optional - The scalar value to use for newly introduced missing values. - The default is ``self.dtype.na_value`` - - .. versionadded:: 0.24.0 - - Returns - ------- - shifted : ExtensionArray - - Notes - ----- - If ``self`` is empty or ``periods`` is 0, a copy of ``self`` is - returned. - - If ``periods > len(self)``, then an array of size - len(self) is returned, with all values filled with - ``self.dtype.na_value``. + See docstring for ExtensionArray.shift """ # Override in RaggedArray to handle ndarray fill values @@ -616,49 +536,8 @@ def shift(self, periods=1, fill_value=None): def searchsorted(self, value, side="left", sorter=None): """ - Find indices where elements should be inserted to maintain order. - - .. versionadded:: 0.24.0 - - Find the indices into a sorted array `self` (a) such that, if the - corresponding elements in `v` were inserted before the indices, the - order of `self` would be preserved. - - Assuming that `a` is sorted: - - ====== ============================ - `side` returned index `i` satisfies - ====== ============================ - left ``self[i-1] < v <= self[i]`` - right ``self[i-1] <= v < self[i]`` - ====== ============================ - - Parameters - ---------- - value : array_like - Values to insert into `self`. - side : {'left', 'right'}, optional - If 'left', the index of the first suitable location found is given. - If 'right', return the last such index. If there is no suitable - index, return either 0 or N (where N is the length of `self`). - sorter : 1-D array_like, optional - Optional array of integer indices that sort array a into ascending - order. They are typically the result of argsort. - - Returns - ------- - indices : array of ints - Array of insertion points with the same shape as `value`. - - See Also - -------- - numpy.searchsorted : Similar method from NumPy. - """ - # Note: the base tests provided by pandas only test the basics. - # We do not test - # 1. Values outside the range of the `data_for_sorting` fixture - # 2. Values between the values in the `data_for_sorting` fixture - # 3. Missing values. + See docstring for ExtensionArray.searchsorted + """ arr = self._as_ragged_element_array() if isinstance(value, RaggedArray): search_value = value._as_ragged_element_array() @@ -668,13 +547,7 @@ def searchsorted(self, value, side="left", sorter=None): def isna(self): """ - A 1-D array indicating if each value is missing. - - Returns - ------- - na_values : np.ndarray - boolean ndarray the same length as the ragged array where values - of True represent missing/NA values. + See docstring for ExtensionArray.isna """ stop_indices = np.hstack([self.start_indices[1:], [len(self.flat_array)]]) @@ -684,34 +557,7 @@ def isna(self): def take(self, indices, allow_fill=False, fill_value=None): """ - Take elements from an array. - - Parameters - ---------- - indices : sequence of integers - Indices to be taken. - allow_fill : bool, default False - How to handle negative values in `indices`. - - * False: negative values in `indices` indicate positional indices - from the right (the default). This is similar to - :func:`numpy.take`. - - * True: negative values in `indices` indicate - missing values. These values are set to `fill_value`. Any other - other negative values raise a ``ValueError``. - - fill_value : any, default None - Fill value to use for NA-indices when `allow_fill` is True. - - Returns - ------- - RaggedArray - - Raises - ------ - IndexError - When the indices are out of bounds for the array. + See docstring for ExtensionArray.take """ if allow_fill: invalid_inds = [i for i in indices if i < -1] @@ -731,16 +577,7 @@ def take(self, indices, allow_fill=False, fill_value=None): def copy(self, deep=False): """ - Return a copy of the array. - - Parameters - ---------- - deep : bool, default False - Also copy the underlying data backing this array. - - Returns - ------- - RaggedArray + See docstring for ExtensionArray.copy """ data = dict( flat_array=self.flat_array, @@ -751,15 +588,7 @@ def copy(self, deep=False): @classmethod def _concat_same_type(cls, to_concat): """ - Concatenate multiple RaggedArray instances - - Parameters - ---------- - to_concat : list of RaggedArray - - Returns - ------- - RaggedArray + See docstring for ExtensionArray._concat_same_type """ # concat flat_arrays flat_array = np.hstack(ra.flat_array for ra in to_concat) @@ -778,18 +607,23 @@ def _concat_same_type(cls, to_concat): @property def dtype(self): + """ + See docstring for ExtensionArray.dtype + """ return self._dtype @property def nbytes(self): """ - The number of bytes needed to store this object in memory. + See docstring for ExtensionArray.nbytes """ return (self._flat_array.nbytes + self._start_indices.nbytes) def astype(self, dtype, copy=True): - + """ + See docstring for ExtensionArray.astype + """ dtype = pandas_dtype(dtype) if isinstance(dtype, RaggedDtype): if copy: From c42f0df68dcfbf8715358194bba11a394f788c86 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Fri, 1 Mar 2019 07:27:12 -0500 Subject: [PATCH 45/45] Remove docstring references --- datashader/datatypes.py | 73 +++++++---------------------------------- 1 file changed, 12 insertions(+), 61 deletions(-) diff --git a/datashader/datatypes.py b/datashader/datatypes.py index 7e1d348dc..c552860bd 100644 --- a/datashader/datatypes.py +++ b/datashader/datatypes.py @@ -116,6 +116,12 @@ def __repr__(self): @register_extension_dtype class RaggedDtype(ExtensionDtype): + """ + Pandas ExtensionDtype to represent a ragged array datatype + + Methods not otherwise documented here are inherited from ExtensionDtype; + please see the corresponding method on that class for the docstring + """ type = np.ndarray base = np.dtype('O') _subtype_re = re.compile(r"^ragged\[(?P\w+)\]$") @@ -123,9 +129,6 @@ class RaggedDtype(ExtensionDtype): @property def name(self): - """ - See docstring for ExtensionDtype.name - """ return 'Ragged[{subtype}]'.format(subtype=self.subtype) def __repr__(self): @@ -133,16 +136,10 @@ def __repr__(self): @classmethod def construct_array_type(cls): - """ - See docstring for ExtensionDtype.construct_array_type - """ return RaggedArray @classmethod def construct_from_string(cls, string): - """ - See docstring for ExtensionDtype.construct_from_string - """ # lowercase string string = string.lower() @@ -205,6 +202,12 @@ def missing(v): class RaggedArray(ExtensionArray): + """ + Pandas ExtensionArray to represent ragged arrays + + Methods not otherwise documented here are inherited from ExtensionArray; + please see the corresponding method on that class for the docstring + """ def __init__(self, data, dtype=None, copy=False): """ Construct a RaggedArray @@ -380,15 +383,9 @@ def start_indices(self): return self._start_indices def __len__(self): - """ - See docstring for ExtensionArray.__len__ - """ return len(self._start_indices) def __getitem__(self, item): - """ - See docstring for ExtensionArray.__getitem__ - """ if isinstance(item, Integral): if item < -len(self) or item >= len(self): raise IndexError("{item} is out of bounds".format(item=item)) @@ -430,16 +427,10 @@ def __getitem__(self, item): @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): - """ - See docstring for ExtensionArray._from_sequence - """ return RaggedArray(scalars, dtype=dtype) @classmethod def _from_factorized(cls, values, original): - """ - See docstring for ExtensionArray._from_factorized - """ return RaggedArray( [_RaggedElement.array_or_nan(v) for v in values], dtype=original.flat_array.dtype) @@ -449,21 +440,12 @@ def _as_ragged_element_array(self): for i in range(len(self))]) def _values_for_factorize(self): - """ - See docstring for ExtensionArray._values_for_factorize - """ return self._as_ragged_element_array(), np.nan def _values_for_argsort(self): - """ - See docstring for ExtensionArray._values_for_argsort - """ return self._as_ragged_element_array() def unique(self): - """ - See docstring for ExtensionArray.unique - """ from pandas import unique uniques = unique(self._as_ragged_element_array()) @@ -472,9 +454,6 @@ def unique(self): dtype=self.dtype) def fillna(self, value=None, method=None, limit=None): - """ - See docstring for ExtensionArray.fillna - """ # Override in RaggedArray to handle ndarray fill values from pandas.util._validators import validate_fillna_kwargs from pandas.core.missing import pad_1d, backfill_1d @@ -508,10 +487,6 @@ def fillna(self, value=None, method=None, limit=None): return new_values def shift(self, periods=1, fill_value=None): - # type: (int, object) -> ExtensionArray - """ - See docstring for ExtensionArray.shift - """ # Override in RaggedArray to handle ndarray fill values # Note: this implementation assumes that `self.dtype.na_value` can be @@ -535,9 +510,6 @@ def shift(self, periods=1, fill_value=None): return self._concat_same_type([a, b]) def searchsorted(self, value, side="left", sorter=None): - """ - See docstring for ExtensionArray.searchsorted - """ arr = self._as_ragged_element_array() if isinstance(value, RaggedArray): search_value = value._as_ragged_element_array() @@ -546,9 +518,6 @@ def searchsorted(self, value, side="left", sorter=None): return arr.searchsorted(search_value, side=side, sorter=sorter) def isna(self): - """ - See docstring for ExtensionArray.isna - """ stop_indices = np.hstack([self.start_indices[1:], [len(self.flat_array)]]) @@ -556,9 +525,6 @@ def isna(self): return element_lengths == 0 def take(self, indices, allow_fill=False, fill_value=None): - """ - See docstring for ExtensionArray.take - """ if allow_fill: invalid_inds = [i for i in indices if i < -1] if invalid_inds: @@ -576,9 +542,6 @@ def take(self, indices, allow_fill=False, fill_value=None): return RaggedArray(sequence, dtype=self.flat_array.dtype) def copy(self, deep=False): - """ - See docstring for ExtensionArray.copy - """ data = dict( flat_array=self.flat_array, start_indices=self.start_indices) @@ -587,9 +550,6 @@ def copy(self, deep=False): @classmethod def _concat_same_type(cls, to_concat): - """ - See docstring for ExtensionArray._concat_same_type - """ # concat flat_arrays flat_array = np.hstack(ra.flat_array for ra in to_concat) @@ -607,23 +567,14 @@ def _concat_same_type(cls, to_concat): @property def dtype(self): - """ - See docstring for ExtensionArray.dtype - """ return self._dtype @property def nbytes(self): - """ - See docstring for ExtensionArray.nbytes - """ return (self._flat_array.nbytes + self._start_indices.nbytes) def astype(self, dtype, copy=True): - """ - See docstring for ExtensionArray.astype - """ dtype = pandas_dtype(dtype) if isinstance(dtype, RaggedDtype): if copy: