diff --git a/RELEASE.rst b/RELEASE.rst index 981fa5bed257d..5db564176959e 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -22,6 +22,42 @@ Where to get it * Binary installers on PyPI: http://pypi.python.org/pypi/pandas * Documentation: http://pandas.pydata.org +pandas 0.10.2 +============= + +**Release date:** 2013-??-?? + +**New features** + + - Allow mixed dtypes (e.g ``float32/float64/int32/int16/int8``) to coexist in DataFrames and propogate in operations + +**Improvements to existing features** + + - added ``blocks`` attribute to DataFrames, to return a dict of dtypes to homogeneously dtyped DataFrames + - added keyword ``convert_numeric`` to ``convert_objects()`` to try to convert object dtypes to numeric types + - ``convert_dates`` in ``convert_objects`` can now be ``coerce`` which will return a datetime64[ns] dtype + with non-convertibles set as ``NaT``; will preserve an all-nan object (e.g. strings) + - Series print output now includes the dtype by default + +**API Changes** + + - Do not automatically upcast numeric specified dtypes to ``int64`` or ``float64`` (GH622_ and GH797_) + - Guarantee that ``convert_objects()`` for Series/DataFrame always returns a copy + - groupby operations will respect dtypes for numeric float operations (float32/float64); other types will be operated on, + and will try to cast back to the input dtype (e.g. if an int is passed, as long as the output doesn't have nans, + then an int will be returned) + - backfill/pad/take/diff/ohlc will now support ``float32/int16/int8`` operations + - Integer block types will upcast as needed in where operations (GH2793_) + +**Bug Fixes** + + - Fix seg fault on empty data frame when fillna with ``pad`` or ``backfill`` (GH2778_) + +.. _GH622: https://github.com/pydata/pandas/issues/622 +.. _GH797: https://github.com/pydata/pandas/issues/797 +.. _GH2778: https://github.com/pydata/pandas/issues/2778 +.. _GH2793: https://github.com/pydata/pandas/issues/2793 + pandas 0.10.1 ============= @@ -36,6 +72,7 @@ pandas 0.10.1 - Restored inplace=True behavior returning self (same object) with deprecation warning until 0.11 (GH1893_) - ``HDFStore`` + - refactored HFDStore to deal with non-table stores as objects, will allow future enhancements - removed keyword ``compression`` from ``put`` (replaced by keyword ``complib`` to be consistent across library) @@ -49,7 +86,7 @@ pandas 0.10.1 - support data column indexing and selection, via ``data_columns`` keyword in append - support write chunking to reduce memory footprint, via ``chunksize`` keyword to append - - support automagic indexing via ``index`` keywork to append + - support automagic indexing via ``index`` keyword to append - support ``expectedrows`` keyword in append to inform ``PyTables`` about the expected tablesize - support ``start`` and ``stop`` keywords in select to limit the row diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index 362ef8ef7d7fb..6919f67db5b78 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -450,15 +450,101 @@ DataFrame: df.xs('b') df.ix[2] -Note if a DataFrame contains columns of multiple dtypes, the dtype of the row -will be chosen to accommodate all of the data types (dtype=object is the most -general). - For a more exhaustive treatment of more sophisticated label-based indexing and slicing, see the :ref:`section on indexing `. We will address the fundamentals of reindexing / conforming to new sets of lables in the :ref:`section on reindexing `. +DataTypes +~~~~~~~~~ + +.. _dsintro.column_types: + +The main types stored in pandas objects are float, int, boolean, datetime64[ns], +and object. A convenient ``dtypes`` attribute return a Series with the data type of +each column. + +.. ipython:: python + + df['integer'] = 1 + df['int32'] = df['integer'].astype('int32') + df['float32'] = Series([1.0]*len(df),dtype='float32') + df['timestamp'] = Timestamp('20010102') + df.dtypes + +If a DataFrame contains columns of multiple dtypes, the dtype of the column +will be chosen to accommodate all of the data types (dtype=object is the most +general). + +The related method ``get_dtype_counts`` will return the number of columns of +each type: + +.. ipython:: python + + df.get_dtype_counts() + +Numeric dtypes will propgate and can coexist in DataFrames (starting in v0.10.2). +If a dtype is passed (either directly via the ``dtype`` keyword, a passed ``ndarray``, +or a passed ``Series``, then it will be preserved in DataFrame operations. Furthermore, different numeric dtypes will **NOT** be combined. The following example will give you a taste. + +.. ipython:: python + + df1 = DataFrame(randn(8, 1), columns = ['A'], dtype = 'float32') + df1 + df1.dtypes + df2 = DataFrame(dict( A = Series(randn(8),dtype='float16'), + B = Series(randn(8)), + C = Series(np.array(randn(8),dtype='uint8')) )) + df2 + df2.dtypes + + # here you get some upcasting + df3 = df1.reindex_like(df2).fillna(value=0.0) + df2 + df3 + df3.dtypes + + # this is lower-common-denomicator upcasting (meaning you get the dtype which can accomodate all of the types) + df3.values.dtype + +Upcasting is always according to the **numpy** rules. If two different dtypes are involved in an operation, then the more *general* one will be used as the result of the operation. + +DataType Conversion +~~~~~~~~~~~~~~~~~~~ + +You can use the ``astype`` method to convert dtypes from one to another. These *always* return a copy. +In addition, ``convert_objects`` will attempt to *soft* conversion of any *object* dtypes, meaning that if all the objects in a Series are of the same type, the Series +will have that dtype. + +.. ipython:: python + + df3 + df3.dtypes + + # conversion of dtypes + df3.astype('float32').dtypes + +To force conversion of specific types of number conversion, pass ``convert_numeric = True``. +This will force strings and numbers alike to be numbers if possible, otherwise the will be set to ``np.nan``. +To force conversion to ``datetime64[ns]``, pass ``convert_dates = 'coerce'``. +This will convert any datetimelike object to dates, forcing other values to ``NaT``. + +.. ipython:: python + + # mixed type conversions + df3['D'] = '1.' + df3['E'] = '1' + df3.convert_objects(convert_numeric=True).dtypes + + # same, but specific dtype conversion + df3['D'] = df3['D'].astype('float16') + df3['E'] = df3['E'].astype('int32') + df3.dtypes + + # forcing date coercion + s = Series([datetime(2001,1,1,0,0), 'foo', 1.0, 1, Timestamp('20010104'), '20010105'],dtype='O') + s + s.convert_objects(convert_dates='coerce') + Data alignment and arithmetic ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -633,26 +719,6 @@ You can also disable this feature via the ``expand_frame_repr`` option: reset_option('expand_frame_repr') -DataFrame column types -~~~~~~~~~~~~~~~~~~~~~~ - -.. _dsintro.column_types: - -The four main types stored in pandas objects are float, int, boolean, and -object. A convenient ``dtypes`` attribute return a Series with the data type of -each column: - -.. ipython:: python - - baseball.dtypes - -The related method ``get_dtype_counts`` will return the number of columns of -each type: - -.. ipython:: python - - baseball.get_dtype_counts() - DataFrame column attribute access and IPython completion ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 33c5db2d24102..969173d0d3569 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -304,6 +304,34 @@ so that the original data can be modified without creating a copy: df.mask(df >= 0) +Upcasting Gotchas +~~~~~~~~~~~~~~~~~ + +Performing indexing operations on ``integer`` type data can easily upcast the data to ``floating``. +The dtype of the input data will be preserved in cases where ``nans`` are not introduced (coming soon). + +.. ipython:: python + + dfi = df.astype('int32') + dfi['E'] = 1 + dfi + dfi.dtypes + + casted = dfi[dfi>0] + casted + casted.dtypes + +While float dtypes are unchanged. + +.. ipython:: python + + df2 = df.copy() + df2['A'] = df2['A'].astype('float32') + df2.dtypes + + casted = df2[df2>0] + casted + casted.dtypes Take Methods ~~~~~~~~~~~~ diff --git a/doc/source/v0.10.2.txt b/doc/source/v0.10.2.txt new file mode 100644 index 0000000000000..d87cf86d56864 --- /dev/null +++ b/doc/source/v0.10.2.txt @@ -0,0 +1,95 @@ +.. _whatsnew_0102: + +v0.10.2 (February ??, 2013) +--------------------------- + +This is a minor release from 0.10.1 and includes many new features and +enhancements along with a large number of bug fixes. There are also a number of +important API changes that long-time pandas users should pay close attention +to. + +API changes +~~~~~~~~~~~ + +Numeric dtypes will propgate and can coexist in DataFrames. If a dtype is passed (either directly via the ``dtype`` keyword, a passed ``ndarray``, or a passed ``Series``, then it will be preserved in DataFrame operations. Furthermore, different numeric dtypes will **NOT** be combined. The following example will give you a taste. + +**Dtype Specification** + +.. ipython:: python + + df1 = DataFrame(randn(8, 1), columns = ['A'], dtype = 'float32') + df1 + df1.dtypes + df2 = DataFrame(dict( A = Series(randn(8),dtype='float16'), B = Series(randn(8)), C = Series(randn(8),dtype='uint8') )) + df2 + df2.dtypes + + # here you get some upcasting + df3 = df1.reindex_like(df2).fillna(value=0.0) + df2 + df3 + df3.dtypes + +**Dtype conversion** + +.. ipython:: python + + # this is lower-common-denomicator upcasting (meaning you get the dtype which can accomodate all of the types) + df3.values.dtype + + # conversion of dtypes + df3.astype('float32').dtypes + + # mixed type conversions + df3['D'] = '1.' + df3['E'] = '1' + df3.convert_objects(convert_numeric=True).dtypes + + # same, but specific dtype conversion + df3['D'] = df3['D'].astype('float16') + df3['E'] = df3['E'].astype('int32') + df3.dtypes + + # forcing date coercion + s = Series([datetime(2001,1,1,0,0), 'foo', 1.0, 1, + Timestamp('20010104'), '20010105'],dtype='O') + s.convert_objects(convert_dates='coerce') + +**Upcasting Gotchas** + +Performing indexing operations on integer type data can easily upcast the data. +The dtype of the input data will be preserved in cases where ``nans`` are not introduced (coming soon). + +.. ipython:: python + + dfi = df3.astype('int32') + dfi['D'] = dfi['D'].astype('int64') + dfi + dfi.dtypes + + casted = dfi[dfi>0] + casted + casted.dtypes + +While float dtypes are unchanged. + +.. ipython:: python + + df4 = df3.copy() + df4['A'] = df4['A'].astype('float32') + df4.dtypes + + casted = df4[df4>0] + casted + casted.dtypes + +New features +~~~~~~~~~~~~ + +**Enhancements** + +**Bug Fixes** + +See the `full release notes +`__ or issue tracker +on GitHub for a complete list. + diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index 6c125c45a2599..646610ecccd88 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -16,6 +16,8 @@ What's New These are new features and improvements of note in each release. +.. include:: v0.10.2.txt + .. include:: v0.10.1.txt .. include:: v0.10.0.txt diff --git a/pandas/algos.pyx b/pandas/algos.pyx index 0d7006f08111b..40c8cabe3cb9a 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -12,23 +12,35 @@ cimport util from libc.stdlib cimport malloc, free +from numpy cimport NPY_INT8 as NPY_int8 +from numpy cimport NPY_INT16 as NPY_int16 from numpy cimport NPY_INT32 as NPY_int32 from numpy cimport NPY_INT64 as NPY_int64 +from numpy cimport NPY_FLOAT16 as NPY_float16 from numpy cimport NPY_FLOAT32 as NPY_float32 from numpy cimport NPY_FLOAT64 as NPY_float64 +int8 = np.dtype(np.int8) +int16 = np.dtype(np.int16) int32 = np.dtype(np.int32) int64 = np.dtype(np.int64) +float16 = np.dtype(np.float16) float32 = np.dtype(np.float32) float64 = np.dtype(np.float64) +cdef np.int8_t MINint8 = np.iinfo(np.int8).min +cdef np.int16_t MINint16 = np.iinfo(np.int16).min cdef np.int32_t MINint32 = np.iinfo(np.int32).min cdef np.int64_t MINint64 = np.iinfo(np.int64).min +cdef np.float16_t MINfloat16 = np.NINF cdef np.float32_t MINfloat32 = np.NINF cdef np.float64_t MINfloat64 = np.NINF +cdef np.int8_t MAXint8 = np.iinfo(np.int8).max +cdef np.int16_t MAXint16 = np.iinfo(np.int16).max cdef np.int32_t MAXint32 = np.iinfo(np.int32).max cdef np.int64_t MAXint64 = np.iinfo(np.int64).max +cdef np.float16_t MAXfloat16 = np.inf cdef np.float32_t MAXfloat32 = np.inf cdef np.float64_t MAXfloat64 = np.inf @@ -615,141 +627,6 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average', # return result - -@cython.wraparound(False) -@cython.boundscheck(False) -def diff_2d_float64(ndarray[float64_t, ndim=2] arr, - ndarray[float64_t, ndim=2] out, - Py_ssize_t periods, int axis): - cdef: - Py_ssize_t i, j, sx, sy - - sx, sy = ( arr).shape - if arr.flags.f_contiguous: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for j in range(sy): - for i in range(start, stop): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for j in range(start, stop): - for i in range(sx): - out[i, j] = arr[i, j] - arr[i, j - periods] - else: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for i in range(start, stop): - for j in range(sy): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for i in range(sx): - for j in range(start, stop): - out[i, j] = arr[i, j] - arr[i, j - periods] - -@cython.wraparound(False) -@cython.boundscheck(False) -def diff_2d_int64(ndarray[int64_t, ndim=2] arr, - ndarray[float64_t, ndim=2] out, - Py_ssize_t periods, int axis): - cdef: - Py_ssize_t i, j, sx, sy - - sx, sy = ( arr).shape - if arr.flags.f_contiguous: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for j in range(sy): - for i in range(start, stop): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for j in range(start, stop): - for i in range(sx): - out[i, j] = arr[i, j] - arr[i, j - periods] - else: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for i in range(start, stop): - for j in range(sy): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for i in range(sx): - for j in range(start, stop): - out[i, j] = arr[i, j] - arr[i, j - periods] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def diff_2d_int32(ndarray[int64_t, ndim=2] arr, - ndarray[float64_t, ndim=2] out, - Py_ssize_t periods, int axis): - cdef: - Py_ssize_t i, j, sx, sy - - sx, sy = ( arr).shape - if arr.flags.f_contiguous: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for j in range(sy): - for i in range(start, stop): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for j in range(start, stop): - for i in range(sx): - out[i, j] = arr[i, j] - arr[i, j - periods] - else: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for i in range(start, stop): - for j in range(sy): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for i in range(sx): - for j in range(start, stop): - out[i, j] = arr[i, j] - arr[i, j - periods] - - # Cython implementations of rolling sum, mean, variance, skewness, # other statistical moment functions # @@ -1931,161 +1808,9 @@ def groupsort_indexer(ndarray[int64_t] index, Py_ssize_t ngroups): return result, counts # TODO: aggregate multiple columns in single pass - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_add(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - ''' - Only aggregates on axis=0 - ''' - cdef: - Py_ssize_t i, j, N, K, lab - float64_t val, count - ndarray[float64_t, ndim=2] sumx, nobs - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - - N, K = ( values).shape - - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[lab, 0] += 1 - sumx[lab, 0] += val - - for i in range(len(counts)): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = sumx[i, j] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_prod(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - ''' - Only aggregates on axis=0 - ''' - cdef: - Py_ssize_t i, j, N, K, lab - float64_t val, count - ndarray[float64_t, ndim=2] prodx, nobs - - nobs = np.zeros_like(out) - prodx = np.ones_like(out) - - N, K = ( values).shape - - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - prodx[lab, j] *= val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[lab, 0] += 1 - prodx[lab, 0] *= val - - for i in range(len(counts)): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = prodx[i, j] - #---------------------------------------------------------------------- # first, nth, last -@cython.boundscheck(False) -@cython.wraparound(False) -def group_nth(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels, int64_t rank): - ''' - Only aggregates on axis=0 - ''' - cdef: - Py_ssize_t i, j, N, K, lab - float64_t val, count - ndarray[float64_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty_like(out) - - N, K = ( values).shape - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - - for i in range(len(counts)): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = resx[i, j] - @cython.boundscheck(False) @cython.wraparound(False) def group_nth_object(ndarray[object, ndim=2] out, @@ -2130,52 +1855,6 @@ def group_nth_object(ndarray[object, ndim=2] out, else: out[i, j] = resx[i, j] -@cython.boundscheck(False) -@cython.wraparound(False) -def group_nth_bin(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] bins, int64_t rank): - ''' - Only aggregates on axis=0 - ''' - cdef: - Py_ssize_t i, j, N, K, ngroups, b - float64_t val, count - ndarray[float64_t, ndim=2] resx, nobs - - nobs = np.zeros_like(out) - resx = np.empty_like(out) - - if bins[len(bins) - 1] == len(values): - ngroups = len(bins) - else: - ngroups = len(bins) + 1 - - N, K = ( values).shape - - b = 0 - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[b, j] += 1 - if nobs[b, j] == rank: - resx[b, j] = val - - for i in range(ngroups): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = resx[i, j] - @cython.boundscheck(False) @cython.wraparound(False) def group_nth_bin_object(ndarray[object, ndim=2] out, @@ -2224,47 +1903,6 @@ def group_nth_bin_object(ndarray[object, ndim=2] out, else: out[i, j] = resx[i, j] -@cython.boundscheck(False) -@cython.wraparound(False) -def group_last(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - ''' - Only aggregates on axis=0 - ''' - cdef: - Py_ssize_t i, j, N, K, lab - float64_t val, count - ndarray[float64_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty_like(out) - - N, K = ( values).shape - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - resx[lab, j] = val - - for i in range(len(counts)): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = resx[i, j] - @cython.boundscheck(False) @cython.wraparound(False) def group_last_object(ndarray[object, ndim=2] out, @@ -2307,52 +1945,6 @@ def group_last_object(ndarray[object, ndim=2] out, else: out[i, j] = resx[i, j] - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_last_bin(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] bins): - ''' - Only aggregates on axis=0 - ''' - cdef: - Py_ssize_t i, j, N, K, ngroups, b - float64_t val, count - ndarray[float64_t, ndim=2] resx, nobs - - nobs = np.zeros_like(out) - resx = np.empty_like(out) - - if bins[len(bins) - 1] == len(values): - ngroups = len(bins) - else: - ngroups = len(bins) + 1 - - N, K = ( values).shape - - b = 0 - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[b, j] += 1 - resx[b, j] = val - - for i in range(ngroups): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = resx[i, j] - @cython.boundscheck(False) @cython.wraparound(False) def group_last_bin_object(ndarray[object, ndim=2] out, @@ -2400,183 +1992,15 @@ def group_last_bin_object(ndarray[object, ndim=2] out, else: out[i, j] = resx[i, j] -#---------------------------------------------------------------------- -# group_min, group_max -@cython.boundscheck(False) -@cython.wraparound(False) -def group_min(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - ''' - Only aggregates on axis=0 - ''' - cdef: - Py_ssize_t i, j, N, K, lab - float64_t val, count - ndarray[float64_t, ndim=2] minx, nobs - - nobs = np.zeros_like(out) - - minx = np.empty_like(out) - minx.fill(np.inf) - - N, K = ( values).shape - - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - if val < minx[lab, j]: - minx[lab, j] = val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[lab, 0] += 1 - if val < minx[lab, 0]: - minx[lab, 0] = val - - for i in range(len(counts)): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = minx[i, j] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_max(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - ''' - Only aggregates on axis=0 - ''' - cdef: - Py_ssize_t i, j, N, K, lab - float64_t val, count - ndarray[float64_t, ndim=2] maxx, nobs - - nobs = np.zeros_like(out) - - maxx = np.empty_like(out) - maxx.fill(-np.inf) - - N, K = ( values).shape - - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - if val > maxx[lab, j]: - maxx[lab, j] = val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[lab, 0] += 1 - if val > maxx[lab, 0]: - maxx[lab, 0] = val - - for i in range(len(counts)): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = maxx[i, j] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_mean(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - cdef: - Py_ssize_t i, j, N, K, lab - float64_t val, count - ndarray[float64_t, ndim=2] sumx, nobs - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - - N, K = ( values).shape - - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - # not nan - if val == val: - nobs[lab, 0] += 1 - sumx[lab, 0] += val - - for i in range(len(counts)): - for j in range(K): - count = nobs[i, j] - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = sumx[i, j] / count - - -def group_median(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): +#---------------------------------------------------------------------- +# median + +def group_median(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): ''' Only aggregates on axis=0 ''' @@ -2642,497 +2066,5 @@ cdef inline float64_t _median_linear(float64_t* a, int n): return result - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_var(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - cdef: - Py_ssize_t i, j, N, K, lab - float64_t val, ct - ndarray[float64_t, ndim=2] nobs, sumx, sumxx - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - sumxx = np.zeros_like(out) - - N, K = ( values).shape - - if K > 1: - for i in range(N): - - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - sumxx[lab, j] += val * val - else: - for i in range(N): - - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - # not nan - if val == val: - nobs[lab, 0] += 1 - sumx[lab, 0] += val - sumxx[lab, 0] += val * val - - - for i in range(len(counts)): - for j in range(K): - ct = nobs[i, j] - if ct < 2: - out[i, j] = nan - else: - out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) / - (ct * ct - ct)) -# add passing bin edges, instead of labels - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_add_bin(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] bins): - ''' - Only aggregates on axis=0 - ''' - cdef: - Py_ssize_t i, j, N, K, ngroups, b, nbins - float64_t val, count - ndarray[float64_t, ndim=2] sumx, nobs - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - - if bins[len(bins) - 1] == len(values): - ngroups = len(bins) - else: - ngroups = len(bins) + 1 - N, K = ( values).shape - - b = 0 - if K > 1: - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[b, j] += 1 - sumx[b, j] += val - else: - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[b, 0] += 1 - sumx[b, 0] += val - - for i in range(ngroups): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = sumx[i, j] - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_prod_bin(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] bins): - ''' - Only aggregates on axis=0 - ''' - cdef: - Py_ssize_t i, j, N, K, ngroups, b - float64_t val, count - ndarray[float64_t, ndim=2] prodx, nobs - - nobs = np.zeros_like(out) - prodx = np.ones_like(out) - - if bins[len(bins) - 1] == len(values): - ngroups = len(bins) - else: - ngroups = len(bins) + 1 - N, K = ( values).shape - - b = 0 - if K > 1: - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[b, j] += 1 - prodx[b, j] *= val - else: - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[b, 0] += 1 - prodx[b, 0] *= val - - for i in range(ngroups): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = prodx[i, j] - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_min_bin(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] bins): - ''' - Only aggregates on axis=0 - ''' - cdef: - Py_ssize_t i, j, N, K, ngroups, b - float64_t val, count - ndarray[float64_t, ndim=2] minx, nobs - - nobs = np.zeros_like(out) - - minx = np.empty_like(out) - minx.fill(np.inf) - - if bins[len(bins) - 1] == len(values): - ngroups = len(bins) - else: - ngroups = len(bins) + 1 - - N, K = ( values).shape - - b = 0 - if K > 1: - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[b, j] += 1 - if val < minx[b, j]: - minx[b, j] = val - else: - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[b, 0] += 1 - if val < minx[b, 0]: - minx[b, 0] = val - - for i in range(ngroups): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = minx[i, j] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_max_bin(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] bins): - ''' - Only aggregates on axis=0 - ''' - cdef: - Py_ssize_t i, j, N, K, ngroups, b - float64_t val, count - ndarray[float64_t, ndim=2] maxx, nobs - - nobs = np.zeros_like(out) - maxx = np.empty_like(out) - maxx.fill(-np.inf) - - if bins[len(bins) - 1] == len(values): - ngroups = len(bins) - else: - ngroups = len(bins) + 1 - - N, K = ( values).shape - - b = 0 - if K > 1: - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[b, j] += 1 - if val > maxx[b, j]: - maxx[b, j] = val - else: - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[b, 0] += 1 - if val > maxx[b, 0]: - maxx[b, 0] = val - - for i in range(ngroups): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = maxx[i, j] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_ohlc(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] bins): - ''' - Only aggregates on axis=0 - ''' - cdef: - Py_ssize_t i, j, N, K, ngroups, b - float64_t val, count - float64_t vopen, vhigh, vlow, vclose, NA - bint got_first = 0 - - if bins[len(bins) - 1] == len(values): - ngroups = len(bins) - else: - ngroups = len(bins) + 1 - - N, K = ( values).shape - - if out.shape[1] != 4: - raise ValueError('Output array must have 4 columns') - - NA = np.nan - - b = 0 - if K > 1: - raise NotImplementedError - else: - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - if not got_first: - out[b, 0] = NA - out[b, 1] = NA - out[b, 2] = NA - out[b, 3] = NA - else: - out[b, 0] = vopen - out[b, 1] = vhigh - out[b, 2] = vlow - out[b, 3] = vclose - b += 1 - got_first = 0 - - counts[b] += 1 - val = values[i, 0] - - # not nan - if val == val: - if not got_first: - got_first = 1 - vopen = val - vlow = val - vhigh = val - else: - if val < vlow: - vlow = val - if val > vhigh: - vhigh = val - vclose = val - - if not got_first: - out[b, 0] = NA - out[b, 1] = NA - out[b, 2] = NA - out[b, 3] = NA - else: - out[b, 0] = vopen - out[b, 1] = vhigh - out[b, 2] = vlow - out[b, 3] = vclose - - -# @cython.boundscheck(False) -# @cython.wraparound(False) -def group_mean_bin(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] bins): - cdef: - Py_ssize_t i, j, N, K, ngroups, b - float64_t val, count - ndarray[float64_t, ndim=2] sumx, nobs - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - - N, K = ( values).shape - if bins[len(bins) - 1] == len(values): - ngroups = len(bins) - else: - ngroups = len(bins) + 1 - - b = 0 - if K > 1: - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[b, j] += 1 - sumx[b, j] += val - else: - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[b, 0] += 1 - sumx[b, 0] += val - - for i in range(ngroups): - for j in range(K): - count = nobs[i, j] - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = sumx[i, j] / count - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_var_bin(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] bins): - - cdef: - Py_ssize_t i, j, N, K, ngroups, b - float64_t val, ct - ndarray[float64_t, ndim=2] nobs, sumx, sumxx - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - sumxx = np.zeros_like(out) - - if bins[len(bins) - 1] == len(values): - ngroups = len(bins) - else: - ngroups = len(bins) + 1 - - N, K = ( values).shape - - b = 0 - if K > 1: - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[b, j] += 1 - sumx[b, j] += val - sumxx[b, j] += val * val - else: - for i in range(N): - while b < ngroups - 1 and i >= bins[b]: - b += 1 - - counts[b] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[b, 0] += 1 - sumx[b, 0] += val - sumxx[b, 0] += val * val - - for i in range(ngroups): - for j in range(K): - ct = nobs[i, j] - if ct < 2: - out[i, j] = nan - else: - out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) / - (ct * ct - ct)) - include "join.pyx" include "generated.pyx" diff --git a/pandas/core/common.py b/pandas/core/common.py index b3d996ffd0606..c99fd87f7a643 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -256,6 +256,9 @@ def wrapper(arr, indexer, out, fill_value=np.nan): _take1d_dict = { 'float64': algos.take_1d_float64, + 'float32': algos.take_1d_float32, + 'int8': algos.take_1d_int8, + 'int16': algos.take_1d_int16, 'int32': algos.take_1d_int32, 'int64': algos.take_1d_int64, 'object': algos.take_1d_object, @@ -266,6 +269,9 @@ def wrapper(arr, indexer, out, fill_value=np.nan): _take2d_axis0_dict = { 'float64': algos.take_2d_axis0_float64, + 'float32': algos.take_2d_axis0_float32, + 'int8': algos.take_2d_axis0_int8, + 'int16': algos.take_2d_axis0_int16, 'int32': algos.take_2d_axis0_int32, 'int64': algos.take_2d_axis0_int64, 'object': algos.take_2d_axis0_object, @@ -276,6 +282,9 @@ def wrapper(arr, indexer, out, fill_value=np.nan): _take2d_axis1_dict = { 'float64': algos.take_2d_axis1_float64, + 'float32': algos.take_2d_axis1_float32, + 'int8': algos.take_2d_axis1_int8, + 'int16': algos.take_2d_axis1_int16, 'int32': algos.take_2d_axis1_int32, 'int64': algos.take_2d_axis1_int64, 'object': algos.take_2d_axis1_object, @@ -286,6 +295,9 @@ def wrapper(arr, indexer, out, fill_value=np.nan): _take2d_multi_dict = { 'float64': algos.take_2d_multi_float64, + 'float32': algos.take_2d_multi_float32, + 'int8': algos.take_2d_multi_int8, + 'int16': algos.take_2d_multi_int16, 'int32': algos.take_2d_multi_int32, 'int64': algos.take_2d_multi_int64, 'object': algos.take_2d_multi_object, @@ -294,6 +306,8 @@ def wrapper(arr, indexer, out, fill_value=np.nan): na_override=tslib.iNaT), } +_dtypes_no_na = set(['int8','int16','int32', 'int64', 'bool']) +_dtypes_na = set(['float32', 'float64', 'object', 'datetime64[ns]']) def _get_take2d_function(dtype_str, axis=0): if axis == 0: @@ -319,7 +333,7 @@ def take_1d(arr, indexer, out=None, fill_value=np.nan): out_passed = out is not None take_f = _take1d_dict.get(dtype_str) - if dtype_str in ('int32', 'int64', 'bool'): + if dtype_str in _dtypes_no_na: try: if out is None: out = np.empty(n, dtype=arr.dtype) @@ -337,7 +351,7 @@ def take_1d(arr, indexer, out=None, fill_value=np.nan): out.dtype) out = _maybe_upcast(out) np.putmask(out, mask, fill_value) - elif dtype_str in ('float64', 'object', 'datetime64[ns]'): + elif dtype_str in _dtypes_na: if out is None: out = np.empty(n, dtype=arr.dtype) take_f(arr, _ensure_int64(indexer), out=out, fill_value=fill_value) @@ -360,7 +374,7 @@ def take_2d_multi(arr, row_idx, col_idx, fill_value=np.nan, out=None): out_shape = len(row_idx), len(col_idx) - if dtype_str in ('int32', 'int64', 'bool'): + if dtype_str in _dtypes_no_na: row_mask = row_idx == -1 col_mask = col_idx == -1 needs_masking = row_mask.any() or col_mask.any() @@ -376,7 +390,7 @@ def take_2d_multi(arr, row_idx, col_idx, fill_value=np.nan, out=None): _ensure_int64(col_idx), out=out, fill_value=fill_value) return out - elif dtype_str in ('float64', 'object', 'datetime64[ns]'): + elif dtype_str in _dtypes_na: if out is None: out = np.empty(out_shape, dtype=arr.dtype) take_f = _get_take2d_function(dtype_str, axis='multi') @@ -405,7 +419,7 @@ def take_2d(arr, indexer, out=None, mask=None, needs_masking=None, axis=0, if not isinstance(indexer, np.ndarray): indexer = np.array(indexer, dtype=np.int64) - if dtype_str in ('int32', 'int64', 'bool'): + if dtype_str in _dtypes_no_na: if mask is None: mask = indexer == -1 needs_masking = mask.any() @@ -423,7 +437,7 @@ def take_2d(arr, indexer, out=None, mask=None, needs_masking=None, axis=0, take_f = _get_take2d_function(dtype_str, axis=axis) take_f(arr, _ensure_int64(indexer), out=out, fill_value=fill_value) return out - elif dtype_str in ('float64', 'object', 'datetime64[ns]'): + elif dtype_str in _dtypes_na: if out is None: out = np.empty(out_shape, dtype=arr.dtype) take_f = _get_take2d_function(dtype_str, axis=axis) @@ -457,8 +471,11 @@ def mask_out_axis(arr, mask, axis, fill_value=np.nan): _diff_special = { 'float64': algos.diff_2d_float64, + 'float32': algos.diff_2d_float32, 'int64': algos.diff_2d_int64, - 'int32': algos.diff_2d_int32 + 'int32': algos.diff_2d_int32, + 'int16': algos.diff_2d_int16, + 'int8': algos.diff_2d_int8, } @@ -548,14 +565,18 @@ def wrapper(arr, mask, limit=None): def pad_1d(values, limit=None, mask=None): + + dtype = values.dtype.name + _method = None if is_float_dtype(values): - _method = algos.pad_inplace_float64 + _method = getattr(algos,'pad_inplace_%s' % dtype,None) elif is_datetime64_dtype(values): _method = _pad_1d_datetime elif values.dtype == np.object_: _method = algos.pad_inplace_object - else: # pragma: no cover - raise ValueError('Invalid dtype for padding') + + if _method is None: + raise ValueError('Invalid dtype for pad_1d [%s]' % dtype) if mask is None: mask = isnull(values) @@ -564,14 +585,18 @@ def pad_1d(values, limit=None, mask=None): def backfill_1d(values, limit=None, mask=None): + + dtype = values.dtype.name + _method = None if is_float_dtype(values): - _method = algos.backfill_inplace_float64 + _method = getattr(algos,'backfill_inplace_%s' % dtype,None) elif is_datetime64_dtype(values): _method = _backfill_1d_datetime elif values.dtype == np.object_: _method = algos.backfill_inplace_object - else: # pragma: no cover - raise ValueError('Invalid dtype for padding') + + if _method is None: + raise ValueError('Invalid dtype for backfill_1d [%s]' % dtype) if mask is None: mask = isnull(values) @@ -581,14 +606,18 @@ def backfill_1d(values, limit=None, mask=None): def pad_2d(values, limit=None, mask=None): + + dtype = values.dtype.name + _method = None if is_float_dtype(values): - _method = algos.pad_2d_inplace_float64 + _method = getattr(algos,'pad_2d_inplace_%s' % dtype,None) elif is_datetime64_dtype(values): _method = _pad_2d_datetime elif values.dtype == np.object_: _method = algos.pad_2d_inplace_object - else: # pragma: no cover - raise ValueError('Invalid dtype for padding') + + if _method is None: + raise ValueError('Invalid dtype for pad_2d [%s]' % dtype) if mask is None: mask = isnull(values) @@ -602,14 +631,18 @@ def pad_2d(values, limit=None, mask=None): def backfill_2d(values, limit=None, mask=None): + + dtype = values.dtype.name + _method = None if is_float_dtype(values): - _method = algos.backfill_2d_inplace_float64 + _method = getattr(algos,'backfill_2d_inplace_%s' % dtype,None) elif is_datetime64_dtype(values): _method = _backfill_2d_datetime elif values.dtype == np.object_: _method = algos.backfill_2d_inplace_object - else: # pragma: no cover - raise ValueError('Invalid dtype for padding') + + if _method is None: + raise ValueError('Invalid dtype for backfill_2d [%s]' % dtype) if mask is None: mask = isnull(values) @@ -633,10 +666,43 @@ def _consensus_name_attr(objs): # Lots of little utilities -def _possibly_cast_to_datetime(value, dtype): +def _possibly_convert_objects(values, convert_dates=True, convert_numeric=True): + """ if we have an object dtype, try to coerce dates and/or numers """ + + if values.dtype == np.object_ and convert_dates: + + # we take an aggressive stance and convert to datetime64[ns] + if convert_dates == 'coerce': + new_values = _possibly_cast_to_datetime(values, 'M8[ns]', coerce = True) + + # if we are all nans then leave me alone + if not isnull(new_values).all(): + values = new_values + + else: + values = lib.maybe_convert_objects(values, convert_datetime=convert_dates) + + if values.dtype == np.object_ and convert_numeric: + try: + new_values = lib.maybe_convert_numeric(values,set(),coerce_numeric=True) + + # if we are all nans then leave me alone + if not isnull(new_values).all(): + values = new_values + + except: + pass + + return values + + +def _possibly_cast_to_datetime(value, dtype, coerce = False): """ try to cast the array/value to a datetimelike dtype, converting float nan to iNaT """ - if dtype == 'M8[ns]': + if isinstance(dtype, basestring): + dtype = np.dtype(dtype) + + if dtype is not None and is_datetime64_dtype(dtype): if np.isscalar(value): if value == tslib.iNaT or isnull(value): value = tslib.iNaT @@ -650,7 +716,7 @@ def _possibly_cast_to_datetime(value, dtype): # we have an array of datetime & nulls elif np.prod(value.shape): try: - value = tslib.array_to_datetime(value) + value = tslib.array_to_datetime(value, coerce = coerce) except: pass @@ -1001,6 +1067,8 @@ def _is_int_or_datetime_dtype(arr_or_dtype): def is_datetime64_dtype(arr_or_dtype): if isinstance(arr_or_dtype, np.dtype): tipo = arr_or_dtype.type + elif isinstance(arr_or_dtype, type): + tipo = np.dtype(arr_or_dtype).type else: tipo = arr_or_dtype.dtype.type return issubclass(tipo, np.datetime64) @@ -1026,13 +1094,17 @@ def _is_sequence(x): return False _ensure_float64 = algos.ensure_float64 +_ensure_float32 = algos.ensure_float32 _ensure_int64 = algos.ensure_int64 _ensure_int32 = algos.ensure_int32 +_ensure_int16 = algos.ensure_int16 +_ensure_int8 = algos.ensure_int8 _ensure_platform_int = algos.ensure_platform_int _ensure_object = algos.ensure_object -def _astype_nansafe(arr, dtype): +def _astype_nansafe(arr, dtype, copy = True): + """ return a view if copy is False """ if not isinstance(dtype, np.dtype): dtype = np.dtype(dtype) @@ -1048,7 +1120,9 @@ def _astype_nansafe(arr, dtype): # work around NumPy brokenness, #1987 return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape) - return arr.astype(dtype) + if copy: + return arr.astype(dtype) + return arr.view(dtype) def _clean_fill_method(method): diff --git a/pandas/core/format.py b/pandas/core/format.py index 7fc9fbccced04..88b729349ca60 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -66,10 +66,11 @@ class SeriesFormatter(object): def __init__(self, series, buf=None, header=True, length=True, - na_rep='NaN', name=False, float_format=None): + na_rep='NaN', name=False, float_format=None, dtype=True): self.series = series self.buf = buf if buf is not None else StringIO(u"") self.name = name + self.dtype = dtype self.na_rep = na_rep self.length = length self.header = header @@ -98,6 +99,12 @@ def _get_footer(self): footer += ', ' footer += 'Length: %d' % len(self.series) + if self.dtype: + if getattr(self.series.dtype,'name',None): + if footer: + footer += ', ' + footer += 'Dtype: %s' % com.pprint_thing(self.series.dtype.name) + return unicode(footer) def _get_formatted_index(self): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 508d54c496dc8..6c96317a645f7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -312,7 +312,10 @@ def f(self, other): elif isinstance(other, Series): return self._combine_series_infer(other, func) else: - return self._combine_const(other, func) + + # straight boolean comparisions we want to allow all columns + # (regardless of dtype to pass thru) + return self._combine_const(other, func, raise_on_error = False).fillna(True).astype(bool) f.__name__ = name @@ -327,6 +330,7 @@ class DataFrame(NDFrame): _auto_consolidate = True _verbose_info = True _het_axis = 1 + _info_axis = 'columns' _col_klass = Series _AXIS_NUMBERS = { @@ -1004,6 +1008,12 @@ def from_records(cls, data, index=None, exclude=None, columns=None, arr_columns.append(k) arrays.append(v) + # reorder according to the columns + if len(columns) and len(arr_columns): + indexer = _ensure_index(arr_columns).get_indexer(columns) + arr_columns = _ensure_index([ arr_columns[i] for i in indexer ]) + arrays = [ arrays[i] for i in indexer ] + elif isinstance(data, (np.ndarray, DataFrame)): arrays, columns = _to_arrays(data, columns) if columns is not None: @@ -1649,38 +1659,25 @@ def info(self, verbose=True, buf=None, max_cols=None): def dtypes(self): return self.apply(lambda x: x.dtype) - def convert_objects(self, convert_dates=True): + def convert_objects(self, convert_dates=True, convert_numeric=True): """ Attempt to infer better dtype for object columns + Always returns a copy (even if no object columns) + + Parameters + ---------- + convert_dates : if True, attempt to soft convert_dates, if 'coerce', force conversion (and non-convertibles get NaT) + convert_numeric : if True attempt to coerce to numerbers (including strings), non-convertibles get NaN Returns ------- converted : DataFrame """ - new_data = {} - convert_f = lambda x: lib.maybe_convert_objects( - x, convert_datetime=convert_dates) - - # TODO: could be more efficient taking advantage of the block - for col, s in self.iteritems(): - if s.dtype == np.object_: - new_data[col] = convert_f(s) - else: - new_data[col] = s - - return DataFrame(new_data, index=self.index, columns=self.columns) + return self._constructor(self._data.convert(convert_dates=convert_dates, convert_numeric=convert_numeric)) def get_dtype_counts(self): - counts = {} - for i in range(len(self.columns)): - series = self.icol(i) - # endianness can cause dtypes to look different - dtype_str = str(series.dtype) - if dtype_str in counts: - counts[dtype_str] += 1 - else: - counts[dtype_str] = 1 - return Series(counts) + """ return the counts of dtypes in this frame """ + return Series(dict([ (dtype, len(df.columns)) for dtype, df in self.blocks.iteritems() ])) #---------------------------------------------------------------------- # properties for index and columns @@ -1694,6 +1691,14 @@ def as_matrix(self, columns=None): are presented in sorted order unless a specific list of columns is provided. + NOTE: the dtype will be a lower-common-denominator dtype (implicit upcasting) + that is to say if the dtypes (even of numeric types) are mixed, the one that accomodates all will be chosen + use this with care if you are not dealing with the blocks + + e.g. if the dtypes are float16,float32 -> float32 + float16,float32,float64 -> float64 + int32,uint8 -> int32 + Parameters ---------- columns : array-like @@ -1710,6 +1715,33 @@ def as_matrix(self, columns=None): values = property(fget=as_matrix) + def as_blocks(self, columns=None): + """ + Convert the frame to a dict of dtype -> DataFrames that each has a homogeneous dtype. + are presented in sorted order unless a specific list of columns is + provided. + + NOTE: the dtypes of the blocks WILL BE PRESERVED HERE (unlike in as_matrix) + + Parameters + ---------- + columns : array-like + Specific column order + + Returns + ------- + values : a list of DataFrames + """ + self._consolidate_inplace() + + bd = dict() + for b in self._data.blocks: + b = b.reindex_items_from(columns or b.items) + bd[str(b.dtype)] = DataFrame(BlockManager([ b ], [ b.items, self.index ])) + return bd + + blocks = property(fget=as_blocks) + def transpose(self): """ Returns a DataFrame with the rows/columns switched. If the DataFrame is @@ -1963,7 +1995,7 @@ def __getitem__(self, key): return self._getitem_multilevel(key) elif isinstance(key, DataFrame): if key.values.dtype == bool: - return self.where(key) + return self.where(key, try_cast = False) else: raise ValueError('Cannot index using non-boolean DataFrame') else: @@ -3334,17 +3366,12 @@ def fillna(self, value=None, method=None, axis=0, inplace=False, raise NotImplementedError() return self.T.fillna(method=method, limit=limit).T - new_blocks = [] method = com._clean_fill_method(method) - for block in self._data.blocks: - if block._can_hold_na: - newb = block.interpolate(method, axis=axis, - limit=limit, inplace=inplace) - else: - newb = block if inplace else block.copy() - new_blocks.append(newb) - - new_data = BlockManager(new_blocks, self._data.axes) + new_data = self._data.interpolate(method = method, + axis = axis, + limit = limit, + inplace = inplace, + coerce = True) else: if method is not None: raise ValueError('cannot specify both a fill method and value') @@ -3447,8 +3474,8 @@ def replace(self, to_replace, value=None, method='pad', axis=0, 'in length. Expecting %d got %d ' % (len(to_replace), len(value))) - new_data = self._data if inplace else self.copy()._data - new_data._replace_list(to_replace, value) + new_data = self._data.replace_list(to_replace, value, + inplace=inplace) else: # [NA, ''] -> 0 new_data = self._data.replace(to_replace, value, @@ -3493,13 +3520,13 @@ def _interpolate(self, to_replace, method, axis, inplace, limit): return rs if not inplace else None else: - new_blocks = [] - for block in self._data.blocks: - newb = block.interpolate(method, axis=axis, - limit=limit, inplace=inplace, - missing=to_replace) - new_blocks.append(newb) - new_data = BlockManager(new_blocks, self._data.axes) + + new_data = self._data.interpolate(method = method, + axis = axis, + limit = limit, + inplace = inplace, + missing = to_replace, + coerce = False) if inplace: self._data = new_data @@ -3672,22 +3699,15 @@ def _combine_match_columns(self, other, func, fill_value=None): if fill_value is not None: raise NotImplementedError - return self._constructor(func(left.values, right.values), - index=self.index, - columns=left.columns, copy=False) + new_data = left._data.where(func, right, axes = [left.columns, self.index]) + return self._constructor(new_data) - def _combine_const(self, other, func): + def _combine_const(self, other, func, raise_on_error = True): if self.empty: return self - result_values = func(self.values, other) - - if not isinstance(result_values, np.ndarray): - raise TypeError('Could not compare %s with DataFrame values' - % repr(other)) - - return self._constructor(result_values, index=self.index, - columns=self.columns, copy=False) + new_data = self._data.where(func, other, raise_on_error=raise_on_error) + return self._constructor(new_data) def _compare_frame(self, other, func): if not self._indexed_same(other): @@ -4016,8 +4036,7 @@ def diff(self, periods=1): ------- diffed : DataFrame """ - new_blocks = [b.diff(periods) for b in self._data.blocks] - new_data = BlockManager(new_blocks, [self.columns, self.index]) + new_data = self._data.diff(periods) return self._constructor(new_data) def shift(self, periods=1, freq=None, **kwds): @@ -4051,21 +4070,9 @@ def shift(self, periods=1, freq=None, **kwds): if isinstance(offset, basestring): offset = datetools.to_offset(offset) - def _shift_block(blk, indexer): - new_values = blk.values.take(indexer, axis=1) - # convert integer to float if necessary. need to do a lot more than - # that, handle boolean etc also - new_values = com._maybe_upcast(new_values) - if periods > 0: - new_values[:, :periods] = NA - else: - new_values[:, periods:] = NA - return make_block(new_values, blk.items, blk.ref_items) - if offset is None: indexer = com._shift_indexer(len(self), periods) - new_blocks = [_shift_block(b, indexer) for b in self._data.blocks] - new_data = BlockManager(new_blocks, [self.columns, self.index]) + new_data = self._data.shift(indexer, periods) elif isinstance(self.index, PeriodIndex): orig_offset = datetools.to_offset(self.index.freq) if offset == orig_offset: @@ -5215,7 +5222,7 @@ def combineMult(self, other): """ return self.mul(other, fill_value=1.) - def where(self, cond, other=NA, inplace=False): + def where(self, cond, other=NA, inplace=False, try_cast=False, raise_on_error=True): """ Return a DataFrame with the same shape as self and whose corresponding entries are from self where cond is True and otherwise are from other. @@ -5224,6 +5231,10 @@ def where(self, cond, other=NA, inplace=False): ---------- cond: boolean DataFrame or array other: scalar or DataFrame + inplace: perform the operation in place on the data + try_cast: try to cast the result back to the input type (if possible), defaults to False + raise_on_error: should I raise on invalid data types (e.g. trying to where on strings), + defaults to True Returns ------- @@ -5235,7 +5246,7 @@ def where(self, cond, other=NA, inplace=False): if isinstance(cond, np.ndarray): if cond.shape != self.shape: - raise ValueError('Array onditional must be same shape as self') + raise ValueError('Array conditional must be same shape as self') cond = self._constructor(cond, index=self.index, columns=self.columns) @@ -5251,12 +5262,23 @@ def where(self, cond, other=NA, inplace=False): if isinstance(other, DataFrame): _, other = self.align(other, join='left', fill_value=NA) + elif isinstance(other,np.ndarray): + + if other.shape[0] != len(self.index) or other.shape[1] != len(self.columns): + raise ValueError('other must be the same shape as self when an ndarray') + other = DataFrame(other,self.index,self.columns) if inplace: - np.putmask(self.values, cond, other) + + # we may have different type blocks come out of putmask, so reconstruct the block manager + self._data = self._data.putmask(cond,other,inplace=True) + else: - rs = np.where(cond, self, other) - return self._constructor(rs, self.index, self.columns) + + func = lambda values, others, conds: np.where(conds, values, others) + new_data = self._data.where(func, other, cond, raise_on_error=raise_on_error, try_cast=try_cast) + + return self._constructor(new_data) def mask(self, cond): """ @@ -5613,7 +5635,6 @@ def _homogenize(data, index, dtype=None): return homogenized - def _from_nested_dict(data): # TODO: this should be seriously cythonized new_data = OrderedDict() diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 285e53e4c396c..558b8aa6137d1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -486,19 +486,23 @@ def __init__(self, data, axes=None, copy=False, dtype=None): object.__setattr__(self, '_data', data) object.__setattr__(self, '_item_cache', {}) - def astype(self, dtype): + def astype(self, dtype, copy = True, raise_on_error = True): """ Cast object to input numpy.dtype + Return a copy when copy = True (be really careful with this!) Parameters ---------- dtype : numpy.dtype or Python type + raise_on_error : raise on invalid input Returns ------- casted : type of caller """ - return self._constructor(self._data, dtype=dtype) + + mgr = self._data.astype(dtype, copy = copy, raise_on_error = raise_on_error) + return self._constructor(mgr) @property def _constructor(self): diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index afce08f8d48dc..e89175ef72f43 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -425,6 +425,36 @@ def picker(arr): return np.nan return self.agg(picker) + def _try_cast(self, result, obj): + """ try to cast the result to our obj original type, + we may have roundtripped thru object in the mean-time """ + try: + if obj.ndim > 1: + dtype = obj.values.dtype + else: + dtype = obj.dtype + + if _is_numeric_dtype(dtype): + + # need to respect a non-number here (e.g. Decimal) + if len(result) and issubclass(type(result[0]),(np.number,float,int)): + if issubclass(dtype.type, (np.integer, np.bool_)): + + # castable back to an int/bool as we don't have nans + if com.notnull(result).all(): + result = result.astype(dtype) + else: + + result = result.astype(dtype) + + elif issubclass(dtype.type, np.datetime64): + if is_datetime64_dtype(obj.dtype): + result = result.astype(obj.dtype) + except: + pass + + return result + def _cython_agg_general(self, how, numeric_only=True): output = {} for name, obj in self._iterate_slices(): @@ -449,7 +479,7 @@ def _python_agg_general(self, func, *args, **kwargs): for name, obj in self._iterate_slices(): try: result, counts = self.grouper.agg_series(obj, f) - output[name] = result + output[name] = self._try_cast(result, obj) except TypeError: continue @@ -457,9 +487,16 @@ def _python_agg_general(self, func, *args, **kwargs): return self._python_apply_general(f) if self.grouper._filter_empty_groups: + mask = counts.ravel() > 0 for name, result in output.iteritems(): - output[name] = result[mask] + + # since we are masking, make sure that we have a float object + values = result + if _is_numeric_dtype(values.dtype): + values = com.ensure_float(values) + + output[name] = self._try_cast(values[mask],result) return self._wrap_aggregated_output(output) @@ -708,21 +745,16 @@ def get_group_levels(self): # Aggregation functions _cython_functions = { - 'add': _algos.group_add, - 'prod': _algos.group_prod, - 'min': _algos.group_min, - 'max': _algos.group_max, - 'mean': _algos.group_mean, - 'median': _algos.group_median, - 'var': _algos.group_var, - 'std': _algos.group_var, - 'first': lambda a, b, c, d: _algos.group_nth(a, b, c, d, 1), - 'last': _algos.group_last - } - - _cython_object_functions = { - 'first': lambda a, b, c, d: _algos.group_nth_object(a, b, c, d, 1), - 'last': _algos.group_last_object + 'add' : 'group_add', + 'prod' : 'group_prod', + 'min' : 'group_min', + 'max' : 'group_max', + 'mean' : 'group_mean', + 'median': dict(name = 'group_median'), + 'var' : 'group_var', + 'std' : 'group_var', + 'first': dict(name = 'group_nth', f = lambda func, a, b, c, d: func(a, b, c, d, 1)), + 'last' : 'group_last', } _cython_transforms = { @@ -737,6 +769,40 @@ def get_group_levels(self): _filter_empty_groups = True + def _get_aggregate_function(self, how, values): + + dtype_str = values.dtype.name + def get_func(fname): + + # find the function, or use the object function, or return a generic + for dt in [dtype_str,'object']: + f = getattr(_algos,"%s_%s" % (fname,dtype_str),None) + if f is not None: + return f + return getattr(_algos,fname,None) + + ftype = self._cython_functions[how] + + if isinstance(ftype,dict): + func = afunc = get_func(ftype['name']) + + # a sub-function + f = ftype.get('f') + if f is not None: + + def wrapper(*args, **kwargs): + return f(afunc, *args, **kwargs) + + # need to curry our sub-function + func = wrapper + + else: + func = get_func(ftype) + + if func is None: + raise NotImplementedError("function is not implemented for this dtype: [how->%s,dtype->%s]" % (how,dtype_str)) + return func, dtype_str + def aggregate(self, values, how, axis=0): arity = self._cython_arity.get(how, 1) @@ -796,12 +862,8 @@ def aggregate(self, values, how, axis=0): return result, names def _aggregate(self, result, counts, values, how, is_numeric): - if not is_numeric: - agg_func = self._cython_object_functions[how] - else: - agg_func = self._cython_functions[how] - - trans_func = self._cython_transforms.get(how, lambda x: x) + agg_func,dtype = self._get_aggregate_function(how, values) + trans_func = self._cython_transforms.get(how, lambda x: x) comp_ids, _, ngroups = self.group_info if values.ndim > 3: @@ -809,8 +871,9 @@ def _aggregate(self, result, counts, values, how, is_numeric): raise NotImplementedError elif values.ndim > 2: for i, chunk in enumerate(values.transpose(2, 0, 1)): - agg_func(result[:, :, i], counts, chunk.squeeze(), - comp_ids) + + chunk = chunk.squeeze() + agg_func(result[:, :, i], counts, chunk, comp_ids) else: agg_func(result, counts, values, comp_ids) @@ -1000,21 +1063,16 @@ def names(self): # cython aggregation _cython_functions = { - 'add': _algos.group_add_bin, - 'prod': _algos.group_prod_bin, - 'mean': _algos.group_mean_bin, - 'min': _algos.group_min_bin, - 'max': _algos.group_max_bin, - 'var': _algos.group_var_bin, - 'std': _algos.group_var_bin, - 'ohlc': _algos.group_ohlc, - 'first': lambda a, b, c, d: _algos.group_nth_bin(a, b, c, d, 1), - 'last': _algos.group_last_bin - } - - _cython_object_functions = { - 'first': lambda a, b, c, d: _algos.group_nth_bin_object(a, b, c, d, 1), - 'last': _algos.group_last_bin_object + 'add' : 'group_add_bin', + 'prod' : 'group_prod_bin', + 'mean' : 'group_mean_bin', + 'min' : 'group_min_bin', + 'max' : 'group_max_bin', + 'var' : 'group_var_bin', + 'std' : 'group_var_bin', + 'ohlc' : 'group_ohlc', + 'first': dict(name = 'group_nth_bin', f = lambda func, a, b, c, d: func(a, b, c, d, 1)), + 'last' : 'group_last_bin', } _name_functions = { @@ -1024,11 +1082,9 @@ def names(self): _filter_empty_groups = True def _aggregate(self, result, counts, values, how, is_numeric=True): - fdict = self._cython_functions - if not is_numeric: - fdict = self._cython_object_functions - agg_func = fdict[how] - trans_func = self._cython_transforms.get(how, lambda x: x) + + agg_func,dtype = self._get_aggregate_function(how, values) + trans_func = self._cython_transforms.get(how, lambda x: x) if values.ndim > 3: # punting for now @@ -1439,7 +1495,7 @@ def _aggregate_named(self, func, *args, **kwargs): output = func(group, *args, **kwargs) if isinstance(output, np.ndarray): raise Exception('Must produce aggregated value') - result[name] = output + result[name] = self._try_cast(output, group) return result @@ -1676,14 +1732,14 @@ def _aggregate_generic(self, func, *args, **kwargs): for name, data in self: # for name in self.indices: # data = self.get_group(name, obj=obj) - result[name] = func(data, *args, **kwargs) + result[name] = self._try_cast(func(data, *args, **kwargs),data) except Exception: return self._aggregate_item_by_item(func, *args, **kwargs) else: for name in self.indices: try: data = self.get_group(name, obj=obj) - result[name] = func(data, *args, **kwargs) + result[name] = self._try_cast(func(data, *args, **kwargs), data) except Exception: wrapper = lambda x: func(x, *args, **kwargs) result[name] = data.apply(wrapper, axis=axis) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index e3031b58ff286..58d193a956491 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -20,6 +20,10 @@ class Block(object): Index-ignorant; let the container take care of that """ __slots__ = ['items', 'ref_items', '_ref_locs', 'values', 'ndim'] + is_numeric = False + is_bool = False + is_object = False + _can_hold_na = False def __init__(self, values, items, ref_items, ndim=2): if issubclass(values.dtype.type, basestring): @@ -93,6 +97,10 @@ def __setstate__(self, state): def shape(self): return self.values.shape + @property + def itemsize(self): + return self.values.itemsize + @property def dtype(self): return self.values.dtype @@ -206,8 +214,13 @@ def split_block_at(self, item): self.ref_items) def fillna(self, value, inplace=False): - new_values = self.values if inplace else self.values.copy() + if not self._can_hold_na: + if inplace: + return self + else: + return self.copy() + new_values = self.values if inplace else self.values.copy() mask = com.isnull(new_values) np.putmask(new_values, mask, value) @@ -216,12 +229,43 @@ def fillna(self, value, inplace=False): else: return make_block(new_values, self.items, self.ref_items) + def astype(self, dtype, copy = True, raise_on_error = True): + """ coerce to the new type (if copy=True, return a new copy) raise on an except if raise == True """ + try: + newb = make_block(com._astype_nansafe(self.values, dtype, copy = copy), + self.items, self.ref_items) + except: + if raise_on_error is True: + raise + newb = self.copy() if copy else self + + if newb.is_numeric and self.is_numeric: + if newb.shape != self.shape or (not copy and newb.itemsize < self.itemsize): + raise TypeError("cannot set astype for copy = [%s] for dtype (%s [%s]) with smaller itemsize that current (%s [%s])" % (copy, + self.dtype.name, + self.itemsize, + newb.dtype.name, + newb.itemsize)) + return newb + + def convert(self, copy = True, **kwargs): + """ attempt to coerce any object types to better types + return a copy of the block (if copy = True) + by definition we are not an ObjectBlock here! """ + + return self.copy() if copy else self + def _can_hold_element(self, value): raise NotImplementedError() def _try_cast(self, value): raise NotImplementedError() + def _try_cast_result(self, result): + """ try to cast the result to our original type, + we may have roundtripped thru object in the mean-time """ + return result + def replace(self, to_replace, value, inplace=False): new_values = self.values if inplace else self.values.copy() if self._can_hold_element(value): @@ -251,17 +295,58 @@ def replace(self, to_replace, value, inplace=False): return make_block(new_values, self.items, self.ref_items) def putmask(self, mask, new, inplace=False): + """ putmask the data to the block; it is possible that we may create a new dtype of block + return the resulting block(s) """ + new_values = self.values if inplace else self.values.copy() + + # may need to align the new + if hasattr(new,'reindex_axis'): + axis = getattr(new,'_het_axis',0) + new = new.reindex_axis(self.items, axis=axis, copy=False).values.T + + # may need to align the mask + if hasattr(mask,'reindex_axis'): + axis = getattr(mask,'_het_axis',0) + mask = mask.reindex_axis(self.items, axis=axis, copy=False).values.T + if self._can_hold_element(new): new = self._try_cast(new) np.putmask(new_values, mask, new) - if inplace: - return self + + # upcast me else: - return make_block(new_values, self.items, self.ref_items) + + # type of the new block + if isinstance(new,np.ndarray) and issubclass(new.dtype,np.number) or issubclass(type(new),float): + typ = float + else: + typ = object + + # we need to exiplicty astype here to make a copy + new_values = new_values.astype(typ) + + # we create a new block type + np.putmask(new_values, mask, new) + return [ make_block(new_values, self.items, self.ref_items) ] + + if inplace: + return [ self ] + + return [ make_block(new_values, self.items, self.ref_items) ] def interpolate(self, method='pad', axis=0, inplace=False, - limit=None, missing=None): + limit=None, missing=None, coerce=False): + + # if we are coercing, then don't force the conversion + # if the block can't hold the type + if coerce: + if not self._can_hold_na: + if inplace: + return self + else: + return self.copy() + values = self.values if inplace else self.values.copy() if values.ndim != 2: @@ -293,9 +378,96 @@ def get_values(self, dtype): return self.values def diff(self, n): + """ return block for the diff of the values """ new_values = com.diff(self.values, n, axis=1) return make_block(new_values, self.items, self.ref_items) + def shift(self, indexer, periods): + """ shift the block by periods, possibly upcast """ + + new_values = self.values.take(indexer, axis=1) + # convert integer to float if necessary. need to do a lot more than + # that, handle boolean etc also + new_values = com._maybe_upcast(new_values) + if periods > 0: + new_values[:, :periods] = np.nan + else: + new_values[:, periods:] = np.nan + return make_block(new_values, self.items, self.ref_items) + + def where(self, func, other, cond = None, raise_on_error = True, try_cast = False): + """ + evaluate the block; return result block(s) from the result + + Parameters + ---------- + func : how to combine self,other + other : a ndarray/object + cond : the condition to respect, optional + raise_on_error : if True, raise when I can't perform the function, False by default (and just return + the data that we had coming in) + + Returns + ------- + a new block, the result of the func + """ + + values = self.values + + # see if we can align other + if hasattr(other,'reindex_axis'): + axis = getattr(other,'_het_axis',0) + other = other.reindex_axis(self.items, axis=axis, copy=True).values + + # make sure that we can broadcast + is_transposed = False + if hasattr(other, 'ndim') and hasattr(values, 'ndim'): + if values.ndim != other.ndim or values.shape == other.shape[::-1]: + values = values.T + is_transposed = True + + # see if we can align cond + if cond is not None: + if not hasattr(cond,'shape'): + raise ValueError("where must have a condition that is ndarray like") + if hasattr(cond,'reindex_axis'): + axis = getattr(cond,'_het_axis',0) + cond = cond.reindex_axis(self.items, axis=axis, copy=True).values + else: + cond = cond.values + + # may need to undo transpose of values + if hasattr(values, 'ndim'): + if values.ndim != cond.ndim or values.shape == cond.shape[::-1]: + values = values.T + is_transposed = not is_transposed + + args = [ values, other ] + if cond is not None: + args.append(cond) + try: + result = func(*args) + except: + if raise_on_error: + raise TypeError('Coulnd not operate %s with block values' + % repr(other)) + else: + # return the values + result = np.empty(values.shape,dtype='O') + result.fill(np.nan) + + if not isinstance(result, np.ndarray): + raise TypeError('Could not compare %s with block values' + % repr(other)) + + if is_transposed: + result = result.T + + # try to cast if requested + if try_cast: + result = self._try_cast_result(result) + + return [ make_block(result, self.items, self.ref_items) ] def _mask_missing(array, missing_values): if not isinstance(missing_values, (list, np.ndarray)): @@ -314,11 +486,15 @@ def _mask_missing(array, missing_values): mask |= array == missing_values return mask - -class FloatBlock(Block): +class NumericBlock(Block): + is_numeric = True _can_hold_na = True +class FloatBlock(NumericBlock): + def _can_hold_element(self, element): + if isinstance(element, np.ndarray): + return issubclass(element.dtype.type, (np.floating,np.integer)) return isinstance(element, (float, int)) def _try_cast(self, element): @@ -330,11 +506,10 @@ def _try_cast(self, element): def should_store(self, value): # when inserting a column should not coerce integers to floats # unnecessarily - return issubclass(value.dtype.type, np.floating) + return issubclass(value.dtype.type, np.floating) and value.dtype == self.dtype -class ComplexBlock(Block): - _can_hold_na = True +class ComplexBlock(NumericBlock): def _can_hold_element(self, element): return isinstance(element, complex) @@ -349,10 +524,12 @@ def should_store(self, value): return issubclass(value.dtype.type, np.complexfloating) -class IntBlock(Block): +class IntBlock(NumericBlock): _can_hold_na = False def _can_hold_element(self, element): + if isinstance(element, np.ndarray): + return issubclass(element.dtype.type, np.integer) return com.is_integer(element) def _try_cast(self, element): @@ -361,11 +538,25 @@ def _try_cast(self, element): except: # pragma: no cover return element + def _try_cast_result(self, result): + # this is quite restrictive to convert + try: + if isinstance(result, np.ndarray) and issubclass(result.dtype.type, np.floating): + if com.notnull(result).all(): + new_result = result.astype(self.dtype) + if (new_result == result).all(): + return new_result + except: + pass + + return result + def should_store(self, value): - return com.is_integer_dtype(value) + return com.is_integer_dtype(value) and value.dtype == self.dtype class BoolBlock(Block): + is_bool = True _can_hold_na = False def _can_hold_element(self, element): @@ -382,8 +573,35 @@ def should_store(self, value): class ObjectBlock(Block): + is_object = True _can_hold_na = True + @property + def is_bool(self): + """ we can be a bool if we have only bool values but are of type object """ + return lib.is_bool_array(self.values.flatten()) + + def convert(self, convert_dates = True, convert_numeric = True, copy = True): + """ attempt to coerce any object types to better types + return a copy of the block (if copy = True) + by definition we ARE an ObjectBlock!!!!! + + can return multiple blocks! + """ + + # attempt to create new type blocks + blocks = [] + for i, c in enumerate(self.items): + values = self.get(c) + + values = com._possibly_convert_objects(values, convert_dates=convert_dates, convert_numeric=convert_numeric) + values = values.reshape(((1,) + values.shape)) + items = self.items.take([i]) + newb = make_block(values, items, self.ref_items) + blocks.append(newb) + + return blocks + def _can_hold_element(self, element): return True @@ -457,8 +675,6 @@ def make_block(values, items, ref_items): elif issubclass(vtype, np.datetime64): klass = DatetimeBlock elif issubclass(vtype, np.integer): - if vtype != np.int64: - values = values.astype('i8') klass = IntBlock elif dtype == np.bool_: klass = BoolBlock @@ -611,15 +827,70 @@ def _verify_integrity(self): raise AssertionError('Number of manager items must equal union of ' 'block items') - def astype(self, dtype): - new_blocks = [] - for block in self.blocks: - newb = make_block(com._astype_nansafe(block.values, dtype), - block.items, block.ref_items) - new_blocks.append(newb) + def apply(self, f, *args, **kwargs): + """ iterate over the blocks, collect and create a new block manager """ + axes = kwargs.pop('axes',None) + result_blocks = [] + for blk in self.blocks: + if callable(f): + applied = f(blk, *args, **kwargs) + else: + applied = getattr(blk,f)(*args, **kwargs) + + if isinstance(applied,list): + result_blocks.extend(applied) + else: + result_blocks.append(applied) + bm = self.__class__(result_blocks, axes or self.axes) + bm._consolidate_inplace() + return bm + + def where(self, *args, **kwargs): + return self.apply('where', *args, **kwargs) + + def putmask(self, *args, **kwargs): + return self.apply('putmask', *args, **kwargs) + + def diff(self, *args, **kwargs): + return self.apply('diff', *args, **kwargs) + + def interpolate(self, *args, **kwargs): + return self.apply('interpolate', *args, **kwargs) + + def shift(self, *args, **kwargs): + return self.apply('shift', *args, **kwargs) + + def fillna(self, *args, **kwargs): + return self.apply('fillna', *args, **kwargs) + + def astype(self, *args, **kwargs): + return self.apply('astype', *args, **kwargs) + + def convert(self, *args, **kwargs): + return self.apply('convert', *args, **kwargs) + + def replace(self, *args, **kwargs): + return self.apply('replace', *args, **kwargs) - new_mgr = BlockManager(new_blocks, self.axes) - return new_mgr.consolidate() + def replace_list(self, src_lst, dest_lst, inplace=False): + """ do a list replace """ + if not inplace: + self = self.copy() + + sset = set(src_lst) + if any([k in sset for k in dest_lst]): + masks = {} + for s in src_lst: + masks[s] = [b.values == s for b in self.blocks] + + for s, d in zip(src_lst, dest_lst): + [b.putmask(masks[s][i], d, inplace=True) for i, b in + enumerate(self.blocks)] + else: + for s, d in zip(src_lst, dest_lst): + self.replace(s, d, inplace=True) + + return self def is_consolidated(self): """ @@ -634,7 +905,7 @@ def _consolidate_check(self): self._is_consolidated = len(dtypes) == len(set(dtypes)) self._known_consolidated = True - def get_numeric_data(self, copy=False, type_list=None): + def get_numeric_data(self, copy=False, type_list=None, as_blocks = False): """ Parameters ---------- @@ -644,15 +915,15 @@ def get_numeric_data(self, copy=False, type_list=None): Numeric types by default (Float/Complex/Int but not Datetime) """ if type_list is None: - def filter_blocks(block): - return (isinstance(block, (IntBlock, FloatBlock, ComplexBlock)) - and not isinstance(block, DatetimeBlock)) + filter_blocks = lambda block: block.is_numeric else: type_list = self._get_clean_block_types(type_list) filter_blocks = lambda block: isinstance(block, type_list) maybe_copy = lambda b: b.copy() if copy else b num_blocks = [maybe_copy(b) for b in self.blocks if filter_blocks(b)] + if as_blocks: + return num_blocks if len(num_blocks) == 0: return BlockManager.make_empty() @@ -686,8 +957,8 @@ def _get_clean_block_types(self, type_list): type_list = tuple([type_map.get(t, t) for t in type_list]) return type_list - def get_bool_data(self, copy=False): - return self.get_numeric_data(copy=copy, type_list=(BoolBlock,)) + def get_bool_data(self, copy=False, as_blocks=False): + return self.get_numeric_data(copy=copy, type_list=(BoolBlock,), as_blocks=as_blocks) def get_slice(self, slobj, axis=0): new_axes = list(self.axes) @@ -1255,37 +1526,6 @@ def add_suffix(self, suffix): f = ('%s' + ('%s' % suffix)).__mod__ return self.rename_items(f) - def fillna(self, value, inplace=False): - new_blocks = [b.fillna(value, inplace=inplace) - if b._can_hold_na else b - for b in self.blocks] - if inplace: - return self - return BlockManager(new_blocks, self.axes) - - def replace(self, to_replace, value, inplace=False): - new_blocks = [b.replace(to_replace, value, inplace=inplace) - for b in self.blocks] - if inplace: - return self - return BlockManager(new_blocks, self.axes) - - def _replace_list(self, src_lst, dest_lst): - sset = set(src_lst) - if any([k in sset for k in dest_lst]): - masks = {} - for s in src_lst: - masks[s] = [b.values == s for b in self.blocks] - - for s, d in zip(src_lst, dest_lst): - [b.putmask(masks[s][i], d, inplace=True) for i, b in - enumerate(self.blocks)] - else: - for s, d in zip(src_lst, dest_lst): - self.replace(s, d, inplace=True) - - return self - @property def block_id_vector(self): # TODO @@ -1359,28 +1599,28 @@ def form_blocks(arrays, names, axes): blocks = [] if len(float_items): - float_block = _simple_blockify(float_items, items, np.float64) - blocks.append(float_block) + float_blocks = _multi_blockify(float_items, items) + blocks.extend(float_blocks) if len(complex_items): - complex_block = _simple_blockify(complex_items, items, np.complex128) - blocks.append(complex_block) + complex_blocks = _simple_blockify(complex_items, items, np.complex128) + blocks.extend(complex_blocks) if len(int_items): - int_block = _simple_blockify(int_items, items, np.int64) - blocks.append(int_block) + int_blocks = _multi_blockify(int_items, items) + blocks.extend(int_blocks) if len(datetime_items): - datetime_block = _simple_blockify(datetime_items, items, _NS_DTYPE) - blocks.append(datetime_block) + datetime_blocks = _simple_blockify(datetime_items, items, _NS_DTYPE) + blocks.extend(datetime_blocks) if len(bool_items): - bool_block = _simple_blockify(bool_items, items, np.bool_) - blocks.append(bool_block) + bool_blocks = _simple_blockify(bool_items, items, np.bool_) + blocks.extend(bool_blocks) if len(object_items) > 0: - object_block = _simple_blockify(object_items, items, np.object_) - blocks.append(object_block) + object_blocks = _simple_blockify(object_items, items, np.object_) + blocks.extend(object_blocks) if len(extra_items): shape = (len(extra_items),) + tuple(len(x) for x in axes[1:]) @@ -1398,14 +1638,31 @@ def form_blocks(arrays, names, axes): def _simple_blockify(tuples, ref_items, dtype): + """ return a single array of a block that has a single dtype; if dtype is not None, coerce to this dtype """ block_items, values = _stack_arrays(tuples, ref_items, dtype) + # CHECK DTYPE? - if values.dtype != dtype: # pragma: no cover + if dtype is not None and values.dtype != dtype: # pragma: no cover values = values.astype(dtype) - return make_block(values, block_items, ref_items) + return [ make_block(values, block_items, ref_items) ] +def _multi_blockify(tuples, ref_items, dtype = None): + """ return an array of blocks that potentially have different dtypes """ + + # group by dtype + grouper = itertools.groupby(tuples, lambda x: x[1].dtype) + + new_blocks = [] + for dtype, tup_block in grouper: + + block_items, values = _stack_arrays(list(tup_block), ref_items, dtype) + block = make_block(values, block_items, ref_items) + new_blocks.append(block) + + return new_blocks + def _stack_arrays(tuples, ref_items, dtype): from pandas.core.series import Series @@ -1451,17 +1708,27 @@ def _blocks_to_series_dict(blocks, index=None): def _interleaved_dtype(blocks): + if not len(blocks): return None + from collections import defaultdict - counts = defaultdict(lambda: 0) + counts = defaultdict(lambda: []) for x in blocks: - counts[type(x)] += 1 - - have_int = counts[IntBlock] > 0 - have_bool = counts[BoolBlock] > 0 - have_object = counts[ObjectBlock] > 0 - have_float = counts[FloatBlock] > 0 - have_complex = counts[ComplexBlock] > 0 - have_dt64 = counts[DatetimeBlock] > 0 + counts[type(x)].append(x) + + def _lcd_dtype(l): + """ find the lowest dtype that can accomodate the given types """ + m = l[0].dtype + for x in l[1:]: + if x.dtype.itemsize > m.itemsize: + m = x.dtype + return m + + have_int = len(counts[IntBlock]) > 0 + have_bool = len(counts[BoolBlock]) > 0 + have_object = len(counts[ObjectBlock]) > 0 + have_float = len(counts[FloatBlock]) > 0 + have_complex = len(counts[ComplexBlock]) > 0 + have_dt64 = len(counts[DatetimeBlock]) > 0 have_numeric = have_float or have_complex or have_int if (have_object or @@ -1471,13 +1738,13 @@ def _interleaved_dtype(blocks): elif have_bool: return np.dtype(bool) elif have_int and not have_float and not have_complex: - return np.dtype('i8') + return _lcd_dtype(counts[IntBlock]) elif have_dt64 and not have_float and not have_complex: return np.dtype('M8[ns]') elif have_complex: return np.dtype('c16') else: - return np.dtype('f8') + return _lcd_dtype(counts[FloatBlock]) def _consolidate(blocks, items): diff --git a/pandas/core/series.py b/pandas/core/series.py index 118b8dba24682..c3ae78b1b5e1f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -779,18 +779,23 @@ def astype(self, dtype): casted = com._astype_nansafe(self.values, dtype) return self._constructor(casted, index=self.index, name=self.name) - def convert_objects(self, convert_dates=True): + def convert_objects(self, convert_dates=True, convert_numeric=True): """ Attempt to infer better dtype + Always return a copy + + Parameters + ---------- + convert_dates : if True, attempt to soft convert_dates, if 'coerce', force conversion (and non-convertibles get NaT) + convert_numeric : if True attempt to coerce to numerbers (including strings), non-convertibles get NaN Returns ------- converted : Series """ if self.dtype == np.object_: - return Series(lib.maybe_convert_objects( - self, convert_datetime=convert_dates), self.index) - return self + return Series(com._possibly_convert_objects(self.values,convert_dates=convert_dates,convert_numeric=convert_numeric), index=self.index, name=self.name) + return self.copy() def repeat(self, reps): """ @@ -1027,7 +1032,8 @@ def _tidy_repr(self, max_vals=20): def _repr_footer(self): namestr = u"Name: %s, " % com.pprint_thing( self.name) if self.name is not None else "" - return u'%sLength: %d' % (namestr, len(self)) + return u'%sLength: %d, Dtype: %s' % (namestr, len(self), + com.pprint_thing(self.dtype.name)) def to_string(self, buf=None, na_rep='NaN', float_format=None, nanRep=None, length=False, name=False): @@ -2410,6 +2416,12 @@ def reindex(self, index=None, method=None, level=None, fill_value=np.nan, new_values = com.take_1d(self.values, indexer, fill_value=fill_value) return Series(new_values, index=new_index, name=self.name) + def reindex_axis(self, labels, axis=0, **kwargs): + """ for compatibility with higher dims """ + if axis != 0: + raise ValueError("cannot reindex series on non-zero axis!") + return self.reindex(index=labels,**kwargs) + def reindex_like(self, other, method=None, limit=None): """ Reindex Series to match index of another Series, optionally with diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 6801b197fa849..556bcdb93477f 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -362,11 +362,11 @@ def _check_extension_int64(self, ext): self.frame.to_excel(path, 'test1', index=False) # Test np.int64, values read come back as float - frame = DataFrame(np.random.randint(-10, 10, size=(10, 2))) + frame = DataFrame(np.random.randint(-10, 10, size=(10, 2)), dtype=np.int64) frame.to_excel(path, 'test1') reader = ExcelFile(path) recons = reader.parse('test1').astype(np.int64) - tm.assert_frame_equal(frame, recons) + tm.assert_frame_equal(frame, recons, check_dtype=False) os.remove(path) diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py index c7897e7def4d3..9cc749d23a3a9 100644 --- a/pandas/src/generate_code.py +++ b/pandas/src/generate_code.py @@ -388,6 +388,10 @@ def pad_inplace_%(name)s(ndarray[%(c_type)s] values, N = len(values) + # GH 2778 + if N == 0: + return + if limit is None: lim = N else: @@ -419,6 +423,10 @@ def pad_2d_inplace_%(name)s(ndarray[%(c_type)s, ndim=2] values, K, N = ( values).shape + # GH 2778 + if N == 0: + return + if limit is None: lim = N else: @@ -451,6 +459,10 @@ def backfill_2d_inplace_%(name)s(ndarray[%(c_type)s, ndim=2] values, K, N = ( values).shape + # GH 2778 + if N == 0: + return + if limit is None: lim = N else: @@ -483,6 +495,10 @@ def backfill_inplace_%(name)s(ndarray[%(c_type)s] values, N = len(values) + # GH 2778 + if N == 0: + return + if limit is None: lim = N else: @@ -502,6 +518,52 @@ def backfill_inplace_%(name)s(ndarray[%(c_type)s] values, val = values[i] """ + +diff_2d_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d_%(name)s(ndarray[%(c_type)s, ndim=2] arr, + ndarray[%(dest_type2)s, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy + + sx, sy = ( arr).shape + if arr.flags.f_contiguous: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] +""" + is_monotonic_template = """@cython.boundscheck(False) @cython.wraparound(False) def is_monotonic_%(name)s(ndarray[%(c_type)s] arr): @@ -582,6 +644,965 @@ def groupby_%(name)s(ndarray[%(c_type)s] index, ndarray labels): """ +group_last_template = """@cython.wraparound(False) +@cython.wraparound(False) +def group_last_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(c_type)s, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + %(dest_type2)s val, count + ndarray[%(dest_type2)s, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] +""" + +group_last_bin_template = """@cython.wraparound(False) +@cython.wraparound(False) +def group_last_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(c_type)s, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + %(dest_type2)s val, count + ndarray[%(dest_type2)s, ndim=2] resx, nobs + + nobs = np.zeros_like(out) + resx = np.empty_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + resx[b, j] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] +""" + +group_nth_bin_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def group_nth_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(c_type)s, ndim=2] values, + ndarray[int64_t] bins, int64_t rank): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + %(dest_type2)s val, count + ndarray[%(dest_type2)s, ndim=2] resx, nobs + + nobs = np.zeros_like(out) + resx = np.empty_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + if nobs[b, j] == rank: + resx[b, j] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] +""" + +group_nth_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def group_nth_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(c_type)s, ndim=2] values, + ndarray[int64_t] labels, int64_t rank): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + %(dest_type2)s val, count + ndarray[%(dest_type2)s, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] +""" + +group_add_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def group_add_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(c_type)s, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + %(dest_type2)s val, count + ndarray[%(dest_type2)s, ndim=2] sumx, nobs + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = sumx[i, j] +""" + +group_add_bin_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def group_add_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(dest_type2)s, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b, nbins + %(dest_type2)s val, count + ndarray[%(dest_type2)s, ndim=2] sumx, nobs + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + sumx[b, j] += val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + sumx[b, 0] += val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = sumx[i, j] +""" + +group_prod_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def group_prod_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(c_type)s, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + %(dest_type2)s val, count + ndarray[%(dest_type2)s, ndim=2] prodx, nobs + + nobs = np.zeros_like(out) + prodx = np.ones_like(out) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + prodx[lab, j] *= val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + prodx[lab, 0] *= val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = prodx[i, j] +""" + +group_prod_bin_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def group_prod_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(dest_type2)s, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + %(dest_type2)s val, count + ndarray[%(dest_type2)s, ndim=2] prodx, nobs + + nobs = np.zeros_like(out) + prodx = np.ones_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + prodx[b, j] *= val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + prodx[b, 0] *= val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = prodx[i, j] +""" + +group_var_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def group_var_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(dest_type2)s, ndim=2] values, + ndarray[int64_t] labels): + cdef: + Py_ssize_t i, j, N, K, lab + %(dest_type2)s val, ct + ndarray[%(dest_type2)s, ndim=2] nobs, sumx, sumxx + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + sumxx = np.zeros_like(out) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + sumxx[lab, j] += val * val + else: + for i in range(N): + + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + sumxx[lab, 0] += val * val + + + for i in range(len(counts)): + for j in range(K): + ct = nobs[i, j] + if ct < 2: + out[i, j] = nan + else: + out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) / + (ct * ct - ct)) +""" + +group_var_bin_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def group_var_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(dest_type2)s, ndim=2] values, + ndarray[int64_t] bins): + + cdef: + Py_ssize_t i, j, N, K, ngroups, b + %(dest_type2)s val, ct + ndarray[%(dest_type2)s, ndim=2] nobs, sumx, sumxx + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + sumxx = np.zeros_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + sumx[b, j] += val + sumxx[b, j] += val * val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + sumx[b, 0] += val + sumxx[b, 0] += val * val + + for i in range(ngroups): + for j in range(K): + ct = nobs[i, j] + if ct < 2: + out[i, j] = nan + else: + out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) / + (ct * ct - ct)) +""" + +# add passing bin edges, instead of labels + + +#---------------------------------------------------------------------- +# group_min, group_max + +group_min_bin_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def group_min_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(dest_type2)s, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + %(dest_type2)s val, count + ndarray[%(dest_type2)s, ndim=2] minx, nobs + + nobs = np.zeros_like(out) + + minx = np.empty_like(out) + minx.fill(np.inf) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + if val < minx[b, j]: + minx[b, j] = val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + if val < minx[b, 0]: + minx[b, 0] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = minx[i, j] +""" + +group_max_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def group_max_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(dest_type2)s, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + %(dest_type2)s val, count + ndarray[%(dest_type2)s, ndim=2] maxx, nobs + + nobs = np.zeros_like(out) + + maxx = np.empty_like(out) + maxx.fill(-np.inf) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if val > maxx[lab, j]: + maxx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + if val > maxx[lab, 0]: + maxx[lab, 0] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = maxx[i, j] +""" + +group_max_bin_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def group_max_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(dest_type2)s, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + %(dest_type2)s val, count + ndarray[%(dest_type2)s, ndim=2] maxx, nobs + + nobs = np.zeros_like(out) + maxx = np.empty_like(out) + maxx.fill(-np.inf) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + if val > maxx[b, j]: + maxx[b, j] = val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + if val > maxx[b, 0]: + maxx[b, 0] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = maxx[i, j] +""" + + +group_min_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def group_min_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(dest_type2)s, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + %(dest_type2)s val, count + ndarray[%(dest_type2)s, ndim=2] minx, nobs + + nobs = np.zeros_like(out) + + minx = np.empty_like(out) + minx.fill(np.inf) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if val < minx[lab, j]: + minx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + if val < minx[lab, 0]: + minx[lab, 0] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = minx[i, j] +""" + + +group_mean_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def group_mean_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(dest_type2)s, ndim=2] values, + ndarray[int64_t] labels): + cdef: + Py_ssize_t i, j, N, K, lab + %(dest_type2)s val, count + ndarray[%(dest_type2)s, ndim=2] sumx, nobs + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + + for i in range(len(counts)): + for j in range(K): + count = nobs[i, j] + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = sumx[i, j] / count +""" + +group_mean_bin_template = """ +def group_mean_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(dest_type2)s, ndim=2] values, + ndarray[int64_t] bins): + cdef: + Py_ssize_t i, j, N, K, ngroups, b + %(dest_type2)s val, count + ndarray[%(dest_type2)s, ndim=2] sumx, nobs + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + sumx[b, j] += val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + sumx[b, 0] += val + + for i in range(ngroups): + for j in range(K): + count = nobs[i, j] + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = sumx[i, j] / count +""" + +group_ohlc_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def group_ohlc_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(dest_type2)s, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + %(dest_type2)s val, count + %(dest_type2)s vopen, vhigh, vlow, vclose, NA + bint got_first = 0 + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + if out.shape[1] != 4: + raise ValueError('Output array must have 4 columns') + + NA = np.nan + + b = 0 + if K > 1: + raise NotImplementedError + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + if not got_first: + out[b, 0] = NA + out[b, 1] = NA + out[b, 2] = NA + out[b, 3] = NA + else: + out[b, 0] = vopen + out[b, 1] = vhigh + out[b, 2] = vlow + out[b, 3] = vclose + b += 1 + got_first = 0 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + if not got_first: + got_first = 1 + vopen = val + vlow = val + vhigh = val + else: + if val < vlow: + vlow = val + if val > vhigh: + vhigh = val + vclose = val + + if not got_first: + out[b, 0] = NA + out[b, 1] = NA + out[b, 2] = NA + out[b, 3] = NA + else: + out[b, 0] = vopen + out[b, 1] = vhigh + out[b, 2] = vlow + out[b, 3] = vclose +""" + arrmap_template = """@cython.wraparound(False) @cython.boundscheck(False) def arrmap_%(name)s(ndarray[%(c_type)s] index, object func): @@ -1100,6 +2121,9 @@ def outer_join_indexer_%(name)s(ndarray[%(c_type)s] left, ensure_functions = [ ('float64', 'FLOAT64', 'float64'), + ('float32', 'FLOAT32', 'float32'), + ('int8', 'INT8', 'int8'), + ('int16', 'INT16', 'int16'), ('int32', 'INT32', 'int32'), ('int64', 'INT64', 'int64'), # ('platform_int', 'INT', 'int_'), @@ -1133,26 +2157,30 @@ def put2d_%(name)s_%(dest_type)s(ndarray[%(c_type)s, ndim=2, cast=True] values, #------------------------------------------------------------------------- # Generators -def generate_put_functions(): - function_list = [ - ('float64', 'float64_t', 'object'), - ('float64', 'float64_t', 'float64_t'), - ('object', 'object', 'object'), - ('int32', 'int32_t', 'int64_t'), - ('int32', 'int32_t', 'float64_t'), - ('int32', 'int32_t', 'object'), - ('int64', 'int64_t', 'int64_t'), - ('int64', 'int64_t', 'float64_t'), - ('int64', 'int64_t', 'object'), - ('bool', 'uint8_t', 'uint8_t'), - ('bool', 'uint8_t', 'object') - ] +def generate_put_template(template, use_ints = True, use_floats = True): + floats_list = [ + ('float64', 'float64_t', 'float64_t', 'np.float64'), + ('float32', 'float32_t', 'float32_t', 'np.float32'), + ] + ints_list = [ + ('int8', 'int8_t', 'float32_t', 'np.float32'), + ('int16', 'int16_t', 'float32_t', 'np.float32'), + ('int32', 'int32_t', 'float64_t', 'np.float64'), + ('int64', 'int64_t', 'float64_t', 'np.float64'), + ] + function_list = [] + if use_floats: + function_list.extend(floats_list) + if use_ints: + function_list.extend(ints_list) output = StringIO() - for name, c_type, dest_type in function_list: - func = put2d_template % {'name' : name, 'c_type' : c_type, - 'dest_type' : dest_type.replace('_t', ''), - 'dest_type2' : dest_type} + for name, c_type, dest_type, dest_dtype in function_list: + func = template % {'name' : name, + 'c_type' : c_type, + 'dest_type' : dest_type.replace('_t', ''), + 'dest_type2' : dest_type, + 'dest_dtype' : dest_dtype} output.write(func) return output.getvalue() @@ -1160,10 +2188,13 @@ def generate_put_functions(): # name, ctype, capable of holding NA function_list = [ ('float64', 'float64_t', 'np.float64', True), - ('object', 'object', 'object', True), + ('float32', 'float32_t', 'np.float32', True), + ('object','object', 'object', True), + ('int8', 'int8_t', 'np.int8', False), + ('int16', 'int16_t', 'np.int16', False), ('int32', 'int32_t', 'np.int32', False), ('int64', 'int64_t', 'np.int64', False), - ('bool', 'uint8_t', 'np.bool', False) + ('bool', 'uint8_t', 'np.bool', False) ] def generate_from_template(template, ndim=1, exclude=None): @@ -1178,6 +2209,25 @@ def generate_from_template(template, ndim=1, exclude=None): output.write(func) return output.getvalue() +put_2d = [diff_2d_template] +groupbys = [group_last_template, + group_last_bin_template, + group_nth_template, + group_nth_bin_template, + group_add_template, + group_add_bin_template, + group_prod_template, + group_prod_bin_template, + group_var_template, + group_var_bin_template, + group_mean_template, + group_mean_bin_template, + group_min_template, + group_min_bin_template, + group_max_template, + group_max_bin_template, + group_ohlc_template] + templates_1d = [map_indices_template, pad_template, backfill_template, @@ -1211,6 +2261,12 @@ def generate_take_cython_file(path='generated.pyx'): for template in templates_2d: print >> f, generate_from_template(template, ndim=2) + for template in put_2d: + print >> f, generate_put_template(template) + + for template in groupbys: + print >> f, generate_put_template(template, use_ints = False) + # for template in templates_1d_datetime: # print >> f, generate_from_template_datetime(template) diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx index 5ecd8439a13ec..a20fb5668aec9 100644 --- a/pandas/src/generated.pyx +++ b/pandas/src/generated.pyx @@ -57,6 +57,36 @@ cpdef ensure_float64(object arr): return np.array(arr, dtype=np.float64) +cpdef ensure_float32(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_FLOAT32: + return arr + else: + return arr.astype(np.float32) + else: + return np.array(arr, dtype=np.float32) + + +cpdef ensure_int8(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_INT8: + return arr + else: + return arr.astype(np.int8) + else: + return np.array(arr, dtype=np.int8) + + +cpdef ensure_int16(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_INT16: + return arr + else: + return arr.astype(np.int16) + else: + return np.array(arr, dtype=np.int16) + + cpdef ensure_int32(object arr): if util.is_array(arr): if ( arr).descr.type_num == NPY_INT32: @@ -109,6 +139,28 @@ cpdef map_indices_float64(ndarray[float64_t] index): return result +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_float32(ndarray[float32_t] index): + ''' + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + ''' + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + @cython.wraparound(False) @cython.boundscheck(False) cpdef map_indices_object(ndarray[object] index): @@ -131,6 +183,50 @@ cpdef map_indices_object(ndarray[object] index): return result +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_int8(ndarray[int8_t] index): + ''' + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + ''' + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_int16(ndarray[int16_t] index): + ''' + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + ''' + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + @cython.wraparound(False) @cython.boundscheck(False) cpdef map_indices_int32(ndarray[int32_t] index): @@ -259,6 +355,67 @@ def pad_float64(ndarray[float64_t] old, ndarray[float64_t] new, return indexer +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_float32(ndarray[float32_t] old, ndarray[float32_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef float32_t cur, next + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: + return indexer + + i = j = 0 + + cur = old[0] + + while j <= nright - 1 and new[j] < cur: + j += 1 + + while True: + if j == nright: + break + + if i == nleft - 1: + while j < nright: + if new[j] == cur: + indexer[j] = i + elif new[j] > cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + break + + next = old[i + 1] + + while j < nright and cur <= new[j] < next: + if new[j] == cur: + indexer[j] = i + elif fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + + fill_count = 0 + i += 1 + cur = next + + return indexer + @cython.boundscheck(False) @cython.wraparound(False) def pad_object(ndarray[object] old, ndarray[object] new, @@ -322,11 +479,11 @@ def pad_object(ndarray[object] old, ndarray[object] new, @cython.boundscheck(False) @cython.wraparound(False) -def pad_int32(ndarray[int32_t] old, ndarray[int32_t] new, +def pad_int8(ndarray[int8_t] old, ndarray[int8_t] new, limit=None): cdef Py_ssize_t i, j, nleft, nright cdef ndarray[int64_t, ndim=1] indexer - cdef int32_t cur, next + cdef int8_t cur, next cdef int lim, fill_count = 0 nleft = len(old) @@ -383,11 +540,11 @@ def pad_int32(ndarray[int32_t] old, ndarray[int32_t] new, @cython.boundscheck(False) @cython.wraparound(False) -def pad_int64(ndarray[int64_t] old, ndarray[int64_t] new, +def pad_int16(ndarray[int16_t] old, ndarray[int16_t] new, limit=None): cdef Py_ssize_t i, j, nleft, nright cdef ndarray[int64_t, ndim=1] indexer - cdef int64_t cur, next + cdef int16_t cur, next cdef int lim, fill_count = 0 nleft = len(old) @@ -444,11 +601,11 @@ def pad_int64(ndarray[int64_t] old, ndarray[int64_t] new, @cython.boundscheck(False) @cython.wraparound(False) -def pad_bool(ndarray[uint8_t] old, ndarray[uint8_t] new, +def pad_int32(ndarray[int32_t] old, ndarray[int32_t] new, limit=None): cdef Py_ssize_t i, j, nleft, nright cdef ndarray[int64_t, ndim=1] indexer - cdef uint8_t cur, next + cdef int32_t cur, next cdef int lim, fill_count = 0 nleft = len(old) @@ -503,14 +660,13 @@ def pad_bool(ndarray[uint8_t] old, ndarray[uint8_t] new, return indexer - @cython.boundscheck(False) @cython.wraparound(False) -def backfill_float64(ndarray[float64_t] old, ndarray[float64_t] new, - limit=None): +def pad_int64(ndarray[int64_t] old, ndarray[int64_t] new, + limit=None): cdef Py_ssize_t i, j, nleft, nright cdef ndarray[int64_t, ndim=1] indexer - cdef float64_t cur, prev + cdef int64_t cur, next cdef int lim, fill_count = 0 nleft = len(old) @@ -525,54 +681,53 @@ def backfill_float64(ndarray[float64_t] old, ndarray[float64_t] new, raise ValueError('Limit must be non-negative') lim = limit - if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: return indexer - i = nleft - 1 - j = nright - 1 + i = j = 0 - cur = old[nleft - 1] + cur = old[0] - while j >= 0 and new[j] > cur: - j -= 1 + while j <= nright - 1 and new[j] < cur: + j += 1 while True: - if j < 0: + if j == nright: break - if i == 0: - while j >= 0: + if i == nleft - 1: + while j < nright: if new[j] == cur: indexer[j] = i - elif new[j] < cur and fill_count < lim: + elif new[j] > cur and fill_count < lim: indexer[j] = i fill_count += 1 - j -= 1 + j += 1 break - prev = old[i - 1] + next = old[i + 1] - while j >= 0 and prev < new[j] <= cur: + while j < nright and cur <= new[j] < next: if new[j] == cur: indexer[j] = i - elif new[j] < cur and fill_count < lim: + elif fill_count < lim: indexer[j] = i fill_count += 1 - j -= 1 + j += 1 fill_count = 0 - i -= 1 - cur = prev + i += 1 + cur = next return indexer @cython.boundscheck(False) @cython.wraparound(False) -def backfill_object(ndarray[object] old, ndarray[object] new, - limit=None): +def pad_bool(ndarray[uint8_t] old, ndarray[uint8_t] new, + limit=None): cdef Py_ssize_t i, j, nleft, nright cdef ndarray[int64_t, ndim=1] indexer - cdef object cur, prev + cdef uint8_t cur, next cdef int lim, fill_count = 0 nleft = len(old) @@ -587,54 +742,54 @@ def backfill_object(ndarray[object] old, ndarray[object] new, raise ValueError('Limit must be non-negative') lim = limit - if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: return indexer - i = nleft - 1 - j = nright - 1 + i = j = 0 - cur = old[nleft - 1] + cur = old[0] - while j >= 0 and new[j] > cur: - j -= 1 + while j <= nright - 1 and new[j] < cur: + j += 1 while True: - if j < 0: + if j == nright: break - if i == 0: - while j >= 0: + if i == nleft - 1: + while j < nright: if new[j] == cur: indexer[j] = i - elif new[j] < cur and fill_count < lim: + elif new[j] > cur and fill_count < lim: indexer[j] = i fill_count += 1 - j -= 1 + j += 1 break - prev = old[i - 1] + next = old[i + 1] - while j >= 0 and prev < new[j] <= cur: + while j < nright and cur <= new[j] < next: if new[j] == cur: indexer[j] = i - elif new[j] < cur and fill_count < lim: + elif fill_count < lim: indexer[j] = i fill_count += 1 - j -= 1 + j += 1 fill_count = 0 - i -= 1 - cur = prev + i += 1 + cur = next return indexer + @cython.boundscheck(False) @cython.wraparound(False) -def backfill_int32(ndarray[int32_t] old, ndarray[int32_t] new, +def backfill_float64(ndarray[float64_t] old, ndarray[float64_t] new, limit=None): cdef Py_ssize_t i, j, nleft, nright cdef ndarray[int64_t, ndim=1] indexer - cdef int32_t cur, prev + cdef float64_t cur, prev cdef int lim, fill_count = 0 nleft = len(old) @@ -692,11 +847,11 @@ def backfill_int32(ndarray[int32_t] old, ndarray[int32_t] new, @cython.boundscheck(False) @cython.wraparound(False) -def backfill_int64(ndarray[int64_t] old, ndarray[int64_t] new, +def backfill_float32(ndarray[float32_t] old, ndarray[float32_t] new, limit=None): cdef Py_ssize_t i, j, nleft, nright cdef ndarray[int64_t, ndim=1] indexer - cdef int64_t cur, prev + cdef float32_t cur, prev cdef int lim, fill_count = 0 nleft = len(old) @@ -754,11 +909,11 @@ def backfill_int64(ndarray[int64_t] old, ndarray[int64_t] new, @cython.boundscheck(False) @cython.wraparound(False) -def backfill_bool(ndarray[uint8_t] old, ndarray[uint8_t] new, +def backfill_object(ndarray[object] old, ndarray[object] new, limit=None): cdef Py_ssize_t i, j, nleft, nright cdef ndarray[int64_t, ndim=1] indexer - cdef uint8_t cur, prev + cdef object cur, prev cdef int lim, fill_count = 0 nleft = len(old) @@ -814,192 +969,332 @@ def backfill_bool(ndarray[uint8_t] old, ndarray[uint8_t] new, return indexer - @cython.boundscheck(False) @cython.wraparound(False) -def pad_inplace_float64(ndarray[float64_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef float64_t val +def backfill_int8(ndarray[int8_t] old, ndarray[int8_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef int8_t cur, prev cdef int lim, fill_count = 0 - N = len(values) + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) if limit is None: - lim = N + lim = nright else: if limit < 0: raise ValueError('Limit must be non-negative') lim = limit - val = values[0] - for i in range(N): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_inplace_object(ndarray[object] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef object val - cdef int lim, fill_count = 0 + i = nleft - 1 + j = nright - 1 - N = len(values) + cur = old[nleft - 1] - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit + while j >= 0 and new[j] > cur: + j -= 1 - val = values[0] - for i in range(N): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] + while True: + if j < 0: + break -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_inplace_int32(ndarray[int32_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef int32_t val - cdef int lim, fill_count = 0 + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break - N = len(values) + prev = old[i - 1] - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 - val = values[0] - for i in range(N): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] + fill_count = 0 + i -= 1 + cur = prev + + return indexer @cython.boundscheck(False) @cython.wraparound(False) -def pad_inplace_int64(ndarray[int64_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef int64_t val +def backfill_int16(ndarray[int16_t] old, ndarray[int16_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef int16_t cur, prev cdef int lim, fill_count = 0 - N = len(values) + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) if limit is None: - lim = N + lim = nright else: if limit < 0: raise ValueError('Limit must be non-negative') lim = limit - val = values[0] - for i in range(N): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer @cython.boundscheck(False) @cython.wraparound(False) -def pad_inplace_bool(ndarray[uint8_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef uint8_t val +def backfill_int32(ndarray[int32_t] old, ndarray[int32_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef int32_t cur, prev cdef int lim, fill_count = 0 - N = len(values) + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) if limit is None: - lim = N + lim = nright else: if limit < 0: raise ValueError('Limit must be non-negative') lim = limit - val = values[0] - for i in range(N): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer @cython.boundscheck(False) @cython.wraparound(False) -def backfill_inplace_float64(ndarray[float64_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef float64_t val +def backfill_int64(ndarray[int64_t] old, ndarray[int64_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef int64_t cur, prev cdef int lim, fill_count = 0 - N = len(values) + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) if limit is None: - lim = N + lim = nright else: if limit < 0: raise ValueError('Limit must be non-negative') lim = limit - val = values[N - 1] - for i in range(N - 1, -1 , -1): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_inplace_object(ndarray[object] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef object val - cdef int lim, fill_count = 0 + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_bool(ndarray[uint8_t] old, ndarray[uint8_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef uint8_t cur, prev + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace_float64(ndarray[float64_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef float64_t val + cdef int lim, fill_count = 0 N = len(values) + # GH 2778 + if N == 0: + return + if limit is None: lim = N else: @@ -1007,8 +1302,8 @@ def backfill_inplace_object(ndarray[object] values, raise ValueError('Limit must be non-negative') lim = limit - val = values[N - 1] - for i in range(N - 1, -1 , -1): + val = values[0] + for i in range(N): if mask[i]: if fill_count >= lim: continue @@ -1017,17 +1312,22 @@ def backfill_inplace_object(ndarray[object] values, else: fill_count = 0 val = values[i] + @cython.boundscheck(False) @cython.wraparound(False) -def backfill_inplace_int32(ndarray[int32_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): +def pad_inplace_float32(ndarray[float32_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): cdef Py_ssize_t i, N - cdef int32_t val + cdef float32_t val cdef int lim, fill_count = 0 N = len(values) + # GH 2778 + if N == 0: + return + if limit is None: lim = N else: @@ -1035,8 +1335,8 @@ def backfill_inplace_int32(ndarray[int32_t] values, raise ValueError('Limit must be non-negative') lim = limit - val = values[N - 1] - for i in range(N - 1, -1 , -1): + val = values[0] + for i in range(N): if mask[i]: if fill_count >= lim: continue @@ -1045,17 +1345,22 @@ def backfill_inplace_int32(ndarray[int32_t] values, else: fill_count = 0 val = values[i] + @cython.boundscheck(False) @cython.wraparound(False) -def backfill_inplace_int64(ndarray[int64_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): +def pad_inplace_object(ndarray[object] values, + ndarray[uint8_t, cast=True] mask, + limit=None): cdef Py_ssize_t i, N - cdef int64_t val + cdef object val cdef int lim, fill_count = 0 N = len(values) + # GH 2778 + if N == 0: + return + if limit is None: lim = N else: @@ -1063,8 +1368,8 @@ def backfill_inplace_int64(ndarray[int64_t] values, raise ValueError('Limit must be non-negative') lim = limit - val = values[N - 1] - for i in range(N - 1, -1 , -1): + val = values[0] + for i in range(N): if mask[i]: if fill_count >= lim: continue @@ -1073,17 +1378,22 @@ def backfill_inplace_int64(ndarray[int64_t] values, else: fill_count = 0 val = values[i] + @cython.boundscheck(False) @cython.wraparound(False) -def backfill_inplace_bool(ndarray[uint8_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): +def pad_inplace_int8(ndarray[int8_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): cdef Py_ssize_t i, N - cdef uint8_t val + cdef int8_t val cdef int lim, fill_count = 0 N = len(values) + # GH 2778 + if N == 0: + return + if limit is None: lim = N else: @@ -1091,8 +1401,8 @@ def backfill_inplace_bool(ndarray[uint8_t] values, raise ValueError('Limit must be non-negative') lim = limit - val = values[N - 1] - for i in range(N - 1, -1 , -1): + val = values[0] + for i in range(N): if mask[i]: if fill_count >= lim: continue @@ -1104,14 +1414,18 @@ def backfill_inplace_bool(ndarray[uint8_t] values, @cython.boundscheck(False) @cython.wraparound(False) -def pad_2d_inplace_float64(ndarray[float64_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef float64_t val +def pad_inplace_int16(ndarray[int16_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef int16_t val cdef int lim, fill_count = 0 - K, N = ( values).shape + N = len(values) + + # GH 2778 + if N == 0: + return if limit is None: lim = N @@ -1120,28 +1434,31 @@ def pad_2d_inplace_float64(ndarray[float64_t, ndim=2] values, raise ValueError('Limit must be non-negative') lim = limit - for j in range(K): - fill_count = 0 - val = values[j, 0] - for i in range(N): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + @cython.boundscheck(False) @cython.wraparound(False) -def pad_2d_inplace_object(ndarray[object, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef object val +def pad_inplace_int32(ndarray[int32_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef int32_t val cdef int lim, fill_count = 0 - K, N = ( values).shape + N = len(values) + + # GH 2778 + if N == 0: + return if limit is None: lim = N @@ -1150,28 +1467,31 @@ def pad_2d_inplace_object(ndarray[object, ndim=2] values, raise ValueError('Limit must be non-negative') lim = limit - for j in range(K): - fill_count = 0 - val = values[j, 0] - for i in range(N): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + @cython.boundscheck(False) @cython.wraparound(False) -def pad_2d_inplace_int32(ndarray[int32_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef int32_t val +def pad_inplace_int64(ndarray[int64_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef int64_t val cdef int lim, fill_count = 0 - K, N = ( values).shape + N = len(values) + + # GH 2778 + if N == 0: + return if limit is None: lim = N @@ -1180,28 +1500,31 @@ def pad_2d_inplace_int32(ndarray[int32_t, ndim=2] values, raise ValueError('Limit must be non-negative') lim = limit - for j in range(K): - fill_count = 0 - val = values[j, 0] - for i in range(N): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_2d_inplace_int64(ndarray[int64_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef int64_t val + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace_bool(ndarray[uint8_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef uint8_t val cdef int lim, fill_count = 0 - K, N = ( values).shape + N = len(values) + + # GH 2778 + if N == 0: + return if limit is None: lim = N @@ -1210,28 +1533,32 @@ def pad_2d_inplace_int64(ndarray[int64_t, ndim=2] values, raise ValueError('Limit must be non-negative') lim = limit - for j in range(K): - fill_count = 0 - val = values[j, 0] - for i in range(N): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + @cython.boundscheck(False) @cython.wraparound(False) -def pad_2d_inplace_bool(ndarray[uint8_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef uint8_t val +def backfill_inplace_float64(ndarray[float64_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef float64_t val cdef int lim, fill_count = 0 - K, N = ( values).shape + N = len(values) + + # GH 2778 + if N == 0: + return if limit is None: lim = N @@ -1240,29 +1567,30 @@ def pad_2d_inplace_bool(ndarray[uint8_t, ndim=2] values, raise ValueError('Limit must be non-negative') lim = limit - for j in range(K): - fill_count = 0 - val = values[j, 0] - for i in range(N): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - + val = values[N - 1] + for i in range(N - 1, -1 , -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] @cython.boundscheck(False) @cython.wraparound(False) -def backfill_2d_inplace_float64(ndarray[float64_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef float64_t val +def backfill_inplace_float32(ndarray[float32_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef float32_t val cdef int lim, fill_count = 0 - K, N = ( values).shape + N = len(values) + + # GH 2778 + if N == 0: + return if limit is None: lim = N @@ -1271,28 +1599,30 @@ def backfill_2d_inplace_float64(ndarray[float64_t, ndim=2] values, raise ValueError('Limit must be non-negative') lim = limit - for j in range(K): - fill_count = 0 - val = values[j, N - 1] - for i in range(N - 1, -1 , -1): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] + val = values[N - 1] + for i in range(N - 1, -1 , -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] @cython.boundscheck(False) @cython.wraparound(False) -def backfill_2d_inplace_object(ndarray[object, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K +def backfill_inplace_object(ndarray[object] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N cdef object val cdef int lim, fill_count = 0 - K, N = ( values).shape + N = len(values) + + # GH 2778 + if N == 0: + return if limit is None: lim = N @@ -1301,28 +1631,30 @@ def backfill_2d_inplace_object(ndarray[object, ndim=2] values, raise ValueError('Limit must be non-negative') lim = limit - for j in range(K): - fill_count = 0 - val = values[j, N - 1] - for i in range(N - 1, -1 , -1): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] + val = values[N - 1] + for i in range(N - 1, -1 , -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] @cython.boundscheck(False) @cython.wraparound(False) -def backfill_2d_inplace_int32(ndarray[int32_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef int32_t val +def backfill_inplace_int8(ndarray[int8_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef int8_t val cdef int lim, fill_count = 0 - K, N = ( values).shape + N = len(values) + + # GH 2778 + if N == 0: + return if limit is None: lim = N @@ -1331,28 +1663,30 @@ def backfill_2d_inplace_int32(ndarray[int32_t, ndim=2] values, raise ValueError('Limit must be non-negative') lim = limit - for j in range(K): - fill_count = 0 - val = values[j, N - 1] - for i in range(N - 1, -1 , -1): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] + val = values[N - 1] + for i in range(N - 1, -1 , -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] @cython.boundscheck(False) @cython.wraparound(False) -def backfill_2d_inplace_int64(ndarray[int64_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef int64_t val +def backfill_inplace_int16(ndarray[int16_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef int16_t val cdef int lim, fill_count = 0 - K, N = ( values).shape + N = len(values) + + # GH 2778 + if N == 0: + return if limit is None: lim = N @@ -1361,28 +1695,30 @@ def backfill_2d_inplace_int64(ndarray[int64_t, ndim=2] values, raise ValueError('Limit must be non-negative') lim = limit - for j in range(K): - fill_count = 0 - val = values[j, N - 1] - for i in range(N - 1, -1 , -1): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] + val = values[N - 1] + for i in range(N - 1, -1 , -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] @cython.boundscheck(False) @cython.wraparound(False) -def backfill_2d_inplace_bool(ndarray[uint8_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef uint8_t val +def backfill_inplace_int32(ndarray[int32_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef int32_t val cdef int lim, fill_count = 0 - K, N = ( values).shape + N = len(values) + + # GH 2778 + if N == 0: + return if limit is None: lim = N @@ -1391,533 +1727,640 @@ def backfill_2d_inplace_bool(ndarray[uint8_t, ndim=2] values, raise ValueError('Limit must be non-negative') lim = limit - for j in range(K): - fill_count = 0 - val = values[j, N - 1] - for i in range(N - 1, -1 , -1): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - + val = values[N - 1] + for i in range(N - 1, -1 , -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] +@cython.boundscheck(False) @cython.wraparound(False) -def take_1d_float64(ndarray[float64_t] values, - ndarray[int64_t] indexer, - out=None, fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - ndarray[float64_t] outbuf - float64_t fv +def backfill_inplace_int64(ndarray[int64_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef int64_t val + cdef int lim, fill_count = 0 - n = len(indexer) + N = len(values) - if out is None: - outbuf = np.empty(n, dtype=values.dtype) - else: - outbuf = out + # GH 2778 + if N == 0: + return - if False and _checknan(fill_value): - for i in range(n): - idx = indexer[i] - if idx == -1: - raise ValueError('No NA values allowed') - else: - outbuf[i] = values[idx] + if limit is None: + lim = N else: - fv = fill_value - for i in range(n): - idx = indexer[i] - if idx == -1: - outbuf[i] = fv - else: - outbuf[i] = values[idx] + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + val = values[N - 1] + for i in range(N - 1, -1 , -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] +@cython.boundscheck(False) @cython.wraparound(False) -def take_1d_object(ndarray[object] values, - ndarray[int64_t] indexer, - out=None, fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - ndarray[object] outbuf - object fv +def backfill_inplace_bool(ndarray[uint8_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef uint8_t val + cdef int lim, fill_count = 0 - n = len(indexer) + N = len(values) - if out is None: - outbuf = np.empty(n, dtype=values.dtype) - else: - outbuf = out + # GH 2778 + if N == 0: + return - if False and _checknan(fill_value): - for i in range(n): - idx = indexer[i] - if idx == -1: - raise ValueError('No NA values allowed') - else: - outbuf[i] = values[idx] + if limit is None: + lim = N else: - fv = fill_value - for i in range(n): - idx = indexer[i] - if idx == -1: - outbuf[i] = fv - else: - outbuf[i] = values[idx] + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[N - 1] + for i in range(N - 1, -1 , -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] +@cython.boundscheck(False) @cython.wraparound(False) -def take_1d_int32(ndarray[int32_t] values, - ndarray[int64_t] indexer, - out=None, fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - ndarray[int32_t] outbuf - int32_t fv +def pad_2d_inplace_float64(ndarray[float64_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef float64_t val + cdef int lim, fill_count = 0 - n = len(indexer) + K, N = ( values).shape - if out is None: - outbuf = np.empty(n, dtype=values.dtype) - else: - outbuf = out + # GH 2778 + if N == 0: + return - if True and _checknan(fill_value): - for i in range(n): - idx = indexer[i] - if idx == -1: - raise ValueError('No NA values allowed') - else: - outbuf[i] = values[idx] + if limit is None: + lim = N else: - fv = fill_value - for i in range(n): - idx = indexer[i] - if idx == -1: - outbuf[i] = fv - else: - outbuf[i] = values[idx] + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] +@cython.boundscheck(False) @cython.wraparound(False) -def take_1d_int64(ndarray[int64_t] values, - ndarray[int64_t] indexer, - out=None, fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - ndarray[int64_t] outbuf - int64_t fv +def pad_2d_inplace_float32(ndarray[float32_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef float32_t val + cdef int lim, fill_count = 0 - n = len(indexer) + K, N = ( values).shape - if out is None: - outbuf = np.empty(n, dtype=values.dtype) - else: - outbuf = out + # GH 2778 + if N == 0: + return - if True and _checknan(fill_value): - for i in range(n): - idx = indexer[i] - if idx == -1: - raise ValueError('No NA values allowed') - else: - outbuf[i] = values[idx] + if limit is None: + lim = N else: - fv = fill_value - for i in range(n): - idx = indexer[i] - if idx == -1: - outbuf[i] = fv - else: - outbuf[i] = values[idx] + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] +@cython.boundscheck(False) @cython.wraparound(False) -def take_1d_bool(ndarray[uint8_t] values, - ndarray[int64_t] indexer, - out=None, fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - ndarray[uint8_t] outbuf - uint8_t fv +def pad_2d_inplace_object(ndarray[object, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef object val + cdef int lim, fill_count = 0 - n = len(indexer) + K, N = ( values).shape - if out is None: - outbuf = np.empty(n, dtype=values.dtype) - else: - outbuf = out + # GH 2778 + if N == 0: + return - if True and _checknan(fill_value): - for i in range(n): - idx = indexer[i] - if idx == -1: - raise ValueError('No NA values allowed') - else: - outbuf[i] = values[idx] + if limit is None: + lim = N else: - fv = fill_value - for i in range(n): - idx = indexer[i] - if idx == -1: - outbuf[i] = fv - else: - outbuf[i] = values[idx] - + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] @cython.boundscheck(False) @cython.wraparound(False) -def is_monotonic_float64(ndarray[float64_t] arr): - ''' - Returns - ------- - is_monotonic, is_unique - ''' - cdef: - Py_ssize_t i, n - float64_t prev, cur - bint is_unique = 1 - - n = len(arr) +def pad_2d_inplace_int8(ndarray[int8_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef int8_t val + cdef int lim, fill_count = 0 - if n < 2: - return True, True + K, N = ( values).shape - prev = arr[0] - for i in range(1, n): - cur = arr[i] - if cur < prev: - return False, None - elif cur == prev: - is_unique = 0 - prev = cur - return True, is_unique + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] @cython.boundscheck(False) @cython.wraparound(False) -def is_monotonic_object(ndarray[object] arr): - ''' - Returns - ------- - is_monotonic, is_unique - ''' - cdef: - Py_ssize_t i, n - object prev, cur - bint is_unique = 1 +def pad_2d_inplace_int16(ndarray[int16_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef int16_t val + cdef int lim, fill_count = 0 - n = len(arr) + K, N = ( values).shape - if n < 2: - return True, True + # GH 2778 + if N == 0: + return - prev = arr[0] - for i in range(1, n): - cur = arr[i] - if cur < prev: - return False, None - elif cur == prev: - is_unique = 0 - prev = cur - return True, is_unique + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] @cython.boundscheck(False) @cython.wraparound(False) -def is_monotonic_int32(ndarray[int32_t] arr): - ''' - Returns - ------- - is_monotonic, is_unique - ''' - cdef: - Py_ssize_t i, n - int32_t prev, cur - bint is_unique = 1 +def pad_2d_inplace_int32(ndarray[int32_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef int32_t val + cdef int lim, fill_count = 0 - n = len(arr) + K, N = ( values).shape - if n < 2: - return True, True + # GH 2778 + if N == 0: + return - prev = arr[0] - for i in range(1, n): - cur = arr[i] - if cur < prev: - return False, None - elif cur == prev: - is_unique = 0 - prev = cur - return True, is_unique + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] @cython.boundscheck(False) @cython.wraparound(False) -def is_monotonic_int64(ndarray[int64_t] arr): - ''' - Returns - ------- - is_monotonic, is_unique - ''' - cdef: - Py_ssize_t i, n - int64_t prev, cur - bint is_unique = 1 +def pad_2d_inplace_int64(ndarray[int64_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef int64_t val + cdef int lim, fill_count = 0 - n = len(arr) + K, N = ( values).shape - if n < 2: - return True, True + # GH 2778 + if N == 0: + return - prev = arr[0] - for i in range(1, n): - cur = arr[i] - if cur < prev: - return False, None - elif cur == prev: - is_unique = 0 - prev = cur - return True, is_unique + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] @cython.boundscheck(False) @cython.wraparound(False) -def is_monotonic_bool(ndarray[uint8_t] arr): - ''' - Returns - ------- - is_monotonic, is_unique - ''' - cdef: - Py_ssize_t i, n - uint8_t prev, cur - bint is_unique = 1 +def pad_2d_inplace_bool(ndarray[uint8_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef uint8_t val + cdef int lim, fill_count = 0 - n = len(arr) + K, N = ( values).shape - if n < 2: - return True, True + # GH 2778 + if N == 0: + return - prev = arr[0] - for i in range(1, n): - cur = arr[i] - if cur < prev: - return False, None - elif cur == prev: - is_unique = 0 - prev = cur - return True, is_unique + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] -@cython.wraparound(False) @cython.boundscheck(False) -def groupby_float64(ndarray[float64_t] index, ndarray labels): - cdef dict result = {} - cdef Py_ssize_t i, length - cdef list members - cdef object idx, key +@cython.wraparound(False) +def backfill_2d_inplace_float64(ndarray[float64_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef float64_t val + cdef int lim, fill_count = 0 - length = len(index) + K, N = ( values).shape - for i in range(length): - key = util.get_value_1d(labels, i) + # GH 2778 + if N == 0: + return - if _checknull(key): - continue + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] - - return result - -@cython.wraparound(False) + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1 , -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] @cython.boundscheck(False) -def groupby_object(ndarray[object] index, ndarray labels): - cdef dict result = {} - cdef Py_ssize_t i, length - cdef list members - cdef object idx, key - - length = len(index) - - for i in range(length): - key = util.get_value_1d(labels, i) +@cython.wraparound(False) +def backfill_2d_inplace_float32(ndarray[float32_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef float32_t val + cdef int lim, fill_count = 0 - if _checknull(key): - continue + K, N = ( values).shape - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] + # GH 2778 + if N == 0: + return - return result + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit -@cython.wraparound(False) + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1 , -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] @cython.boundscheck(False) -def groupby_int32(ndarray[int32_t] index, ndarray labels): - cdef dict result = {} - cdef Py_ssize_t i, length - cdef list members - cdef object idx, key - - length = len(index) - - for i in range(length): - key = util.get_value_1d(labels, i) +@cython.wraparound(False) +def backfill_2d_inplace_object(ndarray[object, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef object val + cdef int lim, fill_count = 0 - if _checknull(key): - continue + K, N = ( values).shape - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] + # GH 2778 + if N == 0: + return - return result + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit -@cython.wraparound(False) + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1 , -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] @cython.boundscheck(False) -def groupby_int64(ndarray[int64_t] index, ndarray labels): - cdef dict result = {} - cdef Py_ssize_t i, length - cdef list members - cdef object idx, key - - length = len(index) - - for i in range(length): - key = util.get_value_1d(labels, i) +@cython.wraparound(False) +def backfill_2d_inplace_int8(ndarray[int8_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef int8_t val + cdef int lim, fill_count = 0 - if _checknull(key): - continue + K, N = ( values).shape - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] + # GH 2778 + if N == 0: + return - return result + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit -@cython.wraparound(False) + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1 , -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] @cython.boundscheck(False) -def groupby_bool(ndarray[uint8_t] index, ndarray labels): - cdef dict result = {} - cdef Py_ssize_t i, length - cdef list members - cdef object idx, key - - length = len(index) - - for i in range(length): - key = util.get_value_1d(labels, i) - - if _checknull(key): - continue +@cython.wraparound(False) +def backfill_2d_inplace_int16(ndarray[int16_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef int16_t val + cdef int lim, fill_count = 0 - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] + K, N = ( values).shape - return result + # GH 2778 + if N == 0: + return + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit -@cython.wraparound(False) + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1 , -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] @cython.boundscheck(False) -def arrmap_float64(ndarray[float64_t] index, object func): - cdef Py_ssize_t length = index.shape[0] - cdef Py_ssize_t i = 0 - - cdef ndarray[object] result = np.empty(length, dtype=np.object_) +@cython.wraparound(False) +def backfill_2d_inplace_int32(ndarray[int32_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef int32_t val + cdef int lim, fill_count = 0 - from pandas.lib import maybe_convert_objects + K, N = ( values).shape - for i in range(length): - result[i] = func(index[i]) + # GH 2778 + if N == 0: + return - return maybe_convert_objects(result) + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit -@cython.wraparound(False) + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1 , -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] @cython.boundscheck(False) -def arrmap_object(ndarray[object] index, object func): - cdef Py_ssize_t length = index.shape[0] - cdef Py_ssize_t i = 0 - - cdef ndarray[object] result = np.empty(length, dtype=np.object_) +@cython.wraparound(False) +def backfill_2d_inplace_int64(ndarray[int64_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef int64_t val + cdef int lim, fill_count = 0 - from pandas.lib import maybe_convert_objects + K, N = ( values).shape - for i in range(length): - result[i] = func(index[i]) + # GH 2778 + if N == 0: + return - return maybe_convert_objects(result) + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit -@cython.wraparound(False) -@cython.boundscheck(False) -def arrmap_int32(ndarray[int32_t] index, object func): - cdef Py_ssize_t length = index.shape[0] - cdef Py_ssize_t i = 0 - - cdef ndarray[object] result = np.empty(length, dtype=np.object_) - - from pandas.lib import maybe_convert_objects - - for i in range(length): - result[i] = func(index[i]) - - return maybe_convert_objects(result) - -@cython.wraparound(False) + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1 , -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] @cython.boundscheck(False) -def arrmap_int64(ndarray[int64_t] index, object func): - cdef Py_ssize_t length = index.shape[0] - cdef Py_ssize_t i = 0 - - cdef ndarray[object] result = np.empty(length, dtype=np.object_) - - from pandas.lib import maybe_convert_objects - - for i in range(length): - result[i] = func(index[i]) - - return maybe_convert_objects(result) - @cython.wraparound(False) -@cython.boundscheck(False) -def arrmap_bool(ndarray[uint8_t] index, object func): - cdef Py_ssize_t length = index.shape[0] - cdef Py_ssize_t i = 0 - - cdef ndarray[object] result = np.empty(length, dtype=np.object_) +def backfill_2d_inplace_bool(ndarray[uint8_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef uint8_t val + cdef int lim, fill_count = 0 - from pandas.lib import maybe_convert_objects + K, N = ( values).shape - for i in range(length): - result[i] = func(index[i]) + # GH 2778 + if N == 0: + return - return maybe_convert_objects(result) + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1 , -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] @cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_float64(ndarray[float64_t, ndim=2] values, - ndarray[int64_t] indexer, - out=None, fill_value=np.nan): +def take_1d_float64(ndarray[float64_t] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): cdef: - Py_ssize_t i, j, k, n, idx - ndarray[float64_t, ndim=2] outbuf + Py_ssize_t i, n, idx + ndarray[float64_t] outbuf float64_t fv n = len(indexer) - k = values.shape[1] if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) + outbuf = np.empty(n, dtype=values.dtype) else: outbuf = out @@ -1925,37 +2368,31 @@ def take_2d_axis0_float64(ndarray[float64_t, ndim=2] values, for i in range(n): idx = indexer[i] if idx == -1: - for j from 0 <= j < k: - raise ValueError('No NA values allowed') + raise ValueError('No NA values allowed') else: - for j from 0 <= j < k: - outbuf[i, j] = values[idx, j] + outbuf[i] = values[idx] else: fv = fill_value for i in range(n): idx = indexer[i] if idx == -1: - for j in range(k): - outbuf[i, j] = fv + outbuf[i] = fv else: - for j in range(k): - outbuf[i, j] = values[idx, j] + outbuf[i] = values[idx] @cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_object(ndarray[object, ndim=2] values, - ndarray[int64_t] indexer, - out=None, fill_value=np.nan): +def take_1d_float32(ndarray[float32_t] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): cdef: - Py_ssize_t i, j, k, n, idx - ndarray[object, ndim=2] outbuf - object fv + Py_ssize_t i, n, idx + ndarray[float32_t] outbuf + float32_t fv n = len(indexer) - k = values.shape[1] if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) + outbuf = np.empty(n, dtype=values.dtype) else: outbuf = out @@ -1963,75 +2400,63 @@ def take_2d_axis0_object(ndarray[object, ndim=2] values, for i in range(n): idx = indexer[i] if idx == -1: - for j from 0 <= j < k: - raise ValueError('No NA values allowed') + raise ValueError('No NA values allowed') else: - for j from 0 <= j < k: - outbuf[i, j] = values[idx, j] + outbuf[i] = values[idx] else: fv = fill_value for i in range(n): idx = indexer[i] if idx == -1: - for j in range(k): - outbuf[i, j] = fv + outbuf[i] = fv else: - for j in range(k): - outbuf[i, j] = values[idx, j] + outbuf[i] = values[idx] @cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int32(ndarray[int32_t, ndim=2] values, - ndarray[int64_t] indexer, - out=None, fill_value=np.nan): +def take_1d_object(ndarray[object] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int32_t, ndim=2] outbuf - int32_t fv + Py_ssize_t i, n, idx + ndarray[object] outbuf + object fv n = len(indexer) - k = values.shape[1] if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) + outbuf = np.empty(n, dtype=values.dtype) else: outbuf = out - if True and _checknan(fill_value): + if False and _checknan(fill_value): for i in range(n): idx = indexer[i] if idx == -1: - for j from 0 <= j < k: - raise ValueError('No NA values allowed') + raise ValueError('No NA values allowed') else: - for j from 0 <= j < k: - outbuf[i, j] = values[idx, j] + outbuf[i] = values[idx] else: fv = fill_value for i in range(n): idx = indexer[i] if idx == -1: - for j in range(k): - outbuf[i, j] = fv + outbuf[i] = fv else: - for j in range(k): - outbuf[i, j] = values[idx, j] + outbuf[i] = values[idx] @cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int64(ndarray[int64_t, ndim=2] values, - ndarray[int64_t] indexer, - out=None, fill_value=np.nan): +def take_1d_int8(ndarray[int8_t] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t, ndim=2] outbuf - int64_t fv + Py_ssize_t i, n, idx + ndarray[int8_t] outbuf + int8_t fv n = len(indexer) - k = values.shape[1] if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) + outbuf = np.empty(n, dtype=values.dtype) else: outbuf = out @@ -2039,37 +2464,31 @@ def take_2d_axis0_int64(ndarray[int64_t, ndim=2] values, for i in range(n): idx = indexer[i] if idx == -1: - for j from 0 <= j < k: - raise ValueError('No NA values allowed') + raise ValueError('No NA values allowed') else: - for j from 0 <= j < k: - outbuf[i, j] = values[idx, j] + outbuf[i] = values[idx] else: fv = fill_value for i in range(n): idx = indexer[i] if idx == -1: - for j in range(k): - outbuf[i, j] = fv + outbuf[i] = fv else: - for j in range(k): - outbuf[i, j] = values[idx, j] + outbuf[i] = values[idx] @cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_bool(ndarray[uint8_t, ndim=2] values, - ndarray[int64_t] indexer, - out=None, fill_value=np.nan): +def take_1d_int16(ndarray[int16_t] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): cdef: - Py_ssize_t i, j, k, n, idx - ndarray[uint8_t, ndim=2] outbuf - uint8_t fv + Py_ssize_t i, n, idx + ndarray[int16_t] outbuf + int16_t fv n = len(indexer) - k = values.shape[1] if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) + outbuf = np.empty(n, dtype=values.dtype) else: outbuf = out @@ -2077,647 +2496,4928 @@ def take_2d_axis0_bool(ndarray[uint8_t, ndim=2] values, for i in range(n): idx = indexer[i] if idx == -1: - for j from 0 <= j < k: - raise ValueError('No NA values allowed') + raise ValueError('No NA values allowed') else: - for j from 0 <= j < k: - outbuf[i, j] = values[idx, j] + outbuf[i] = values[idx] else: fv = fill_value for i in range(n): idx = indexer[i] if idx == -1: - for j in range(k): - outbuf[i, j] = fv + outbuf[i] = fv else: - for j in range(k): - outbuf[i, j] = values[idx, j] - + outbuf[i] = values[idx] @cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_float64(ndarray[float64_t, ndim=2] values, - ndarray[int64_t] indexer, - out=None, fill_value=np.nan): +def take_1d_int32(ndarray[int32_t] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): cdef: - Py_ssize_t i, j, k, n, idx - ndarray[float64_t, ndim=2] outbuf - float64_t fv + Py_ssize_t i, n, idx + ndarray[int32_t] outbuf + int32_t fv - n = len(values) - k = len(indexer) + n = len(indexer) if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) + outbuf = np.empty(n, dtype=values.dtype) else: outbuf = out - if False and _checknan(fill_value): - for j in range(k): - idx = indexer[j] - + if True and _checknan(fill_value): + for i in range(n): + idx = indexer[i] if idx == -1: - for i in range(n): - raise ValueError('No NA values allowed') + raise ValueError('No NA values allowed') else: - for i in range(n): - outbuf[i, j] = values[i, idx] + outbuf[i] = values[idx] else: fv = fill_value - for j in range(k): - idx = indexer[j] - + for i in range(n): + idx = indexer[i] if idx == -1: - for i in range(n): - outbuf[i, j] = fv + outbuf[i] = fv else: - for i in range(n): - outbuf[i, j] = values[i, idx] + outbuf[i] = values[idx] @cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_object(ndarray[object, ndim=2] values, - ndarray[int64_t] indexer, - out=None, fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[object, ndim=2] outbuf - object fv +def take_1d_int64(ndarray[int64_t] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, n, idx + ndarray[int64_t] outbuf + int64_t fv - n = len(values) - k = len(indexer) + n = len(indexer) if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) + outbuf = np.empty(n, dtype=values.dtype) else: outbuf = out - if False and _checknan(fill_value): - for j in range(k): - idx = indexer[j] - + if True and _checknan(fill_value): + for i in range(n): + idx = indexer[i] if idx == -1: - for i in range(n): - raise ValueError('No NA values allowed') + raise ValueError('No NA values allowed') else: - for i in range(n): - outbuf[i, j] = values[i, idx] + outbuf[i] = values[idx] else: fv = fill_value - for j in range(k): - idx = indexer[j] - + for i in range(n): + idx = indexer[i] if idx == -1: - for i in range(n): - outbuf[i, j] = fv + outbuf[i] = fv else: - for i in range(n): - outbuf[i, j] = values[i, idx] + outbuf[i] = values[idx] @cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int32(ndarray[int32_t, ndim=2] values, - ndarray[int64_t] indexer, - out=None, fill_value=np.nan): +def take_1d_bool(ndarray[uint8_t] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int32_t, ndim=2] outbuf - int32_t fv + Py_ssize_t i, n, idx + ndarray[uint8_t] outbuf + uint8_t fv - n = len(values) - k = len(indexer) + n = len(indexer) if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) + outbuf = np.empty(n, dtype=values.dtype) else: outbuf = out if True and _checknan(fill_value): - for j in range(k): - idx = indexer[j] - + for i in range(n): + idx = indexer[i] if idx == -1: - for i in range(n): - raise ValueError('No NA values allowed') + raise ValueError('No NA values allowed') else: - for i in range(n): - outbuf[i, j] = values[i, idx] + outbuf[i] = values[idx] else: fv = fill_value - for j in range(k): - idx = indexer[j] - + for i in range(n): + idx = indexer[i] if idx == -1: - for i in range(n): - outbuf[i, j] = fv + outbuf[i] = fv else: - for i in range(n): - outbuf[i, j] = values[i, idx] + outbuf[i] = values[idx] + -@cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis1_int64(ndarray[int64_t, ndim=2] values, - ndarray[int64_t] indexer, - out=None, fill_value=np.nan): +@cython.wraparound(False) +def is_monotonic_float64(ndarray[float64_t] arr): + ''' + Returns + ------- + is_monotonic, is_unique + ''' cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t, ndim=2] outbuf - int64_t fv + Py_ssize_t i, n + float64_t prev, cur + bint is_unique = 1 - n = len(values) - k = len(indexer) + n = len(arr) - if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) - else: - outbuf = out + if n < 2: + return True, True - if True and _checknan(fill_value): - for j in range(k): - idx = indexer[j] + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if cur < prev: + return False, None + elif cur == prev: + is_unique = 0 + prev = cur + return True, is_unique +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_float32(ndarray[float32_t] arr): + ''' + Returns + ------- + is_monotonic, is_unique + ''' + cdef: + Py_ssize_t i, n + float32_t prev, cur + bint is_unique = 1 - if idx == -1: - for i in range(n): - raise ValueError('No NA values allowed') - else: - for i in range(n): - outbuf[i, j] = values[i, idx] - else: - fv = fill_value - for j in range(k): - idx = indexer[j] + n = len(arr) - if idx == -1: - for i in range(n): - outbuf[i, j] = fv - else: - for i in range(n): - outbuf[i, j] = values[i, idx] + if n < 2: + return True, True -@cython.wraparound(False) + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if cur < prev: + return False, None + elif cur == prev: + is_unique = 0 + prev = cur + return True, is_unique @cython.boundscheck(False) -def take_2d_axis1_bool(ndarray[uint8_t, ndim=2] values, - ndarray[int64_t] indexer, - out=None, fill_value=np.nan): +@cython.wraparound(False) +def is_monotonic_object(ndarray[object] arr): + ''' + Returns + ------- + is_monotonic, is_unique + ''' cdef: - Py_ssize_t i, j, k, n, idx - ndarray[uint8_t, ndim=2] outbuf - uint8_t fv - - n = len(values) - k = len(indexer) + Py_ssize_t i, n + object prev, cur + bint is_unique = 1 - if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) - else: - outbuf = out + n = len(arr) - if True and _checknan(fill_value): - for j in range(k): - idx = indexer[j] + if n < 2: + return True, True - if idx == -1: - for i in range(n): - raise ValueError('No NA values allowed') - else: - for i in range(n): - outbuf[i, j] = values[i, idx] - else: - fv = fill_value - for j in range(k): - idx = indexer[j] + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if cur < prev: + return False, None + elif cur == prev: + is_unique = 0 + prev = cur + return True, is_unique +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_int8(ndarray[int8_t] arr): + ''' + Returns + ------- + is_monotonic, is_unique + ''' + cdef: + Py_ssize_t i, n + int8_t prev, cur + bint is_unique = 1 - if idx == -1: - for i in range(n): - outbuf[i, j] = fv - else: - for i in range(n): - outbuf[i, j] = values[i, idx] + n = len(arr) + if n < 2: + return True, True -@cython.wraparound(False) + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if cur < prev: + return False, None + elif cur == prev: + is_unique = 0 + prev = cur + return True, is_unique @cython.boundscheck(False) -def take_2d_multi_float64(ndarray[float64_t, ndim=2] values, - ndarray[int64_t] idx0, - ndarray[int64_t] idx1, - out=None, fill_value=np.nan): +@cython.wraparound(False) +def is_monotonic_int16(ndarray[int16_t] arr): + ''' + Returns + ------- + is_monotonic, is_unique + ''' cdef: - Py_ssize_t i, j, k, n, idx - ndarray[float64_t, ndim=2] outbuf - float64_t fv - - n = len(idx0) - k = len(idx1) + Py_ssize_t i, n + int16_t prev, cur + bint is_unique = 1 - if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) - else: - outbuf = out + n = len(arr) + if n < 2: + return True, True + + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if cur < prev: + return False, None + elif cur == prev: + is_unique = 0 + prev = cur + return True, is_unique +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_int32(ndarray[int32_t] arr): + ''' + Returns + ------- + is_monotonic, is_unique + ''' + cdef: + Py_ssize_t i, n + int32_t prev, cur + bint is_unique = 1 + + n = len(arr) + + if n < 2: + return True, True + + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if cur < prev: + return False, None + elif cur == prev: + is_unique = 0 + prev = cur + return True, is_unique +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_int64(ndarray[int64_t] arr): + ''' + Returns + ------- + is_monotonic, is_unique + ''' + cdef: + Py_ssize_t i, n + int64_t prev, cur + bint is_unique = 1 + + n = len(arr) + + if n < 2: + return True, True + + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if cur < prev: + return False, None + elif cur == prev: + is_unique = 0 + prev = cur + return True, is_unique +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_bool(ndarray[uint8_t] arr): + ''' + Returns + ------- + is_monotonic, is_unique + ''' + cdef: + Py_ssize_t i, n + uint8_t prev, cur + bint is_unique = 1 + + n = len(arr) + + if n < 2: + return True, True + + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if cur < prev: + return False, None + elif cur == prev: + is_unique = 0 + prev = cur + return True, is_unique + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_float64(ndarray[float64_t] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + for i in range(length): + key = util.get_value_1d(labels, i) + + if _checknull(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_float32(ndarray[float32_t] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + for i in range(length): + key = util.get_value_1d(labels, i) + + if _checknull(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_object(ndarray[object] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + for i in range(length): + key = util.get_value_1d(labels, i) + + if _checknull(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_int8(ndarray[int8_t] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + for i in range(length): + key = util.get_value_1d(labels, i) + + if _checknull(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_int16(ndarray[int16_t] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + for i in range(length): + key = util.get_value_1d(labels, i) + + if _checknull(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_int32(ndarray[int32_t] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + for i in range(length): + key = util.get_value_1d(labels, i) + + if _checknull(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_int64(ndarray[int64_t] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + for i in range(length): + key = util.get_value_1d(labels, i) + + if _checknull(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_bool(ndarray[uint8_t] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + for i in range(length): + key = util.get_value_1d(labels, i) + + if _checknull(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_float64(ndarray[float64_t] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_float32(ndarray[float32_t] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_object(ndarray[object] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_int8(ndarray[int8_t] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_int16(ndarray[int16_t] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_int32(ndarray[int32_t] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_int64(ndarray[int64_t] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_bool(ndarray[uint8_t] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_float64(ndarray[float64_t, ndim=2] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[float64_t, ndim=2] outbuf + float64_t fv + + n = len(indexer) + k = values.shape[1] + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + if False and _checknan(fill_value): + for i in range(n): + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + raise ValueError('No NA values allowed') + else: + for j from 0 <= j < k: + outbuf[i, j] = values[idx, j] + else: + fv = fill_value + for i in range(n): + idx = indexer[i] + if idx == -1: + for j in range(k): + outbuf[i, j] = fv + else: + for j in range(k): + outbuf[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_float32(ndarray[float32_t, ndim=2] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[float32_t, ndim=2] outbuf + float32_t fv + + n = len(indexer) + k = values.shape[1] + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + if False and _checknan(fill_value): + for i in range(n): + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + raise ValueError('No NA values allowed') + else: + for j from 0 <= j < k: + outbuf[i, j] = values[idx, j] + else: + fv = fill_value + for i in range(n): + idx = indexer[i] + if idx == -1: + for j in range(k): + outbuf[i, j] = fv + else: + for j in range(k): + outbuf[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_object(ndarray[object, ndim=2] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[object, ndim=2] outbuf + object fv + + n = len(indexer) + k = values.shape[1] + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + if False and _checknan(fill_value): + for i in range(n): + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + raise ValueError('No NA values allowed') + else: + for j from 0 <= j < k: + outbuf[i, j] = values[idx, j] + else: + fv = fill_value + for i in range(n): + idx = indexer[i] + if idx == -1: + for j in range(k): + outbuf[i, j] = fv + else: + for j in range(k): + outbuf[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int8(ndarray[int8_t, ndim=2] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int8_t, ndim=2] outbuf + int8_t fv + + n = len(indexer) + k = values.shape[1] + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + if True and _checknan(fill_value): + for i in range(n): + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + raise ValueError('No NA values allowed') + else: + for j from 0 <= j < k: + outbuf[i, j] = values[idx, j] + else: + fv = fill_value + for i in range(n): + idx = indexer[i] + if idx == -1: + for j in range(k): + outbuf[i, j] = fv + else: + for j in range(k): + outbuf[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int16(ndarray[int16_t, ndim=2] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int16_t, ndim=2] outbuf + int16_t fv + + n = len(indexer) + k = values.shape[1] + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + if True and _checknan(fill_value): + for i in range(n): + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + raise ValueError('No NA values allowed') + else: + for j from 0 <= j < k: + outbuf[i, j] = values[idx, j] + else: + fv = fill_value + for i in range(n): + idx = indexer[i] + if idx == -1: + for j in range(k): + outbuf[i, j] = fv + else: + for j in range(k): + outbuf[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int32(ndarray[int32_t, ndim=2] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int32_t, ndim=2] outbuf + int32_t fv + + n = len(indexer) + k = values.shape[1] + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + if True and _checknan(fill_value): + for i in range(n): + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + raise ValueError('No NA values allowed') + else: + for j from 0 <= j < k: + outbuf[i, j] = values[idx, j] + else: + fv = fill_value + for i in range(n): + idx = indexer[i] + if idx == -1: + for j in range(k): + outbuf[i, j] = fv + else: + for j in range(k): + outbuf[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int64(ndarray[int64_t, ndim=2] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t, ndim=2] outbuf + int64_t fv + + n = len(indexer) + k = values.shape[1] + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + if True and _checknan(fill_value): + for i in range(n): + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + raise ValueError('No NA values allowed') + else: + for j from 0 <= j < k: + outbuf[i, j] = values[idx, j] + else: + fv = fill_value + for i in range(n): + idx = indexer[i] + if idx == -1: + for j in range(k): + outbuf[i, j] = fv + else: + for j in range(k): + outbuf[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_bool(ndarray[uint8_t, ndim=2] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[uint8_t, ndim=2] outbuf + uint8_t fv + + n = len(indexer) + k = values.shape[1] + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + if True and _checknan(fill_value): + for i in range(n): + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + raise ValueError('No NA values allowed') + else: + for j from 0 <= j < k: + outbuf[i, j] = values[idx, j] + else: + fv = fill_value + for i in range(n): + idx = indexer[i] + if idx == -1: + for j in range(k): + outbuf[i, j] = fv + else: + for j in range(k): + outbuf[i, j] = values[idx, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_float64(ndarray[float64_t, ndim=2] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[float64_t, ndim=2] outbuf + float64_t fv + + n = len(values) + k = len(indexer) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + if False and _checknan(fill_value): + for j in range(k): + idx = indexer[j] + + if idx == -1: + for i in range(n): + raise ValueError('No NA values allowed') + else: + for i in range(n): + outbuf[i, j] = values[i, idx] + else: + fv = fill_value + for j in range(k): + idx = indexer[j] + + if idx == -1: + for i in range(n): + outbuf[i, j] = fv + else: + for i in range(n): + outbuf[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_float32(ndarray[float32_t, ndim=2] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[float32_t, ndim=2] outbuf + float32_t fv + + n = len(values) + k = len(indexer) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + if False and _checknan(fill_value): + for j in range(k): + idx = indexer[j] + + if idx == -1: + for i in range(n): + raise ValueError('No NA values allowed') + else: + for i in range(n): + outbuf[i, j] = values[i, idx] + else: + fv = fill_value + for j in range(k): + idx = indexer[j] + + if idx == -1: + for i in range(n): + outbuf[i, j] = fv + else: + for i in range(n): + outbuf[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_object(ndarray[object, ndim=2] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[object, ndim=2] outbuf + object fv + + n = len(values) + k = len(indexer) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + if False and _checknan(fill_value): + for j in range(k): + idx = indexer[j] + + if idx == -1: + for i in range(n): + raise ValueError('No NA values allowed') + else: + for i in range(n): + outbuf[i, j] = values[i, idx] + else: + fv = fill_value + for j in range(k): + idx = indexer[j] + + if idx == -1: + for i in range(n): + outbuf[i, j] = fv + else: + for i in range(n): + outbuf[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int8(ndarray[int8_t, ndim=2] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int8_t, ndim=2] outbuf + int8_t fv + + n = len(values) + k = len(indexer) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + if True and _checknan(fill_value): + for j in range(k): + idx = indexer[j] + + if idx == -1: + for i in range(n): + raise ValueError('No NA values allowed') + else: + for i in range(n): + outbuf[i, j] = values[i, idx] + else: + fv = fill_value + for j in range(k): + idx = indexer[j] + + if idx == -1: + for i in range(n): + outbuf[i, j] = fv + else: + for i in range(n): + outbuf[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int16(ndarray[int16_t, ndim=2] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int16_t, ndim=2] outbuf + int16_t fv + + n = len(values) + k = len(indexer) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + if True and _checknan(fill_value): + for j in range(k): + idx = indexer[j] + + if idx == -1: + for i in range(n): + raise ValueError('No NA values allowed') + else: + for i in range(n): + outbuf[i, j] = values[i, idx] + else: + fv = fill_value + for j in range(k): + idx = indexer[j] + + if idx == -1: + for i in range(n): + outbuf[i, j] = fv + else: + for i in range(n): + outbuf[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int32(ndarray[int32_t, ndim=2] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int32_t, ndim=2] outbuf + int32_t fv + + n = len(values) + k = len(indexer) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + if True and _checknan(fill_value): + for j in range(k): + idx = indexer[j] + + if idx == -1: + for i in range(n): + raise ValueError('No NA values allowed') + else: + for i in range(n): + outbuf[i, j] = values[i, idx] + else: + fv = fill_value + for j in range(k): + idx = indexer[j] + + if idx == -1: + for i in range(n): + outbuf[i, j] = fv + else: + for i in range(n): + outbuf[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int64(ndarray[int64_t, ndim=2] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t, ndim=2] outbuf + int64_t fv + + n = len(values) + k = len(indexer) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + if True and _checknan(fill_value): + for j in range(k): + idx = indexer[j] + + if idx == -1: + for i in range(n): + raise ValueError('No NA values allowed') + else: + for i in range(n): + outbuf[i, j] = values[i, idx] + else: + fv = fill_value + for j in range(k): + idx = indexer[j] + + if idx == -1: + for i in range(n): + outbuf[i, j] = fv + else: + for i in range(n): + outbuf[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_bool(ndarray[uint8_t, ndim=2] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[uint8_t, ndim=2] outbuf + uint8_t fv + + n = len(values) + k = len(indexer) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + if True and _checknan(fill_value): + for j in range(k): + idx = indexer[j] + + if idx == -1: + for i in range(n): + raise ValueError('No NA values allowed') + else: + for i in range(n): + outbuf[i, j] = values[i, idx] + else: + fv = fill_value + for j in range(k): + idx = indexer[j] + + if idx == -1: + for i in range(n): + outbuf[i, j] = fv + else: + for i in range(n): + outbuf[i, j] = values[i, idx] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_float64(ndarray[float64_t, ndim=2] values, + ndarray[int64_t] idx0, + ndarray[int64_t] idx1, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[float64_t, ndim=2] outbuf + float64_t fv + + n = len(idx0) + k = len(idx1) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + + if False and _checknan(fill_value): + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + raise ValueError('No NA values allowed') + else: + for j in range(k): + if idx1[j] == -1: + raise ValueError('No NA values allowed') + else: + outbuf[i, j] = values[idx, idx1[j]] + else: + fv = fill_value + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + outbuf[i, j] = fv + else: + for j in range(k): + if idx1[j] == -1: + outbuf[i, j] = fv + else: + outbuf[i, j] = values[idx, idx1[j]] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_float32(ndarray[float32_t, ndim=2] values, + ndarray[int64_t] idx0, + ndarray[int64_t] idx1, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[float32_t, ndim=2] outbuf + float32_t fv + + n = len(idx0) + k = len(idx1) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + + if False and _checknan(fill_value): + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + raise ValueError('No NA values allowed') + else: + for j in range(k): + if idx1[j] == -1: + raise ValueError('No NA values allowed') + else: + outbuf[i, j] = values[idx, idx1[j]] + else: + fv = fill_value + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + outbuf[i, j] = fv + else: + for j in range(k): + if idx1[j] == -1: + outbuf[i, j] = fv + else: + outbuf[i, j] = values[idx, idx1[j]] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_object(ndarray[object, ndim=2] values, + ndarray[int64_t] idx0, + ndarray[int64_t] idx1, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[object, ndim=2] outbuf + object fv + + n = len(idx0) + k = len(idx1) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + + if False and _checknan(fill_value): + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + raise ValueError('No NA values allowed') + else: + for j in range(k): + if idx1[j] == -1: + raise ValueError('No NA values allowed') + else: + outbuf[i, j] = values[idx, idx1[j]] + else: + fv = fill_value + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + outbuf[i, j] = fv + else: + for j in range(k): + if idx1[j] == -1: + outbuf[i, j] = fv + else: + outbuf[i, j] = values[idx, idx1[j]] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int8(ndarray[int8_t, ndim=2] values, + ndarray[int64_t] idx0, + ndarray[int64_t] idx1, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int8_t, ndim=2] outbuf + int8_t fv + + n = len(idx0) + k = len(idx1) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + + if True and _checknan(fill_value): + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + raise ValueError('No NA values allowed') + else: + for j in range(k): + if idx1[j] == -1: + raise ValueError('No NA values allowed') + else: + outbuf[i, j] = values[idx, idx1[j]] + else: + fv = fill_value + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + outbuf[i, j] = fv + else: + for j in range(k): + if idx1[j] == -1: + outbuf[i, j] = fv + else: + outbuf[i, j] = values[idx, idx1[j]] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int16(ndarray[int16_t, ndim=2] values, + ndarray[int64_t] idx0, + ndarray[int64_t] idx1, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int16_t, ndim=2] outbuf + int16_t fv + + n = len(idx0) + k = len(idx1) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + + if True and _checknan(fill_value): + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + raise ValueError('No NA values allowed') + else: + for j in range(k): + if idx1[j] == -1: + raise ValueError('No NA values allowed') + else: + outbuf[i, j] = values[idx, idx1[j]] + else: + fv = fill_value + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + outbuf[i, j] = fv + else: + for j in range(k): + if idx1[j] == -1: + outbuf[i, j] = fv + else: + outbuf[i, j] = values[idx, idx1[j]] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int32(ndarray[int32_t, ndim=2] values, + ndarray[int64_t] idx0, + ndarray[int64_t] idx1, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int32_t, ndim=2] outbuf + int32_t fv + + n = len(idx0) + k = len(idx1) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + + if True and _checknan(fill_value): + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + raise ValueError('No NA values allowed') + else: + for j in range(k): + if idx1[j] == -1: + raise ValueError('No NA values allowed') + else: + outbuf[i, j] = values[idx, idx1[j]] + else: + fv = fill_value + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + outbuf[i, j] = fv + else: + for j in range(k): + if idx1[j] == -1: + outbuf[i, j] = fv + else: + outbuf[i, j] = values[idx, idx1[j]] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int64(ndarray[int64_t, ndim=2] values, + ndarray[int64_t] idx0, + ndarray[int64_t] idx1, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t, ndim=2] outbuf + int64_t fv + + n = len(idx0) + k = len(idx1) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + + if True and _checknan(fill_value): + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + raise ValueError('No NA values allowed') + else: + for j in range(k): + if idx1[j] == -1: + raise ValueError('No NA values allowed') + else: + outbuf[i, j] = values[idx, idx1[j]] + else: + fv = fill_value + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + outbuf[i, j] = fv + else: + for j in range(k): + if idx1[j] == -1: + outbuf[i, j] = fv + else: + outbuf[i, j] = values[idx, idx1[j]] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_bool(ndarray[uint8_t, ndim=2] values, + ndarray[int64_t] idx0, + ndarray[int64_t] idx1, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[uint8_t, ndim=2] outbuf + uint8_t fv + + n = len(idx0) + k = len(idx1) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + + if True and _checknan(fill_value): + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + raise ValueError('No NA values allowed') + else: + for j in range(k): + if idx1[j] == -1: + raise ValueError('No NA values allowed') + else: + outbuf[i, j] = values[idx, idx1[j]] + else: + fv = fill_value + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + outbuf[i, j] = fv + else: + for j in range(k): + if idx1[j] == -1: + outbuf[i, j] = fv + else: + outbuf[i, j] = values[idx, idx1[j]] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d_float64(ndarray[float64_t, ndim=2] arr, + ndarray[float64_t, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy + + sx, sy = ( arr).shape + if arr.flags.f_contiguous: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] +@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d_float32(ndarray[float32_t, ndim=2] arr, + ndarray[float32_t, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy + + sx, sy = ( arr).shape + if arr.flags.f_contiguous: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] +@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d_int8(ndarray[int8_t, ndim=2] arr, + ndarray[float32_t, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy + + sx, sy = ( arr).shape + if arr.flags.f_contiguous: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] +@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d_int16(ndarray[int16_t, ndim=2] arr, + ndarray[float32_t, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy + + sx, sy = ( arr).shape + if arr.flags.f_contiguous: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] +@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d_int32(ndarray[int32_t, ndim=2] arr, + ndarray[float64_t, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy + + sx, sy = ( arr).shape + if arr.flags.f_contiguous: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] +@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d_int64(ndarray[int64_t, ndim=2] arr, + ndarray[float64_t, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy + + sx, sy = ( arr).shape + if arr.flags.f_contiguous: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] + +@cython.wraparound(False) +@cython.wraparound(False) +def group_last_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, count + ndarray[float64_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] +@cython.wraparound(False) +@cython.wraparound(False) +def group_last_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float32_t val, count + ndarray[float32_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] + +@cython.wraparound(False) +@cython.wraparound(False) +def group_last_bin_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, count + ndarray[float64_t, ndim=2] resx, nobs + + nobs = np.zeros_like(out) + resx = np.empty_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + resx[b, j] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] +@cython.wraparound(False) +@cython.wraparound(False) +def group_last_bin_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float32_t val, count + ndarray[float32_t, ndim=2] resx, nobs + + nobs = np.zeros_like(out) + resx = np.empty_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + resx[b, j] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_nth_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels, int64_t rank): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, count + ndarray[float64_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] +@cython.boundscheck(False) +@cython.wraparound(False) +def group_nth_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels, int64_t rank): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float32_t val, count + ndarray[float32_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_nth_bin_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins, int64_t rank): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, count + ndarray[float64_t, ndim=2] resx, nobs + + nobs = np.zeros_like(out) + resx = np.empty_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + if nobs[b, j] == rank: + resx[b, j] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] +@cython.boundscheck(False) +@cython.wraparound(False) +def group_nth_bin_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] bins, int64_t rank): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float32_t val, count + ndarray[float32_t, ndim=2] resx, nobs + + nobs = np.zeros_like(out) + resx = np.empty_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + if nobs[b, j] == rank: + resx[b, j] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_add_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, count + ndarray[float64_t, ndim=2] sumx, nobs + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = sumx[i, j] +@cython.boundscheck(False) +@cython.wraparound(False) +def group_add_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float32_t val, count + ndarray[float32_t, ndim=2] sumx, nobs + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = sumx[i, j] + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_add_bin_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b, nbins + float64_t val, count + ndarray[float64_t, ndim=2] sumx, nobs + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + sumx[b, j] += val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + sumx[b, 0] += val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = sumx[i, j] +@cython.boundscheck(False) +@cython.wraparound(False) +def group_add_bin_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b, nbins + float32_t val, count + ndarray[float32_t, ndim=2] sumx, nobs + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + sumx[b, j] += val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + sumx[b, 0] += val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = sumx[i, j] + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_prod_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, count + ndarray[float64_t, ndim=2] prodx, nobs + + nobs = np.zeros_like(out) + prodx = np.ones_like(out) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + prodx[lab, j] *= val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + prodx[lab, 0] *= val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = prodx[i, j] +@cython.boundscheck(False) +@cython.wraparound(False) +def group_prod_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float32_t val, count + ndarray[float32_t, ndim=2] prodx, nobs + + nobs = np.zeros_like(out) + prodx = np.ones_like(out) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + prodx[lab, j] *= val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + prodx[lab, 0] *= val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = prodx[i, j] + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_prod_bin_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, count + ndarray[float64_t, ndim=2] prodx, nobs + + nobs = np.zeros_like(out) + prodx = np.ones_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + prodx[b, j] *= val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + prodx[b, 0] *= val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = prodx[i, j] +@cython.boundscheck(False) +@cython.wraparound(False) +def group_prod_bin_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float32_t val, count + ndarray[float32_t, ndim=2] prodx, nobs + + nobs = np.zeros_like(out) + prodx = np.ones_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + prodx[b, j] *= val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + prodx[b, 0] *= val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = prodx[i, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_var_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, ct + ndarray[float64_t, ndim=2] nobs, sumx, sumxx + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + sumxx = np.zeros_like(out) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + sumxx[lab, j] += val * val + else: + for i in range(N): + + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + sumxx[lab, 0] += val * val + + + for i in range(len(counts)): + for j in range(K): + ct = nobs[i, j] + if ct < 2: + out[i, j] = nan + else: + out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) / + (ct * ct - ct)) +@cython.wraparound(False) +@cython.boundscheck(False) +def group_var_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + cdef: + Py_ssize_t i, j, N, K, lab + float32_t val, ct + ndarray[float32_t, ndim=2] nobs, sumx, sumxx + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + sumxx = np.zeros_like(out) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + sumxx[lab, j] += val * val + else: + for i in range(N): + + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + sumxx[lab, 0] += val * val + + + for i in range(len(counts)): + for j in range(K): + ct = nobs[i, j] + if ct < 2: + out[i, j] = nan + else: + out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) / + (ct * ct - ct)) + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_var_bin_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): + + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, ct + ndarray[float64_t, ndim=2] nobs, sumx, sumxx + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + sumxx = np.zeros_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + sumx[b, j] += val + sumxx[b, j] += val * val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + sumx[b, 0] += val + sumxx[b, 0] += val * val + + for i in range(ngroups): + for j in range(K): + ct = nobs[i, j] + if ct < 2: + out[i, j] = nan + else: + out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) / + (ct * ct - ct)) +@cython.wraparound(False) +@cython.boundscheck(False) +def group_var_bin_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] bins): + + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float32_t val, ct + ndarray[float32_t, ndim=2] nobs, sumx, sumxx + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + sumxx = np.zeros_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + sumx[b, j] += val + sumxx[b, j] += val * val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + sumx[b, 0] += val + sumxx[b, 0] += val * val + + for i in range(ngroups): + for j in range(K): + ct = nobs[i, j] + if ct < 2: + out[i, j] = nan + else: + out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) / + (ct * ct - ct)) + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_mean_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, count + ndarray[float64_t, ndim=2] sumx, nobs + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + + for i in range(len(counts)): + for j in range(K): + count = nobs[i, j] + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = sumx[i, j] / count +@cython.wraparound(False) +@cython.boundscheck(False) +def group_mean_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + cdef: + Py_ssize_t i, j, N, K, lab + float32_t val, count + ndarray[float32_t, ndim=2] sumx, nobs + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + + for i in range(len(counts)): + for j in range(K): + count = nobs[i, j] + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = sumx[i, j] / count + + +def group_mean_bin_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, count + ndarray[float64_t, ndim=2] sumx, nobs + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + sumx[b, j] += val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + sumx[b, 0] += val + + for i in range(ngroups): + for j in range(K): + count = nobs[i, j] + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = sumx[i, j] / count + +def group_mean_bin_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] bins): + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float32_t val, count + ndarray[float32_t, ndim=2] sumx, nobs + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + sumx[b, j] += val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + sumx[b, 0] += val + + for i in range(ngroups): + for j in range(K): + count = nobs[i, j] + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = sumx[i, j] / count + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_min_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, count + ndarray[float64_t, ndim=2] minx, nobs + + nobs = np.zeros_like(out) + + minx = np.empty_like(out) + minx.fill(np.inf) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if val < minx[lab, j]: + minx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + if val < minx[lab, 0]: + minx[lab, 0] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = minx[i, j] +@cython.wraparound(False) +@cython.boundscheck(False) +def group_min_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float32_t val, count + ndarray[float32_t, ndim=2] minx, nobs + + nobs = np.zeros_like(out) + + minx = np.empty_like(out) + minx.fill(np.inf) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if val < minx[lab, j]: + minx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + if val < minx[lab, 0]: + minx[lab, 0] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = minx[i, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_min_bin_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, count + ndarray[float64_t, ndim=2] minx, nobs + + nobs = np.zeros_like(out) + + minx = np.empty_like(out) + minx.fill(np.inf) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + if val < minx[b, j]: + minx[b, j] = val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + if val < minx[b, 0]: + minx[b, 0] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = minx[i, j] +@cython.wraparound(False) +@cython.boundscheck(False) +def group_min_bin_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float32_t val, count + ndarray[float32_t, ndim=2] minx, nobs + + nobs = np.zeros_like(out) + + minx = np.empty_like(out) + minx.fill(np.inf) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + if val < minx[b, j]: + minx[b, j] = val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + if val < minx[b, 0]: + minx[b, 0] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = minx[i, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_max_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, count + ndarray[float64_t, ndim=2] maxx, nobs + + nobs = np.zeros_like(out) + + maxx = np.empty_like(out) + maxx.fill(-np.inf) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if val > maxx[lab, j]: + maxx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + if val > maxx[lab, 0]: + maxx[lab, 0] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = maxx[i, j] +@cython.wraparound(False) +@cython.boundscheck(False) +def group_max_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float32_t val, count + ndarray[float32_t, ndim=2] maxx, nobs + + nobs = np.zeros_like(out) + + maxx = np.empty_like(out) + maxx.fill(-np.inf) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if val > maxx[lab, j]: + maxx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + if val > maxx[lab, 0]: + maxx[lab, 0] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = maxx[i, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_max_bin_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, count + ndarray[float64_t, ndim=2] maxx, nobs + + nobs = np.zeros_like(out) + maxx = np.empty_like(out) + maxx.fill(-np.inf) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + if val > maxx[b, j]: + maxx[b, j] = val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + if val > maxx[b, 0]: + maxx[b, 0] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = maxx[i, j] +@cython.wraparound(False) +@cython.boundscheck(False) +def group_max_bin_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float32_t val, count + ndarray[float32_t, ndim=2] maxx, nobs + + nobs = np.zeros_like(out) + maxx = np.empty_like(out) + maxx.fill(-np.inf) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + if val > maxx[b, j]: + maxx[b, j] = val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + if val > maxx[b, 0]: + maxx[b, 0] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = maxx[i, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_ohlc_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, count + float64_t vopen, vhigh, vlow, vclose, NA + bint got_first = 0 + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + if out.shape[1] != 4: + raise ValueError('Output array must have 4 columns') + + NA = np.nan + + b = 0 + if K > 1: + raise NotImplementedError + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + if not got_first: + out[b, 0] = NA + out[b, 1] = NA + out[b, 2] = NA + out[b, 3] = NA + else: + out[b, 0] = vopen + out[b, 1] = vhigh + out[b, 2] = vlow + out[b, 3] = vclose + b += 1 + got_first = 0 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + if not got_first: + got_first = 1 + vopen = val + vlow = val + vhigh = val + else: + if val < vlow: + vlow = val + if val > vhigh: + vhigh = val + vclose = val + + if not got_first: + out[b, 0] = NA + out[b, 1] = NA + out[b, 2] = NA + out[b, 3] = NA + else: + out[b, 0] = vopen + out[b, 1] = vhigh + out[b, 2] = vlow + out[b, 3] = vclose +@cython.wraparound(False) +@cython.boundscheck(False) +def group_ohlc_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float32_t val, count + float32_t vopen, vhigh, vlow, vclose, NA + bint got_first = 0 + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + if out.shape[1] != 4: + raise ValueError('Output array must have 4 columns') + + NA = np.nan + + b = 0 + if K > 1: + raise NotImplementedError + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + if not got_first: + out[b, 0] = NA + out[b, 1] = NA + out[b, 2] = NA + out[b, 3] = NA + else: + out[b, 0] = vopen + out[b, 1] = vhigh + out[b, 2] = vlow + out[b, 3] = vclose + b += 1 + got_first = 0 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + if not got_first: + got_first = 1 + vopen = val + vlow = val + vhigh = val + else: + if val < vlow: + vlow = val + if val > vhigh: + vhigh = val + vclose = val + + if not got_first: + out[b, 0] = NA + out[b, 1] = NA + out[b, 2] = NA + out[b, 3] = NA + else: + out[b, 0] = vopen + out[b, 1] = vhigh + out[b, 2] = vlow + out[b, 3] = vclose + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_float64(ndarray[float64_t] left, + ndarray[float64_t] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + float64_t lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_float32(ndarray[float32_t] left, + ndarray[float32_t] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + float32_t lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_object(ndarray[object] left, + ndarray[object] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + object lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_int8(ndarray[int8_t] left, + ndarray[int8_t] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + int8_t lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_int16(ndarray[int16_t] left, + ndarray[int16_t] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + int16_t lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_int32(ndarray[int32_t] left, + ndarray[int32_t] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + int32_t lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_int64(ndarray[int64_t] left, + ndarray[int64_t] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + int64_t lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + + + +def left_join_indexer_float64(ndarray[float64_t] left, + ndarray[float64_t] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + float64_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[float64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break - if False and _checknan(fill_value): - for i in range(n): - idx = idx0[i] - if idx == -1: - for j in range(k): - raise ValueError('No NA values allowed') + lval = left[i] + rval = right[j] + + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 else: - for j in range(k): - if idx1[j] == -1: - raise ValueError('No NA values allowed') + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.float64) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +def left_join_indexer_float32(ndarray[float32_t] left, + ndarray[float32_t] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + float32_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[float32_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.float32) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +def left_join_indexer_object(ndarray[object] left, + ndarray[object] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + object lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[object] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=object) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 else: - outbuf[i, j] = values[idx, idx1[j]] - else: - fv = fill_value - for i in range(n): - idx = idx0[i] - if idx == -1: - for j in range(k): - outbuf[i, j] = fv + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 else: - for j in range(k): - if idx1[j] == -1: - outbuf[i, j] = fv - else: - outbuf[i, j] = values[idx, idx1[j]] + j += 1 -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_object(ndarray[object, ndim=2] values, - ndarray[int64_t] idx0, - ndarray[int64_t] idx1, - out=None, fill_value=np.nan): + return result, lindexer, rindexer + + +def left_join_indexer_int8(ndarray[int8_t] left, + ndarray[int8_t] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' cdef: - Py_ssize_t i, j, k, n, idx - ndarray[object, ndim=2] outbuf - object fv + Py_ssize_t i, j, k, nright, nleft, count + int8_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[int8_t] result - n = len(idx0) - k = len(idx1) + nleft = len(left) + nright = len(right) - if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) - else: - outbuf = out + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + lval = left[i] + rval = right[j] - if False and _checknan(fill_value): - for i in range(n): - idx = idx0[i] - if idx == -1: - for j in range(k): - raise ValueError('No NA values allowed') - else: - for j in range(k): - if idx1[j] == -1: - raise ValueError('No NA values allowed') + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 else: - outbuf[i, j] = values[idx, idx1[j]] - else: - fv = fill_value - for i in range(n): - idx = idx0[i] - if idx == -1: - for j in range(k): - outbuf[i, j] = fv + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 else: - for j in range(k): - if idx1[j] == -1: - outbuf[i, j] = fv - else: - outbuf[i, j] = values[idx, idx1[j]] + j += 1 -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int32(ndarray[int32_t, ndim=2] values, - ndarray[int64_t] idx0, - ndarray[int64_t] idx1, - out=None, fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int32_t, ndim=2] outbuf - int32_t fv + # do it again now that result size is known - n = len(idx0) - k = len(idx1) + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.int8) - if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) - else: - outbuf = out + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + lval = left[i] + rval = right[j] - if True and _checknan(fill_value): - for i in range(n): - idx = idx0[i] - if idx == -1: - for j in range(k): - raise ValueError('No NA values allowed') - else: - for j in range(k): - if idx1[j] == -1: - raise ValueError('No NA values allowed') + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 else: - outbuf[i, j] = values[idx, idx1[j]] - else: - fv = fill_value - for i in range(n): - idx = idx0[i] - if idx == -1: - for j in range(k): - outbuf[i, j] = fv + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 else: - for j in range(k): - if idx1[j] == -1: - outbuf[i, j] = fv - else: - outbuf[i, j] = values[idx, idx1[j]] + j += 1 -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int64(ndarray[int64_t, ndim=2] values, - ndarray[int64_t] idx0, - ndarray[int64_t] idx1, - out=None, fill_value=np.nan): + return result, lindexer, rindexer + + +def left_join_indexer_int16(ndarray[int16_t] left, + ndarray[int16_t] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t, ndim=2] outbuf - int64_t fv + Py_ssize_t i, j, k, nright, nleft, count + int16_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[int16_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + j += 1 - n = len(idx0) - k = len(idx1) + # do it again now that result size is known - if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) - else: - outbuf = out + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.int16) + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break - if True and _checknan(fill_value): - for i in range(n): - idx = idx0[i] - if idx == -1: - for j in range(k): - raise ValueError('No NA values allowed') - else: - for j in range(k): - if idx1[j] == -1: - raise ValueError('No NA values allowed') + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 else: - outbuf[i, j] = values[idx, idx1[j]] - else: - fv = fill_value - for i in range(n): - idx = idx0[i] - if idx == -1: - for j in range(k): - outbuf[i, j] = fv + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 else: - for j in range(k): - if idx1[j] == -1: - outbuf[i, j] = fv - else: - outbuf[i, j] = values[idx, idx1[j]] + j += 1 -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_bool(ndarray[uint8_t, ndim=2] values, - ndarray[int64_t] idx0, - ndarray[int64_t] idx1, - out=None, fill_value=np.nan): + return result, lindexer, rindexer + + +def left_join_indexer_int32(ndarray[int32_t] left, + ndarray[int32_t] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' cdef: - Py_ssize_t i, j, k, n, idx - ndarray[uint8_t, ndim=2] outbuf - uint8_t fv + Py_ssize_t i, j, k, nright, nleft, count + int32_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[int32_t] result - n = len(idx0) - k = len(idx1) + nleft = len(left) + nright = len(right) - if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) - else: - outbuf = out + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + lval = left[i] + rval = right[j] - if True and _checknan(fill_value): - for i in range(n): - idx = idx0[i] - if idx == -1: - for j in range(k): - raise ValueError('No NA values allowed') - else: - for j in range(k): - if idx1[j] == -1: - raise ValueError('No NA values allowed') + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 else: - outbuf[i, j] = values[idx, idx1[j]] - else: - fv = fill_value - for i in range(n): - idx = idx0[i] - if idx == -1: - for j in range(k): - outbuf[i, j] = fv + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 else: - for j in range(k): - if idx1[j] == -1: - outbuf[i, j] = fv - else: - outbuf[i, j] = values[idx, idx1[j]] + j += 1 + # do it again now that result size is known -@cython.wraparound(False) -@cython.boundscheck(False) -def left_join_indexer_unique_float64(ndarray[float64_t] left, - ndarray[float64_t] right): - cdef: - Py_ssize_t i, j, nleft, nright - ndarray[int64_t] indexer - float64_t lval, rval + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.int32) i = 0 j = 0 - nleft = len(left) - nright = len(right) - - indexer = np.empty(nleft, dtype=np.int64) - while True: - if i == nleft: - break + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break - if j == nright: - indexer[i] = -1 - i += 1 - continue + lval = left[i] + rval = right[j] - rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + else: + j += 1 - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 + return result, lindexer, rindexer - if left[i] == right[j]: - indexer[i] = j - i += 1 - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - j += 1 - elif left[i] > rval: - indexer[i] = -1 - j += 1 - else: - indexer[i] = -1 - i += 1 - return indexer -@cython.wraparound(False) -@cython.boundscheck(False) -def left_join_indexer_unique_object(ndarray[object] left, - ndarray[object] right): +def left_join_indexer_int64(ndarray[int64_t] left, + ndarray[int64_t] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' cdef: - Py_ssize_t i, j, nleft, nright - ndarray[int64_t] indexer - object lval, rval - - i = 0 - j = 0 + Py_ssize_t i, j, k, nright, nleft, count + int64_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[int64_t] result + nleft = len(left) nright = len(right) - indexer = np.empty(nleft, dtype=np.int64) - while True: - if i == nleft: - break - - if j == nright: - indexer[i] = -1 - i += 1 - continue - - rval = right[j] + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 + lval = left[i] + rval = right[j] - if left[i] == right[j]: - indexer[i] = j - i += 1 - while i < nleft - 1 and left[i] == rval: - indexer[i] = j + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 i += 1 - j += 1 - elif left[i] > rval: - indexer[i] = -1 - j += 1 - else: - indexer[i] = -1 - i += 1 - return indexer + else: + j += 1 -@cython.wraparound(False) -@cython.boundscheck(False) -def left_join_indexer_unique_int32(ndarray[int32_t] left, - ndarray[int32_t] right): - cdef: - Py_ssize_t i, j, nleft, nright - ndarray[int64_t] indexer - int32_t lval, rval + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.int64) i = 0 j = 0 - nleft = len(left) - nright = len(right) - - indexer = np.empty(nleft, dtype=np.int64) - while True: - if i == nleft: - break + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break - if j == nright: - indexer[i] = -1 - i += 1 - continue + lval = left[i] + rval = right[j] - rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + else: + j += 1 - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 + return result, lindexer, rindexer - if left[i] == right[j]: - indexer[i] = j - i += 1 - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - j += 1 - elif left[i] > rval: - indexer[i] = -1 - j += 1 - else: - indexer[i] = -1 - i += 1 - return indexer @cython.wraparound(False) @cython.boundscheck(False) -def left_join_indexer_unique_int64(ndarray[int64_t] left, - ndarray[int64_t] right): +def outer_join_indexer_float64(ndarray[float64_t] left, + ndarray[float64_t] right): cdef: - Py_ssize_t i, j, nleft, nright - ndarray[int64_t] indexer - int64_t lval, rval + Py_ssize_t i, j, nright, nleft, count + float64_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[float64_t] result - i = 0 - j = 0 nleft = len(left) nright = len(right) - indexer = np.empty(nleft, dtype=np.int64) - while True: - if i == nleft: - break + i = 0 + j = 0 + count = 0 + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break + if j == nright: + count += nleft - i + break - if j == nright: - indexer[i] = -1 - i += 1 - continue + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + count += 1 + j += 1 - rval = right[j] + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.float64) - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 + # do it again, but populate the indexers / result - if left[i] == right[j]: - indexer[i] = j - i += 1 - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - j += 1 - elif left[i] > rval: - indexer[i] = -1 - j += 1 - else: - indexer[i] = -1 - i += 1 - return indexer + i = 0 + j = 0 + count = 0 + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nright): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + break + + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + count += 1 + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 + j += 1 + return result, lindexer, rindexer -def left_join_indexer_float64(ndarray[float64_t] left, - ndarray[float64_t] right): - ''' - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - ''' +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_float32(ndarray[float32_t] left, + ndarray[float32_t] right): cdef: - Py_ssize_t i, j, k, nright, nleft, count - float64_t lval, rval + Py_ssize_t i, j, nright, nleft, count + float32_t lval, rval ndarray[int64_t] lindexer, rindexer - ndarray[float64_t] result + ndarray[float32_t] result nleft = len(left) nright = len(right) @@ -2725,15 +7425,21 @@ def left_join_indexer_float64(ndarray[float64_t] left, i = 0 j = 0 count = 0 - if nleft > 0: - while i < nleft: + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break if j == nright: count += nleft - i break lval = left[i] rval = right[j] - if lval == rval: count += 1 if i < nleft - 1: @@ -2754,26 +7460,45 @@ def left_join_indexer_float64(ndarray[float64_t] left, count += 1 i += 1 else: + count += 1 j += 1 - # do it again now that result size is known - lindexer = np.empty(count, dtype=np.int64) rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.float64) + result = np.empty(count, dtype=np.float32) + + # do it again, but populate the indexers / result i = 0 j = 0 count = 0 - if nleft > 0: - while i < nleft: + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nright): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break if j == nright: while i < nleft: lindexer[count] = i rindexer[count] = -1 result[count] = left[i] - i += 1 count += 1 + i += 1 break lval = left[i] @@ -2801,22 +7526,24 @@ def left_join_indexer_float64(ndarray[float64_t] left, elif lval < rval: lindexer[count] = i rindexer[count] = -1 - result[count] = left[i] + result[count] = lval count += 1 i += 1 else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 j += 1 return result, lindexer, rindexer - -def left_join_indexer_object(ndarray[object] left, - ndarray[object] right): - ''' - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - ''' +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_object(ndarray[object] left, + ndarray[object] right): cdef: - Py_ssize_t i, j, k, nright, nleft, count + Py_ssize_t i, j, nright, nleft, count object lval, rval ndarray[int64_t] lindexer, rindexer ndarray[object] result @@ -2827,15 +7554,21 @@ def left_join_indexer_object(ndarray[object] left, i = 0 j = 0 count = 0 - if nleft > 0: - while i < nleft: + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break if j == nright: count += nleft - i break lval = left[i] rval = right[j] - if lval == rval: count += 1 if i < nleft - 1: @@ -2856,26 +7589,45 @@ def left_join_indexer_object(ndarray[object] left, count += 1 i += 1 else: + count += 1 j += 1 - # do it again now that result size is known - lindexer = np.empty(count, dtype=np.int64) rindexer = np.empty(count, dtype=np.int64) result = np.empty(count, dtype=object) + # do it again, but populate the indexers / result + i = 0 j = 0 count = 0 - if nleft > 0: - while i < nleft: + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nright): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break if j == nright: while i < nleft: lindexer[count] = i rindexer[count] = -1 result[count] = left[i] - i += 1 count += 1 + i += 1 break lval = left[i] @@ -2903,25 +7655,27 @@ def left_join_indexer_object(ndarray[object] left, elif lval < rval: lindexer[count] = i rindexer[count] = -1 - result[count] = left[i] + result[count] = lval count += 1 i += 1 else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 j += 1 return result, lindexer, rindexer - -def left_join_indexer_int32(ndarray[int32_t] left, - ndarray[int32_t] right): - ''' - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - ''' +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_int8(ndarray[int8_t] left, + ndarray[int8_t] right): cdef: - Py_ssize_t i, j, k, nright, nleft, count - int32_t lval, rval + Py_ssize_t i, j, nright, nleft, count + int8_t lval, rval ndarray[int64_t] lindexer, rindexer - ndarray[int32_t] result + ndarray[int8_t] result nleft = len(left) nright = len(right) @@ -2929,15 +7683,21 @@ def left_join_indexer_int32(ndarray[int32_t] left, i = 0 j = 0 count = 0 - if nleft > 0: - while i < nleft: + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break if j == nright: count += nleft - i break lval = left[i] rval = right[j] - if lval == rval: count += 1 if i < nleft - 1: @@ -2958,26 +7718,45 @@ def left_join_indexer_int32(ndarray[int32_t] left, count += 1 i += 1 else: + count += 1 j += 1 - # do it again now that result size is known - lindexer = np.empty(count, dtype=np.int64) rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.int32) + result = np.empty(count, dtype=np.int8) + + # do it again, but populate the indexers / result i = 0 j = 0 count = 0 - if nleft > 0: - while i < nleft: + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nright): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break if j == nright: while i < nleft: lindexer[count] = i rindexer[count] = -1 result[count] = left[i] - i += 1 count += 1 + i += 1 break lval = left[i] @@ -3005,25 +7784,27 @@ def left_join_indexer_int32(ndarray[int32_t] left, elif lval < rval: lindexer[count] = i rindexer[count] = -1 - result[count] = left[i] + result[count] = lval count += 1 i += 1 else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 j += 1 return result, lindexer, rindexer - -def left_join_indexer_int64(ndarray[int64_t] left, - ndarray[int64_t] right): - ''' - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - ''' +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_int16(ndarray[int16_t] left, + ndarray[int16_t] right): cdef: - Py_ssize_t i, j, k, nright, nleft, count - int64_t lval, rval + Py_ssize_t i, j, nright, nleft, count + int16_t lval, rval ndarray[int64_t] lindexer, rindexer - ndarray[int64_t] result + ndarray[int16_t] result nleft = len(left) nright = len(right) @@ -3031,15 +7812,21 @@ def left_join_indexer_int64(ndarray[int64_t] left, i = 0 j = 0 count = 0 - if nleft > 0: - while i < nleft: + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break if j == nright: count += nleft - i break lval = left[i] rval = right[j] - if lval == rval: count += 1 if i < nleft - 1: @@ -3060,26 +7847,45 @@ def left_join_indexer_int64(ndarray[int64_t] left, count += 1 i += 1 else: + count += 1 j += 1 - # do it again now that result size is known - lindexer = np.empty(count, dtype=np.int64) rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.int16) + + # do it again, but populate the indexers / result i = 0 j = 0 count = 0 - if nleft > 0: - while i < nleft: + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nright): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break if j == nright: while i < nleft: lindexer[count] = i rindexer[count] = -1 result[count] = left[i] - i += 1 count += 1 + i += 1 break lval = left[i] @@ -3107,24 +7913,27 @@ def left_join_indexer_int64(ndarray[int64_t] left, elif lval < rval: lindexer[count] = i rindexer[count] = -1 - result[count] = left[i] + result[count] = lval count += 1 i += 1 else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 j += 1 return result, lindexer, rindexer - @cython.wraparound(False) @cython.boundscheck(False) -def outer_join_indexer_float64(ndarray[float64_t] left, - ndarray[float64_t] right): +def outer_join_indexer_int32(ndarray[int32_t] left, + ndarray[int32_t] right): cdef: Py_ssize_t i, j, nright, nleft, count - float64_t lval, rval + int32_t lval, rval ndarray[int64_t] lindexer, rindexer - ndarray[float64_t] result + ndarray[int32_t] result nleft = len(left) nright = len(right) @@ -3172,7 +7981,7 @@ def outer_join_indexer_float64(ndarray[float64_t] left, lindexer = np.empty(count, dtype=np.int64) rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.float64) + result = np.empty(count, dtype=np.int32) # do it again, but populate the indexers / result @@ -3247,13 +8056,13 @@ def outer_join_indexer_float64(ndarray[float64_t] left, @cython.wraparound(False) @cython.boundscheck(False) -def outer_join_indexer_object(ndarray[object] left, - ndarray[object] right): +def outer_join_indexer_int64(ndarray[int64_t] left, + ndarray[int64_t] right): cdef: Py_ssize_t i, j, nright, nleft, count - object lval, rval + int64_t lval, rval ndarray[int64_t] lindexer, rindexer - ndarray[object] result + ndarray[int64_t] result nleft = len(left) nright = len(right) @@ -3301,7 +8110,7 @@ def outer_join_indexer_object(ndarray[object] left, lindexer = np.empty(count, dtype=np.int64) rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=object) + result = np.empty(count, dtype=np.int64) # do it again, but populate the indexers / result @@ -3374,15 +8183,19 @@ def outer_join_indexer_object(ndarray[object] left, return result, lindexer, rindexer + @cython.wraparound(False) @cython.boundscheck(False) -def outer_join_indexer_int32(ndarray[int32_t] left, - ndarray[int32_t] right): +def inner_join_indexer_float64(ndarray[float64_t] left, + ndarray[float64_t] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' cdef: - Py_ssize_t i, j, nright, nleft, count - int32_t lval, rval + Py_ssize_t i, j, k, nright, nleft, count + float64_t lval, rval ndarray[int64_t] lindexer, rindexer - ndarray[int32_t] result + ndarray[float64_t] result nleft = len(left) nright = len(right) @@ -3390,17 +8203,11 @@ def outer_join_indexer_int32(ndarray[int32_t] left, i = 0 j = 0 count = 0 - if nleft == 0: - count = nright - elif nright == 0: - count = nleft - else: + if nleft > 0 and nright > 0: while True: if i == nleft: - count += nright - j break if j == nright: - count += nleft - i break lval = left[i] @@ -3422,57 +8229,32 @@ def outer_join_indexer_int32(ndarray[int32_t] left, # end of the road break elif lval < rval: - count += 1 i += 1 else: - count += 1 j += 1 + # do it again now that result size is known + lindexer = np.empty(count, dtype=np.int64) rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.int32) - - # do it again, but populate the indexers / result + result = np.empty(count, dtype=np.float64) i = 0 j = 0 count = 0 - if nleft == 0: - for j in range(nright): - lindexer[j] = -1 - rindexer[j] = j - result[j] = right[j] - elif nright == 0: - for i in range(nright): - lindexer[i] = i - rindexer[i] = -1 - result[i] = left[i] - else: + if nleft > 0 and nright > 0: while True: if i == nleft: - while j < nright: - lindexer[count] = -1 - rindexer[count] = j - result[count] = right[j] - count += 1 - j += 1 break if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 break lval = left[i] rval = right[j] - if lval == rval: lindexer[count] = i rindexer[count] = j - result[count] = lval + result[count] = rval count += 1 if i < nleft - 1: if j < nright - 1 and right[j + 1] == rval: @@ -3489,29 +8271,24 @@ def outer_join_indexer_int32(ndarray[int32_t] left, # end of the road break elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = lval - count += 1 i += 1 else: - lindexer[count] = -1 - rindexer[count] = j - result[count] = rval - count += 1 j += 1 return result, lindexer, rindexer @cython.wraparound(False) @cython.boundscheck(False) -def outer_join_indexer_int64(ndarray[int64_t] left, - ndarray[int64_t] right): +def inner_join_indexer_float32(ndarray[float32_t] left, + ndarray[float32_t] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' cdef: - Py_ssize_t i, j, nright, nleft, count - int64_t lval, rval + Py_ssize_t i, j, k, nright, nleft, count + float32_t lval, rval ndarray[int64_t] lindexer, rindexer - ndarray[int64_t] result + ndarray[float32_t] result nleft = len(left) nright = len(right) @@ -3519,17 +8296,11 @@ def outer_join_indexer_int64(ndarray[int64_t] left, i = 0 j = 0 count = 0 - if nleft == 0: - count = nright - elif nright == 0: - count = nleft - else: + if nleft > 0 and nright > 0: while True: if i == nleft: - count += nright - j break if j == nright: - count += nleft - i break lval = left[i] @@ -3551,57 +8322,32 @@ def outer_join_indexer_int64(ndarray[int64_t] left, # end of the road break elif lval < rval: - count += 1 i += 1 else: - count += 1 j += 1 + # do it again now that result size is known + lindexer = np.empty(count, dtype=np.int64) rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.int64) - - # do it again, but populate the indexers / result + result = np.empty(count, dtype=np.float32) i = 0 j = 0 count = 0 - if nleft == 0: - for j in range(nright): - lindexer[j] = -1 - rindexer[j] = j - result[j] = right[j] - elif nright == 0: - for i in range(nright): - lindexer[i] = i - rindexer[i] = -1 - result[i] = left[i] - else: + if nleft > 0 and nright > 0: while True: if i == nleft: - while j < nright: - lindexer[count] = -1 - rindexer[count] = j - result[count] = right[j] - count += 1 - j += 1 break if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 break lval = left[i] rval = right[j] - if lval == rval: lindexer[count] = i rindexer[count] = j - result[count] = lval + result[count] = rval count += 1 if i < nleft - 1: if j < nright - 1 and right[j + 1] == rval: @@ -3618,33 +8364,117 @@ def outer_join_indexer_int64(ndarray[int64_t] left, # end of the road break elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = lval + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer_object(ndarray[object] left, + ndarray[object] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + object lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[object] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: i += 1 else: - lindexer[count] = -1 + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=object) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i rindexer[count] = j result[count] = rval count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: j += 1 return result, lindexer, rindexer - @cython.wraparound(False) @cython.boundscheck(False) -def inner_join_indexer_float64(ndarray[float64_t] left, - ndarray[float64_t] right): +def inner_join_indexer_int8(ndarray[int8_t] left, + ndarray[int8_t] right): ''' Two-pass algorithm for monotonic indexes. Handles many-to-one merges ''' cdef: Py_ssize_t i, j, k, nright, nleft, count - float64_t lval, rval + int8_t lval, rval ndarray[int64_t] lindexer, rindexer - ndarray[float64_t] result + ndarray[int8_t] result nleft = len(left) nright = len(right) @@ -3686,7 +8516,7 @@ def inner_join_indexer_float64(ndarray[float64_t] left, lindexer = np.empty(count, dtype=np.int64) rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.float64) + result = np.empty(count, dtype=np.int8) i = 0 j = 0 @@ -3728,16 +8558,16 @@ def inner_join_indexer_float64(ndarray[float64_t] left, @cython.wraparound(False) @cython.boundscheck(False) -def inner_join_indexer_object(ndarray[object] left, - ndarray[object] right): +def inner_join_indexer_int16(ndarray[int16_t] left, + ndarray[int16_t] right): ''' Two-pass algorithm for monotonic indexes. Handles many-to-one merges ''' cdef: Py_ssize_t i, j, k, nright, nleft, count - object lval, rval + int16_t lval, rval ndarray[int64_t] lindexer, rindexer - ndarray[object] result + ndarray[int16_t] result nleft = len(left) nright = len(right) @@ -3779,7 +8609,7 @@ def inner_join_indexer_object(ndarray[object] left, lindexer = np.empty(count, dtype=np.int64) rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=object) + result = np.empty(count, dtype=np.int16) i = 0 j = 0 diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 41ac1b3f3480f..ea14245e10731 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -302,7 +302,7 @@ cdef double fINT64_MAX = INT64_MAX cdef double fINT64_MIN = INT64_MIN def maybe_convert_numeric(ndarray[object] values, set na_values, - convert_empty=True): + convert_empty=True, coerce_numeric=False): ''' Type inference function-- convert strings to numeric (potentially) and convert to proper dtype array @@ -346,17 +346,25 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, complexes[i] = val seen_complex = 1 else: - status = floatify(val, &fval) - floats[i] = fval - if not seen_float: - if '.' in val or fval == INF or fval == NEGINF: - seen_float = 1 - elif 'inf' in val: # special case to handle +/-inf - seen_float = 1 - elif fval < fINT64_MAX and fval > fINT64_MIN: - ints[i] = fval - else: - seen_float = 1 + try: + status = floatify(val, &fval) + floats[i] = fval + if not seen_float: + if '.' in val or fval == INF or fval == NEGINF: + seen_float = 1 + elif 'inf' in val: # special case to handle +/-inf + seen_float = 1 + elif fval < fINT64_MAX and fval > fINT64_MIN: + ints[i] = fval + else: + seen_float = 1 + except: + if not coerce_numeric: + raise + + floats[i] = nan + seen_float = 1 + if seen_complex: return complexes diff --git a/pandas/src/numpy.pxd b/pandas/src/numpy.pxd index 45c2fc184a911..b005a716e7d5f 100644 --- a/pandas/src/numpy.pxd +++ b/pandas/src/numpy.pxd @@ -326,6 +326,7 @@ cdef extern from "numpy/arrayobject.h": ctypedef unsigned long long npy_uint96 ctypedef unsigned long long npy_uint128 + ctypedef float npy_float16 ctypedef float npy_float32 ctypedef double npy_float64 ctypedef long double npy_float80 @@ -735,6 +736,7 @@ ctypedef npy_uint64 uint64_t #ctypedef npy_uint96 uint96_t #ctypedef npy_uint128 uint128_t +ctypedef npy_float16 float16_t ctypedef npy_float32 float32_t ctypedef npy_float64 float64_t #ctypedef npy_float80 float80_t diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 5b1d6c31403cb..1017f9cd7c503 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -406,8 +406,8 @@ def test_2d_float32(self): expected[[2, 4]] = np.nan tm.assert_almost_equal(result, expected) - # test with float64 out buffer - out = np.empty((len(indexer), arr.shape[1]), dtype='f8') + #### this now accepts a float32! # test with float64 out buffer + out = np.empty((len(indexer), arr.shape[1]), dtype='float32') com.take_2d(arr, indexer, out=out) # it works! # axis=1 diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index a14d6027361cc..0e3134d940c99 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -140,7 +140,8 @@ def test_to_string_repr_unicode(self): line = line.decode(get_option("display.encoding")) except: pass - self.assert_(len(line) == line_len) + if not line.startswith('Dtype:'): + self.assert_(len(line) == line_len) # it works even if sys.stdin in None _stdin= sys.stdin @@ -1056,6 +1057,8 @@ def test_float_trim_zeros(self): 2.03954217305e+10, 5.59897817305e+10] skip = True for line in repr(DataFrame({'A': vals})).split('\n'): + if line.startswith('Dtype:'): + continue if _three_digit_exp(): self.assert_(('+010' in line) or skip) else: @@ -1101,7 +1104,7 @@ def test_to_string(self): format = '%.4f'.__mod__ result = self.ts.to_string(float_format=format) result = [x.split()[1] for x in result.split('\n')] - expected = [format(x) for x in self.ts] + expected = [format(x) for x in self.ts] + [u'float64'] self.assertEqual(result, expected) # empty string @@ -1116,7 +1119,7 @@ def test_to_string(self): cp.name = 'foo' result = cp.to_string(length=True, name=True) last_line = result.split('\n')[-1].strip() - self.assertEqual(last_line, "Freq: B, Name: foo, Length: %d" % len(cp)) + self.assertEqual(last_line, "Freq: B, Name: foo, Length: %d, Dtype: float64" % len(cp)) def test_freq_name_separation(self): s = Series(np.random.randn(10), @@ -1131,7 +1134,8 @@ def test_to_string_mixed(self): expected = (u'0 foo\n' u'1 NaN\n' u'2 -1.23\n' - u'3 4.56') + u'3 4.56\n' + u'Dtype: object') self.assertEqual(result, expected) # but don't count NAs as floats @@ -1140,7 +1144,8 @@ def test_to_string_mixed(self): expected = (u'0 foo\n' '1 NaN\n' '2 bar\n' - '3 baz') + '3 baz\n' + u'Dtype: object') self.assertEqual(result, expected) s = Series(['foo', 5, 'bar', 'baz']) @@ -1148,7 +1153,8 @@ def test_to_string_mixed(self): expected = (u'0 foo\n' '1 5\n' '2 bar\n' - '3 baz') + '3 baz\n' + u'Dtype: object') self.assertEqual(result, expected) def test_to_string_float_na_spacing(self): @@ -1160,7 +1166,8 @@ def test_to_string_float_na_spacing(self): '1 1.5678\n' '2 NaN\n' '3 -3.0000\n' - '4 NaN') + '4 NaN\n' + u'Dtype: float64') self.assertEqual(result, expected) def test_unicode_name_in_footer(self): @@ -1172,6 +1179,8 @@ def test_float_trim_zeros(self): vals = [2.08430917305e+10, 3.52205017305e+10, 2.30674817305e+10, 2.03954217305e+10, 5.59897817305e+10] for line in repr(Series(vals)).split('\n'): + if line.startswith('Dtype:'): + continue if _three_digit_exp(): self.assert_('+010' in line) else: diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 09747ba3f09f0..03fdd53ce19af 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -46,7 +46,38 @@ def _skip_if_no_scipy(): # DataFrame test cases JOIN_TYPES = ['inner', 'outer', 'left', 'right'] - +MIXED_FLOAT_DTYPES = ['float16','float32','float64'] +MIXED_INT_DTYPES = ['uint8','uint16','uint32','uint64','int8','int16','int32','int64'] + +def _check_mixed_float(df, dtype = None): + dtypes = dict(A = 'float32', B = 'float32', C = 'float16', D = 'float64') + if isinstance(dtype, basestring): + dtypes = dict([ (k,dtype) for k, v in dtypes.items() ]) + elif isinstance(dtype, dict): + dtypes.update(dtype) + if dtypes.get('A'): + assert(df.dtypes['A'] == dtypes['A']) + if dtypes.get('B'): + assert(df.dtypes['B'] == dtypes['B']) + if dtypes.get('C'): + assert(df.dtypes['C'] == dtypes['C']) + if dtypes.get('D'): + assert(df.dtypes['D'] == dtypes['D']) + +def _check_mixed_int(df, dtype = None): + dtypes = dict(A = 'int32', B = 'uint64', C = 'uint8', D = 'int64') + if isinstance(dtype, basestring): + dtypes = dict([ (k,dtype) for k, v in dtypes.items() ]) + elif isinstance(dtype, dict): + dtypes.update(dtype) + if dtypes.get('A'): + assert(df.dtypes['A'] == dtypes['A']) + if dtypes.get('B'): + assert(df.dtypes['B'] == dtypes['B']) + if dtypes.get('C'): + assert(df.dtypes['C'] == dtypes['C']) + if dtypes.get('D'): + assert(df.dtypes['D'] == dtypes['D']) class CheckIndexing(object): @@ -121,6 +152,7 @@ def test_getitem_list(self): self.assertEqual(result.columns.name, 'sth') def test_setitem_list(self): + self.frame['E'] = 'foo' data = self.frame[['A', 'B']] self.frame[['B', 'A']] = data @@ -128,11 +160,11 @@ def test_setitem_list(self): assert_series_equal(self.frame['B'], data['A']) assert_series_equal(self.frame['A'], data['B']) - df = DataFrame(0, range(3), ['tt1', 'tt2']) + df = DataFrame(0, range(3), ['tt1', 'tt2'], dtype=np.int_) df.ix[1, ['tt1', 'tt2']] = [1, 2] result = df.ix[1, ['tt1', 'tt2']] - expected = Series([1, 2], df.columns) + expected = Series([1, 2], df.columns, dtype=np.int_) assert_series_equal(result, expected) df['tt1'] = df['tt2'] = '0' @@ -171,14 +203,43 @@ def test_getitem_boolean(self): self.assertRaises(ValueError, self.tsframe.__getitem__, self.tsframe) - # test df[df >0] works - bif = self.tsframe[self.tsframe > 0] - bifw = DataFrame(np.where(self.tsframe > 0, self.tsframe, np.nan), - index=self.tsframe.index, columns=self.tsframe.columns) - self.assert_(isinstance(bif, DataFrame)) - self.assert_(bif.shape == self.tsframe.shape) - assert_frame_equal(bif, bifw) + # test df[df > 0] + for df in [ self.tsframe, self.mixed_frame, self.mixed_float, self.mixed_int ]: + + data = df._get_numeric_data() + bif = df[df > 0] + bifw = DataFrame(dict([ (c,np.where(data[c] > 0, data[c], np.nan)) for c in data.columns ]), + index=data.index, columns=data.columns) + + # add back other columns to compare + for c in df.columns: + if c not in bifw: + bifw[c] = df[c] + bifw = bifw.reindex(columns = df.columns) + + assert_frame_equal(bif, bifw, check_dtype=False) + for c in df.columns: + if bif[c].dtype != bifw[c].dtype: + self.assert_(bif[c].dtype == df[c].dtype) + + def test_getitem_boolean_casting(self): + + #### this currently disabled ### + + # don't upcast if we don't need to + df = self.tsframe.copy() + df['E'] = 1 + df['E'] = df['E'].astype('int32') + df['F'] = 1 + df['F'] = df['F'].astype('int64') + casted = df[df>0] + result = casted.get_dtype_counts() + #expected = Series({'float64': 4, 'int32' : 1, 'int64' : 1}) + expected = Series({'float64': 6 }) + assert_series_equal(result, expected) + + def test_getitem_boolean_list(self): df = DataFrame(np.arange(12).reshape(3, 4)) @@ -194,9 +255,9 @@ def _checkit(lst): def test_getitem_boolean_iadd(self): arr = randn(5, 5) - df = DataFrame(arr.copy()) - df[df < 0] += 1 + df = DataFrame(arr.copy(), columns = ['A','B','C','D','E']) + df[df < 0] += 1 arr[arr < 0] += 1 assert_almost_equal(df.values, arr) @@ -341,7 +402,7 @@ def test_setitem_cast(self): # #669, should not cast? self.frame['B'] = 0 - self.assert_(self.frame['B'].dtype == np.float64) + self.assert_(self.frame['B'].dtype == np.float_) # cast if pass array of course self.frame['B'] = np.arange(len(self.frame)) @@ -349,18 +410,18 @@ def test_setitem_cast(self): self.frame['foo'] = 'bar' self.frame['foo'] = 0 - self.assert_(self.frame['foo'].dtype == np.int64) + self.assert_(self.frame['foo'].dtype == np.int_) self.frame['foo'] = 'bar' self.frame['foo'] = 2.5 - self.assert_(self.frame['foo'].dtype == np.float64) + self.assert_(self.frame['foo'].dtype == np.float_) self.frame['something'] = 0 - self.assert_(self.frame['something'].dtype == np.int64) + self.assert_(self.frame['something'].dtype == np.int_) self.frame['something'] = 2 - self.assert_(self.frame['something'].dtype == np.int64) + self.assert_(self.frame['something'].dtype == np.int_) self.frame['something'] = 2.5 - self.assert_(self.frame['something'].dtype == np.float64) + self.assert_(self.frame['something'].dtype == np.float_) def test_setitem_boolean_column(self): expected = self.frame.copy() @@ -395,7 +456,7 @@ def test_setitem_corner(self): self.assertEqual(dm.values.dtype, np.object_) dm['C'] = 1 - self.assertEqual(dm['C'].dtype, np.int64) + self.assertEqual(dm['C'].dtype, np.int_) # set existing column dm['A'] = 'bar' @@ -1114,10 +1175,6 @@ def test_setitem_single_column_mixed_datetime(self): self.assertRaises( Exception, df.ix.__setitem__, ('d', 'timestamp'), [nan]) - # prior to 0.10.1 this failed - # self.assertRaises(TypeError, df.ix.__setitem__, ('c','timestamp'), - # nan) - def test_setitem_frame(self): piece = self.frame.ix[:2, ['A', 'B']] self.frame.ix[-2:, ['A', 'B']] = piece.values @@ -1562,10 +1619,30 @@ def setUp(self): self.frame = _frame.copy() self.frame2 = _frame2.copy() - self.intframe = _intframe.copy() + + # force these all to int64 to avoid platform testing issues + self.intframe = DataFrame(dict([ (c,s) for c,s in _intframe.iteritems() ]), dtype = np.int64) self.tsframe = _tsframe.copy() self.mixed_frame = _mixed_frame.copy() - + self.mixed_float = DataFrame({ 'A': _frame['A'].copy().astype('float32'), + 'B': _frame['B'].copy().astype('float32'), + 'C': _frame['C'].copy().astype('float16'), + 'D': _frame['D'].copy().astype('float64') }) + self.mixed_float2 = DataFrame({ 'A': _frame2['A'].copy().astype('float32'), + 'B': _frame2['B'].copy().astype('float32'), + 'C': _frame2['C'].copy().astype('float16'), + 'D': _frame2['D'].copy().astype('float64') }) + self.mixed_int = DataFrame({ 'A': _intframe['A'].copy().astype('int32'), + 'B': np.ones(len(_intframe['B']),dtype='uint64'), + 'C': _intframe['C'].copy().astype('uint8'), + 'D': _intframe['D'].copy().astype('int64') }) + self.all_mixed = DataFrame({'a': 1., 'b': 2, 'c': 'foo', 'float32' : np.array([1.]*10,dtype='float32'), + 'int32' : np.array([1]*10,dtype='int32'), + }, index=np.arange(10)) + #self.all_mixed = DataFrame({'a': 1., 'b': 2, 'c': 'foo', 'float32' : np.array([1.]*10,dtype='float32'), + # 'int32' : np.array([1]*10,dtype='int32'), 'timestamp' : Timestamp('20010101'), + # }, index=np.arange(10)) + self.ts1 = tm.makeTimeSeries() self.ts2 = tm.makeTimeSeries()[5:] self.ts3 = tm.makeTimeSeries()[-5:] @@ -1806,6 +1883,44 @@ def test_constructor_dtype_list_data(self): self.assert_(df.ix[1, 0] is None) self.assert_(df.ix[0, 1] == '2') + def test_constructor_mixed_dtypes(self): + + def _make_mixed_dtypes_df(typ, ad = None): + + if typ == 'int': + dtypes = MIXED_INT_DTYPES + arrays = [ np.array(np.random.rand(10), dtype = d) for d in dtypes ] + elif typ == 'float': + dtypes = MIXED_FLOAT_DTYPES + arrays = [ np.array(np.random.randint(10, size=10), dtype = d) for d in dtypes ] + + zipper = zip(dtypes,arrays) + for d,a in zipper: + assert(a.dtype == d) + if ad is None: + ad = dict() + ad.update(dict([ (d,a) for d,a in zipper ])) + return DataFrame(ad) + + def _check_mixed_dtypes(df, dtypes = None): + if dtypes is None: + dtypes = MIXED_FLOAT_DTYPES + MIXED_INT_DTYPES + for d in dtypes: + if d in df: + assert(df.dtypes[d] == d) + + # mixed floating and integer coexinst in the same frame + df = _make_mixed_dtypes_df('float') + _check_mixed_dtypes(df) + + # add lots of types + df = _make_mixed_dtypes_df('float', dict(A = 1, B = 'foo', C = 'bar')) + _check_mixed_dtypes(df) + + # GH 622 + df = _make_mixed_dtypes_df('int') + _check_mixed_dtypes(df) + def test_constructor_rec(self): rec = self.frame.to_records(index=False) @@ -1975,7 +2090,7 @@ def test_constructor_dict_of_tuples(self): result = DataFrame(data) expected = DataFrame(dict((k, list(v)) for k, v in data.iteritems())) - assert_frame_equal(result, expected) + assert_frame_equal(result, expected, check_dtype=False) def test_constructor_ndarray(self): mat = np.zeros((2, 3), dtype=float) @@ -1988,7 +2103,7 @@ def test_constructor_ndarray(self): # cast type frame = DataFrame(mat, columns=['A', 'B', 'C'], - index=[1, 2], dtype=int) + index=[1, 2], dtype=np.int64) self.assert_(frame.values.dtype == np.int64) # 1-D input @@ -2040,7 +2155,7 @@ def test_constructor_maskedarray(self): # cast type frame = DataFrame(mat, columns=['A', 'B', 'C'], - index=[1, 2], dtype=int) + index=[1, 2], dtype=np.int64) self.assert_(frame.values.dtype == np.int64) # Check non-masked values @@ -2098,7 +2213,7 @@ def test_constructor_maskedarray_nonfloat(self): # cast type frame = DataFrame(mat, columns=['A', 'B', 'C'], - index=[1, 2], dtype=float) + index=[1, 2], dtype=np.float64) self.assert_(frame.values.dtype == np.float64) # Check non-masked values @@ -2174,9 +2289,9 @@ def test_constructor_scalar_inference(self): 'float': 3., 'complex': 4j, 'object': 'foo'} df = DataFrame(data, index=np.arange(10)) - self.assert_(df['int'].dtype == np.int64) + self.assert_(df['int'].dtype == np.int_) self.assert_(df['bool'].dtype == np.bool_) - self.assert_(df['float'].dtype == np.float64) + self.assert_(df['float'].dtype == np.float_) self.assert_(df['complex'].dtype == np.complex128) self.assert_(df['object'].dtype == np.object_) @@ -2192,7 +2307,7 @@ def test_constructor_DataFrame(self): df = DataFrame(self.frame) assert_frame_equal(df, self.frame) - df_casted = DataFrame(self.frame, dtype=int) + df_casted = DataFrame(self.frame, dtype=np.int64) self.assert_(df_casted.values.dtype == np.int64) def test_constructor_more(self): @@ -2229,7 +2344,7 @@ def test_constructor_more(self): # int cast dm = DataFrame({'A': np.ones(10, dtype=int), - 'B': np.ones(10, dtype=float)}, + 'B': np.ones(10, dtype=np.float64)}, index=np.arange(10)) self.assertEqual(len(dm.columns), 2) @@ -2339,7 +2454,7 @@ def test_constructor_scalar(self): idx = Index(range(3)) df = DataFrame({"a": 0}, index=idx) expected = DataFrame({"a": [0, 0, 0]}, index=idx) - assert_frame_equal(df, expected) + assert_frame_equal(df, expected, check_dtype=False) def test_constructor_Series_copy_bug(self): df = DataFrame(self.frame['A'], index=self.frame.index, columns=['A']) @@ -2523,6 +2638,12 @@ def test_astype(self): columns=self.frame.columns) assert_frame_equal(casted, expected) + casted = self.frame.astype(np.int32) + expected = DataFrame(self.frame.values.astype(np.int32), + index=self.frame.index, + columns=self.frame.columns) + assert_frame_equal(casted, expected) + self.frame['foo'] = '5' casted = self.frame.astype(int) expected = DataFrame(self.frame.values.astype(int), @@ -2530,6 +2651,81 @@ def test_astype(self): columns=self.frame.columns) assert_frame_equal(casted, expected) + # mixed casting + def _check_cast(df, v): + self.assert_(list(set([ s.dtype.name for _, s in df.iteritems() ]))[0] == v) + + mn = self.all_mixed._get_numeric_data().copy() + mn['little_float'] = np.array(12345.,dtype='float16') + mn['big_float'] = np.array(123456789101112.,dtype='float64') + + casted = mn.astype('float64') + _check_cast(casted, 'float64') + + casted = mn.astype('int64') + _check_cast(casted, 'int64') + + casted = self.mixed_float.reindex(columns = ['A','B']).astype('float32') + _check_cast(casted, 'float32') + + casted = mn.reindex(columns = ['little_float']).astype('float16') + _check_cast(casted, 'float16') + + casted = self.mixed_float.reindex(columns = ['A','B']).astype('float16') + _check_cast(casted, 'float16') + + casted = mn.astype('float32') + _check_cast(casted, 'float32') + + casted = mn.astype('int32') + _check_cast(casted, 'int32') + + # to object + casted = mn.astype('O') + _check_cast(casted, 'object') + + def test_astype_with_exclude_string(self): + df = self.frame.copy() + expected = self.frame.astype(int) + df['string'] = 'foo' + casted = df.astype(int, raise_on_error = False) + + expected['string'] = 'foo' + assert_frame_equal(casted, expected) + + df = self.frame.copy() + expected = self.frame.astype(np.int32) + df['string'] = 'foo' + casted = df.astype(np.int32, raise_on_error = False) + + expected['string'] = 'foo' + assert_frame_equal(casted, expected) + + def test_astype_with_view(self): + + tf = self.mixed_float.reindex(columns = ['A','B','C']) + self.assertRaises(TypeError, self.frame.astype, np.int32, copy = False) + + self.assertRaises(TypeError, tf, np.int32, copy = False) + + self.assertRaises(TypeError, tf, np.int64, copy = False) + casted = tf.astype(np.int64) + + self.assertRaises(TypeError, tf, np.float32, copy = False) + casted = tf.astype(np.float32) + + # this is the only real reason to do it this way + tf = np.round(self.frame).astype(np.int32) + casted = tf.astype(np.float32, copy = False) + #self.assert_(casted.values.data == tf.values.data) + + tf = self.frame.astype(np.float64) + casted = tf.astype(np.int64, copy = False) + #self.assert_(casted.values.data == tf.values.data) + + # can't view to an object array + self.assertRaises(Exception, self.frame.astype, 'O', copy = False) + def test_astype_cast_nan_int(self): df = DataFrame(data={"Values": [1.0, 2.0, 3.0, np.nan]}) self.assertRaises(ValueError, df.astype, np.int64) @@ -2634,7 +2830,7 @@ def _check_all_orients(df, dtype=None): # dtypes _check_all_orients(DataFrame(biggie, dtype=np.float64), dtype=np.float64) - _check_all_orients(DataFrame(biggie, dtype=np.int), dtype=np.int) + _check_all_orients(DataFrame(biggie, dtype=np.int64), dtype=np.int64) _check_all_orients(DataFrame(biggie, dtype=' 0 - - other1 = df + 1 - rs = df.where(cond, other1) - rs2 = df.where(cond.values, other1) - for k, v in rs.iteritems(): - assert_series_equal(v, np.where(cond[k], df[k], other1[k])) - assert_frame_equal(rs, rs2) - - # it works! - rs = df.where(cond[1:], other1) - - other2 = (df + 1).values - rs = df.where(cond, other2) - for k, v in rs.iteritems(): - assert_series_equal(v, np.where(cond[k], df[k], other2[:, k])) - - other5 = np.nan - rs = df.where(cond, other5) - for k, v in rs.iteritems(): - assert_series_equal(v, np.where(cond[k], df[k], other5)) + default_frame = DataFrame(np.random.randn(5, 3),columns=['A','B','C']) + + def _safe_add(df): + # only add to the numeric items + return DataFrame(dict([ (c,s+1) if issubclass(s.dtype.type, (np.integer,np.floating)) else (c,s) for c, s in df.iteritems() ])) + + def _check_get(df, cond, check_dtypes = True): + other1 = _safe_add(df) + rs = df.where(cond, other1) + rs2 = df.where(cond.values, other1) + for k, v in rs.iteritems(): + assert_series_equal(v, np.where(cond[k], df[k], other1[k])) + assert_frame_equal(rs, rs2) + + # dtypes + if check_dtypes: + self.assert_((rs.dtypes == df.dtypes).all() == True) + + + # check getting + for df in [ default_frame, self.mixed_frame, self.mixed_float, self.mixed_int ]: + cond = df > 0 + _check_get(df, cond) + + # aligning + def _check_align(df, cond, other, check_dtypes = True): + rs = df.where(cond, other) + for i, k in enumerate(rs.columns): + v = rs[k] + d = df[k].values + c = cond[k].reindex(df[k].index).fillna(False).values + + if np.isscalar(other): + o = other + else: + if isinstance(other,np.ndarray): + o = Series(other[:,i],index=v.index).values + else: + o = other[k].values + + assert_series_equal(v, Series(np.where(c, d, o),index=v.index)) + + # dtypes + # can't check dtype when other is an ndarray + if check_dtypes and not isinstance(other,np.ndarray): + self.assert_((rs.dtypes == df.dtypes).all() == True) + + for df in [ self.mixed_frame, self.mixed_float, self.mixed_int ]: + + # other is a frame + cond = (df > 0)[1:] + _check_align(df, cond, _safe_add(df)) + + # check other is ndarray + cond = df > 0 + _check_align(df, cond, (_safe_add(df).values)) + + # integers are upcast, so don't check the dtypes + cond = df > 0 + check_dtypes = all([ not issubclass(s.type,np.integer) for s in df.dtypes ]) + _check_align(df, cond, np.nan, check_dtypes = check_dtypes) + # invalid conditions + df = default_frame err1 = (df + 1).values[0:2, :] self.assertRaises(ValueError, df.where, cond, err1) err2 = cond.ix[:2, :].values + other1 = _safe_add(df) self.assertRaises(ValueError, df.where, err2, other1) - # invalid conditions self.assertRaises(ValueError, df.mask, True) self.assertRaises(ValueError, df.mask, 0) # where inplace - df = DataFrame(np.random.randn(5, 3)) + def _check_set(df, cond, check_dtypes = True): + dfi = df.copy() + econd = cond.reindex_like(df).fillna(True) + expected = dfi.mask(~econd) + dfi.where(cond, np.nan, inplace=True) + assert_frame_equal(dfi, expected) - expected = df.mask(df < 0) - df.where(df >= 0, np.nan, inplace=True) - assert_frame_equal(df, expected) + # dtypes (and confirm upcasts)x + if check_dtypes: + for k, v in df.dtypes.iteritems(): + if issubclass(v.type,np.integer): + v = np.dtype('float64') + self.assert_(dfi[k].dtype == v) + + for df in [ default_frame, self.mixed_frame, self.mixed_float, self.mixed_int ]: + + cond = df > 0 + _check_set(df, cond) + + cond = df >= 0 + _check_set(df, cond) + + # aligining + cond = (df >= 0)[1:] + _check_set(df, cond) + + def test_where_bug(self): + + # GH 2793 + + df = DataFrame({'a': [1.0, 2.0, 3.0, 4.0], 'b': [4.0, 3.0, 2.0, 1.0]}, dtype = 'float64') + expected = DataFrame({'a': [np.nan, np.nan, 3.0, 4.0], 'b': [4.0, 3.0, np.nan, np.nan]}, dtype = 'float64') + result = df.where(df > 2, np.nan) + assert_frame_equal(result, expected) + + result = df.copy() + result.where(result > 2, np.nan, inplace=True) + assert_frame_equal(result, expected) + + # mixed + for dtype in ['int16','int8','int32','int64']: + df = DataFrame({'a': np.array([1, 2, 3, 4],dtype=dtype), 'b': np.array([4.0, 3.0, 2.0, 1.0], dtype = 'float64') }) + expected = DataFrame({'a': [np.nan, np.nan, 3.0, 4.0], 'b': [4.0, 3.0, np.nan, np.nan]}, dtype = 'float64') + result = df.where(df > 2, np.nan) + assert_frame_equal(result, expected) + + result = df.copy() + result.where(result > 2, np.nan, inplace=True) + assert_frame_equal(result, expected) def test_mask(self): df = DataFrame(np.random.randn(5, 3)) @@ -5568,6 +6084,13 @@ def test_diff(self): rs = DataFrame({'s': s}).diff() self.assertEqual(rs.s[1], 1) + # mixed numeric + tf = self.tsframe.astype('float32') + the_diff = tf.diff(1) + assert_series_equal(the_diff['A'], + tf['A'] - tf['A'].shift(1)) + + def test_diff_mixed_dtype(self): df = DataFrame(np.random.randn(5, 3)) df['A'] = np.array([1, 2, 3, 4, 5], dtype=object) @@ -5938,7 +6461,7 @@ def test_apply_convert_objects(self): 'F': np.random.randn(11)}) result = data.apply(lambda x: x, axis=1) - assert_frame_equal(result, data) + assert_frame_equal(result.convert_objects(), data) def test_apply_attach_name(self): result = self.frame.apply(lambda x: x.name) @@ -6484,11 +7007,16 @@ def test_get_X_columns(self): ['a', 'e'])) def test_get_numeric_data(self): - df = DataFrame({'a': 1., 'b': 2, 'c': 'foo'}, + + #df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', 'd' : np.array(1.*10.,dtype='float32'), 'e' : np.array(1*10,dtype='int32')}, + # index=np.arange(10)) + df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', 'd' : np.array([1.]*10,dtype='float32'), + 'e' : np.array([1]*10,dtype='int32'), + 'f' : np.array([1]*10,dtype='int16')}, index=np.arange(10)) result = df._get_numeric_data() - expected = df.ix[:, ['a', 'b']] + expected = df.ix[:, ['a', 'b','d','e','f']] assert_frame_equal(result, expected) only_obj = df.ix[:, ['c']] @@ -6500,7 +7028,8 @@ def test_count(self): f = lambda s: notnull(s).sum() self._check_stat_op('count', f, has_skipna=False, - has_numeric_only=True) + has_numeric_only=True, + check_dtypes=False) # corner case frame = DataFrame() @@ -6529,6 +7058,11 @@ def test_count(self): def test_sum(self): self._check_stat_op('sum', np.sum, has_numeric_only=True) + def test_sum_mixed_numeric(self): + raise nose.SkipTest + # mixed types + self._check_stat_op('sum', np.sum, frame = self.mixed_float, has_numeric_only=True) + def test_stat_operators_attempt_obj_array(self): data = { 'a': [-0.00049987540199591344, -0.0016467257772919831, @@ -6679,7 +7213,7 @@ def alt(x): assert_series_equal(df.kurt(), df.kurt(level=0).xs('bar')) def _check_stat_op(self, name, alternative, frame=None, has_skipna=True, - has_numeric_only=False): + has_numeric_only=False, check_dtypes=True): if frame is None: frame = self.frame # set some NAs @@ -6713,6 +7247,12 @@ def wrapper(x): assert_series_equal(result1, frame.apply(skipna_wrapper, axis=1), check_dtype=False) + # check dtypes + if check_dtypes: + lcd_dtype = frame.values.dtype + self.assert_(lcd_dtype == result0.dtype) + self.assert_(lcd_dtype == result1.dtype) + # result = f(axis=1) # comp = frame.apply(alternative, axis=1).reindex(result.index) # assert_series_equal(result, comp) @@ -6788,7 +7328,7 @@ def wrapper(x): return np.nan return np.median(x) - self._check_stat_op('median', wrapper, frame=self.intframe) + self._check_stat_op('median', wrapper, frame=self.intframe, check_dtypes=False) def test_quantile(self): from pandas.compat.scipy import scoreatpercentile @@ -6856,6 +7396,11 @@ def test_cumprod(self): df.cumprod(0) df.cumprod(1) + # ints32 + df = self.tsframe.fillna(0).astype(np.int32) + df.cumprod(0) + df.cumprod(1) + def test_rank(self): from pandas.compat.scipy import rankdata @@ -7367,6 +7912,40 @@ def test_as_matrix_numeric_cols(self): values = self.frame.as_matrix(['A', 'B', 'C', 'D']) self.assert_(values.dtype == np.float64) + def test_as_matrix_lcd(self): + + # mixed lcd + values = self.mixed_float.as_matrix(['A', 'B', 'C', 'D']) + self.assert_(values.dtype == np.float64) + + values = self.mixed_float.as_matrix(['A', 'B', 'C' ]) + self.assert_(values.dtype == np.float32) + + values = self.mixed_float.as_matrix(['C']) + self.assert_(values.dtype == np.float16) + + values = self.mixed_int.as_matrix(['A','B','C','D']) + self.assert_(values.dtype == np.uint64) + + values = self.mixed_int.as_matrix(['A','D']) + self.assert_(values.dtype == np.int64) + + # guess all ints are cast to uints.... + values = self.mixed_int.as_matrix(['A','B','C']) + self.assert_(values.dtype == np.uint64) + + values = self.mixed_int.as_matrix(['A','C']) + self.assert_(values.dtype == np.int32) + + values = self.mixed_int.as_matrix(['C','D']) + self.assert_(values.dtype == np.int64) + + values = self.mixed_int.as_matrix(['A']) + self.assert_(values.dtype == np.int32) + + values = self.mixed_int.as_matrix(['C']) + self.assert_(values.dtype == np.uint8) + def test_constructor_frame_copy(self): cop = DataFrame(self.frame, copy=True) cop['A'] = 5 @@ -7404,6 +7983,10 @@ def test_cast_internals(self): expected = DataFrame(self.frame._series, dtype=int) assert_frame_equal(casted, expected) + casted = DataFrame(self.frame._data, dtype=np.int32) + expected = DataFrame(self.frame._series, dtype=np.int32) + assert_frame_equal(casted, expected) + def test_consolidate(self): self.frame['E'] = 7. consolidated = self.frame.consolidate() @@ -7475,7 +8058,7 @@ def test_xs_view(self): def test_boolean_indexing(self): idx = range(3) - cols = range(3) + cols = ['A','B','C'] df1 = DataFrame(index=idx, columns=cols, data=np.array([[0.0, 0.5, 1.0], [1.5, 2.0, 2.5], @@ -7512,15 +8095,29 @@ def test_take(self): # mixed-dtype #---------------------------------------- order = [4, 1, 2, 0, 3] + for df in [self.mixed_frame]: - result = self.mixed_frame.take(order, axis=0) - expected = self.mixed_frame.reindex(self.mixed_frame.index.take(order)) - assert_frame_equal(result, expected) + result = df.take(order, axis=0) + expected = df.reindex(df.index.take(order)) + assert_frame_equal(result, expected) + + # axis = 1 + result = df.take(order, axis=1) + expected = df.ix[:, ['foo', 'B', 'C', 'A', 'D']] + assert_frame_equal(result, expected) - # axis = 1 - result = self.mixed_frame.take(order, axis=1) - expected = self.mixed_frame.ix[:, ['foo', 'B', 'C', 'A', 'D']] - assert_frame_equal(result, expected) + # by dtype + order = [1, 2, 0, 3] + for df in [self.mixed_float,self.mixed_int]: + + result = df.take(order, axis=0) + expected = df.reindex(df.index.take(order)) + assert_frame_equal(result, expected) + + # axis = 1 + result = df.take(order, axis=1) + expected = df.ix[:, ['B', 'C', 'A', 'D']] + assert_frame_equal(result, expected) def test_iterkv_names(self): for k, v in self.mixed_frame.iterkv(): diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 114697bc5c8cd..54d29263b2308 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -62,6 +62,13 @@ def setUp(self): 'C': np.random.randn(8), 'D': np.random.randn(8)}) + self.df_mixed_floats = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.array(np.random.randn(8),dtype='float32')}) + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], @@ -155,6 +162,25 @@ def test_first_last_nth(self): self.assert_(com.isnull(grouped['B'].last()['foo'])) self.assert_(com.isnull(grouped['B'].nth(0)['foo'])) + def test_first_last_nth_dtypes(self): + # tests for first / last / nth + + grouped = self.df_mixed_floats.groupby('A') + first = grouped.first() + expected = self.df_mixed_floats.ix[[1, 0], ['B', 'C', 'D']] + expected.index = ['bar', 'foo'] + assert_frame_equal(first, expected) + + last = grouped.last() + expected = self.df_mixed_floats.ix[[5, 7], ['B', 'C', 'D']] + expected.index = ['bar', 'foo'] + assert_frame_equal(last, expected) + + nth = grouped.nth(1) + expected = self.df_mixed_floats.ix[[3, 2], ['B', 'C', 'D']] + expected.index = ['bar', 'foo'] + assert_frame_equal(nth, expected) + def test_grouper_iter(self): self.assertEqual(sorted(self.df.groupby('A').grouper), ['bar', 'foo']) @@ -478,16 +504,30 @@ def test_transform_function_aliases(self): def test_with_na(self): index = Index(np.arange(10)) - values = Series(np.ones(10), index) - labels = Series([nan, 'foo', 'bar', 'bar', nan, nan, 'bar', - 'bar', nan, 'foo'], index=index) - grouped = values.groupby(labels) - agged = grouped.agg(len) - expected = Series([4, 2], index=['bar', 'foo']) + for dtype in ['float64','float32','int64','int32','int16','int8']: + values = Series(np.ones(10), index, dtype=dtype) + labels = Series([nan, 'foo', 'bar', 'bar', nan, nan, 'bar', + 'bar', nan, 'foo'], index=index) + + + # this SHOULD be an int + grouped = values.groupby(labels) + agged = grouped.agg(len) + expected = Series([4, 2], index=['bar', 'foo']) + + assert_series_equal(agged, expected, check_dtype=False) + #self.assert_(issubclass(agged.dtype.type, np.integer)) - assert_series_equal(agged, expected, check_dtype=False) - self.assert_(issubclass(agged.dtype.type, np.integer)) + # explicity return a float from my function + def f(x): + return float(len(x)) + + agged = grouped.agg(f) + expected = Series([4, 2], index=['bar', 'foo']) + + assert_series_equal(agged, expected, check_dtype=False) + self.assert_(issubclass(agged.dtype.type, np.dtype(dtype).type)) def test_attr_wrapper(self): grouped = self.ts.groupby(lambda x: x.weekday()) @@ -1596,6 +1636,7 @@ def test_series_grouper_noncontig_index(self): grouped.agg(f) def test_convert_objects_leave_decimal_alone(self): + from decimal import Decimal s = Series(range(5)) diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index 9deddb802d1bf..f39a6d3b3feec 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -20,20 +20,20 @@ def assert_block_equal(left, right): assert(left.ref_items.equals(right.ref_items)) -def get_float_mat(n, k): - return np.repeat(np.atleast_2d(np.arange(k, dtype=float)), n, axis=0) +def get_float_mat(n, k, dtype): + return np.repeat(np.atleast_2d(np.arange(k, dtype=dtype)), n, axis=0) TEST_COLS = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'] N = 10 -def get_float_ex(cols=['a', 'c', 'e']): - floats = get_float_mat(N, len(cols)).T +def get_float_ex(cols=['a', 'c', 'e'], dtype = np.float_): + floats = get_float_mat(N, len(cols), dtype = dtype).T return make_block(floats, cols, TEST_COLS) def get_complex_ex(cols=['h']): - complexes = (get_float_mat(N, 1).T * 1j).astype(np.complex128) + complexes = (get_float_mat(N, 1, dtype = np.float_).T * 1j).astype(np.complex128) return make_block(complexes, cols, TEST_COLS) @@ -49,13 +49,8 @@ def get_bool_ex(cols=['f']): return make_block(mat.T, cols, TEST_COLS) -def get_int_ex(cols=['g']): - mat = randn(N, 1).astype(int) - return make_block(mat.T, cols, TEST_COLS) - - -def get_int32_ex(cols): - mat = randn(N, 1).astype(np.int32) +def get_int_ex(cols=['g'], dtype = np.int_): + mat = randn(N, 1).astype(dtype) return make_block(mat.T, cols, TEST_COLS) @@ -63,6 +58,16 @@ def get_dt_ex(cols=['h']): mat = randn(N, 1).astype(int).astype('M8[ns]') return make_block(mat.T, cols, TEST_COLS) +def create_blockmanager(blocks): + l = [] + for b in blocks: + l.extend(b.items) + items = Index(l) + for b in blocks: + b.ref_items = items + + index_sz = blocks[0].values.shape[1] + return BlockManager(blocks, [items, np.arange(index_sz)]) class TestBlock(unittest.TestCase): @@ -76,8 +81,8 @@ def setUp(self): self.int_block = get_int_ex() def test_constructor(self): - int32block = get_int32_ex(['a']) - self.assert_(int32block.dtype == np.int64) + int32block = get_int_ex(['a'],dtype = np.int32) + self.assert_(int32block.dtype == np.int32) def test_pickle(self): import pickle @@ -235,12 +240,7 @@ def test_attrs(self): def test_is_mixed_dtype(self): self.assert_(self.mgr.is_mixed_dtype()) - items = Index(['a', 'b']) - blocks = [get_bool_ex(['a']), get_bool_ex(['b'])] - for b in blocks: - b.ref_items = items - - mgr = BlockManager(blocks, [items, np.arange(N)]) + mgr = create_blockmanager([get_bool_ex(['a']), get_bool_ex(['b'])]) self.assert_(not mgr.is_mixed_dtype()) def test_is_indexed_like(self): @@ -254,9 +254,12 @@ def test_block_id_vector_item_dtypes(self): assert_almost_equal(expected, result) result = self.mgr.item_dtypes + + # as the platform may not exactly match this, pseudo match expected = ['float64', 'object', 'float64', 'object', 'float64', 'bool', 'int64', 'complex128'] - self.assert_(np.array_equal(result, expected)) + for e, r in zip(expected, result): + np.dtype(e).kind == np.dtype(r).kind def test_duplicate_item_failure(self): items = Index(['a', 'a']) @@ -315,7 +318,7 @@ def test_set_change_dtype(self): self.assert_(mgr2.get('baz').dtype == np.object_) mgr2.set('quux', randn(N).astype(int)) - self.assert_(mgr2.get('quux').dtype == np.int64) + self.assert_(mgr2.get('quux').dtype == np.int_) mgr2.set('quux', randn(N)) self.assert_(mgr2.get('quux').dtype == np.float_) @@ -326,36 +329,110 @@ def test_copy(self): for cp_blk, blk in zip(shallow.blocks, self.mgr.blocks): self.assert_(cp_blk.values is blk.values) - def test_as_matrix(self): - pass + def test_as_matrix_float(self): + + mgr = create_blockmanager([get_float_ex(['c'],np.float32), get_float_ex(['d'],np.float16), get_float_ex(['e'],np.float64)]) + self.assert_(mgr.as_matrix().dtype == np.float64) + + mgr = create_blockmanager([get_float_ex(['c'],np.float32), get_float_ex(['d'],np.float16)]) + self.assert_(mgr.as_matrix().dtype == np.float32) def test_as_matrix_int_bool(self): - items = Index(['a', 'b']) - blocks = [get_bool_ex(['a']), get_bool_ex(['b'])] - for b in blocks: - b.ref_items = items - index_sz = blocks[0].values.shape[1] - mgr = BlockManager(blocks, [items, np.arange(index_sz)]) + mgr = create_blockmanager([get_bool_ex(['a']), get_bool_ex(['b'])]) self.assert_(mgr.as_matrix().dtype == np.bool_) - blocks = [get_int_ex(['a']), get_int_ex(['b'])] - for b in blocks: - b.ref_items = items - - mgr = BlockManager(blocks, [items, np.arange(index_sz)]) + mgr = create_blockmanager([get_int_ex(['a'],np.int64), get_int_ex(['b'],np.int64), get_int_ex(['c'],np.int32), get_int_ex(['d'],np.int16), get_int_ex(['e'],np.uint8) ]) self.assert_(mgr.as_matrix().dtype == np.int64) - def test_as_matrix_datetime(self): - items = Index(['h', 'g']) - blocks = [get_dt_ex(['h']), get_dt_ex(['g'])] - for b in blocks: - b.ref_items = items + mgr = create_blockmanager([get_int_ex(['c'],np.int32), get_int_ex(['d'],np.int16), get_int_ex(['e'],np.uint8) ]) + self.assert_(mgr.as_matrix().dtype == np.int32) - index_sz = blocks[0].values.shape[1] - mgr = BlockManager(blocks, [items, np.arange(index_sz)]) + def test_as_matrix_datetime(self): + mgr = create_blockmanager([get_dt_ex(['h']), get_dt_ex(['g'])]) self.assert_(mgr.as_matrix().dtype == 'M8[ns]') + def test_astype(self): + + # coerce all + mgr = create_blockmanager([get_float_ex(['c'],np.float32), get_float_ex(['d'],np.float16), get_float_ex(['e'],np.float64)]) + + for t in ['float16','float32','float64','int32','int64']: + tmgr = mgr.astype(t) + self.assert_(tmgr.as_matrix().dtype == np.dtype(t)) + + # mixed + mgr = create_blockmanager([get_obj_ex(['a','b']),get_bool_ex(['c']),get_dt_ex(['d']),get_float_ex(['e'],np.float32), get_float_ex(['f'],np.float16), get_float_ex(['g'],np.float64)]) + for t in ['float16','float32','float64','int32','int64']: + tmgr = mgr.astype(t, raise_on_error = False).get_numeric_data() + self.assert_(tmgr.as_matrix().dtype == np.dtype(t)) + + def test_convert(self): + + def _compare(old_mgr, new_mgr): + """ compare the blocks, numeric compare ==, object don't """ + old_blocks = set(old_mgr.blocks) + new_blocks = set(new_mgr.blocks) + self.assert_(len(old_blocks) == len(new_blocks)) + + # compare non-numeric + for b in old_blocks: + found = False + for nb in new_blocks: + if (b.values == nb.values).all(): + found = True + break + self.assert_(found == True) + + for b in new_blocks: + found = False + for ob in old_blocks: + if (b.values == ob.values).all(): + found = True + break + self.assert_(found == True) + + # noops + mgr = create_blockmanager([get_int_ex(['f']), get_float_ex(['g'])]) + new_mgr = mgr.convert() + _compare(mgr,new_mgr) + + mgr = create_blockmanager([get_obj_ex(['a','b']), get_int_ex(['f']), get_float_ex(['g'])]) + new_mgr = mgr.convert() + _compare(mgr,new_mgr) + + # there could atcually be multiple dtypes resulting + def _check(new_mgr,block_type, citems): + items = set() + for b in new_mgr.blocks: + if isinstance(b,block_type): + for i in list(b.items): + items.add(i) + self.assert_(items == set(citems)) + + # convert + mat = np.empty((N, 3), dtype=object) + mat[:, 0] = '1' + mat[:, 1] = '2.' + mat[:, 2] = 'foo' + b = make_block(mat.T, ['a','b','foo'], TEST_COLS) + + mgr = create_blockmanager([b, get_int_ex(['f']), get_float_ex(['g'])]) + new_mgr = mgr.convert(convert_numeric = True) + + _check(new_mgr,FloatBlock,['b','g']) + _check(new_mgr,IntBlock,['a','f']) + + mgr = create_blockmanager([b, get_int_ex(['f'],np.int32), get_bool_ex(['bool']), get_dt_ex(['dt']), + get_int_ex(['i'],np.int64), get_float_ex(['g'],np.float64), get_float_ex(['h'],np.float16)]) + new_mgr = mgr.convert(convert_numeric = True) + + _check(new_mgr,FloatBlock,['b','g','h']) + _check(new_mgr,IntBlock,['a','f','i']) + _check(new_mgr,ObjectBlock,['foo']) + _check(new_mgr,BoolBlock,['bool']) + _check(new_mgr,DatetimeBlock,['dt']) + def test_xs(self): pass diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index a4df141fefef9..87b820faa3dc8 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -616,12 +616,22 @@ def test_sortlevel(self): assert_frame_equal(rs, self.frame.sortlevel(0)) def test_sortlevel_large_cardinality(self): - # #2684 + + # #2684 (int64) + index = MultiIndex.from_arrays([np.arange(4000)]*3) + df = DataFrame(np.random.randn(4000), index=index, dtype = np.int64) + + # it works! + result = df.sortlevel(0) + self.assertTrue(result.index.lexsort_depth == 3) + + # #2684 (int32) index = MultiIndex.from_arrays([np.arange(4000)]*3) - df = DataFrame(np.random.randn(4000), index=index) + df = DataFrame(np.random.randn(4000), index=index, dtype = np.int32) # it works! result = df.sortlevel(0) + self.assert_((result.dtypes.values == df.dtypes.values).all() == True) self.assertTrue(result.index.lexsort_depth == 3) def test_delevel_infer_dtype(self): @@ -723,7 +733,7 @@ def test_count_level_corner(self): df = self.frame[:0] result = df.count(level=0) expected = DataFrame({}, index=s.index.levels[0], - columns=df.columns).fillna(0).astype(int) + columns=df.columns).fillna(0).astype(np.int64) assert_frame_equal(result, expected) def test_unstack(self): @@ -734,6 +744,9 @@ def test_unstack(self): # test that ints work unstacked = self.ymd.astype(int).unstack() + # test that int32 work + unstacked = self.ymd.astype(np.int32).unstack() + def test_unstack_multiple_no_empty_columns(self): index = MultiIndex.from_tuples([(0, 'foo', 0), (0, 'bar', 0), (1, 'baz', 1), (1, 'qux', 1)]) diff --git a/pandas/tests/test_ndframe.py b/pandas/tests/test_ndframe.py index e017bf07039d7..0c004884c5559 100644 --- a/pandas/tests/test_ndframe.py +++ b/pandas/tests/test_ndframe.py @@ -24,7 +24,10 @@ def test_ndim(self): def test_astype(self): casted = self.ndf.astype(int) - self.assert_(casted.values.dtype == np.int64) + self.assert_(casted.values.dtype == np.int_) + + casted = self.ndf.astype(np.int32) + self.assert_(casted.values.dtype == np.int32) if __name__ == '__main__': import nose diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 235b3e153574c..07a02f18d8337 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -418,7 +418,7 @@ def test_setitem(self): # scalar self.panel['ItemG'] = 1 self.panel['ItemE'] = True - self.assert_(self.panel['ItemG'].values.dtype == np.int64) + self.assert_(self.panel['ItemG'].values.dtype == np.int_) self.assert_(self.panel['ItemE'].values.dtype == np.bool_) # object dtype @@ -782,6 +782,13 @@ def test_constructor_cast(self): assert_almost_equal(casted.values, exp_values) assert_almost_equal(casted2.values, exp_values) + casted = Panel(zero_filled._data, dtype=np.int32) + casted2 = Panel(zero_filled.values, dtype=np.int32) + + exp_values = zero_filled.values.astype(np.int32) + assert_almost_equal(casted.values, exp_values) + assert_almost_equal(casted2.values, exp_values) + # can't cast data = [[['foo', 'bar', 'baz']]] self.assertRaises(ValueError, Panel, data, dtype=float) @@ -798,6 +805,30 @@ def test_constructor_observe_dtype(self): minor_axis=range(3), dtype='O') self.assert_(panel.values.dtype == np.object_) + def test_constructor_dtypes(self): + # GH #797 + + def _check_dtype(panel, dtype): + for i in panel.items: + self.assert_(panel[i].values.dtype.name == dtype) + + # only nan holding types allowed here + for dtype in ['float64','float32','object']: + panel = Panel(items=range(2),major_axis=range(10),minor_axis=range(5),dtype=dtype) + _check_dtype(panel,dtype) + + for dtype in ['float64','float32','int64','int32','object']: + panel = Panel(np.array(np.random.randn(2,10,5),dtype=dtype),items=range(2),major_axis=range(10),minor_axis=range(5),dtype=dtype) + _check_dtype(panel,dtype) + + for dtype in ['float64','float32','int64','int32','object']: + panel = Panel(np.array(np.random.randn(2,10,5),dtype='O'),items=range(2),major_axis=range(10),minor_axis=range(5),dtype=dtype) + _check_dtype(panel,dtype) + + for dtype in ['float64','float32','int64','int32','object']: + panel = Panel(np.random.randn(2,10,5),items=range(2),major_axis=range(10),minor_axis=range(5),dtype=dtype) + _check_dtype(panel,dtype) + def test_consolidate(self): self.assert_(self.panel._data.is_consolidated()) @@ -845,6 +876,11 @@ def test_ctor_dict(self): for k, v in dcasted.iteritems())) assert_panel_equal(result, expected) + result = Panel(dcasted, dtype=np.int32) + expected = Panel(dict((k, v.astype(np.int32)) + for k, v in dcasted.iteritems())) + assert_panel_equal(result, expected) + def test_constructor_dict_mixed(self): data = dict((k, v.values) for k, v in self.panel.iterkv()) result = Panel(data) diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index e0180f475ca45..87bfba7c55cce 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -358,7 +358,7 @@ def test_setitem(self): # scalar self.panel4d['lG'] = 1 self.panel4d['lE'] = True - self.assert_(self.panel4d['lG'].values.dtype == np.int64) + self.assert_(self.panel4d['lG'].values.dtype == np.int_) self.assert_(self.panel4d['lE'].values.dtype == np.bool_) # object dtype @@ -592,6 +592,13 @@ def test_constructor_cast(self): assert_almost_equal(casted.values, exp_values) assert_almost_equal(casted2.values, exp_values) + casted = Panel4D(zero_filled._data, dtype=np.int32) + casted2 = Panel4D(zero_filled.values, dtype=np.int32) + + exp_values = zero_filled.values.astype(np.int32) + assert_almost_equal(casted.values, exp_values) + assert_almost_equal(casted2.values, exp_values) + # can't cast data = [[['foo', 'bar', 'baz']]] self.assertRaises(ValueError, Panel, data, dtype=float) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 1ac61ea276cac..03708f6dc396d 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -142,7 +142,7 @@ def test_multilevel_name_print(self): "qux one 7", " two 8", " three 9", - "Name: sth"] + "Name: sth, Dtype: int64"] expected = "\n".join(expected) self.assertEquals(repr(s), expected) @@ -2875,6 +2875,64 @@ def test_apply_dont_convert_dtype(self): result = s.apply(f, convert_dtype=False) self.assert_(result.dtype == object) + def test_convert_objects(self): + + s = Series([1., 2, 3],index=['a','b','c']) + result = s.convert_objects(convert_dates=False,convert_numeric=True) + assert_series_equal(s,result) + + # force numeric conversion + r = s.copy().astype('O') + r['a'] = '1' + result = r.convert_objects(convert_dates=False,convert_numeric=True) + assert_series_equal(s,result) + + r = s.copy().astype('O') + r['a'] = '1.' + result = r.convert_objects(convert_dates=False,convert_numeric=True) + assert_series_equal(s,result) + + r = s.copy().astype('O') + r['a'] = 'garbled' + expected = s.copy() + expected['a'] = np.nan + result = r.convert_objects(convert_dates=False,convert_numeric=True) + assert_series_equal(expected,result) + + # dates + s = Series([datetime(2001,1,1,0,0), datetime(2001,1,2,0,0), datetime(2001,1,3,0,0) ]) + s2 = Series([datetime(2001,1,1,0,0), datetime(2001,1,2,0,0), datetime(2001,1,3,0,0), 'foo', 1.0, 1, Timestamp('20010104'), '20010105'],dtype='O') + + result = s.convert_objects(convert_dates=True,convert_numeric=False) + expected = Series([Timestamp('20010101'),Timestamp('20010102'),Timestamp('20010103')],dtype='M8[ns]') + assert_series_equal(expected,result) + + result = s.convert_objects(convert_dates='coerce',convert_numeric=False) + assert_series_equal(expected,result) + result = s.convert_objects(convert_dates='coerce',convert_numeric=True) + assert_series_equal(expected,result) + + expected = Series([Timestamp('20010101'),Timestamp('20010102'),Timestamp('20010103'),lib.NaT,lib.NaT,lib.NaT,Timestamp('20010104'),Timestamp('20010105')],dtype='M8[ns]') + result = s2.convert_objects(convert_dates='coerce',convert_numeric=False) + assert_series_equal(expected,result) + result = s2.convert_objects(convert_dates='coerce',convert_numeric=True) + assert_series_equal(expected,result) + + # preserver all-nans (if convert_dates='coerce') + s = Series(['foo','bar',1,1.0],dtype='O') + result = s.convert_objects(convert_dates='coerce',convert_numeric=False) + assert_series_equal(result,s) + + # preserver if non-object + s = Series([1],dtype='float32') + result = s.convert_objects(convert_dates='coerce',convert_numeric=False) + assert_series_equal(result,s) + + #r = s.copy() + #r[0] = np.nan + #result = r.convert_objects(convert_dates=True,convert_numeric=False) + #self.assert_(result.dtype == 'M8[ns]') + def test_apply_args(self): s = Series(['foo,bar']) diff --git a/pandas/tests/test_tseries.py b/pandas/tests/test_tseries.py index 7e5341fd5b311..eaeb3325685ec 100644 --- a/pandas/tests/test_tseries.py +++ b/pandas/tests/test_tseries.py @@ -458,7 +458,9 @@ def test_generate_bins(self): values, [-3, -1], 'right') def test_group_bin_functions(self): - funcs = ['add', 'mean', 'prod', 'min', 'max', 'var'] + + dtypes = ['float32','float64'] + funcs = ['add', 'mean', 'prod', 'min', 'max', 'var'] np_funcs = { 'add': np.sum, @@ -470,71 +472,82 @@ def test_group_bin_functions(self): } for fname in funcs: - args = [getattr(algos, 'group_%s' % fname), - getattr(algos, 'group_%s_bin' % fname), - np_funcs[fname]] - self._check_versions(*args) - - def _check_versions(self, irr_func, bin_func, np_func): - obj = self.obj + for d in dtypes: + check_less_precise = False + if d == 'float32': + check_less_precise = True + args = [getattr(algos, 'group_%s_%s' % (fname,d)), + getattr(algos, 'group_%s_bin_%s' % (fname,d)), + np_funcs[fname], + d, + check_less_precise] + self._check_versions(*args) + + def _check_versions(self, irr_func, bin_func, np_func, dtype, check_less_precise): + obj = self.obj.astype(dtype) cts = np.zeros(3, dtype=np.int64) - exp = np.zeros((3, 1), np.float64) + exp = np.zeros((3, 1), dtype) irr_func(exp, cts, obj, self.labels) # bin-based version bins = np.array([3, 6], dtype=np.int64) - out = np.zeros((3, 1), np.float64) + out = np.zeros((3, 1), dtype) counts = np.zeros(len(out), dtype=np.int64) bin_func(out, counts, obj, bins) - assert_almost_equal(out, exp) + assert_almost_equal(out, exp, check_less_precise=check_less_precise) bins = np.array([3, 9, 10], dtype=np.int64) - out = np.zeros((3, 1), np.float64) + out = np.zeros((3, 1), dtype) counts = np.zeros(len(out), dtype=np.int64) bin_func(out, counts, obj, bins) exp = np.array([np_func(obj[:3]), np_func(obj[3:9]), np_func(obj[9:])], - dtype=np.float64) - assert_almost_equal(out.squeeze(), exp) + dtype=dtype) + assert_almost_equal(out.squeeze(), exp, check_less_precise=check_less_precise) # duplicate bins bins = np.array([3, 6, 10, 10], dtype=np.int64) - out = np.zeros((4, 1), np.float64) + out = np.zeros((4, 1), dtype) counts = np.zeros(len(out), dtype=np.int64) bin_func(out, counts, obj, bins) exp = np.array([np_func(obj[:3]), np_func(obj[3:6]), np_func(obj[6:10]), np.nan], - dtype=np.float64) - assert_almost_equal(out.squeeze(), exp) + dtype=dtype) + assert_almost_equal(out.squeeze(), exp, check_less_precise=check_less_precise) def test_group_ohlc(): - obj = np.random.randn(20) - bins = np.array([6, 12], dtype=np.int64) - out = np.zeros((3, 4), np.float64) - counts = np.zeros(len(out), dtype=np.int64) + def _check(dtype): + obj = np.array(np.random.randn(20),dtype=dtype) - algos.group_ohlc(out, counts, obj[:, None], bins) + bins = np.array([6, 12], dtype=np.int64) + out = np.zeros((3, 4), dtype) + counts = np.zeros(len(out), dtype=np.int64) + + func = getattr(algos,'group_ohlc_%s' % dtype) + func(out, counts, obj[:, None], bins) - def _ohlc(group): - if isnull(group).all(): - return np.repeat(nan, 4) - return [group[0], group.max(), group.min(), group[-1]] + def _ohlc(group): + if isnull(group).all(): + return np.repeat(nan, 4) + return [group[0], group.max(), group.min(), group[-1]] - expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), - _ohlc(obj[12:])]) + expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), + _ohlc(obj[12:])]) - assert_almost_equal(out, expected) - assert_almost_equal(counts, [6, 6, 8]) + assert_almost_equal(out, expected) + assert_almost_equal(counts, [6, 6, 8]) - obj[:6] = nan - algos.group_ohlc(out, counts, obj[:, None], bins) - expected[0] = nan - assert_almost_equal(out, expected) + obj[:6] = nan + func(out, counts, obj[:, None], bins) + expected[0] = nan + assert_almost_equal(out, expected) + _check('float32') + _check('float64') def test_try_parse_dates(): from dateutil.parser import parse diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index c058580ab0f45..3adfb38e6144b 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -2,6 +2,7 @@ SQL-style merge routines """ +import itertools import numpy as np from pandas.core.categorical import Factor @@ -658,7 +659,7 @@ def _prepare_blocks(self): join_blocks = unit.get_upcasted_blocks() type_map = {} for blk in join_blocks: - type_map.setdefault(type(blk), []).append(blk) + type_map.setdefault(blk.dtype, []).append(blk) blockmaps.append((unit, type_map)) return blockmaps @@ -985,7 +986,8 @@ def _prepare_blocks(self): blockmaps = [] for data in reindexed_data: data = data.consolidate() - type_map = dict((type(blk), blk) for blk in data.blocks) + + type_map = dict((blk.dtype, blk) for blk in data.blocks) blockmaps.append(type_map) return blockmaps, reindexed_data diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 47ab02d892c3f..8820d43975885 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -287,7 +287,7 @@ def test_join_index_mixed(self): df1 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, index=np.arange(10), columns=['A', 'B', 'C', 'D']) - self.assert_(df1['B'].dtype == np.int64) + self.assert_(df1['B'].dtype == np.int) self.assert_(df1['D'].dtype == np.bool_) df2 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, @@ -422,23 +422,27 @@ def test_join_hierarchical_mixed(self): self.assertTrue('b' in result) def test_join_float64_float32(self): - a = DataFrame(randn(10, 2), columns=['a', 'b']) - b = DataFrame(randn(10, 1), columns=['c']).astype(np.float32) - joined = a.join(b) - expected = a.join(b.astype('f8')) - assert_frame_equal(joined, expected) - joined = b.join(a) - assert_frame_equal(expected, joined.reindex(columns=['a', 'b', 'c'])) + a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype = np.float64) + b = DataFrame(randn(10, 1), columns=['c'], dtype = np.float32) + joined = a.join(b) + self.assert_(joined.dtypes['a'] == 'float64') + self.assert_(joined.dtypes['b'] == 'float64') + self.assert_(joined.dtypes['c'] == 'float32') - a = np.random.randint(0, 5, 100) - b = np.random.random(100).astype('Float64') - c = np.random.random(100).astype('Float32') + a = np.random.randint(0, 5, 100).astype('int64') + b = np.random.random(100).astype('float64') + c = np.random.random(100).astype('float32') df = DataFrame({'a': a, 'b': b, 'c': c}) - xpdf = DataFrame({'a': a, 'b': b, 'c': c.astype('Float64')}) - s = DataFrame(np.random.random(5).astype('f'), columns=['md']) + xpdf = DataFrame({'a': a, 'b': b, 'c': c }) + s = DataFrame(np.random.random(5).astype('float32'), columns=['md']) rs = df.merge(s, left_on='a', right_index=True) - xp = xpdf.merge(s.astype('f8'), left_on='a', right_index=True) + self.assert_(rs.dtypes['a'] == 'int64') + self.assert_(rs.dtypes['b'] == 'float64') + self.assert_(rs.dtypes['c'] == 'float32') + self.assert_(rs.dtypes['md'] == 'float32') + + xp = xpdf.merge(s, left_on='a', right_index=True) assert_frame_equal(rs, xp) def test_join_many_non_unique_index(self): @@ -591,7 +595,7 @@ def test_intelligently_handle_join_key(self): np.nan, np.nan]), 'rvalue': np.array([0, 1, 0, 1, 2, 2, 3, 4, 5])}, columns=['value', 'key', 'rvalue']) - assert_frame_equal(joined, expected) + assert_frame_equal(joined, expected, check_dtype=False) self.assert_(joined._data.is_consolidated()) @@ -801,7 +805,25 @@ def test_left_join_index_preserve_order(self): left = DataFrame({'k1': [0, 1, 2] * 8, 'k2': ['foo', 'bar'] * 12, - 'v': np.arange(24)}) + 'v': np.array(np.arange(24),dtype=np.int64) }) + + index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) + right = DataFrame({'v2': [5, 7]}, index=index) + + result = left.join(right, on=['k1', 'k2']) + + expected = left.copy() + expected['v2'] = np.nan + expected['v2'][(expected.k1 == 2) & (expected.k2 == 'bar')] = 5 + expected['v2'][(expected.k1 == 1) & (expected.k2 == 'foo')] = 7 + + tm.assert_frame_equal(result, expected) + + # test join with multi dtypes blocks + left = DataFrame({'k1': [0, 1, 2] * 8, + 'k2': ['foo', 'bar'] * 12, + 'k3' : np.array([0, 1, 2]*8, dtype=np.float32), + 'v': np.array(np.arange(24),dtype=np.int32) }) index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) right = DataFrame({'v2': [5, 7]}, index=index) @@ -820,6 +842,33 @@ def test_left_join_index_preserve_order(self): right_on=['k1', 'k2'], how='right') tm.assert_frame_equal(joined.ix[:, expected.columns], expected) + def test_join_multi_dtypes(self): + + # test with multi dtypes in the join index + def _test(dtype1,dtype2): + left = DataFrame({'k1': np.array([0, 1, 2] * 8, dtype=dtype1), + 'k2': ['foo', 'bar'] * 12, + 'v': np.array(np.arange(24),dtype=np.int64) }) + + index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) + right = DataFrame({'v2': np.array([5, 7], dtype=dtype2)}, index=index) + + result = left.join(right, on=['k1', 'k2']) + + expected = left.copy() + + if dtype2.kind == 'i': + dtype2 = np.dtype('float64') + expected['v2'] = np.array(np.nan,dtype=dtype2) + expected['v2'][(expected.k1 == 2) & (expected.k2 == 'bar')] = 5 + expected['v2'][(expected.k1 == 1) & (expected.k2 == 'foo')] = 7 + + tm.assert_frame_equal(result, expected) + + for d1 in [np.int64,np.int32,np.int16,np.int8,np.uint8]: + for d2 in [np.int64,np.float64,np.float32,np.float16]: + _test(np.dtype(d1),np.dtype(d2)) + def test_left_merge_na_buglet(self): left = DataFrame({'id': list('abcde'), 'v1': randn(5), 'v2': randn(5), 'dummy': list('abcde'), diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 4d81119bd4a34..29b844d330af2 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -550,17 +550,20 @@ def test_resample_not_monotonic(self): assert_series_equal(result, exp) def test_resample_median_bug_1688(self): - df = DataFrame([1, 2], index=[datetime(2012, 1, 1, 0, 0, 0), - datetime(2012, 1, 1, 0, 5, 0)]) - - result = df.resample("T", how=lambda x: x.mean()) - exp = df.asfreq('T') - tm.assert_frame_equal(result, exp) - - result = df.resample("T", how="median") - exp = df.asfreq('T') - tm.assert_frame_equal(result, exp) + for dtype in ['int64','int32','float64','float32']: + df = DataFrame([1, 2], index=[datetime(2012, 1, 1, 0, 0, 0), + datetime(2012, 1, 1, 0, 5, 0)], + dtype = dtype) + + result = df.resample("T", how=lambda x: x.mean()) + exp = df.asfreq('T') + tm.assert_frame_equal(result, exp) + + result = df.resample("T", how="median") + exp = df.asfreq('T') + tm.assert_frame_equal(result, exp) + def test_how_lambda_functions(self): ts = _simple_ts('1/1/2000', '4/1/2000') diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index aa12d6142d6d8..861a8aa9d3a95 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -567,7 +567,8 @@ def test_series_repr_nat(self): expected = ('0 1970-01-01 00:00:00\n' '1 1970-01-01 00:00:00.000001\n' '2 1970-01-01 00:00:00.000002\n' - '3 NaT') + '3 NaT\n' + 'Dtype: datetime64[ns]') self.assertEquals(result, expected) def test_fillna_nat(self): diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 98d1d81b9aabc..8cd710283464a 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -774,7 +774,7 @@ def datetime_to_datetime64(ndarray[object] values): def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, - format=None, utc=None): + format=None, utc=None, coerce=False): cdef: Py_ssize_t i, n = len(values) object val @@ -813,14 +813,16 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, _check_dts_bounds(iresult[i], &dts) elif util.is_datetime64_object(val): iresult[i] = _get_datetime64_nanos(val) - elif util.is_integer_object(val): + + # if we are coercing, dont' allow integers + elif util.is_integer_object(val) and not coerce: iresult[i] = val else: - if len(val) == 0: - iresult[i] = iNaT - continue - try: + if len(val) == 0: + iresult[i] = iNaT + continue + _string_to_dts(val, &dts) iresult[i] = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) @@ -829,10 +831,19 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, try: result[i] = parse(val, dayfirst=dayfirst) except Exception: + if coerce: + iresult[i] = iNaT + continue raise TypeError pandas_datetime_to_datetimestruct(iresult[i], PANDAS_FR_ns, &dts) _check_dts_bounds(iresult[i], &dts) + except: + if coerce: + iresult[i] = iNaT + continue + raise + return result except TypeError: oresult = np.empty(n, dtype=object) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index a9a6bab893ac1..702ae7d5c72ef 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -88,7 +88,7 @@ def isiterable(obj): return hasattr(obj, '__iter__') -def assert_almost_equal(a, b): +def assert_almost_equal(a, b, check_less_precise = False): if isinstance(a, dict) or isinstance(b, dict): return assert_dict_equal(a, b) @@ -103,7 +103,7 @@ def assert_almost_equal(a, b): return True else: for i in xrange(len(a)): - assert_almost_equal(a[i], b[i]) + assert_almost_equal(a[i], b[i], check_less_precise) return True err_msg = lambda a, b: 'expected %.5f but got %.5f' % (a, b) @@ -112,16 +112,29 @@ def assert_almost_equal(a, b): np.testing.assert_(isnull(b)) return - if isinstance(a, (bool, float, int)): + if isinstance(a, (bool, float, int, np.float32)): + decimal = 5 + + # deal with differing dtypes + if check_less_precise: + dtype_a = np.dtype(a) + dtype_b = np.dtype(b) + if dtype_a.kind == 'i' and dtype_b == 'i': + pass + if dtype_a.kind == 'f' and dtype_b == 'f': + if dtype_a.itemsize <= 4 and dtype_b.itemsize <= 4: + decimal = 3 + if np.isinf(a): assert np.isinf(b), err_msg(a, b) + # case for zero elif abs(a) < 1e-5: np.testing.assert_almost_equal( - a, b, decimal=5, err_msg=err_msg(a, b), verbose=False) + a, b, decimal=decimal, err_msg=err_msg(a, b), verbose=False) else: np.testing.assert_almost_equal( - 1, a / b, decimal=5, err_msg=err_msg(a, b), verbose=False) + 1, a / b, decimal=decimal, err_msg=err_msg(a, b), verbose=False) else: assert(a == b) @@ -144,10 +157,11 @@ def assert_dict_equal(a, b, compare_keys=True): def assert_series_equal(left, right, check_dtype=True, check_index_type=False, check_index_freq=False, - check_series_type=False): + check_series_type=False, + check_less_precise=False): if check_series_type: assert(type(left) == type(right)) - assert_almost_equal(left.values, right.values) + assert_almost_equal(left.values, right.values, check_less_precise) if check_dtype: assert(left.dtype == right.dtype) assert(left.index.equals(right.index)) @@ -160,9 +174,11 @@ def assert_series_equal(left, right, check_dtype=True, getattr(right, 'freqstr', None)) -def assert_frame_equal(left, right, check_index_type=False, +def assert_frame_equal(left, right, check_dtype=True, + check_index_type=False, check_column_type=False, - check_frame_type=False): + check_frame_type=False, + check_less_precise=False): if check_frame_type: assert(type(left) == type(right)) assert(isinstance(left, DataFrame)) @@ -175,7 +191,10 @@ def assert_frame_equal(left, right, check_index_type=False, assert(col in right) lcol = left.icol(i) rcol = right.icol(i) - assert_series_equal(lcol, rcol) + assert_series_equal(lcol, rcol, + check_dtype=check_dtype, + check_index_type=check_index_type, + check_less_precise=check_less_precise) if check_index_type: assert(type(left.index) == type(right.index)) @@ -187,7 +206,9 @@ def assert_frame_equal(left, right, check_index_type=False, assert(left.columns.inferred_type == right.columns.inferred_type) -def assert_panel_equal(left, right, check_panel_type=False): +def assert_panel_equal(left, right, + check_panel_type=False, + check_less_precise=False): if check_panel_type: assert(type(left) == type(right)) @@ -197,13 +218,14 @@ def assert_panel_equal(left, right, check_panel_type=False): for col, series in left.iterkv(): assert(col in right) - assert_frame_equal(series, right[col]) + assert_frame_equal(series, right[col], check_less_precise=check_less_precise) for col in right: assert(col in left) -def assert_panel4d_equal(left, right): +def assert_panel4d_equal(left, right, + check_less_precise=False): assert(left.labels.equals(right.labels)) assert(left.items.equals(right.items)) assert(left.major_axis.equals(right.major_axis)) @@ -211,7 +233,7 @@ def assert_panel4d_equal(left, right): for col, series in left.iterkv(): assert(col in right) - assert_panel_equal(series, right[col]) + assert_panel_equal(series, right[col], check_less_precise=check_less_precise) for col in right: assert(col in left) diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py index 502752d9ec6a6..caa09c219a866 100644 --- a/vb_suite/groupby.py +++ b/vb_suite/groupby.py @@ -177,15 +177,24 @@ def f(): data = Series(randn(len(labels))) data[::3] = np.nan data[1::3] = np.nan +data2 = Series(randn(len(labels)),dtype='float32') +data2[::3] = np.nan +data2[1::3] = np.nan labels = labels.take(np.random.permutation(len(labels))) """ groupby_first = Benchmark('data.groupby(labels).first()', setup, start_date=datetime(2012, 5, 1)) +groupby_first_float32 = Benchmark('data2.groupby(labels).first()', setup, + start_date=datetime(2013, 1, 1)) + groupby_last = Benchmark('data.groupby(labels).last()', setup, start_date=datetime(2012, 5, 1)) +groupby_last_float32 = Benchmark('data2.groupby(labels).last()', setup, + start_date=datetime(2013, 1, 1)) + #---------------------------------------------------------------------- # groupby_indices replacement, chop up Series diff --git a/vb_suite/reindex.py b/vb_suite/reindex.py index 2f675636ee928..acf8f6f043bad 100644 --- a/vb_suite/reindex.py +++ b/vb_suite/reindex.py @@ -56,6 +56,7 @@ ts = Series(np.random.randn(len(rng)), index=rng) ts2 = ts[::2] ts3 = ts2.reindex(ts.index) +ts4 = ts3.astype('float32') def pad(): try: @@ -81,9 +82,16 @@ def backfill(): name="reindex_fillna_pad", start_date=datetime(2011, 3, 1)) +reindex_fillna_pad_float32 = Benchmark("ts4.fillna(method='pad')", setup, + name="reindex_fillna_pad_float32", + start_date=datetime(2013, 1, 1)) + reindex_fillna_backfill = Benchmark("ts3.fillna(method='backfill')", setup, name="reindex_fillna_backfill", start_date=datetime(2011, 3, 1)) +reindex_fillna_backfill_float32 = Benchmark("ts4.fillna(method='backfill')", setup, + name="reindex_fillna_backfill_float32", + start_date=datetime(2013, 1, 1)) #---------------------------------------------------------------------- # align on level