diff --git a/pandas/algos.pyx b/pandas/algos.pyx
index 40c8cabe3cb9a..cac9c5ccc7a6d 100644
--- a/pandas/algos.pyx
+++ b/pandas/algos.pyx
@@ -2018,7 +2018,7 @@ def group_median(ndarray[float64_t, ndim=2] out,
     data = np.empty((K, N), dtype=np.float64)
     ptr = <float64_t*> data.data
 
-    take_2d_axis1_float64(values.T, indexer, out=data)
+    take_2d_axis1_float64_float64(values.T, indexer, out=data)
 
     for i in range(K):
         # exclude NA group
diff --git a/pandas/core/common.py b/pandas/core/common.py
index c99fd87f7a643..f83b218a1ae98 100644
--- a/pandas/core/common.py
+++ b/pandas/core/common.py
@@ -1,12 +1,14 @@
 """
 Misc tools for implementing data structures
 """
+# XXX: HACK for NumPy 1.5.1 to suppress warnings
 try:
     import cPickle as pickle
 except ImportError:  # pragma: no cover
     import pickle
 
 import itertools
+from datetime import datetime
 
 from numpy.lib.format import read_array, write_array
 import numpy as np
@@ -244,231 +246,333 @@ def _unpickle_array(bytes):
     return arr
 
 
-def _view_wrapper(f, wrap_dtype, na_override=None):
+def _view_wrapper(f, arr_dtype, out_dtype, fill_wrap=None):
     def wrapper(arr, indexer, out, fill_value=np.nan):
-        if na_override is not None and np.isnan(fill_value):
-            fill_value = na_override
-        view = arr.view(wrap_dtype)
-        outview = out.view(wrap_dtype)
-        f(view, indexer, outview, fill_value=fill_value)
+        if arr_dtype is not None:
+            arr = arr.view(arr_dtype)
+        if out_dtype is not None:
+            out = out.view(out_dtype)
+        if fill_wrap is not None:
+            fill_value = fill_wrap(fill_value)
+        f(arr, indexer, out, fill_value=fill_value)
     return wrapper
 
 
-_take1d_dict = {
-    'float64': algos.take_1d_float64,
-    'float32': algos.take_1d_float32,
-    'int8': algos.take_1d_int8,
-    'int16': algos.take_1d_int16,
-    'int32': algos.take_1d_int32,
-    'int64': algos.take_1d_int64,
-    'object': algos.take_1d_object,
-    'bool': _view_wrapper(algos.take_1d_bool, np.uint8),
-    'datetime64[ns]': _view_wrapper(algos.take_1d_int64, np.int64,
-                                    na_override=tslib.iNaT),
-}
+def _datetime64_fill_wrap(fill_value):
+    if isnull(fill_value):
+        return tslib.iNaT
+    try:
+        return lib.Timestamp(fill_value).value
+    except:
+        # the proper thing to do here would probably be to upcast to object
+        # (but numpy 1.6.1 doesn't do this properly)
+        return tslib.iNaT
+
 
-_take2d_axis0_dict = {
-    'float64': algos.take_2d_axis0_float64,
-    'float32': algos.take_2d_axis0_float32,
-    'int8': algos.take_2d_axis0_int8,
-    'int16': algos.take_2d_axis0_int16,
-    'int32': algos.take_2d_axis0_int32,
-    'int64': algos.take_2d_axis0_int64,
-    'object': algos.take_2d_axis0_object,
-    'bool': _view_wrapper(algos.take_2d_axis0_bool, np.uint8),
-    'datetime64[ns]': _view_wrapper(algos.take_2d_axis0_int64, np.int64,
-                                    na_override=tslib.iNaT),
+def _convert_wrapper(f, conv_dtype):
+    def wrapper(arr, indexer, out, fill_value=np.nan):
+        arr = arr.astype(conv_dtype)
+        f(arr, indexer, out, fill_value=fill_value)
+    return wrapper
+
+
+def _take_2d_multi_generic(arr, indexer, out, fill_value=np.nan):
+    # this is not ideal, performance-wise, but it's better than
+    #   raising an exception
+    if arr.shape[0] == 0 or arr.shape[1] == 0:
+        return
+    row_idx, col_idx = indexer
+    row_mask = row_idx == -1
+    col_mask = col_idx == -1
+    if fill_value is not None:
+        if row_mask.any():
+            out[row_mask, :] = fill_value
+        if col_mask.any():
+            out[:, col_mask] = fill_value
+    for i in range(len(row_idx)):
+        u = row_idx[i]
+        for j in range(len(col_idx)):
+            v = col_idx[j]
+            out[i, j] = arr[u, v]
+
+
+def _take_nd_generic(arr, indexer, out, axis=0, fill_value=np.nan):
+    if arr.shape[axis] == 0:
+        return
+    mask = indexer == -1
+    needs_masking = mask.any()
+    if arr.dtype != out.dtype:
+        arr = arr.astype(out.dtype)
+    ndtake(arr, indexer, axis=axis, out=out)
+    if needs_masking:
+        outindexer = [slice(None)] * arr.ndim
+        outindexer[axis] = mask
+        out[tuple(outindexer)] = fill_value
+
+
+_take_1d_dict = {
+    ('int8', 'int8'): algos.take_1d_int8_int8,
+    ('int8', 'int32'): algos.take_1d_int8_int32,
+    ('int8', 'int64'): algos.take_1d_int8_int64,
+    ('int8', 'float64'): algos.take_1d_int8_float64,
+    ('int16', 'int16'): algos.take_1d_int16_int16,
+    ('int16', 'int32'): algos.take_1d_int16_int32,
+    ('int16', 'int64'): algos.take_1d_int16_int64,
+    ('int16', 'float64'): algos.take_1d_int16_float64,
+    ('int32', 'int32'): algos.take_1d_int32_int32,
+    ('int32', 'int64'): algos.take_1d_int32_int64,
+    ('int32', 'float64'): algos.take_1d_int32_float64,
+    ('int64', 'int64'): algos.take_1d_int64_int64,
+    ('int64', 'float64'): algos.take_1d_int64_float64,
+    ('float32', 'float32'): algos.take_1d_float32_float32,
+    ('float32', 'float64'): algos.take_1d_float32_float64,
+    ('float64', 'float64'): algos.take_1d_float64_float64,
+    ('object', 'object'): algos.take_1d_object_object,
+    ('bool', 'bool'):
+        _view_wrapper(algos.take_1d_bool_bool, np.uint8, np.uint8),
+    ('bool', 'object'):
+        _view_wrapper(algos.take_1d_bool_object, np.uint8, None),
+    ('datetime64[ns]','datetime64[ns]'):
+        _view_wrapper(algos.take_1d_int64_int64, np.int64, np.int64,
+                      fill_wrap=_datetime64_fill_wrap)
 }
 
-_take2d_axis1_dict = {
-    'float64': algos.take_2d_axis1_float64,
-    'float32': algos.take_2d_axis1_float32,
-    'int8': algos.take_2d_axis1_int8,
-    'int16': algos.take_2d_axis1_int16,
-    'int32': algos.take_2d_axis1_int32,
-    'int64': algos.take_2d_axis1_int64,
-    'object': algos.take_2d_axis1_object,
-    'bool': _view_wrapper(algos.take_2d_axis1_bool, np.uint8),
-    'datetime64[ns]': _view_wrapper(algos.take_2d_axis1_int64, np.int64,
-                                    na_override=tslib.iNaT),
+
+_take_2d_axis0_dict = {
+    ('int8', 'int8'): algos.take_2d_axis0_int8_int8,
+    ('int8', 'int32'): algos.take_2d_axis0_int8_int32,
+    ('int8', 'int64'): algos.take_2d_axis0_int8_int64,
+    ('int8', 'float64'): algos.take_2d_axis0_int8_float64,
+    ('int16', 'int16'): algos.take_2d_axis0_int16_int16,
+    ('int16', 'int32'): algos.take_2d_axis0_int16_int32,
+    ('int16', 'int64'): algos.take_2d_axis0_int16_int64,
+    ('int16', 'float64'): algos.take_2d_axis0_int16_float64,
+    ('int32', 'int32'): algos.take_2d_axis0_int32_int32,
+    ('int32', 'int64'): algos.take_2d_axis0_int32_int64,
+    ('int32', 'float64'): algos.take_2d_axis0_int32_float64,
+    ('int64', 'int64'): algos.take_2d_axis0_int64_int64,
+    ('int64', 'float64'): algos.take_2d_axis0_int64_float64,
+    ('float32', 'float32'): algos.take_2d_axis0_float32_float32,
+    ('float32', 'float64'): algos.take_2d_axis0_float32_float64,
+    ('float64', 'float64'): algos.take_2d_axis0_float64_float64,
+    ('object', 'object'): algos.take_2d_axis0_object_object,
+    ('bool', 'bool'):
+        _view_wrapper(algos.take_2d_axis0_bool_bool, np.uint8, np.uint8),
+    ('bool', 'object'):
+        _view_wrapper(algos.take_2d_axis0_bool_object, np.uint8, None),
+    ('datetime64[ns]','datetime64[ns]'):
+        _view_wrapper(algos.take_2d_axis0_int64_int64, np.int64, np.int64,
+                      fill_wrap=_datetime64_fill_wrap)
 }
 
-_take2d_multi_dict = {
-    'float64': algos.take_2d_multi_float64,
-    'float32': algos.take_2d_multi_float32,
-    'int8': algos.take_2d_multi_int8,
-    'int16': algos.take_2d_multi_int16,
-    'int32': algos.take_2d_multi_int32,
-    'int64': algos.take_2d_multi_int64,
-    'object': algos.take_2d_multi_object,
-    'bool': _view_wrapper(algos.take_2d_multi_bool, np.uint8),
-    'datetime64[ns]': _view_wrapper(algos.take_2d_multi_int64, np.int64,
-                                    na_override=tslib.iNaT),
+
+_take_2d_axis1_dict = {
+    ('int8', 'int8'): algos.take_2d_axis1_int8_int8,
+    ('int8', 'int32'): algos.take_2d_axis1_int8_int32,
+    ('int8', 'int64'): algos.take_2d_axis1_int8_int64,
+    ('int8', 'float64'): algos.take_2d_axis1_int8_float64,
+    ('int16', 'int16'): algos.take_2d_axis1_int16_int16,
+    ('int16', 'int32'): algos.take_2d_axis1_int16_int32,
+    ('int16', 'int64'): algos.take_2d_axis1_int16_int64,
+    ('int16', 'float64'): algos.take_2d_axis1_int16_float64,
+    ('int32', 'int32'): algos.take_2d_axis1_int32_int32,
+    ('int32', 'int64'): algos.take_2d_axis1_int32_int64,
+    ('int32', 'float64'): algos.take_2d_axis1_int32_float64,
+    ('int64', 'int64'): algos.take_2d_axis1_int64_int64,
+    ('int64', 'float64'): algos.take_2d_axis1_int64_float64,
+    ('float32', 'float32'): algos.take_2d_axis1_float32_float32,
+    ('float32', 'float64'): algos.take_2d_axis1_float32_float64,
+    ('float64', 'float64'): algos.take_2d_axis1_float64_float64,
+    ('object', 'object'): algos.take_2d_axis1_object_object,
+    ('bool', 'bool'):
+        _view_wrapper(algos.take_2d_axis1_bool_bool, np.uint8, np.uint8),
+    ('bool', 'object'):
+        _view_wrapper(algos.take_2d_axis1_bool_object, np.uint8, None),
+    ('datetime64[ns]','datetime64[ns]'):
+        _view_wrapper(algos.take_2d_axis1_int64_int64, np.int64, np.int64,
+                      fill_wrap=_datetime64_fill_wrap)
 }
 
-_dtypes_no_na = set(['int8','int16','int32', 'int64', 'bool'])
-_dtypes_na    = set(['float32', 'float64', 'object', 'datetime64[ns]'])
 
-def _get_take2d_function(dtype_str, axis=0):
-    if axis == 0:
-        return _take2d_axis0_dict[dtype_str]
-    elif axis == 1:
-        return _take2d_axis1_dict[dtype_str]
-    elif axis == 'multi':
-        return _take2d_multi_dict[dtype_str]
-    else:  # pragma: no cover
-        raise ValueError('bad axis: %s' % axis)
+_take_2d_multi_dict = {
+    ('int8', 'int8'): algos.take_2d_multi_int8_int8,
+    ('int8', 'int32'): algos.take_2d_multi_int8_int32,
+    ('int8', 'int64'): algos.take_2d_multi_int8_int64,
+    ('int8', 'float64'): algos.take_2d_multi_int8_float64,
+    ('int16', 'int16'): algos.take_2d_multi_int16_int16,
+    ('int16', 'int32'): algos.take_2d_multi_int16_int32,
+    ('int16', 'int64'): algos.take_2d_multi_int16_int64,
+    ('int16', 'float64'): algos.take_2d_multi_int16_float64,
+    ('int32', 'int32'): algos.take_2d_multi_int32_int32,
+    ('int32', 'int64'): algos.take_2d_multi_int32_int64,
+    ('int32', 'float64'): algos.take_2d_multi_int32_float64,
+    ('int64', 'int64'): algos.take_2d_multi_int64_int64,
+    ('int64', 'float64'): algos.take_2d_multi_int64_float64,
+    ('float32', 'float32'): algos.take_2d_multi_float32_float32,
+    ('float32', 'float64'): algos.take_2d_multi_float32_float64,
+    ('float64', 'float64'): algos.take_2d_multi_float64_float64,
+    ('object', 'object'): algos.take_2d_multi_object_object,
+    ('bool', 'bool'):
+        _view_wrapper(algos.take_2d_multi_bool_bool, np.uint8, np.uint8),
+    ('bool', 'object'):
+        _view_wrapper(algos.take_2d_multi_bool_object, np.uint8, None),
+    ('datetime64[ns]','datetime64[ns]'):
+        _view_wrapper(algos.take_2d_multi_int64_int64, np.int64, np.int64,
+                      fill_wrap=_datetime64_fill_wrap)
+}
 
 
-def take_1d(arr, indexer, out=None, fill_value=np.nan):
-    """
-    Specialized Cython take which sets NaN values in one pass
-    """
-    dtype_str = arr.dtype.name
+def _get_take_1d_function(dtype, out_dtype):
+    try:
+        return _take_1d_dict[dtype.name, out_dtype.name]
+    except KeyError:
+        pass
 
-    n = len(indexer)
+    if dtype != out_dtype: 
+        try:
+            func = _take_1d_dict[out_dtype.name, out_dtype.name]
+            return _convert_wrapper(func, out_dtype)
+        except KeyError:
+            pass
 
-    indexer = _ensure_int64(indexer)
+    def wrapper(arr, indexer, out, fill_value=np.nan):
+        return _take_nd_generic(arr, indexer, out, axis=0,
+                                fill_value=fill_value)
+    return wrapper
 
-    out_passed = out is not None
-    take_f = _take1d_dict.get(dtype_str)
 
-    if dtype_str in _dtypes_no_na:
+def _get_take_2d_function(dtype, out_dtype, axis=0):
+    try:
+        if axis == 0:
+            return _take_2d_axis0_dict[dtype.name, out_dtype.name]
+        elif axis == 1:
+            return _take_2d_axis1_dict[dtype.name, out_dtype.name]
+        elif axis == 'multi':
+            return _take_2d_multi_dict[dtype.name, out_dtype.name]
+        else:  # pragma: no cover
+            raise ValueError('bad axis: %s' % axis)
+    except KeyError:
+        pass
+
+    if dtype != out_dtype: 
         try:
-            if out is None:
-                out = np.empty(n, dtype=arr.dtype)
-            take_f(arr, _ensure_int64(indexer), out=out, fill_value=fill_value)
-        except ValueError:
-            mask = indexer == -1
-            if len(arr) == 0:
-                if not out_passed:
-                    out = np.empty(n, dtype=arr.dtype)
+            if axis == 0:
+                func = _take_2d_axis0_dict[out_dtype.name, out_dtype.name]
+            elif axis == 1:
+                func = _take_2d_axis1_dict[out_dtype.name, out_dtype.name]
             else:
-                out = ndtake(arr, indexer, out=out)
-            if mask.any():
-                if out_passed:
-                    raise Exception('out with dtype %s does not support NA' %
-                                    out.dtype)
-                out = _maybe_upcast(out)
-                np.putmask(out, mask, fill_value)
-    elif dtype_str in _dtypes_na:
-        if out is None:
-            out = np.empty(n, dtype=arr.dtype)
-        take_f(arr, _ensure_int64(indexer), out=out, fill_value=fill_value)
-    else:
-        out = ndtake(arr, indexer, out=out)
-        mask = indexer == -1
-        if mask.any():
-            if out_passed:
-                raise Exception('out with dtype %s does not support NA' %
-                                out.dtype)
-            out = _maybe_upcast(out)
-            np.putmask(out, mask, fill_value)
+                func = _take_2d_multi_dict[out_dtype.name, out_dtype.name]
+            return _convert_wrapper(func, out_dtype)
+        except KeyError:
+            pass
 
-    return out
+    if axis == 'multi':
+        return _take_2d_multi_generic
 
+    def wrapper(arr, indexer, out, fill_value=np.nan):
+        return _take_nd_generic(arr, indexer, out, axis=axis,
+                                fill_value=fill_value)
+    return wrapper
 
-def take_2d_multi(arr, row_idx, col_idx, fill_value=np.nan, out=None):
 
-    dtype_str = arr.dtype.name
+def _get_take_nd_function(ndim, dtype, out_dtype, axis=0):
+    if ndim == 2:
+        return _get_take_2d_function(dtype, out_dtype, axis=axis)
+    elif ndim == 1:
+        if axis != 0:
+            raise ValueError('axis must be 0 for one dimensional array')
+        return _get_take_1d_function(dtype, out_dtype)
+    elif ndim <= 0:
+        raise ValueError('ndim must be >= 1')
 
-    out_shape = len(row_idx), len(col_idx)
+    def wrapper(arr, indexer, out, fill_value=np.nan):
+        return _take_nd_generic(arr, indexer, out, axis=axis,
+                                fill_value=fill_value)
+    if (dtype.name, out_dtype.name) == ('datetime64[ns]','datetime64[ns]'):
+        wrapper = _view_wrapper(wrapper, np.int64, np.int64,
+                                fill_wrap=_datetime64_fill_wrap)
+    return wrapper
 
-    if dtype_str in _dtypes_no_na:
-        row_mask = row_idx == -1
-        col_mask = col_idx == -1
-        needs_masking = row_mask.any() or col_mask.any()
 
-        if needs_masking:
-            return take_2d_multi(_maybe_upcast(arr), row_idx, col_idx,
-                                 fill_value=fill_value, out=out)
-        else:
-            if out is None:
-                out = np.empty(out_shape, dtype=arr.dtype)
-            take_f = _get_take2d_function(dtype_str, axis='multi')
-            take_f(arr, _ensure_int64(row_idx),
-                   _ensure_int64(col_idx), out=out,
-                   fill_value=fill_value)
-            return out
-    elif dtype_str in _dtypes_na:
-        if out is None:
-            out = np.empty(out_shape, dtype=arr.dtype)
-        take_f = _get_take2d_function(dtype_str, axis='multi')
-        take_f(arr, _ensure_int64(row_idx), _ensure_int64(col_idx), out=out,
-               fill_value=fill_value)
-        return out
+def take_1d(arr, indexer, out=None, fill_value=np.nan):
+    """
+    Specialized Cython take which sets NaN values in one pass
+    """
+    if indexer is None:
+        indexer = np.arange(len(arr), dtype=np.int64)
+        dtype, fill_value = arr.dtype, arr.dtype.type()
     else:
-        if out is not None:
-            raise ValueError('Cannot pass out in this case')
+        indexer = _ensure_int64(indexer)
+        dtype = _maybe_promote(arr.dtype, fill_value)
+        if dtype != arr.dtype:
+            mask = indexer == -1
+            needs_masking = mask.any()
+            if needs_masking:
+                if out is not None and out.dtype != dtype:
+                    raise Exception('Incompatible type for fill_value')
+            else:
+                dtype, fill_value = arr.dtype, arr.dtype.type()
 
-        return take_2d(take_2d(arr, row_idx, axis=0, fill_value=fill_value),
-                       col_idx, axis=1, fill_value=fill_value)
+    if out is None:
+        out = np.empty(len(indexer), dtype=dtype)
+    take_f = _get_take_1d_function(arr.dtype, out.dtype)
+    take_f(arr, indexer, out=out, fill_value=fill_value)
+    return out
 
 
-def take_2d(arr, indexer, out=None, mask=None, needs_masking=None, axis=0,
-            fill_value=np.nan):
+def take_nd(arr, indexer, out=None, axis=0, fill_value=np.nan):
     """
     Specialized Cython take which sets NaN values in one pass
     """
-    dtype_str = arr.dtype.name
+    if indexer is None:
+        mask = None
+        needs_masking = False
+        fill_value = arr.dtype.type()
+    else:
+        indexer = _ensure_int64(indexer)
+        mask = indexer == -1
+        needs_masking = mask.any()
+        if not needs_masking:
+            fill_value = arr.dtype.type()
+    return take_fast(arr, indexer, mask, needs_masking, axis, out, fill_value)
 
-    out_shape = list(arr.shape)
-    out_shape[axis] = len(indexer)
-    out_shape = tuple(out_shape)
 
-    if not isinstance(indexer, np.ndarray):
-        indexer = np.array(indexer, dtype=np.int64)
+def take_2d_multi(arr, row_idx, col_idx, fill_value=np.nan, out=None):
+    """
+    Specialized Cython take which sets NaN values in one pass
+    """
+    if row_idx is None:
+        row_idx = np.arange(arr.shape[0], dtype=np.int64)
+    else:
+        row_idx = _ensure_int64(row_idx)
 
-    if dtype_str in _dtypes_no_na:
-        if mask is None:
-            mask = indexer == -1
-            needs_masking = mask.any()
+    if col_idx is None:
+        col_idx = np.arange(arr.shape[1], dtype=np.int64)
+    else:
+        col_idx = _ensure_int64(col_idx)
 
+    dtype = _maybe_promote(arr.dtype, fill_value)
+    if dtype != arr.dtype:
+        row_mask = row_idx == -1
+        col_mask = col_idx == -1
+        needs_masking = row_mask.any() or col_mask.any()
         if needs_masking:
-            # upcasting may be required
-            result = ndtake(arr, indexer, axis=axis, out=out)
-            result = _maybe_mask(result, mask, needs_masking, axis=axis,
-                                 out_passed=out is not None,
-                                 fill_value=fill_value)
-            return result
+            if out is not None and out.dtype != dtype:
+                raise Exception('Incompatible type for fill_value')
         else:
-            if out is None:
-                out = np.empty(out_shape, dtype=arr.dtype)
-            take_f = _get_take2d_function(dtype_str, axis=axis)
-            take_f(arr, _ensure_int64(indexer), out=out, fill_value=fill_value)
-            return out
-    elif dtype_str in _dtypes_na:
-        if out is None:
-            out = np.empty(out_shape, dtype=arr.dtype)
-        take_f = _get_take2d_function(dtype_str, axis=axis)
-        take_f(arr, _ensure_int64(indexer), out=out, fill_value=fill_value)
-        return out
-    else:
-        if mask is None:
-            mask = indexer == -1
-            needs_masking = mask.any()
-
-        # GH #486
-        if out is not None and arr.dtype != out.dtype:
-            arr = arr.astype(out.dtype)
-
-        result = ndtake(arr, indexer, axis=axis, out=out)
-        result = _maybe_mask(result, mask, needs_masking, axis=axis,
-                             out_passed=out is not None,
-                             fill_value=fill_value)
-        return result
+            dtype, fill_value = arr.dtype, arr.dtype.type()
+    if out is None:
+        out_shape = len(row_idx), len(col_idx)
+        out = np.empty(out_shape, dtype=dtype)
+    take_f = _get_take_2d_function(arr.dtype, out.dtype, axis='multi')
+    take_f(arr, (row_idx, col_idx), out=out, fill_value=fill_value)
+    return out
 
 
 def ndtake(arr, indexer, axis=0, out=None):
     return arr.take(_ensure_platform_int(indexer), axis=axis, out=out)
 
 
-def mask_out_axis(arr, mask, axis, fill_value=np.nan):
-    indexer = [slice(None)] * arr.ndim
-    indexer[axis] = mask
-
-    arr[tuple(indexer)] = fill_value
-
 _diff_special = {
     'float64': algos.diff_2d_float64,
     'float32': algos.diff_2d_float32,
@@ -483,7 +587,7 @@ def diff(arr, n, axis=0):
     n = int(n)
     dtype = arr.dtype
     if issubclass(dtype.type, np.integer):
-        dtype = np.float64
+        dtype = np.float_
     elif issubclass(dtype.type, np.bool_):
         dtype = np.object_
 
@@ -512,43 +616,77 @@ def diff(arr, n, axis=0):
 
 def take_fast(arr, indexer, mask, needs_masking, axis=0, out=None,
               fill_value=np.nan):
-    if arr.ndim == 2:
-        return take_2d(arr, indexer, out=out, mask=mask,
-                       needs_masking=needs_masking,
-                       axis=axis, fill_value=fill_value)
-    indexer = _ensure_platform_int(indexer)
-    result = ndtake(arr, indexer, axis=axis, out=out)
-    result = _maybe_mask(result, mask, needs_masking, axis=axis,
-                         out_passed=out is not None, fill_value=fill_value)
-    return result
-
+    """
+    Specialized Cython take which sets NaN values in one pass
 
-def _maybe_mask(result, mask, needs_masking, axis=0, out_passed=False,
-                fill_value=np.nan):
-    if needs_masking:
-        if out_passed and _need_upcast(result):
-            raise Exception('incompatible type for NAs')
+    (equivalent to take_nd but requires mask and needs_masking
+     to be set appropriately already; slightly more efficient)
+    """
+    if indexer is None:
+        indexer = np.arange(arr.shape[axis], dtype=np.int64)
+        dtype = arr.dtype
+    else:
+        indexer = _ensure_int64(indexer)
+        if needs_masking:
+            dtype = _maybe_promote(arr.dtype, fill_value)
+            if dtype != arr.dtype and out is not None and out.dtype != dtype:
+                raise Exception('Incompatible type for fill_value')
         else:
-            # a bit spaghettified
-            result = _maybe_upcast(result)
-            mask_out_axis(result, mask, axis, fill_value)
-    return result
+            dtype = arr.dtype
+
+    if out is None:
+        out_shape = list(arr.shape)
+        out_shape[axis] = len(indexer)
+        out_shape = tuple(out_shape)
+        out = np.empty(out_shape, dtype=dtype)
+    take_f = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis)
+    take_f(arr, indexer, out=out, fill_value=fill_value)
+    return out
+
+
+def _maybe_promote(dtype, fill_value=np.nan):
+    if issubclass(dtype.type, np.datetime64):
+        # for now: refuse to upcast
+        # (this is because datetime64 will not implicitly upconvert
+        #  to object correctly as of numpy 1.6.1)
+        return dtype
+    elif is_float(fill_value):
+        if issubclass(dtype.type, np.bool_):
+            return np.object_
+        elif issubclass(dtype.type, np.integer):
+            return np.float_
+        return dtype
+    elif is_bool(fill_value):
+        if issubclass(dtype.type, np.bool_):
+            return dtype
+        return np.object_
+    elif is_integer(fill_value):
+        if issubclass(dtype.type, np.bool_):
+            return np.object_
+        elif issubclass(dtype.type, np.integer):
+            # upcast to prevent overflow
+            arr = np.asarray(fill_value)
+            if arr != arr.astype(dtype):
+                return arr.dtype
+            return dtype
+        return dtype
+    elif is_complex(fill_value):
+        if issubclass(dtype.type, np.bool_):
+            return np.object_
+        elif issubclass(dtype.type, (np.integer, np.floating)):
+            return np.complex_
+        return dtype
+    return np.object_
 
 
 def _maybe_upcast(values):
+    # TODO: convert remaining usage of _maybe_upcast to _maybe_promote
     if issubclass(values.dtype.type, np.integer):
-        values = values.astype(float)
+        values = values.astype(np.float_)
     elif issubclass(values.dtype.type, np.bool_):
-        values = values.astype(object)
-
+        values = values.astype(np.object_)
     return values
-
-
-def _need_upcast(values):
-    if issubclass(values.dtype.type, (np.integer, np.bool_)):
-        return True
-    return False
-
+ 
 
 def _interp_wrapper(f, wrap_dtype, na_override=None):
     def wrapper(arr, mask, limit=None):
@@ -556,6 +694,7 @@ def wrapper(arr, mask, limit=None):
         f(view, mask, limit=limit)
     return wrapper
 
+
 _pad_1d_datetime = _interp_wrapper(algos.pad_inplace_int64, np.int64)
 _pad_2d_datetime = _interp_wrapper(algos.pad_2d_inplace_int64, np.int64)
 _backfill_1d_datetime = _interp_wrapper(algos.backfill_inplace_int64,
@@ -728,8 +867,10 @@ def _infer_dtype(value):
         return np.float_
     elif isinstance(value, (bool, np.bool_)):
         return np.bool_
-    elif isinstance(value, (int, np.integer)):
+    elif isinstance(value, (int, long, np.integer)):
         return np.int_
+    elif isinstance(value, (complex, np.complexfloating)):
+        return np.complex_
     else:
         return np.object_
 
@@ -1028,6 +1169,10 @@ def _maybe_make_list(obj):
     return obj
 
 
+def is_bool(obj):
+    return isinstance(obj, (bool, np.bool_))
+
+
 def is_integer(obj):
     return isinstance(obj, (int, long, np.integer))
 
@@ -1036,13 +1181,17 @@ def is_float(obj):
     return isinstance(obj, (float, np.floating))
 
 
+def is_complex(obj):
+    return isinstance(obj, (complex, np.complexfloating))
+
+
 def is_iterator(obj):
     # python 3 generators have __next__ instead of next
     return hasattr(obj, 'next') or hasattr(obj, '__next__')
 
 
 def is_number(obj):
-    return isinstance(obj, (np.number, int, long, float))
+    return isinstance(obj, (np.number, int, long, float, complex))
 
 
 def is_integer_dtype(arr_or_dtype):
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 6c96317a645f7..30d8313acf2fb 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -2107,10 +2107,6 @@ def __setitem__(self, key, value):
     def _boolean_set(self, key, value):
         if key.values.dtype != np.bool_:
             raise ValueError('Must pass DataFrame with boolean values only')
-
-        if self._is_mixed_type:
-            raise ValueError('Cannot do boolean setting on mixed-type frame')
-
         self.where(-key, value, inplace=True)
 
     def _set_item_multiple(self, keys, value):
@@ -2928,7 +2924,7 @@ def take(self, indices, axis=0):
                 new_columns = self.columns.take(indices)
                 return self.reindex(columns=new_columns)
         else:
-            new_values = com.take_2d(self.values,
+            new_values = com.take_nd(self.values,
                                      com._ensure_int64(indices),
                                      axis=axis)
             if axis == 0:
@@ -5229,16 +5225,19 @@ def where(self, cond, other=NA, inplace=False, try_cast=False, raise_on_error=Tr
 
         Parameters
         ----------
-        cond: boolean DataFrame or array
-        other: scalar or DataFrame
-        inplace: perform the operation in place on the data
-        try_cast: try to cast the result back to the input type (if possible), defaults to False
-        raise_on_error: should I raise on invalid data types (e.g. trying to where on strings),
-          defaults to True
+        cond : boolean DataFrame or array
+        other : scalar or DataFrame
+        inplace : boolean, default False
+            Whether to perform the operation in place on the data
+        try_cast : boolean, default False
+            try to cast the result back to the input type (if possible),
+        raise_on_error : boolean, default True
+            Whether to raise on invalid data types (e.g. trying to where on
+            strings)
 
         Returns
         -------
-        wh: DataFrame
+        wh : DataFrame
         """
         if not hasattr(cond, 'shape'):
             raise ValueError('where requires an ndarray like object for its '
@@ -5263,18 +5262,16 @@ def where(self, cond, other=NA, inplace=False, try_cast=False, raise_on_error=Tr
         if isinstance(other, DataFrame):
             _, other = self.align(other, join='left', fill_value=NA)
         elif isinstance(other,np.ndarray):
-
-            if other.shape[0] != len(self.index) or other.shape[1] != len(self.columns):
-                raise ValueError('other must be the same shape as self when an ndarray')
-            other = DataFrame(other,self.index,self.columns)
+            if other.shape != self.shape:
+                raise ValueError('other must be the same shape as self '
+                                 'when an ndarray')
+            other = DataFrame(other, self.index, self.columns)
 
         if inplace:
-
             # we may have different type blocks come out of putmask, so reconstruct the block manager
             self._data = self._data.putmask(cond,other,inplace=True)
 
         else:
-
             func = lambda values, others, conds: np.where(conds, values, others)
             new_data = self._data.where(func, other, cond, raise_on_error=raise_on_error, try_cast=try_cast)
 
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
index 58d193a956491..ee024ce68b5b4 100644
--- a/pandas/core/internals.py
+++ b/pandas/core/internals.py
@@ -125,15 +125,9 @@ def reindex_axis(self, indexer, mask, needs_masking, axis=0,
         """
         Reindex using pre-computed indexer information
         """
-        if self.values.size > 0:
-            new_values = com.take_fast(self.values, indexer, mask,
-                                       needs_masking, axis=axis,
-                                       fill_value=fill_value)
-        else:
-            shape = list(self.shape)
-            shape[axis] = len(indexer)
-            new_values = np.empty(shape)
-            new_values.fill(fill_value)
+        new_values = com.take_fast(self.values, indexer,
+                                   mask, needs_masking, axis=axis,
+                                   fill_value=fill_value)
         return make_block(new_values, self.items, self.ref_items)
 
     def reindex_items_from(self, new_ref_items, copy=True):
@@ -155,12 +149,9 @@ def reindex_items_from(self, new_ref_items, copy=True):
             mask = indexer != -1
             masked_idx = indexer[mask]
 
-            if self.values.ndim == 2:
-                new_values = com.take_2d(self.values, masked_idx, axis=0,
-                                         needs_masking=False)
-            else:
-                new_values = self.values.take(masked_idx, axis=0)
-
+            new_values = com.take_fast(self.values, masked_idx,
+                                       mask=None, needs_masking=False,
+                                       axis=0)
             new_items = self.items.take(masked_idx)
         return make_block(new_values, new_items, new_ref_items)
 
@@ -301,24 +292,23 @@ def putmask(self, mask, new, inplace=False):
         new_values = self.values if inplace else self.values.copy()
 
         # may need to align the new
-        if hasattr(new,'reindex_axis'):
-            axis = getattr(new,'_het_axis',0)
+        if hasattr(new, 'reindex_axis'):
+            axis = getattr(new, '_het_axis', 0)
             new = new.reindex_axis(self.items, axis=axis, copy=False).values.T
 
         # may need to align the mask
-        if hasattr(mask,'reindex_axis'):
-            axis = getattr(mask,'_het_axis',0)
+        if hasattr(mask, 'reindex_axis'):
+            axis = getattr(mask, '_het_axis', 0)
             mask = mask.reindex_axis(self.items, axis=axis, copy=False).values.T
 
         if self._can_hold_element(new):
             new = self._try_cast(new)
             np.putmask(new_values, mask, new)
-
         # upcast me
         else:
-            
             # type of the new block
-            if isinstance(new,np.ndarray) and issubclass(new.dtype,np.number) or issubclass(type(new),float):
+            if ((isinstance(new, np.ndarray) and issubclass(new.dtype, np.number)) or
+                    isinstance(new, float)):
                 typ = float
             else:
                 typ = object
@@ -369,9 +359,8 @@ def interpolate(self, method='pad', axis=0, inplace=False,
     def take(self, indexer, axis=1, fill_value=np.nan):
         if axis < 1:
             raise AssertionError('axis must be at least 1, got %d' % axis)
-        new_values = com.take_fast(self.values, indexer, None,
-                                   None, axis=axis,
-                                   fill_value=fill_value)
+        new_values = com.take_fast(self.values, indexer, None, False,
+                                   axis=axis, fill_value=fill_value)
         return make_block(new_values, self.items, self.ref_items)
 
     def get_values(self, dtype):
@@ -401,22 +390,21 @@ def where(self, func, other, cond = None, raise_on_error = True, try_cast = Fals
 
         Parameters
         ----------
-        func  : how to combine self,other
+        func  : how to combine self, other
         other : a ndarray/object
         cond  : the condition to respect, optional
-        raise_on_error : if True, raise when I can't perform the function, False by default (and just return
-             the data that we had coming in)
+        raise_on_error : if True, raise when I can't perform the function,
+            False by default (and just return the data that we had coming in)
 
         Returns
         -------
         a new block, the result of the func
         """
-
         values = self.values
 
         # see if we can align other
-        if hasattr(other,'reindex_axis'):
-            axis = getattr(other,'_het_axis',0)
+        if hasattr(other, 'reindex_axis'):
+            axis = getattr(other, '_het_axis', 0)
             other = other.reindex_axis(self.items, axis=axis, copy=True).values
 
         # make sure that we can broadcast
@@ -428,17 +416,20 @@ def where(self, func, other, cond = None, raise_on_error = True, try_cast = Fals
 
         # see if we can align cond
         if cond is not None:
-            if not hasattr(cond,'shape'):
-                raise ValueError("where must have a condition that is ndarray like")
-            if hasattr(cond,'reindex_axis'):
-                axis = getattr(cond,'_het_axis',0)
-                cond = cond.reindex_axis(self.items, axis=axis, copy=True).values
+            if not hasattr(cond, 'shape'):
+                raise ValueError('where must have a condition that is ndarray'
+                                 ' like')
+            if hasattr(cond, 'reindex_axis'):
+                axis = getattr(cond, '_het_axis', 0)
+                cond = cond.reindex_axis(self.items, axis=axis,
+                                         copy=True).values
             else:
                 cond = cond.values
 
             # may need to undo transpose of values
             if hasattr(values, 'ndim'):
-                if values.ndim != cond.ndim or values.shape == cond.shape[::-1]:
+                if (values.ndim != cond.ndim or
+                        values.shape == cond.shape[::-1]):
                     values = values.T
                     is_transposed =  not is_transposed
 
@@ -494,7 +485,7 @@ class FloatBlock(NumericBlock):
 
     def _can_hold_element(self, element):
         if isinstance(element, np.ndarray):
-            return issubclass(element.dtype.type, (np.floating,np.integer))
+            return issubclass(element.dtype.type, (np.floating, np.integer))
         return isinstance(element, (float, int))
 
     def _try_cast(self, element):
@@ -541,7 +532,8 @@ def _try_cast(self, element):
     def _try_cast_result(self, result):
         # this is quite restrictive to convert
         try:
-            if isinstance(result, np.ndarray) and issubclass(result.dtype.type, np.floating):
+            if (isinstance(result, np.ndarray) and
+                    issubclass(result.dtype.type, np.floating)):
                 if com.notnull(result).all():
                     new_result = result.astype(self.dtype)
                     if (new_result == result).all():
@@ -958,7 +950,8 @@ def _get_clean_block_types(self, type_list):
         return type_list
 
     def get_bool_data(self, copy=False, as_blocks=False):
-        return self.get_numeric_data(copy=copy, type_list=(BoolBlock,), as_blocks=as_blocks)
+        return self.get_numeric_data(copy=copy, type_list=(BoolBlock,),
+                                     as_blocks=as_blocks)
 
     def get_slice(self, slobj, axis=0):
         new_axes = list(self.axes)
@@ -1429,7 +1422,7 @@ def take(self, indexer, axis=1):
         if axis == 0:
             raise NotImplementedError
 
-        indexer = np.asarray(indexer, dtype='i4')
+        indexer = com._ensure_platform_int(indexer)
 
         n = len(self.axes[axis])
         if ((indexer == -1) | (indexer >= n)).any():
@@ -1440,8 +1433,8 @@ def take(self, indexer, axis=1):
         new_axes[axis] = self.axes[axis].take(indexer)
         new_blocks = []
         for blk in self.blocks:
-            new_values = com.take_fast(blk.values, indexer,
-                                       None, False, axis=axis)
+            new_values = com.take_fast(blk.values, indexer, None, False,
+                                       axis=axis)
             newb = make_block(new_values, blk.items, self.items)
             new_blocks.append(newb)
 
@@ -1765,8 +1758,6 @@ def _consolidate(blocks, items):
     return new_blocks
 
 
-# TODO: this could be much optimized
-
 def _merge_blocks(blocks, items):
     if len(blocks) == 1:
         return blocks[0]
diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py
index 5ed3af4c34ee7..362215703e1f2 100644
--- a/pandas/core/reshape.py
+++ b/pandas/core/reshape.py
@@ -93,7 +93,7 @@ def _make_sorted_values_labels(self):
         indexer = algos.groupsort_indexer(comp_index, ngroups)[0]
         indexer = _ensure_platform_int(indexer)
 
-        self.sorted_values = com.take_2d(self.values, indexer, axis=0)
+        self.sorted_values = com.take_nd(self.values, indexer, axis=0)
         self.sorted_labels = [l.take(indexer) for l in to_sort]
 
     def _make_selectors(self):
@@ -136,7 +136,7 @@ def get_result(self):
             # rare case, level values not observed
             if len(obs_ids) < self.full_shape[1]:
                 inds = (value_mask.sum(0) > 0).nonzero()[0]
-                values = com.take_2d(values, inds, axis=1)
+                values = com.take_nd(values, inds, axis=1)
                 columns = columns[inds]
 
         return DataFrame(values, index=index, columns=columns)
diff --git a/pandas/core/series.py b/pandas/core/series.py
index c3ae78b1b5e1f..bc54a1b7be0e8 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -794,7 +794,9 @@ def convert_objects(self, convert_dates=True, convert_numeric=True):
         converted : Series
         """
         if self.dtype == np.object_:
-            return Series(com._possibly_convert_objects(self.values,convert_dates=convert_dates,convert_numeric=convert_numeric), index=self.index, name=self.name)
+            return Series(com._possibly_convert_objects(self.values,
+                convert_dates=convert_dates, convert_numeric=convert_numeric),
+                index=self.index, name=self.name)
         return self.copy()
 
     def repeat(self, reps):
diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py
index 9cc749d23a3a9..c68154b27f7d1 100644
--- a/pandas/src/generate_code.py
+++ b/pandas/src/generate_code.py
@@ -52,177 +52,148 @@
 
 
 take_1d_template = """@cython.wraparound(False)
-def take_1d_%(name)s(ndarray[%(c_type)s] values,
-                     ndarray[int64_t] indexer,
-                     out=None, fill_value=np.nan):
+def take_1d_%(name)s_%(dest)s(ndarray[%(c_type_in)s] values,
+                              ndarray[int64_t] indexer,
+                              out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, n, idx
-        ndarray[%(c_type)s] outbuf
-        %(c_type)s fv
+        ndarray[%(c_type_out)s] outbuf = out
+        %(c_type_out)s fv
 
     n = len(indexer)
 
-    if out is None:
-        outbuf = np.empty(n, dtype=values.dtype)
-    else:
-        outbuf = out
-
     if %(raise_on_na)s and _checknan(fill_value):
-        for i in range(n):
+        for i from 0 <= i < n:
             idx = indexer[i]
             if idx == -1:
                 raise ValueError('No NA values allowed')
             else:
-                outbuf[i] = values[idx]
+                outbuf[i] = %(preval)svalues[idx]%(postval)s
     else:
         fv = fill_value
-        for i in range(n):
+        for i from 0 <= i < n:
             idx = indexer[i]
             if idx == -1:
                 outbuf[i] = fv
             else:
-                outbuf[i] = values[idx]
+                outbuf[i] = %(preval)svalues[idx]%(postval)s
 
 """
 
 take_2d_axis0_template = """@cython.wraparound(False)
 @cython.boundscheck(False)
-def take_2d_axis0_%(name)s(ndarray[%(c_type)s, ndim=2] values,
-                           ndarray[int64_t] indexer,
-                           out=None, fill_value=np.nan):
+def take_2d_axis0_%(name)s_%(dest)s(ndarray[%(c_type_in)s, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, j, k, n, idx
-        ndarray[%(c_type)s, ndim=2] outbuf
-        %(c_type)s fv
+        ndarray[%(c_type_out)s, ndim=2] outbuf = out
+        %(c_type_out)s fv
 
     n = len(indexer)
     k = values.shape[1]
 
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
-
     if %(raise_on_na)s and _checknan(fill_value):
-        for i in range(n):
+        for i from 0 <= i < n:
             idx = indexer[i]
             if idx == -1:
                 for j from 0 <= j < k:
                     raise ValueError('No NA values allowed')
             else:
                 for j from 0 <= j < k:
-                    outbuf[i, j] = values[idx, j]
+                    outbuf[i, j] = %(preval)svalues[idx, j]%(postval)s
     else:
         fv = fill_value
-        for i in range(n):
+        for i from 0 <= i < n:
             idx = indexer[i]
             if idx == -1:
-                for j in range(k):
+                for j from 0 <= j < k:
                     outbuf[i, j] = fv
             else:
-                for j in range(k):
-                    outbuf[i, j] = values[idx, j]
+                for j from 0 <= j < k:
+                    outbuf[i, j] = %(preval)svalues[idx, j]%(postval)s
 
 """
 
 take_2d_axis1_template = """@cython.wraparound(False)
 @cython.boundscheck(False)
-def take_2d_axis1_%(name)s(ndarray[%(c_type)s, ndim=2] values,
-                           ndarray[int64_t] indexer,
-                           out=None, fill_value=np.nan):
+def take_2d_axis1_%(name)s_%(dest)s(ndarray[%(c_type_in)s, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, j, k, n, idx
-        ndarray[%(c_type)s, ndim=2] outbuf
-        %(c_type)s fv
+        ndarray[%(c_type_out)s, ndim=2] outbuf = out
+        %(c_type_out)s fv
 
     n = len(values)
     k = len(indexer)
 
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
-
     if %(raise_on_na)s and _checknan(fill_value):
-        for j in range(k):
+        for j from 0 <= j < k:
             idx = indexer[j]
-
             if idx == -1:
-                for i in range(n):
+                for i from 0 <= i < n:
                     raise ValueError('No NA values allowed')
             else:
-                for i in range(n):
-                    outbuf[i, j] = values[i, idx]
+                for i from 0 <= i < n:
+                    outbuf[i, j] = %(preval)svalues[i, idx]%(postval)s
     else:
         fv = fill_value
-        for j in range(k):
+        for j from 0 <= j < k:
             idx = indexer[j]
-
             if idx == -1:
-                for i in range(n):
+                for i from 0 <= i < n:
                     outbuf[i, j] = fv
             else:
-                for i in range(n):
-                    outbuf[i, j] = values[i, idx]
+                for i from 0 <= i < n:
+                    outbuf[i, j] = %(preval)svalues[i, idx]%(postval)s
 
 """
 
 take_2d_multi_template = """@cython.wraparound(False)
 @cython.boundscheck(False)
-def take_2d_multi_%(name)s(ndarray[%(c_type)s, ndim=2] values,
-                           ndarray[int64_t] idx0,
-                           ndarray[int64_t] idx1,
-                           out=None, fill_value=np.nan):
+def take_2d_multi_%(name)s_%(dest)s(ndarray[%(c_type_in)s, ndim=2] values,
+                                    indexer,
+                                    out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, j, k, n, idx
-        ndarray[%(c_type)s, ndim=2] outbuf
-        %(c_type)s fv
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        ndarray[%(c_type_out)s, ndim=2] outbuf = out
+        %(c_type_out)s fv
 
     n = len(idx0)
     k = len(idx1)
 
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
-
-
     if %(raise_on_na)s and _checknan(fill_value):
-        for i in range(n):
+        for i from 0 <= i < n:
             idx = idx0[i]
             if idx == -1:
-                for j in range(k):
+                for j from 0 <= j < k:
                     raise ValueError('No NA values allowed')
             else:
-                for j in range(k):
+                for j from 0 <= j < k:
                     if idx1[j] == -1:
                         raise ValueError('No NA values allowed')
                     else:
-                        outbuf[i, j] = values[idx, idx1[j]]
+                        outbuf[i, j] = %(preval)svalues[idx, idx1[j]]%(postval)s
     else:
         fv = fill_value
-        for i in range(n):
+        for i from 0 <= i < n:
             idx = idx0[i]
             if idx == -1:
-                for j in range(k):
+                for j from 0 <= j < k:
                     outbuf[i, j] = fv
             else:
-                for j in range(k):
+                for j from 0 <= j < k:
                     if idx1[j] == -1:
                         outbuf[i, j] = fv
                     else:
-                        outbuf[i, j] = values[idx, idx1[j]]
+                        outbuf[i, j] = %(preval)svalues[idx, idx1[j]]%(postval)s
 
 """
 
 
-def set_na(na ="NaN"):
-    return "outbuf[i] = %s" % na
-
-def set_na_2d(na = "NaN"):
-    return "outbuf[i, j] = %s" % na
-
-raise_on_na = "raise ValueError('No NA values allowed')"
 
 '''
 Backfilling logic for generating fill vector
@@ -2184,20 +2155,55 @@ def generate_put_template(template, use_ints = True, use_floats = True):
         output.write(func)
     return output.getvalue()
 
+def generate_take_template(template, exclude=None):
+    # name, dest, ctypein, ctypeout, preval, postval, capable of holding NA
+    function_list = [
+        ('bool', 'bool', 'uint8_t', 'uint8_t', '', '', False),
+        ('bool', 'object', 'uint8_t', 'object',
+         'True if ', ' > 0 else False', True),
+        ('int8', 'int8', 'int8_t', 'int8_t', '', '', False),
+        ('int8', 'int32', 'int8_t', 'int32_t', '', '', False),
+        ('int8', 'int64', 'int8_t', 'int64_t', '', '', False),
+        ('int8', 'float64', 'int8_t', 'float64_t', '', '', True),
+        ('int16', 'int16', 'int16_t', 'int16_t', '', '', False),
+        ('int16', 'int32', 'int16_t', 'int32_t', '', '', False),
+        ('int16', 'int64', 'int16_t', 'int64_t', '', '', False),
+        ('int16', 'float64', 'int16_t', 'float64_t', '', '', True),
+        ('int32', 'int32', 'int32_t', 'int32_t', '', '', False),
+        ('int32', 'int64', 'int32_t', 'int64_t', '', '', False),
+        ('int32', 'float64', 'int32_t', 'float64_t', '', '', True),
+        ('int64', 'int64', 'int64_t', 'int64_t', '', '', False),
+        ('int64', 'float64', 'int64_t', 'float64_t', '', '', True),
+        ('float32', 'float32', 'float32_t', 'float32_t', '', '', True),
+        ('float32', 'float64', 'float32_t', 'float64_t', '', '', True),
+        ('float64', 'float64', 'float64_t', 'float64_t', '', '', True),
+        ('object', 'object', 'object', 'object', '', '', True)
+    ]
 
-# name, ctype, capable of holding NA
-function_list = [
-    ('float64', 'float64_t', 'np.float64', True),
-    ('float32', 'float32_t', 'np.float32', True),
-    ('object','object',  'object',   True),
-    ('int8',  'int8_t',  'np.int8',  False),
-    ('int16', 'int16_t', 'np.int16', False),
-    ('int32', 'int32_t', 'np.int32', False),
-    ('int64', 'int64_t', 'np.int64', False),
-    ('bool',  'uint8_t', 'np.bool',  False)
-]
+    output = StringIO()
+    for (name, dest, c_type_in, c_type_out, 
+            preval, postval, can_hold_na) in function_list:
+        if exclude is not None and name in exclude:
+            continue
+
+        func = template % {'name': name, 'dest': dest,
+                           'c_type_in': c_type_in, 'c_type_out': c_type_out,
+                           'preval': preval, 'postval': postval,
+                           'raise_on_na': 'False' if can_hold_na else 'True'}
+        output.write(func)
+    return output.getvalue()
+
+def generate_from_template(template, exclude=None):
+    # name, ctype, capable of holding NA
+    function_list = [
+        ('float64', 'float64_t', 'np.float64', True),
+        ('float32', 'float32_t', 'np.float32', True),
+        ('object', 'object', 'object', True),
+        ('int32', 'int32_t', 'np.int32', False),
+        ('int64', 'int64_t', 'np.int64', False),
+        ('bool', 'uint8_t', 'np.bool', False)
+    ]
 
-def generate_from_template(template, ndim=1, exclude=None):
     output = StringIO()
     for name, c_type, dtype, can_hold_na in function_list:
         if exclude is not None and name in exclude:
@@ -2235,7 +2241,6 @@ def generate_from_template(template, ndim=1, exclude=None):
                 backfill_1d_template,
                 pad_2d_template,
                 backfill_2d_template,
-                take_1d_template,
                 is_monotonic_template,
                 groupby_template,
                 arrmap_template]
@@ -2245,9 +2250,10 @@ def generate_from_template(template, ndim=1, exclude=None):
                        outer_join_template2,
                        inner_join_template]
 
-templates_2d = [take_2d_axis0_template,
-                take_2d_axis1_template,
-                take_2d_multi_template]
+take_templates = [take_1d_template,
+                  take_2d_axis0_template,
+                  take_2d_axis1_template,
+                  take_2d_multi_template]
 
 def generate_take_cython_file(path='generated.pyx'):
     with open(path, 'w') as f:
@@ -2258,8 +2264,8 @@ def generate_take_cython_file(path='generated.pyx'):
         for template in templates_1d:
             print >> f, generate_from_template(template)
 
-        for template in templates_2d:
-            print >> f, generate_from_template(template, ndim=2)
+        for template in take_templates:
+            print >> f, generate_take_template(template)
 
         for template in put_2d:
             print >> f, generate_put_template(template)
diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx
index a20fb5668aec9..1723f2fb8b34c 100644
--- a/pandas/src/generated.pyx
+++ b/pandas/src/generated.pyx
@@ -183,50 +183,6 @@ cpdef map_indices_object(ndarray[object] index):
 
     return result
 
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef map_indices_int8(ndarray[int8_t] index):
-    '''
-    Produce a dict mapping the values of the input array to their respective
-    locations.
-
-    Example:
-        array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
-
-    Better to do this with Cython because of the enormous speed boost.
-    '''
-    cdef Py_ssize_t i, length
-    cdef dict result = {}
-
-    length = len(index)
-
-    for i in range(length):
-        result[index[i]] = i
-
-    return result
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef map_indices_int16(ndarray[int16_t] index):
-    '''
-    Produce a dict mapping the values of the input array to their respective
-    locations.
-
-    Example:
-        array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
-
-    Better to do this with Cython because of the enormous speed boost.
-    '''
-    cdef Py_ssize_t i, length
-    cdef dict result = {}
-
-    length = len(index)
-
-    for i in range(length):
-        result[index[i]] = i
-
-    return result
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cpdef map_indices_int32(ndarray[int32_t] index):
@@ -477,128 +433,6 @@ def pad_object(ndarray[object] old, ndarray[object] new,
 
     return indexer
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_int8(ndarray[int8_t] old, ndarray[int8_t] new,
-                   limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef int8_t cur, next
-    cdef int lim, fill_count = 0
-
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
-
-    if limit is None:
-        lim = nright
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
-        return indexer
-
-    i = j = 0
-
-    cur = old[0]
-
-    while j <= nright - 1 and new[j] < cur:
-        j += 1
-
-    while True:
-        if j == nright:
-            break
-
-        if i == nleft - 1:
-            while j < nright:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] > cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j += 1
-            break
-
-        next = old[i + 1]
-
-        while j < nright and cur <= new[j] < next:
-            if new[j] == cur:
-                indexer[j] = i
-            elif fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j += 1
-
-        fill_count = 0
-        i += 1
-        cur = next
-
-    return indexer
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_int16(ndarray[int16_t] old, ndarray[int16_t] new,
-                   limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef int16_t cur, next
-    cdef int lim, fill_count = 0
-
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
-
-    if limit is None:
-        lim = nright
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
-        return indexer
-
-    i = j = 0
-
-    cur = old[0]
-
-    while j <= nright - 1 and new[j] < cur:
-        j += 1
-
-    while True:
-        if j == nright:
-            break
-
-        if i == nleft - 1:
-            while j < nright:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] > cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j += 1
-            break
-
-        next = old[i + 1]
-
-        while j < nright and cur <= new[j] < next:
-            if new[j] == cur:
-                indexer[j] = i
-            elif fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j += 1
-
-        fill_count = 0
-        i += 1
-        cur = next
-
-    return indexer
-
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def pad_int32(ndarray[int32_t] old, ndarray[int32_t] new,
@@ -971,11 +805,11 @@ def backfill_object(ndarray[object] old, ndarray[object] new,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_int8(ndarray[int8_t] old, ndarray[int8_t] new,
+def backfill_int32(ndarray[int32_t] old, ndarray[int32_t] new,
                       limit=None):
     cdef Py_ssize_t i, j, nleft, nright
     cdef ndarray[int64_t, ndim=1] indexer
-    cdef int8_t cur, prev
+    cdef int32_t cur, prev
     cdef int lim, fill_count = 0
 
     nleft = len(old)
@@ -1033,11 +867,11 @@ def backfill_int8(ndarray[int8_t] old, ndarray[int8_t] new,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_int16(ndarray[int16_t] old, ndarray[int16_t] new,
+def backfill_int64(ndarray[int64_t] old, ndarray[int64_t] new,
                       limit=None):
     cdef Py_ssize_t i, j, nleft, nright
     cdef ndarray[int64_t, ndim=1] indexer
-    cdef int16_t cur, prev
+    cdef int64_t cur, prev
     cdef int lim, fill_count = 0
 
     nleft = len(old)
@@ -1095,11 +929,11 @@ def backfill_int16(ndarray[int16_t] old, ndarray[int16_t] new,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_int32(ndarray[int32_t] old, ndarray[int32_t] new,
+def backfill_bool(ndarray[uint8_t] old, ndarray[uint8_t] new,
                       limit=None):
     cdef Py_ssize_t i, j, nleft, nright
     cdef ndarray[int64_t, ndim=1] indexer
-    cdef int32_t cur, prev
+    cdef uint8_t cur, prev
     cdef int lim, fill_count = 0
 
     nleft = len(old)
@@ -1155,139 +989,81 @@ def backfill_int32(ndarray[int32_t] old, ndarray[int32_t] new,
 
     return indexer
 
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_int64(ndarray[int64_t] old, ndarray[int64_t] new,
-                      limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef int64_t cur, prev
+def pad_inplace_float64(ndarray[float64_t] values,
+                         ndarray[uint8_t, cast=True] mask,
+                         limit=None):
+    cdef Py_ssize_t i, N
+    cdef float64_t val
     cdef int lim, fill_count = 0
 
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
+    N = len(values)
+
+    # GH 2778
+    if N == 0:
+        return
 
     if limit is None:
-        lim = nright
+        lim = N
     else:
         if limit < 0:
             raise ValueError('Limit must be non-negative')
         lim = limit
 
-    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
-        return indexer
+    val = values[0]
+    for i in range(N):
+        if mask[i]:
+            if fill_count >= lim:
+                continue
+            fill_count += 1
+            values[i] = val
+        else:
+            fill_count = 0
+            val = values[i]
 
-    i = nleft - 1
-    j = nright - 1
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def pad_inplace_float32(ndarray[float32_t] values,
+                         ndarray[uint8_t, cast=True] mask,
+                         limit=None):
+    cdef Py_ssize_t i, N
+    cdef float32_t val
+    cdef int lim, fill_count = 0
 
-    cur = old[nleft - 1]
+    N = len(values)
 
-    while j >= 0 and new[j] > cur:
-        j -= 1
+    # GH 2778
+    if N == 0:
+        return
 
-    while True:
-        if j < 0:
-            break
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
 
-        if i == 0:
-            while j >= 0:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] < cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j -= 1
-            break
+    val = values[0]
+    for i in range(N):
+        if mask[i]:
+            if fill_count >= lim:
+                continue
+            fill_count += 1
+            values[i] = val
+        else:
+            fill_count = 0
+            val = values[i]
 
-        prev = old[i - 1]
-
-        while j >= 0 and prev < new[j] <= cur:
-            if new[j] == cur:
-                indexer[j] = i
-            elif new[j] < cur and fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j -= 1
-
-        fill_count = 0
-        i -= 1
-        cur = prev
-
-    return indexer
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_bool(ndarray[uint8_t] old, ndarray[uint8_t] new,
-                      limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef uint8_t cur, prev
-    cdef int lim, fill_count = 0
-
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
-
-    if limit is None:
-        lim = nright
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
-        return indexer
-
-    i = nleft - 1
-    j = nright - 1
-
-    cur = old[nleft - 1]
-
-    while j >= 0 and new[j] > cur:
-        j -= 1
-
-    while True:
-        if j < 0:
-            break
-
-        if i == 0:
-            while j >= 0:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] < cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j -= 1
-            break
-
-        prev = old[i - 1]
-
-        while j >= 0 and prev < new[j] <= cur:
-            if new[j] == cur:
-                indexer[j] = i
-            elif new[j] < cur and fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j -= 1
-
-        fill_count = 0
-        i -= 1
-        cur = prev
-
-    return indexer
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_inplace_float64(ndarray[float64_t] values,
-                         ndarray[uint8_t, cast=True] mask,
-                         limit=None):
-    cdef Py_ssize_t i, N
-    cdef float64_t val
-    cdef int lim, fill_count = 0
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def pad_inplace_object(ndarray[object] values,
+                         ndarray[uint8_t, cast=True] mask,
+                         limit=None):
+    cdef Py_ssize_t i, N
+    cdef object val
+    cdef int lim, fill_count = 0
 
     N = len(values)
 
@@ -1315,11 +1091,11 @@ def pad_inplace_float64(ndarray[float64_t] values,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def pad_inplace_float32(ndarray[float32_t] values,
+def pad_inplace_int32(ndarray[int32_t] values,
                          ndarray[uint8_t, cast=True] mask,
                          limit=None):
     cdef Py_ssize_t i, N
-    cdef float32_t val
+    cdef int32_t val
     cdef int lim, fill_count = 0
 
     N = len(values)
@@ -1348,11 +1124,11 @@ def pad_inplace_float32(ndarray[float32_t] values,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def pad_inplace_object(ndarray[object] values,
+def pad_inplace_int64(ndarray[int64_t] values,
                          ndarray[uint8_t, cast=True] mask,
                          limit=None):
     cdef Py_ssize_t i, N
-    cdef object val
+    cdef int64_t val
     cdef int lim, fill_count = 0
 
     N = len(values)
@@ -1381,11 +1157,11 @@ def pad_inplace_object(ndarray[object] values,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def pad_inplace_int8(ndarray[int8_t] values,
+def pad_inplace_bool(ndarray[uint8_t] values,
                          ndarray[uint8_t, cast=True] mask,
                          limit=None):
     cdef Py_ssize_t i, N
-    cdef int8_t val
+    cdef uint8_t val
     cdef int lim, fill_count = 0
 
     N = len(values)
@@ -1412,46 +1188,14 @@ def pad_inplace_int8(ndarray[int8_t] values,
             fill_count = 0
             val = values[i]
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_inplace_int16(ndarray[int16_t] values,
-                         ndarray[uint8_t, cast=True] mask,
-                         limit=None):
-    cdef Py_ssize_t i, N
-    cdef int16_t val
-    cdef int lim, fill_count = 0
-
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[0]
-    for i in range(N):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def pad_inplace_int32(ndarray[int32_t] values,
-                         ndarray[uint8_t, cast=True] mask,
-                         limit=None):
+def backfill_inplace_float64(ndarray[float64_t] values,
+                              ndarray[uint8_t, cast=True] mask,
+                              limit=None):
     cdef Py_ssize_t i, N
-    cdef int32_t val
+    cdef float64_t val
     cdef int lim, fill_count = 0
 
     N = len(values)
@@ -1467,8 +1211,8 @@ def pad_inplace_int32(ndarray[int32_t] values,
             raise ValueError('Limit must be non-negative')
         lim = limit
 
-    val = values[0]
-    for i in range(N):
+    val = values[N - 1]
+    for i in range(N - 1, -1 , -1):
         if mask[i]:
             if fill_count >= lim:
                 continue
@@ -1477,14 +1221,13 @@ def pad_inplace_int32(ndarray[int32_t] values,
         else:
             fill_count = 0
             val = values[i]
-
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def pad_inplace_int64(ndarray[int64_t] values,
-                         ndarray[uint8_t, cast=True] mask,
-                         limit=None):
+def backfill_inplace_float32(ndarray[float32_t] values,
+                              ndarray[uint8_t, cast=True] mask,
+                              limit=None):
     cdef Py_ssize_t i, N
-    cdef int64_t val
+    cdef float32_t val
     cdef int lim, fill_count = 0
 
     N = len(values)
@@ -1500,8 +1243,8 @@ def pad_inplace_int64(ndarray[int64_t] values,
             raise ValueError('Limit must be non-negative')
         lim = limit
 
-    val = values[0]
-    for i in range(N):
+    val = values[N - 1]
+    for i in range(N - 1, -1 , -1):
         if mask[i]:
             if fill_count >= lim:
                 continue
@@ -1510,14 +1253,13 @@ def pad_inplace_int64(ndarray[int64_t] values,
         else:
             fill_count = 0
             val = values[i]
-
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def pad_inplace_bool(ndarray[uint8_t] values,
-                         ndarray[uint8_t, cast=True] mask,
-                         limit=None):
+def backfill_inplace_object(ndarray[object] values,
+                              ndarray[uint8_t, cast=True] mask,
+                              limit=None):
     cdef Py_ssize_t i, N
-    cdef uint8_t val
+    cdef object val
     cdef int lim, fill_count = 0
 
     N = len(values)
@@ -1533,8 +1275,8 @@ def pad_inplace_bool(ndarray[uint8_t] values,
             raise ValueError('Limit must be non-negative')
         lim = limit
 
-    val = values[0]
-    for i in range(N):
+    val = values[N - 1]
+    for i in range(N - 1, -1 , -1):
         if mask[i]:
             if fill_count >= lim:
                 continue
@@ -1543,15 +1285,13 @@ def pad_inplace_bool(ndarray[uint8_t] values,
         else:
             fill_count = 0
             val = values[i]
-
-
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_inplace_float64(ndarray[float64_t] values,
+def backfill_inplace_int32(ndarray[int32_t] values,
                               ndarray[uint8_t, cast=True] mask,
                               limit=None):
     cdef Py_ssize_t i, N
-    cdef float64_t val
+    cdef int32_t val
     cdef int lim, fill_count = 0
 
     N = len(values)
@@ -1579,11 +1319,11 @@ def backfill_inplace_float64(ndarray[float64_t] values,
             val = values[i]
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_inplace_float32(ndarray[float32_t] values,
+def backfill_inplace_int64(ndarray[int64_t] values,
                               ndarray[uint8_t, cast=True] mask,
                               limit=None):
     cdef Py_ssize_t i, N
-    cdef float32_t val
+    cdef int64_t val
     cdef int lim, fill_count = 0
 
     N = len(values)
@@ -1611,11 +1351,11 @@ def backfill_inplace_float32(ndarray[float32_t] values,
             val = values[i]
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_inplace_object(ndarray[object] values,
+def backfill_inplace_bool(ndarray[uint8_t] values,
                               ndarray[uint8_t, cast=True] mask,
                               limit=None):
     cdef Py_ssize_t i, N
-    cdef object val
+    cdef uint8_t val
     cdef int lim, fill_count = 0
 
     N = len(values)
@@ -1641,16 +1381,17 @@ def backfill_inplace_object(ndarray[object] values,
         else:
             fill_count = 0
             val = values[i]
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_inplace_int8(ndarray[int8_t] values,
-                              ndarray[uint8_t, cast=True] mask,
-                              limit=None):
-    cdef Py_ssize_t i, N
-    cdef int8_t val
+def pad_2d_inplace_float64(ndarray[float64_t, ndim=2] values,
+                            ndarray[uint8_t, ndim=2] mask,
+                            limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef float64_t val
     cdef int lim, fill_count = 0
 
-    N = len(values)
+    K, N = (<object> values).shape
 
     # GH 2778
     if N == 0:
@@ -1663,26 +1404,28 @@ def backfill_inplace_int8(ndarray[int8_t] values,
             raise ValueError('Limit must be non-negative')
         lim = limit
 
-    val = values[N - 1]
-    for i in range(N - 1, -1 , -1):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
+    for j in range(K):
+        fill_count = 0
+        val = values[j, 0]
+        for i in range(N):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_inplace_int16(ndarray[int16_t] values,
-                              ndarray[uint8_t, cast=True] mask,
-                              limit=None):
-    cdef Py_ssize_t i, N
-    cdef int16_t val
+def pad_2d_inplace_float32(ndarray[float32_t, ndim=2] values,
+                            ndarray[uint8_t, ndim=2] mask,
+                            limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef float32_t val
     cdef int lim, fill_count = 0
 
-    N = len(values)
+    K, N = (<object> values).shape
 
     # GH 2778
     if N == 0:
@@ -1695,259 +1438,28 @@ def backfill_inplace_int16(ndarray[int16_t] values,
             raise ValueError('Limit must be non-negative')
         lim = limit
 
-    val = values[N - 1]
-    for i in range(N - 1, -1 , -1):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
+    for j in range(K):
+        fill_count = 0
+        val = values[j, 0]
+        for i in range(N):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_inplace_int32(ndarray[int32_t] values,
-                              ndarray[uint8_t, cast=True] mask,
-                              limit=None):
-    cdef Py_ssize_t i, N
-    cdef int32_t val
+def pad_2d_inplace_object(ndarray[object, ndim=2] values,
+                            ndarray[uint8_t, ndim=2] mask,
+                            limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef object val
     cdef int lim, fill_count = 0
 
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[N - 1]
-    for i in range(N - 1, -1 , -1):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_inplace_int64(ndarray[int64_t] values,
-                              ndarray[uint8_t, cast=True] mask,
-                              limit=None):
-    cdef Py_ssize_t i, N
-    cdef int64_t val
-    cdef int lim, fill_count = 0
-
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[N - 1]
-    for i in range(N - 1, -1 , -1):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_inplace_bool(ndarray[uint8_t] values,
-                              ndarray[uint8_t, cast=True] mask,
-                              limit=None):
-    cdef Py_ssize_t i, N
-    cdef uint8_t val
-    cdef int lim, fill_count = 0
-
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[N - 1]
-    for i in range(N - 1, -1 , -1):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_2d_inplace_float64(ndarray[float64_t, ndim=2] values,
-                            ndarray[uint8_t, ndim=2] mask,
-                            limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef float64_t val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, 0]
-        for i in range(N):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_2d_inplace_float32(ndarray[float32_t, ndim=2] values,
-                            ndarray[uint8_t, ndim=2] mask,
-                            limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef float32_t val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, 0]
-        for i in range(N):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_2d_inplace_object(ndarray[object, ndim=2] values,
-                            ndarray[uint8_t, ndim=2] mask,
-                            limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef object val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, 0]
-        for i in range(N):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_2d_inplace_int8(ndarray[int8_t, ndim=2] values,
-                            ndarray[uint8_t, ndim=2] mask,
-                            limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef int8_t val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, 0]
-        for i in range(N):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_2d_inplace_int16(ndarray[int16_t, ndim=2] values,
-                            ndarray[uint8_t, ndim=2] mask,
-                            limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef int16_t val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
+    K, N = (<object> values).shape
 
     # GH 2778
     if N == 0:
@@ -2179,11 +1691,11 @@ def backfill_2d_inplace_object(ndarray[object, ndim=2] values,
                 val = values[j, i]
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_2d_inplace_int8(ndarray[int8_t, ndim=2] values,
+def backfill_2d_inplace_int32(ndarray[int32_t, ndim=2] values,
                                  ndarray[uint8_t, ndim=2] mask,
                                  limit=None):
     cdef Py_ssize_t i, j, N, K
-    cdef int8_t val
+    cdef int32_t val
     cdef int lim, fill_count = 0
 
     K, N = (<object> values).shape
@@ -2213,11 +1725,11 @@ def backfill_2d_inplace_int8(ndarray[int8_t, ndim=2] values,
                 val = values[j, i]
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_2d_inplace_int16(ndarray[int16_t, ndim=2] values,
+def backfill_2d_inplace_int64(ndarray[int64_t, ndim=2] values,
                                  ndarray[uint8_t, ndim=2] mask,
                                  limit=None):
     cdef Py_ssize_t i, j, N, K
-    cdef int16_t val
+    cdef int64_t val
     cdef int lim, fill_count = 0
 
     K, N = (<object> values).shape
@@ -2247,11 +1759,11 @@ def backfill_2d_inplace_int16(ndarray[int16_t, ndim=2] values,
                 val = values[j, i]
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_2d_inplace_int32(ndarray[int32_t, ndim=2] values,
+def backfill_2d_inplace_bool(ndarray[uint8_t, ndim=2] values,
                                  ndarray[uint8_t, ndim=2] mask,
                                  limit=None):
     cdef Py_ssize_t i, j, N, K
-    cdef int32_t val
+    cdef uint8_t val
     cdef int lim, fill_count = 0
 
     K, N = (<object> values).shape
@@ -2279,221 +1791,425 @@ def backfill_2d_inplace_int32(ndarray[int32_t, ndim=2] values,
             else:
                 fill_count = 0
                 val = values[j, i]
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_2d_inplace_int64(ndarray[int64_t, ndim=2] values,
-                                 ndarray[uint8_t, ndim=2] mask,
-                                 limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef int64_t val
-    cdef int lim, fill_count = 0
+def is_monotonic_float64(ndarray[float64_t] arr):
+    '''
+    Returns
+    -------
+    is_monotonic, is_unique
+    '''
+    cdef:
+        Py_ssize_t i, n
+        float64_t prev, cur
+        bint is_unique = 1
 
-    K, N = (<object> values).shape
+    n = len(arr)
 
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
+    if n < 2:
+        return True, True
 
-    for j in range(K):
-        fill_count = 0
-        val = values[j, N - 1]
-        for i in range(N - 1, -1 , -1):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
+    prev = arr[0]
+    for i in range(1, n):
+        cur = arr[i]
+        if cur < prev:
+            return False, None
+        elif cur == prev:
+            is_unique = 0
+        prev = cur
+    return True, is_unique
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_2d_inplace_bool(ndarray[uint8_t, ndim=2] values,
-                                 ndarray[uint8_t, ndim=2] mask,
-                                 limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef uint8_t val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
+def is_monotonic_float32(ndarray[float32_t] arr):
+    '''
+    Returns
+    -------
+    is_monotonic, is_unique
+    '''
+    cdef:
+        Py_ssize_t i, n
+        float32_t prev, cur
+        bint is_unique = 1
 
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
+    n = len(arr)
 
-    for j in range(K):
-        fill_count = 0
-        val = values[j, N - 1]
-        for i in range(N - 1, -1 , -1):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
+    if n < 2:
+        return True, True
 
+    prev = arr[0]
+    for i in range(1, n):
+        cur = arr[i]
+        if cur < prev:
+            return False, None
+        elif cur == prev:
+            is_unique = 0
+        prev = cur
+    return True, is_unique
+@cython.boundscheck(False)
 @cython.wraparound(False)
-def take_1d_float64(ndarray[float64_t] values,
-                     ndarray[int64_t] indexer,
-                     out=None, fill_value=np.nan):
+def is_monotonic_object(ndarray[object] arr):
+    '''
+    Returns
+    -------
+    is_monotonic, is_unique
+    '''
     cdef:
-        Py_ssize_t i, n, idx
-        ndarray[float64_t] outbuf
-        float64_t fv
-
-    n = len(indexer)
+        Py_ssize_t i, n
+        object prev, cur
+        bint is_unique = 1
 
-    if out is None:
-        outbuf = np.empty(n, dtype=values.dtype)
-    else:
-        outbuf = out
+    n = len(arr)
 
-    if False and _checknan(fill_value):
-        for i in range(n):
-            idx = indexer[i]
-            if idx == -1:
-                raise ValueError('No NA values allowed')
-            else:
-                outbuf[i] = values[idx]
-    else:
-        fv = fill_value
-        for i in range(n):
-            idx = indexer[i]
-            if idx == -1:
-                outbuf[i] = fv
-            else:
-                outbuf[i] = values[idx]
+    if n < 2:
+        return True, True
 
+    prev = arr[0]
+    for i in range(1, n):
+        cur = arr[i]
+        if cur < prev:
+            return False, None
+        elif cur == prev:
+            is_unique = 0
+        prev = cur
+    return True, is_unique
+@cython.boundscheck(False)
 @cython.wraparound(False)
-def take_1d_float32(ndarray[float32_t] values,
-                     ndarray[int64_t] indexer,
-                     out=None, fill_value=np.nan):
+def is_monotonic_int32(ndarray[int32_t] arr):
+    '''
+    Returns
+    -------
+    is_monotonic, is_unique
+    '''
     cdef:
-        Py_ssize_t i, n, idx
-        ndarray[float32_t] outbuf
-        float32_t fv
-
-    n = len(indexer)
+        Py_ssize_t i, n
+        int32_t prev, cur
+        bint is_unique = 1
 
-    if out is None:
-        outbuf = np.empty(n, dtype=values.dtype)
-    else:
-        outbuf = out
+    n = len(arr)
 
-    if False and _checknan(fill_value):
-        for i in range(n):
-            idx = indexer[i]
-            if idx == -1:
-                raise ValueError('No NA values allowed')
-            else:
-                outbuf[i] = values[idx]
-    else:
-        fv = fill_value
-        for i in range(n):
-            idx = indexer[i]
-            if idx == -1:
-                outbuf[i] = fv
-            else:
-                outbuf[i] = values[idx]
+    if n < 2:
+        return True, True
 
+    prev = arr[0]
+    for i in range(1, n):
+        cur = arr[i]
+        if cur < prev:
+            return False, None
+        elif cur == prev:
+            is_unique = 0
+        prev = cur
+    return True, is_unique
+@cython.boundscheck(False)
 @cython.wraparound(False)
-def take_1d_object(ndarray[object] values,
-                     ndarray[int64_t] indexer,
-                     out=None, fill_value=np.nan):
+def is_monotonic_int64(ndarray[int64_t] arr):
+    '''
+    Returns
+    -------
+    is_monotonic, is_unique
+    '''
     cdef:
-        Py_ssize_t i, n, idx
-        ndarray[object] outbuf
-        object fv
-
-    n = len(indexer)
+        Py_ssize_t i, n
+        int64_t prev, cur
+        bint is_unique = 1
 
-    if out is None:
-        outbuf = np.empty(n, dtype=values.dtype)
-    else:
-        outbuf = out
+    n = len(arr)
 
-    if False and _checknan(fill_value):
-        for i in range(n):
-            idx = indexer[i]
-            if idx == -1:
-                raise ValueError('No NA values allowed')
-            else:
-                outbuf[i] = values[idx]
-    else:
-        fv = fill_value
-        for i in range(n):
-            idx = indexer[i]
-            if idx == -1:
-                outbuf[i] = fv
-            else:
-                outbuf[i] = values[idx]
+    if n < 2:
+        return True, True
 
+    prev = arr[0]
+    for i in range(1, n):
+        cur = arr[i]
+        if cur < prev:
+            return False, None
+        elif cur == prev:
+            is_unique = 0
+        prev = cur
+    return True, is_unique
+@cython.boundscheck(False)
 @cython.wraparound(False)
-def take_1d_int8(ndarray[int8_t] values,
-                     ndarray[int64_t] indexer,
-                     out=None, fill_value=np.nan):
+def is_monotonic_bool(ndarray[uint8_t] arr):
+    '''
+    Returns
+    -------
+    is_monotonic, is_unique
+    '''
     cdef:
-        Py_ssize_t i, n, idx
-        ndarray[int8_t] outbuf
-        int8_t fv
+        Py_ssize_t i, n
+        uint8_t prev, cur
+        bint is_unique = 1
 
-    n = len(indexer)
+    n = len(arr)
 
-    if out is None:
-        outbuf = np.empty(n, dtype=values.dtype)
-    else:
-        outbuf = out
+    if n < 2:
+        return True, True
 
-    if True and _checknan(fill_value):
-        for i in range(n):
-            idx = indexer[i]
-            if idx == -1:
-                raise ValueError('No NA values allowed')
-            else:
-                outbuf[i] = values[idx]
-    else:
-        fv = fill_value
-        for i in range(n):
-            idx = indexer[i]
-            if idx == -1:
-                outbuf[i] = fv
-            else:
-                outbuf[i] = values[idx]
+    prev = arr[0]
+    for i in range(1, n):
+        cur = arr[i]
+        if cur < prev:
+            return False, None
+        elif cur == prev:
+            is_unique = 0
+        prev = cur
+    return True, is_unique
 
 @cython.wraparound(False)
-def take_1d_int16(ndarray[int16_t] values,
-                     ndarray[int64_t] indexer,
-                     out=None, fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, n, idx
-        ndarray[int16_t] outbuf
-        int16_t fv
-
-    n = len(indexer)
+@cython.boundscheck(False)
+def groupby_float64(ndarray[float64_t] index, ndarray labels):
+    cdef dict result = {}
+    cdef Py_ssize_t i, length
+    cdef list members
+    cdef object idx, key
 
-    if out is None:
-        outbuf = np.empty(n, dtype=values.dtype)
-    else:
-        outbuf = out
+    length = len(index)
+
+    for i in range(length):
+        key = util.get_value_1d(labels, i)
+
+        if _checknull(key):
+            continue
+
+        idx = index[i]
+        if key in result:
+            members = result[key]
+            members.append(idx)
+        else:
+            result[key] = [idx]
+
+    return result
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def groupby_float32(ndarray[float32_t] index, ndarray labels):
+    cdef dict result = {}
+    cdef Py_ssize_t i, length
+    cdef list members
+    cdef object idx, key
+
+    length = len(index)
+
+    for i in range(length):
+        key = util.get_value_1d(labels, i)
+
+        if _checknull(key):
+            continue
+
+        idx = index[i]
+        if key in result:
+            members = result[key]
+            members.append(idx)
+        else:
+            result[key] = [idx]
+
+    return result
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def groupby_object(ndarray[object] index, ndarray labels):
+    cdef dict result = {}
+    cdef Py_ssize_t i, length
+    cdef list members
+    cdef object idx, key
+
+    length = len(index)
+
+    for i in range(length):
+        key = util.get_value_1d(labels, i)
+
+        if _checknull(key):
+            continue
+
+        idx = index[i]
+        if key in result:
+            members = result[key]
+            members.append(idx)
+        else:
+            result[key] = [idx]
+
+    return result
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def groupby_int32(ndarray[int32_t] index, ndarray labels):
+    cdef dict result = {}
+    cdef Py_ssize_t i, length
+    cdef list members
+    cdef object idx, key
+
+    length = len(index)
+
+    for i in range(length):
+        key = util.get_value_1d(labels, i)
+
+        if _checknull(key):
+            continue
+
+        idx = index[i]
+        if key in result:
+            members = result[key]
+            members.append(idx)
+        else:
+            result[key] = [idx]
+
+    return result
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def groupby_int64(ndarray[int64_t] index, ndarray labels):
+    cdef dict result = {}
+    cdef Py_ssize_t i, length
+    cdef list members
+    cdef object idx, key
+
+    length = len(index)
+
+    for i in range(length):
+        key = util.get_value_1d(labels, i)
+
+        if _checknull(key):
+            continue
+
+        idx = index[i]
+        if key in result:
+            members = result[key]
+            members.append(idx)
+        else:
+            result[key] = [idx]
+
+    return result
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def groupby_bool(ndarray[uint8_t] index, ndarray labels):
+    cdef dict result = {}
+    cdef Py_ssize_t i, length
+    cdef list members
+    cdef object idx, key
+
+    length = len(index)
+
+    for i in range(length):
+        key = util.get_value_1d(labels, i)
+
+        if _checknull(key):
+            continue
+
+        idx = index[i]
+        if key in result:
+            members = result[key]
+            members.append(idx)
+        else:
+            result[key] = [idx]
+
+    return result
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def arrmap_float64(ndarray[float64_t] index, object func):
+    cdef Py_ssize_t length = index.shape[0]
+    cdef Py_ssize_t i = 0
+
+    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
+
+    from pandas.lib import maybe_convert_objects
+
+    for i in range(length):
+        result[i] = func(index[i])
+
+    return maybe_convert_objects(result)
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def arrmap_float32(ndarray[float32_t] index, object func):
+    cdef Py_ssize_t length = index.shape[0]
+    cdef Py_ssize_t i = 0
+
+    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
+
+    from pandas.lib import maybe_convert_objects
+
+    for i in range(length):
+        result[i] = func(index[i])
+
+    return maybe_convert_objects(result)
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def arrmap_object(ndarray[object] index, object func):
+    cdef Py_ssize_t length = index.shape[0]
+    cdef Py_ssize_t i = 0
+
+    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
+
+    from pandas.lib import maybe_convert_objects
+
+    for i in range(length):
+        result[i] = func(index[i])
+
+    return maybe_convert_objects(result)
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def arrmap_int32(ndarray[int32_t] index, object func):
+    cdef Py_ssize_t length = index.shape[0]
+    cdef Py_ssize_t i = 0
+
+    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
+
+    from pandas.lib import maybe_convert_objects
+
+    for i in range(length):
+        result[i] = func(index[i])
+
+    return maybe_convert_objects(result)
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def arrmap_int64(ndarray[int64_t] index, object func):
+    cdef Py_ssize_t length = index.shape[0]
+    cdef Py_ssize_t i = 0
+
+    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
+
+    from pandas.lib import maybe_convert_objects
+
+    for i in range(length):
+        result[i] = func(index[i])
+
+    return maybe_convert_objects(result)
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def arrmap_bool(ndarray[uint8_t] index, object func):
+    cdef Py_ssize_t length = index.shape[0]
+    cdef Py_ssize_t i = 0
+
+    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
+
+    from pandas.lib import maybe_convert_objects
+
+    for i in range(length):
+        result[i] = func(index[i])
+
+    return maybe_convert_objects(result)
+
+
+@cython.wraparound(False)
+def take_1d_bool_bool(ndarray[uint8_t] values,
+                              ndarray[int64_t] indexer,
+                              out, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, n, idx
+        ndarray[uint8_t] outbuf = out
+        uint8_t fv
+
+    n = len(indexer)
 
     if True and _checknan(fill_value):
-        for i in range(n):
+        for i from 0 <= i < n:
             idx = indexer[i]
             if idx == -1:
                 raise ValueError('No NA values allowed')
@@ -2501,7 +2217,7 @@ def take_1d_int16(ndarray[int16_t] values,
                 outbuf[i] = values[idx]
     else:
         fv = fill_value
-        for i in range(n):
+        for i from 0 <= i < n:
             idx = indexer[i]
             if idx == -1:
                 outbuf[i] = fv
@@ -2509,55 +2225,45 @@ def take_1d_int16(ndarray[int16_t] values,
                 outbuf[i] = values[idx]
 
 @cython.wraparound(False)
-def take_1d_int32(ndarray[int32_t] values,
-                     ndarray[int64_t] indexer,
-                     out=None, fill_value=np.nan):
+def take_1d_bool_object(ndarray[uint8_t] values,
+                              ndarray[int64_t] indexer,
+                              out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, n, idx
-        ndarray[int32_t] outbuf
-        int32_t fv
+        ndarray[object] outbuf = out
+        object fv
 
     n = len(indexer)
 
-    if out is None:
-        outbuf = np.empty(n, dtype=values.dtype)
-    else:
-        outbuf = out
-
-    if True and _checknan(fill_value):
-        for i in range(n):
+    if False and _checknan(fill_value):
+        for i from 0 <= i < n:
             idx = indexer[i]
             if idx == -1:
                 raise ValueError('No NA values allowed')
             else:
-                outbuf[i] = values[idx]
+                outbuf[i] = True if values[idx] > 0 else False
     else:
         fv = fill_value
-        for i in range(n):
+        for i from 0 <= i < n:
             idx = indexer[i]
             if idx == -1:
                 outbuf[i] = fv
             else:
-                outbuf[i] = values[idx]
+                outbuf[i] = True if values[idx] > 0 else False
 
 @cython.wraparound(False)
-def take_1d_int64(ndarray[int64_t] values,
-                     ndarray[int64_t] indexer,
-                     out=None, fill_value=np.nan):
+def take_1d_int8_int8(ndarray[int8_t] values,
+                              ndarray[int64_t] indexer,
+                              out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, n, idx
-        ndarray[int64_t] outbuf
-        int64_t fv
+        ndarray[int8_t] outbuf = out
+        int8_t fv
 
     n = len(indexer)
 
-    if out is None:
-        outbuf = np.empty(n, dtype=values.dtype)
-    else:
-        outbuf = out
-
     if True and _checknan(fill_value):
-        for i in range(n):
+        for i from 0 <= i < n:
             idx = indexer[i]
             if idx == -1:
                 raise ValueError('No NA values allowed')
@@ -2565,7 +2271,7 @@ def take_1d_int64(ndarray[int64_t] values,
                 outbuf[i] = values[idx]
     else:
         fv = fill_value
-        for i in range(n):
+        for i from 0 <= i < n:
             idx = indexer[i]
             if idx == -1:
                 outbuf[i] = fv
@@ -2573,23 +2279,18 @@ def take_1d_int64(ndarray[int64_t] values,
                 outbuf[i] = values[idx]
 
 @cython.wraparound(False)
-def take_1d_bool(ndarray[uint8_t] values,
-                     ndarray[int64_t] indexer,
-                     out=None, fill_value=np.nan):
+def take_1d_int8_int32(ndarray[int8_t] values,
+                              ndarray[int64_t] indexer,
+                              out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, n, idx
-        ndarray[uint8_t] outbuf
-        uint8_t fv
+        ndarray[int32_t] outbuf = out
+        int32_t fv
 
     n = len(indexer)
 
-    if out is None:
-        outbuf = np.empty(n, dtype=values.dtype)
-    else:
-        outbuf = out
-
     if True and _checknan(fill_value):
-        for i in range(n):
+        for i from 0 <= i < n:
             idx = indexer[i]
             if idx == -1:
                 raise ValueError('No NA values allowed')
@@ -2597,573 +2298,434 @@ def take_1d_bool(ndarray[uint8_t] values,
                 outbuf[i] = values[idx]
     else:
         fv = fill_value
-        for i in range(n):
+        for i from 0 <= i < n:
             idx = indexer[i]
             if idx == -1:
                 outbuf[i] = fv
             else:
                 outbuf[i] = values[idx]
 
-
-@cython.boundscheck(False)
 @cython.wraparound(False)
-def is_monotonic_float64(ndarray[float64_t] arr):
-    '''
-    Returns
-    -------
-    is_monotonic, is_unique
-    '''
+def take_1d_int8_int64(ndarray[int8_t] values,
+                              ndarray[int64_t] indexer,
+                              out, fill_value=np.nan):
     cdef:
-        Py_ssize_t i, n
-        float64_t prev, cur
-        bint is_unique = 1
+        Py_ssize_t i, n, idx
+        ndarray[int64_t] outbuf = out
+        int64_t fv
 
-    n = len(arr)
+    n = len(indexer)
 
-    if n < 2:
-        return True, True
+    if True and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                raise ValueError('No NA values allowed')
+            else:
+                outbuf[i] = values[idx]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                outbuf[i] = fv
+            else:
+                outbuf[i] = values[idx]
 
-    prev = arr[0]
-    for i in range(1, n):
-        cur = arr[i]
-        if cur < prev:
-            return False, None
-        elif cur == prev:
-            is_unique = 0
-        prev = cur
-    return True, is_unique
-@cython.boundscheck(False)
 @cython.wraparound(False)
-def is_monotonic_float32(ndarray[float32_t] arr):
-    '''
-    Returns
-    -------
-    is_monotonic, is_unique
-    '''
+def take_1d_int8_float64(ndarray[int8_t] values,
+                              ndarray[int64_t] indexer,
+                              out, fill_value=np.nan):
     cdef:
-        Py_ssize_t i, n
-        float32_t prev, cur
-        bint is_unique = 1
+        Py_ssize_t i, n, idx
+        ndarray[float64_t] outbuf = out
+        float64_t fv
 
-    n = len(arr)
+    n = len(indexer)
 
-    if n < 2:
-        return True, True
+    if False and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                raise ValueError('No NA values allowed')
+            else:
+                outbuf[i] = values[idx]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                outbuf[i] = fv
+            else:
+                outbuf[i] = values[idx]
 
-    prev = arr[0]
-    for i in range(1, n):
-        cur = arr[i]
-        if cur < prev:
-            return False, None
-        elif cur == prev:
-            is_unique = 0
-        prev = cur
-    return True, is_unique
-@cython.boundscheck(False)
 @cython.wraparound(False)
-def is_monotonic_object(ndarray[object] arr):
-    '''
-    Returns
-    -------
-    is_monotonic, is_unique
-    '''
+def take_1d_int16_int16(ndarray[int16_t] values,
+                              ndarray[int64_t] indexer,
+                              out, fill_value=np.nan):
     cdef:
-        Py_ssize_t i, n
-        object prev, cur
-        bint is_unique = 1
+        Py_ssize_t i, n, idx
+        ndarray[int16_t] outbuf = out
+        int16_t fv
 
-    n = len(arr)
+    n = len(indexer)
 
-    if n < 2:
-        return True, True
+    if True and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                raise ValueError('No NA values allowed')
+            else:
+                outbuf[i] = values[idx]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                outbuf[i] = fv
+            else:
+                outbuf[i] = values[idx]
 
-    prev = arr[0]
-    for i in range(1, n):
-        cur = arr[i]
-        if cur < prev:
-            return False, None
-        elif cur == prev:
-            is_unique = 0
-        prev = cur
-    return True, is_unique
-@cython.boundscheck(False)
 @cython.wraparound(False)
-def is_monotonic_int8(ndarray[int8_t] arr):
-    '''
-    Returns
-    -------
-    is_monotonic, is_unique
-    '''
+def take_1d_int16_int32(ndarray[int16_t] values,
+                              ndarray[int64_t] indexer,
+                              out, fill_value=np.nan):
     cdef:
-        Py_ssize_t i, n
-        int8_t prev, cur
-        bint is_unique = 1
+        Py_ssize_t i, n, idx
+        ndarray[int32_t] outbuf = out
+        int32_t fv
 
-    n = len(arr)
+    n = len(indexer)
 
-    if n < 2:
-        return True, True
+    if True and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                raise ValueError('No NA values allowed')
+            else:
+                outbuf[i] = values[idx]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                outbuf[i] = fv
+            else:
+                outbuf[i] = values[idx]
 
-    prev = arr[0]
-    for i in range(1, n):
-        cur = arr[i]
-        if cur < prev:
-            return False, None
-        elif cur == prev:
-            is_unique = 0
-        prev = cur
-    return True, is_unique
-@cython.boundscheck(False)
 @cython.wraparound(False)
-def is_monotonic_int16(ndarray[int16_t] arr):
-    '''
-    Returns
-    -------
-    is_monotonic, is_unique
-    '''
+def take_1d_int16_int64(ndarray[int16_t] values,
+                              ndarray[int64_t] indexer,
+                              out, fill_value=np.nan):
     cdef:
-        Py_ssize_t i, n
-        int16_t prev, cur
-        bint is_unique = 1
+        Py_ssize_t i, n, idx
+        ndarray[int64_t] outbuf = out
+        int64_t fv
 
-    n = len(arr)
+    n = len(indexer)
 
-    if n < 2:
-        return True, True
+    if True and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                raise ValueError('No NA values allowed')
+            else:
+                outbuf[i] = values[idx]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                outbuf[i] = fv
+            else:
+                outbuf[i] = values[idx]
 
-    prev = arr[0]
-    for i in range(1, n):
-        cur = arr[i]
-        if cur < prev:
-            return False, None
-        elif cur == prev:
-            is_unique = 0
-        prev = cur
-    return True, is_unique
-@cython.boundscheck(False)
 @cython.wraparound(False)
-def is_monotonic_int32(ndarray[int32_t] arr):
-    '''
-    Returns
-    -------
-    is_monotonic, is_unique
-    '''
+def take_1d_int16_float64(ndarray[int16_t] values,
+                              ndarray[int64_t] indexer,
+                              out, fill_value=np.nan):
     cdef:
-        Py_ssize_t i, n
-        int32_t prev, cur
-        bint is_unique = 1
+        Py_ssize_t i, n, idx
+        ndarray[float64_t] outbuf = out
+        float64_t fv
 
-    n = len(arr)
+    n = len(indexer)
 
-    if n < 2:
-        return True, True
+    if False and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                raise ValueError('No NA values allowed')
+            else:
+                outbuf[i] = values[idx]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                outbuf[i] = fv
+            else:
+                outbuf[i] = values[idx]
 
-    prev = arr[0]
-    for i in range(1, n):
-        cur = arr[i]
-        if cur < prev:
-            return False, None
-        elif cur == prev:
-            is_unique = 0
-        prev = cur
-    return True, is_unique
-@cython.boundscheck(False)
 @cython.wraparound(False)
-def is_monotonic_int64(ndarray[int64_t] arr):
-    '''
-    Returns
-    -------
-    is_monotonic, is_unique
-    '''
+def take_1d_int32_int32(ndarray[int32_t] values,
+                              ndarray[int64_t] indexer,
+                              out, fill_value=np.nan):
     cdef:
-        Py_ssize_t i, n
-        int64_t prev, cur
-        bint is_unique = 1
+        Py_ssize_t i, n, idx
+        ndarray[int32_t] outbuf = out
+        int32_t fv
 
-    n = len(arr)
+    n = len(indexer)
 
-    if n < 2:
-        return True, True
+    if True and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                raise ValueError('No NA values allowed')
+            else:
+                outbuf[i] = values[idx]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                outbuf[i] = fv
+            else:
+                outbuf[i] = values[idx]
 
-    prev = arr[0]
-    for i in range(1, n):
-        cur = arr[i]
-        if cur < prev:
-            return False, None
-        elif cur == prev:
-            is_unique = 0
-        prev = cur
-    return True, is_unique
-@cython.boundscheck(False)
 @cython.wraparound(False)
-def is_monotonic_bool(ndarray[uint8_t] arr):
-    '''
-    Returns
-    -------
-    is_monotonic, is_unique
-    '''
+def take_1d_int32_int64(ndarray[int32_t] values,
+                              ndarray[int64_t] indexer,
+                              out, fill_value=np.nan):
     cdef:
-        Py_ssize_t i, n
-        uint8_t prev, cur
-        bint is_unique = 1
-
-    n = len(arr)
+        Py_ssize_t i, n, idx
+        ndarray[int64_t] outbuf = out
+        int64_t fv
 
-    if n < 2:
-        return True, True
+    n = len(indexer)
 
-    prev = arr[0]
-    for i in range(1, n):
-        cur = arr[i]
-        if cur < prev:
-            return False, None
-        elif cur == prev:
-            is_unique = 0
-        prev = cur
-    return True, is_unique
+    if True and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                raise ValueError('No NA values allowed')
+            else:
+                outbuf[i] = values[idx]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                outbuf[i] = fv
+            else:
+                outbuf[i] = values[idx]
 
 @cython.wraparound(False)
-@cython.boundscheck(False)
-def groupby_float64(ndarray[float64_t] index, ndarray labels):
-    cdef dict result = {}
-    cdef Py_ssize_t i, length
-    cdef list members
-    cdef object idx, key
+def take_1d_int32_float64(ndarray[int32_t] values,
+                              ndarray[int64_t] indexer,
+                              out, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, n, idx
+        ndarray[float64_t] outbuf = out
+        float64_t fv
 
-    length = len(index)
+    n = len(indexer)
 
-    for i in range(length):
-        key = util.get_value_1d(labels, i)
+    if False and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                raise ValueError('No NA values allowed')
+            else:
+                outbuf[i] = values[idx]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                outbuf[i] = fv
+            else:
+                outbuf[i] = values[idx]
 
-        if _checknull(key):
-            continue
+@cython.wraparound(False)
+def take_1d_int64_int64(ndarray[int64_t] values,
+                              ndarray[int64_t] indexer,
+                              out, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, n, idx
+        ndarray[int64_t] outbuf = out
+        int64_t fv
 
-        idx = index[i]
-        if key in result:
-            members = result[key]
-            members.append(idx)
-        else:
-            result[key] = [idx]
+    n = len(indexer)
 
-    return result
+    if True and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                raise ValueError('No NA values allowed')
+            else:
+                outbuf[i] = values[idx]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                outbuf[i] = fv
+            else:
+                outbuf[i] = values[idx]
 
 @cython.wraparound(False)
-@cython.boundscheck(False)
-def groupby_float32(ndarray[float32_t] index, ndarray labels):
-    cdef dict result = {}
-    cdef Py_ssize_t i, length
-    cdef list members
-    cdef object idx, key
+def take_1d_int64_float64(ndarray[int64_t] values,
+                              ndarray[int64_t] indexer,
+                              out, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, n, idx
+        ndarray[float64_t] outbuf = out
+        float64_t fv
 
-    length = len(index)
+    n = len(indexer)
 
-    for i in range(length):
-        key = util.get_value_1d(labels, i)
+    if False and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                raise ValueError('No NA values allowed')
+            else:
+                outbuf[i] = values[idx]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                outbuf[i] = fv
+            else:
+                outbuf[i] = values[idx]
 
-        if _checknull(key):
-            continue
+@cython.wraparound(False)
+def take_1d_float32_float32(ndarray[float32_t] values,
+                              ndarray[int64_t] indexer,
+                              out, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, n, idx
+        ndarray[float32_t] outbuf = out
+        float32_t fv
 
-        idx = index[i]
-        if key in result:
-            members = result[key]
-            members.append(idx)
-        else:
-            result[key] = [idx]
+    n = len(indexer)
 
-    return result
+    if False and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                raise ValueError('No NA values allowed')
+            else:
+                outbuf[i] = values[idx]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                outbuf[i] = fv
+            else:
+                outbuf[i] = values[idx]
 
 @cython.wraparound(False)
-@cython.boundscheck(False)
-def groupby_object(ndarray[object] index, ndarray labels):
-    cdef dict result = {}
-    cdef Py_ssize_t i, length
-    cdef list members
-    cdef object idx, key
+def take_1d_float32_float64(ndarray[float32_t] values,
+                              ndarray[int64_t] indexer,
+                              out, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, n, idx
+        ndarray[float64_t] outbuf = out
+        float64_t fv
 
-    length = len(index)
+    n = len(indexer)
 
-    for i in range(length):
-        key = util.get_value_1d(labels, i)
+    if False and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                raise ValueError('No NA values allowed')
+            else:
+                outbuf[i] = values[idx]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                outbuf[i] = fv
+            else:
+                outbuf[i] = values[idx]
 
-        if _checknull(key):
-            continue
+@cython.wraparound(False)
+def take_1d_float64_float64(ndarray[float64_t] values,
+                              ndarray[int64_t] indexer,
+                              out, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, n, idx
+        ndarray[float64_t] outbuf = out
+        float64_t fv
 
-        idx = index[i]
-        if key in result:
-            members = result[key]
-            members.append(idx)
-        else:
-            result[key] = [idx]
+    n = len(indexer)
 
-    return result
+    if False and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                raise ValueError('No NA values allowed')
+            else:
+                outbuf[i] = values[idx]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                outbuf[i] = fv
+            else:
+                outbuf[i] = values[idx]
 
 @cython.wraparound(False)
-@cython.boundscheck(False)
-def groupby_int8(ndarray[int8_t] index, ndarray labels):
-    cdef dict result = {}
-    cdef Py_ssize_t i, length
-    cdef list members
-    cdef object idx, key
-
-    length = len(index)
-
-    for i in range(length):
-        key = util.get_value_1d(labels, i)
+def take_1d_object_object(ndarray[object] values,
+                              ndarray[int64_t] indexer,
+                              out, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, n, idx
+        ndarray[object] outbuf = out
+        object fv
 
-        if _checknull(key):
-            continue
+    n = len(indexer)
 
-        idx = index[i]
-        if key in result:
-            members = result[key]
-            members.append(idx)
-        else:
-            result[key] = [idx]
+    if False and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                raise ValueError('No NA values allowed')
+            else:
+                outbuf[i] = values[idx]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                outbuf[i] = fv
+            else:
+                outbuf[i] = values[idx]
 
-    return result
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def groupby_int16(ndarray[int16_t] index, ndarray labels):
-    cdef dict result = {}
-    cdef Py_ssize_t i, length
-    cdef list members
-    cdef object idx, key
-
-    length = len(index)
-
-    for i in range(length):
-        key = util.get_value_1d(labels, i)
+def take_2d_axis0_bool_bool(ndarray[uint8_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[uint8_t, ndim=2] outbuf = out
+        uint8_t fv
 
-        if _checknull(key):
-            continue
+    n = len(indexer)
+    k = values.shape[1]
 
-        idx = index[i]
-        if key in result:
-            members = result[key]
-            members.append(idx)
-        else:
-            result[key] = [idx]
-
-    return result
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def groupby_int32(ndarray[int32_t] index, ndarray labels):
-    cdef dict result = {}
-    cdef Py_ssize_t i, length
-    cdef list members
-    cdef object idx, key
-
-    length = len(index)
-
-    for i in range(length):
-        key = util.get_value_1d(labels, i)
-
-        if _checknull(key):
-            continue
-
-        idx = index[i]
-        if key in result:
-            members = result[key]
-            members.append(idx)
-        else:
-            result[key] = [idx]
-
-    return result
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def groupby_int64(ndarray[int64_t] index, ndarray labels):
-    cdef dict result = {}
-    cdef Py_ssize_t i, length
-    cdef list members
-    cdef object idx, key
-
-    length = len(index)
-
-    for i in range(length):
-        key = util.get_value_1d(labels, i)
-
-        if _checknull(key):
-            continue
-
-        idx = index[i]
-        if key in result:
-            members = result[key]
-            members.append(idx)
-        else:
-            result[key] = [idx]
-
-    return result
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def groupby_bool(ndarray[uint8_t] index, ndarray labels):
-    cdef dict result = {}
-    cdef Py_ssize_t i, length
-    cdef list members
-    cdef object idx, key
-
-    length = len(index)
-
-    for i in range(length):
-        key = util.get_value_1d(labels, i)
-
-        if _checknull(key):
-            continue
-
-        idx = index[i]
-        if key in result:
-            members = result[key]
-            members.append(idx)
-        else:
-            result[key] = [idx]
-
-    return result
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def arrmap_float64(ndarray[float64_t] index, object func):
-    cdef Py_ssize_t length = index.shape[0]
-    cdef Py_ssize_t i = 0
-
-    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
-
-    from pandas.lib import maybe_convert_objects
-
-    for i in range(length):
-        result[i] = func(index[i])
-
-    return maybe_convert_objects(result)
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def arrmap_float32(ndarray[float32_t] index, object func):
-    cdef Py_ssize_t length = index.shape[0]
-    cdef Py_ssize_t i = 0
-
-    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
-
-    from pandas.lib import maybe_convert_objects
-
-    for i in range(length):
-        result[i] = func(index[i])
-
-    return maybe_convert_objects(result)
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def arrmap_object(ndarray[object] index, object func):
-    cdef Py_ssize_t length = index.shape[0]
-    cdef Py_ssize_t i = 0
-
-    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
-
-    from pandas.lib import maybe_convert_objects
-
-    for i in range(length):
-        result[i] = func(index[i])
-
-    return maybe_convert_objects(result)
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def arrmap_int8(ndarray[int8_t] index, object func):
-    cdef Py_ssize_t length = index.shape[0]
-    cdef Py_ssize_t i = 0
-
-    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
-
-    from pandas.lib import maybe_convert_objects
-
-    for i in range(length):
-        result[i] = func(index[i])
-
-    return maybe_convert_objects(result)
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def arrmap_int16(ndarray[int16_t] index, object func):
-    cdef Py_ssize_t length = index.shape[0]
-    cdef Py_ssize_t i = 0
-
-    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
-
-    from pandas.lib import maybe_convert_objects
-
-    for i in range(length):
-        result[i] = func(index[i])
-
-    return maybe_convert_objects(result)
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def arrmap_int32(ndarray[int32_t] index, object func):
-    cdef Py_ssize_t length = index.shape[0]
-    cdef Py_ssize_t i = 0
-
-    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
-
-    from pandas.lib import maybe_convert_objects
-
-    for i in range(length):
-        result[i] = func(index[i])
-
-    return maybe_convert_objects(result)
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def arrmap_int64(ndarray[int64_t] index, object func):
-    cdef Py_ssize_t length = index.shape[0]
-    cdef Py_ssize_t i = 0
-
-    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
-
-    from pandas.lib import maybe_convert_objects
-
-    for i in range(length):
-        result[i] = func(index[i])
-
-    return maybe_convert_objects(result)
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def arrmap_bool(ndarray[uint8_t] index, object func):
-    cdef Py_ssize_t length = index.shape[0]
-    cdef Py_ssize_t i = 0
-
-    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
-
-    from pandas.lib import maybe_convert_objects
-
-    for i in range(length):
-        result[i] = func(index[i])
-
-    return maybe_convert_objects(result)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_float64(ndarray[float64_t, ndim=2] values,
-                           ndarray[int64_t] indexer,
-                           out=None, fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[float64_t, ndim=2] outbuf
-        float64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
-
-    if False and _checknan(fill_value):
-        for i in range(n):
+    if True and _checknan(fill_value):
+        for i from 0 <= i < n:
             idx = indexer[i]
             if idx == -1:
                 for j from 0 <= j < k:
@@ -3173,73 +2735,63 @@ def take_2d_axis0_float64(ndarray[float64_t, ndim=2] values,
                     outbuf[i, j] = values[idx, j]
     else:
         fv = fill_value
-        for i in range(n):
+        for i from 0 <= i < n:
             idx = indexer[i]
             if idx == -1:
-                for j in range(k):
+                for j from 0 <= j < k:
                     outbuf[i, j] = fv
             else:
-                for j in range(k):
+                for j from 0 <= j < k:
                     outbuf[i, j] = values[idx, j]
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def take_2d_axis0_float32(ndarray[float32_t, ndim=2] values,
-                           ndarray[int64_t] indexer,
-                           out=None, fill_value=np.nan):
+def take_2d_axis0_bool_object(ndarray[uint8_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, j, k, n, idx
-        ndarray[float32_t, ndim=2] outbuf
-        float32_t fv
+        ndarray[object, ndim=2] outbuf = out
+        object fv
 
     n = len(indexer)
     k = values.shape[1]
 
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
-
     if False and _checknan(fill_value):
-        for i in range(n):
+        for i from 0 <= i < n:
             idx = indexer[i]
             if idx == -1:
                 for j from 0 <= j < k:
                     raise ValueError('No NA values allowed')
             else:
                 for j from 0 <= j < k:
-                    outbuf[i, j] = values[idx, j]
+                    outbuf[i, j] = True if values[idx, j] > 0 else False
     else:
         fv = fill_value
-        for i in range(n):
+        for i from 0 <= i < n:
             idx = indexer[i]
             if idx == -1:
-                for j in range(k):
+                for j from 0 <= j < k:
                     outbuf[i, j] = fv
             else:
-                for j in range(k):
-                    outbuf[i, j] = values[idx, j]
+                for j from 0 <= j < k:
+                    outbuf[i, j] = True if values[idx, j] > 0 else False
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def take_2d_axis0_object(ndarray[object, ndim=2] values,
-                           ndarray[int64_t] indexer,
-                           out=None, fill_value=np.nan):
+def take_2d_axis0_int8_int8(ndarray[int8_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, j, k, n, idx
-        ndarray[object, ndim=2] outbuf
-        object fv
+        ndarray[int8_t, ndim=2] outbuf = out
+        int8_t fv
 
     n = len(indexer)
     k = values.shape[1]
 
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
-
-    if False and _checknan(fill_value):
-        for i in range(n):
+    if True and _checknan(fill_value):
+        for i from 0 <= i < n:
             idx = indexer[i]
             if idx == -1:
                 for j from 0 <= j < k:
@@ -3249,35 +2801,30 @@ def take_2d_axis0_object(ndarray[object, ndim=2] values,
                     outbuf[i, j] = values[idx, j]
     else:
         fv = fill_value
-        for i in range(n):
+        for i from 0 <= i < n:
             idx = indexer[i]
             if idx == -1:
-                for j in range(k):
+                for j from 0 <= j < k:
                     outbuf[i, j] = fv
             else:
-                for j in range(k):
+                for j from 0 <= j < k:
                     outbuf[i, j] = values[idx, j]
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def take_2d_axis0_int8(ndarray[int8_t, ndim=2] values,
-                           ndarray[int64_t] indexer,
-                           out=None, fill_value=np.nan):
+def take_2d_axis0_int8_int32(ndarray[int8_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, j, k, n, idx
-        ndarray[int8_t, ndim=2] outbuf
-        int8_t fv
+        ndarray[int32_t, ndim=2] outbuf = out
+        int32_t fv
 
     n = len(indexer)
     k = values.shape[1]
 
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
-
     if True and _checknan(fill_value):
-        for i in range(n):
+        for i from 0 <= i < n:
             idx = indexer[i]
             if idx == -1:
                 for j from 0 <= j < k:
@@ -3287,35 +2834,30 @@ def take_2d_axis0_int8(ndarray[int8_t, ndim=2] values,
                     outbuf[i, j] = values[idx, j]
     else:
         fv = fill_value
-        for i in range(n):
+        for i from 0 <= i < n:
             idx = indexer[i]
             if idx == -1:
-                for j in range(k):
+                for j from 0 <= j < k:
                     outbuf[i, j] = fv
             else:
-                for j in range(k):
+                for j from 0 <= j < k:
                     outbuf[i, j] = values[idx, j]
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def take_2d_axis0_int16(ndarray[int16_t, ndim=2] values,
-                           ndarray[int64_t] indexer,
-                           out=None, fill_value=np.nan):
+def take_2d_axis0_int8_int64(ndarray[int8_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, j, k, n, idx
-        ndarray[int16_t, ndim=2] outbuf
-        int16_t fv
+        ndarray[int64_t, ndim=2] outbuf = out
+        int64_t fv
 
     n = len(indexer)
     k = values.shape[1]
 
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
-
     if True and _checknan(fill_value):
-        for i in range(n):
+        for i from 0 <= i < n:
             idx = indexer[i]
             if idx == -1:
                 for j from 0 <= j < k:
@@ -3325,35 +2867,30 @@ def take_2d_axis0_int16(ndarray[int16_t, ndim=2] values,
                     outbuf[i, j] = values[idx, j]
     else:
         fv = fill_value
-        for i in range(n):
+        for i from 0 <= i < n:
             idx = indexer[i]
             if idx == -1:
-                for j in range(k):
+                for j from 0 <= j < k:
                     outbuf[i, j] = fv
             else:
-                for j in range(k):
+                for j from 0 <= j < k:
                     outbuf[i, j] = values[idx, j]
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def take_2d_axis0_int32(ndarray[int32_t, ndim=2] values,
-                           ndarray[int64_t] indexer,
-                           out=None, fill_value=np.nan):
+def take_2d_axis0_int8_float64(ndarray[int8_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, j, k, n, idx
-        ndarray[int32_t, ndim=2] outbuf
-        int32_t fv
+        ndarray[float64_t, ndim=2] outbuf = out
+        float64_t fv
 
     n = len(indexer)
     k = values.shape[1]
 
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
-
-    if True and _checknan(fill_value):
-        for i in range(n):
+    if False and _checknan(fill_value):
+        for i from 0 <= i < n:
             idx = indexer[i]
             if idx == -1:
                 for j from 0 <= j < k:
@@ -3363,35 +2900,30 @@ def take_2d_axis0_int32(ndarray[int32_t, ndim=2] values,
                     outbuf[i, j] = values[idx, j]
     else:
         fv = fill_value
-        for i in range(n):
+        for i from 0 <= i < n:
             idx = indexer[i]
             if idx == -1:
-                for j in range(k):
+                for j from 0 <= j < k:
                     outbuf[i, j] = fv
             else:
-                for j in range(k):
+                for j from 0 <= j < k:
                     outbuf[i, j] = values[idx, j]
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def take_2d_axis0_int64(ndarray[int64_t, ndim=2] values,
-                           ndarray[int64_t] indexer,
-                           out=None, fill_value=np.nan):
+def take_2d_axis0_int16_int16(ndarray[int16_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t, ndim=2] outbuf
-        int64_t fv
+        ndarray[int16_t, ndim=2] outbuf = out
+        int16_t fv
 
     n = len(indexer)
     k = values.shape[1]
 
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
-
     if True and _checknan(fill_value):
-        for i in range(n):
+        for i from 0 <= i < n:
             idx = indexer[i]
             if idx == -1:
                 for j from 0 <= j < k:
@@ -3401,35 +2933,30 @@ def take_2d_axis0_int64(ndarray[int64_t, ndim=2] values,
                     outbuf[i, j] = values[idx, j]
     else:
         fv = fill_value
-        for i in range(n):
+        for i from 0 <= i < n:
             idx = indexer[i]
             if idx == -1:
-                for j in range(k):
+                for j from 0 <= j < k:
                     outbuf[i, j] = fv
             else:
-                for j in range(k):
+                for j from 0 <= j < k:
                     outbuf[i, j] = values[idx, j]
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def take_2d_axis0_bool(ndarray[uint8_t, ndim=2] values,
-                           ndarray[int64_t] indexer,
-                           out=None, fill_value=np.nan):
+def take_2d_axis0_int16_int32(ndarray[int16_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, j, k, n, idx
-        ndarray[uint8_t, ndim=2] outbuf
-        uint8_t fv
+        ndarray[int32_t, ndim=2] outbuf = out
+        int32_t fv
 
     n = len(indexer)
     k = values.shape[1]
 
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
-
     if True and _checknan(fill_value):
-        for i in range(n):
+        for i from 0 <= i < n:
             idx = indexer[i]
             if idx == -1:
                 for j from 0 <= j < k:
@@ -3439,1775 +2966,2416 @@ def take_2d_axis0_bool(ndarray[uint8_t, ndim=2] values,
                     outbuf[i, j] = values[idx, j]
     else:
         fv = fill_value
-        for i in range(n):
+        for i from 0 <= i < n:
             idx = indexer[i]
             if idx == -1:
-                for j in range(k):
+                for j from 0 <= j < k:
                     outbuf[i, j] = fv
             else:
-                for j in range(k):
+                for j from 0 <= j < k:
                     outbuf[i, j] = values[idx, j]
 
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def take_2d_axis1_float64(ndarray[float64_t, ndim=2] values,
-                           ndarray[int64_t] indexer,
-                           out=None, fill_value=np.nan):
+def take_2d_axis0_int16_int64(ndarray[int16_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, j, k, n, idx
-        ndarray[float64_t, ndim=2] outbuf
-        float64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
+        ndarray[int64_t, ndim=2] outbuf = out
+        int64_t fv
 
-    if False and _checknan(fill_value):
-        for j in range(k):
-            idx = indexer[j]
+    n = len(indexer)
+    k = values.shape[1]
 
+    if True and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = indexer[i]
             if idx == -1:
-                for i in range(n):
+                for j from 0 <= j < k:
                     raise ValueError('No NA values allowed')
             else:
-                for i in range(n):
-                    outbuf[i, j] = values[i, idx]
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
     else:
         fv = fill_value
-        for j in range(k):
-            idx = indexer[j]
-
+        for i from 0 <= i < n:
+            idx = indexer[i]
             if idx == -1:
-                for i in range(n):
+                for j from 0 <= j < k:
                     outbuf[i, j] = fv
             else:
-                for i in range(n):
-                    outbuf[i, j] = values[i, idx]
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def take_2d_axis1_float32(ndarray[float32_t, ndim=2] values,
-                           ndarray[int64_t] indexer,
-                           out=None, fill_value=np.nan):
+def take_2d_axis0_int16_float64(ndarray[int16_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, j, k, n, idx
-        ndarray[float32_t, ndim=2] outbuf
-        float32_t fv
-
-    n = len(values)
-    k = len(indexer)
+        ndarray[float64_t, ndim=2] outbuf = out
+        float64_t fv
 
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
+    n = len(indexer)
+    k = values.shape[1]
 
     if False and _checknan(fill_value):
-        for j in range(k):
-            idx = indexer[j]
-
+        for i from 0 <= i < n:
+            idx = indexer[i]
             if idx == -1:
-                for i in range(n):
+                for j from 0 <= j < k:
                     raise ValueError('No NA values allowed')
             else:
-                for i in range(n):
-                    outbuf[i, j] = values[i, idx]
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
     else:
         fv = fill_value
-        for j in range(k):
-            idx = indexer[j]
-
+        for i from 0 <= i < n:
+            idx = indexer[i]
             if idx == -1:
-                for i in range(n):
+                for j from 0 <= j < k:
                     outbuf[i, j] = fv
             else:
-                for i in range(n):
-                    outbuf[i, j] = values[i, idx]
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def take_2d_axis1_object(ndarray[object, ndim=2] values,
-                           ndarray[int64_t] indexer,
-                           out=None, fill_value=np.nan):
+def take_2d_axis0_int32_int32(ndarray[int32_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, j, k, n, idx
-        ndarray[object, ndim=2] outbuf
-        object fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
+        ndarray[int32_t, ndim=2] outbuf = out
+        int32_t fv
 
-    if False and _checknan(fill_value):
-        for j in range(k):
-            idx = indexer[j]
+    n = len(indexer)
+    k = values.shape[1]
 
+    if True and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = indexer[i]
             if idx == -1:
-                for i in range(n):
+                for j from 0 <= j < k:
                     raise ValueError('No NA values allowed')
             else:
-                for i in range(n):
-                    outbuf[i, j] = values[i, idx]
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
     else:
         fv = fill_value
-        for j in range(k):
-            idx = indexer[j]
-
+        for i from 0 <= i < n:
+            idx = indexer[i]
             if idx == -1:
-                for i in range(n):
+                for j from 0 <= j < k:
                     outbuf[i, j] = fv
             else:
-                for i in range(n):
-                    outbuf[i, j] = values[i, idx]
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def take_2d_axis1_int8(ndarray[int8_t, ndim=2] values,
-                           ndarray[int64_t] indexer,
-                           out=None, fill_value=np.nan):
+def take_2d_axis0_int32_int64(ndarray[int32_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, j, k, n, idx
-        ndarray[int8_t, ndim=2] outbuf
-        int8_t fv
-
-    n = len(values)
-    k = len(indexer)
+        ndarray[int64_t, ndim=2] outbuf = out
+        int64_t fv
 
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
+    n = len(indexer)
+    k = values.shape[1]
 
     if True and _checknan(fill_value):
-        for j in range(k):
-            idx = indexer[j]
-
+        for i from 0 <= i < n:
+            idx = indexer[i]
             if idx == -1:
-                for i in range(n):
+                for j from 0 <= j < k:
                     raise ValueError('No NA values allowed')
             else:
-                for i in range(n):
-                    outbuf[i, j] = values[i, idx]
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
     else:
         fv = fill_value
-        for j in range(k):
-            idx = indexer[j]
-
+        for i from 0 <= i < n:
+            idx = indexer[i]
             if idx == -1:
-                for i in range(n):
+                for j from 0 <= j < k:
                     outbuf[i, j] = fv
             else:
-                for i in range(n):
-                    outbuf[i, j] = values[i, idx]
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def take_2d_axis1_int16(ndarray[int16_t, ndim=2] values,
-                           ndarray[int64_t] indexer,
-                           out=None, fill_value=np.nan):
+def take_2d_axis0_int32_float64(ndarray[int32_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, j, k, n, idx
-        ndarray[int16_t, ndim=2] outbuf
-        int16_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
+        ndarray[float64_t, ndim=2] outbuf = out
+        float64_t fv
 
-    if True and _checknan(fill_value):
-        for j in range(k):
-            idx = indexer[j]
+    n = len(indexer)
+    k = values.shape[1]
 
+    if False and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = indexer[i]
             if idx == -1:
-                for i in range(n):
+                for j from 0 <= j < k:
                     raise ValueError('No NA values allowed')
             else:
-                for i in range(n):
-                    outbuf[i, j] = values[i, idx]
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
     else:
         fv = fill_value
-        for j in range(k):
-            idx = indexer[j]
-
+        for i from 0 <= i < n:
+            idx = indexer[i]
             if idx == -1:
-                for i in range(n):
+                for j from 0 <= j < k:
                     outbuf[i, j] = fv
             else:
-                for i in range(n):
-                    outbuf[i, j] = values[i, idx]
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def take_2d_axis1_int32(ndarray[int32_t, ndim=2] values,
-                           ndarray[int64_t] indexer,
-                           out=None, fill_value=np.nan):
+def take_2d_axis0_int64_int64(ndarray[int64_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, j, k, n, idx
-        ndarray[int32_t, ndim=2] outbuf
-        int32_t fv
-
-    n = len(values)
-    k = len(indexer)
+        ndarray[int64_t, ndim=2] outbuf = out
+        int64_t fv
 
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
+    n = len(indexer)
+    k = values.shape[1]
 
     if True and _checknan(fill_value):
-        for j in range(k):
-            idx = indexer[j]
-
+        for i from 0 <= i < n:
+            idx = indexer[i]
             if idx == -1:
-                for i in range(n):
+                for j from 0 <= j < k:
                     raise ValueError('No NA values allowed')
             else:
-                for i in range(n):
-                    outbuf[i, j] = values[i, idx]
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
     else:
         fv = fill_value
-        for j in range(k):
-            idx = indexer[j]
-
+        for i from 0 <= i < n:
+            idx = indexer[i]
             if idx == -1:
-                for i in range(n):
+                for j from 0 <= j < k:
                     outbuf[i, j] = fv
             else:
-                for i in range(n):
-                    outbuf[i, j] = values[i, idx]
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def take_2d_axis1_int64(ndarray[int64_t, ndim=2] values,
-                           ndarray[int64_t] indexer,
-                           out=None, fill_value=np.nan):
+def take_2d_axis0_int64_float64(ndarray[int64_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t, ndim=2] outbuf
-        int64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
+        ndarray[float64_t, ndim=2] outbuf = out
+        float64_t fv
 
-    if True and _checknan(fill_value):
-        for j in range(k):
-            idx = indexer[j]
+    n = len(indexer)
+    k = values.shape[1]
 
+    if False and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = indexer[i]
             if idx == -1:
-                for i in range(n):
+                for j from 0 <= j < k:
                     raise ValueError('No NA values allowed')
             else:
-                for i in range(n):
-                    outbuf[i, j] = values[i, idx]
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
     else:
         fv = fill_value
-        for j in range(k):
-            idx = indexer[j]
-
+        for i from 0 <= i < n:
+            idx = indexer[i]
             if idx == -1:
-                for i in range(n):
+                for j from 0 <= j < k:
                     outbuf[i, j] = fv
             else:
-                for i in range(n):
-                    outbuf[i, j] = values[i, idx]
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def take_2d_axis1_bool(ndarray[uint8_t, ndim=2] values,
-                           ndarray[int64_t] indexer,
-                           out=None, fill_value=np.nan):
+def take_2d_axis0_float32_float32(ndarray[float32_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, j, k, n, idx
-        ndarray[uint8_t, ndim=2] outbuf
-        uint8_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
+        ndarray[float32_t, ndim=2] outbuf = out
+        float32_t fv
 
-    if True and _checknan(fill_value):
-        for j in range(k):
-            idx = indexer[j]
+    n = len(indexer)
+    k = values.shape[1]
 
+    if False and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = indexer[i]
             if idx == -1:
-                for i in range(n):
+                for j from 0 <= j < k:
                     raise ValueError('No NA values allowed')
             else:
-                for i in range(n):
-                    outbuf[i, j] = values[i, idx]
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
     else:
         fv = fill_value
-        for j in range(k):
-            idx = indexer[j]
-
+        for i from 0 <= i < n:
+            idx = indexer[i]
             if idx == -1:
-                for i in range(n):
+                for j from 0 <= j < k:
                     outbuf[i, j] = fv
             else:
-                for i in range(n):
-                    outbuf[i, j] = values[i, idx]
-
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def take_2d_multi_float64(ndarray[float64_t, ndim=2] values,
-                           ndarray[int64_t] idx0,
-                           ndarray[int64_t] idx1,
-                           out=None, fill_value=np.nan):
+def take_2d_axis0_float32_float64(ndarray[float32_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, j, k, n, idx
-        ndarray[float64_t, ndim=2] outbuf
+        ndarray[float64_t, ndim=2] outbuf = out
         float64_t fv
 
-    n = len(idx0)
-    k = len(idx1)
-
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
-
+    n = len(indexer)
+    k = values.shape[1]
 
     if False and _checknan(fill_value):
-        for i in range(n):
-            idx = idx0[i]
+        for i from 0 <= i < n:
+            idx = indexer[i]
             if idx == -1:
-                for j in range(k):
+                for j from 0 <= j < k:
                     raise ValueError('No NA values allowed')
             else:
-                for j in range(k):
-                    if idx1[j] == -1:
-                        raise ValueError('No NA values allowed')
-                    else:
-                        outbuf[i, j] = values[idx, idx1[j]]
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
     else:
         fv = fill_value
-        for i in range(n):
-            idx = idx0[i]
+        for i from 0 <= i < n:
+            idx = indexer[i]
             if idx == -1:
-                for j in range(k):
+                for j from 0 <= j < k:
                     outbuf[i, j] = fv
             else:
-                for j in range(k):
-                    if idx1[j] == -1:
-                        outbuf[i, j] = fv
-                    else:
-                        outbuf[i, j] = values[idx, idx1[j]]
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def take_2d_multi_float32(ndarray[float32_t, ndim=2] values,
-                           ndarray[int64_t] idx0,
-                           ndarray[int64_t] idx1,
-                           out=None, fill_value=np.nan):
+def take_2d_axis0_float64_float64(ndarray[float64_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, j, k, n, idx
-        ndarray[float32_t, ndim=2] outbuf
-        float32_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
+        ndarray[float64_t, ndim=2] outbuf = out
+        float64_t fv
 
+    n = len(indexer)
+    k = values.shape[1]
 
     if False and _checknan(fill_value):
-        for i in range(n):
-            idx = idx0[i]
+        for i from 0 <= i < n:
+            idx = indexer[i]
             if idx == -1:
-                for j in range(k):
+                for j from 0 <= j < k:
                     raise ValueError('No NA values allowed')
             else:
-                for j in range(k):
-                    if idx1[j] == -1:
-                        raise ValueError('No NA values allowed')
-                    else:
-                        outbuf[i, j] = values[idx, idx1[j]]
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
     else:
         fv = fill_value
-        for i in range(n):
-            idx = idx0[i]
+        for i from 0 <= i < n:
+            idx = indexer[i]
             if idx == -1:
-                for j in range(k):
+                for j from 0 <= j < k:
                     outbuf[i, j] = fv
             else:
-                for j in range(k):
-                    if idx1[j] == -1:
-                        outbuf[i, j] = fv
-                    else:
-                        outbuf[i, j] = values[idx, idx1[j]]
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def take_2d_multi_object(ndarray[object, ndim=2] values,
-                           ndarray[int64_t] idx0,
-                           ndarray[int64_t] idx1,
-                           out=None, fill_value=np.nan):
+def take_2d_axis0_object_object(ndarray[object, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, j, k, n, idx
-        ndarray[object, ndim=2] outbuf
+        ndarray[object, ndim=2] outbuf = out
         object fv
 
-    n = len(idx0)
-    k = len(idx1)
-
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
-
+    n = len(indexer)
+    k = values.shape[1]
 
     if False and _checknan(fill_value):
-        for i in range(n):
-            idx = idx0[i]
+        for i from 0 <= i < n:
+            idx = indexer[i]
             if idx == -1:
-                for j in range(k):
+                for j from 0 <= j < k:
                     raise ValueError('No NA values allowed')
             else:
-                for j in range(k):
-                    if idx1[j] == -1:
-                        raise ValueError('No NA values allowed')
-                    else:
-                        outbuf[i, j] = values[idx, idx1[j]]
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
     else:
         fv = fill_value
-        for i in range(n):
-            idx = idx0[i]
+        for i from 0 <= i < n:
+            idx = indexer[i]
             if idx == -1:
-                for j in range(k):
+                for j from 0 <= j < k:
                     outbuf[i, j] = fv
             else:
-                for j in range(k):
-                    if idx1[j] == -1:
-                        outbuf[i, j] = fv
-                    else:
-                        outbuf[i, j] = values[idx, idx1[j]]
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
+
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def take_2d_multi_int8(ndarray[int8_t, ndim=2] values,
-                           ndarray[int64_t] idx0,
-                           ndarray[int64_t] idx1,
-                           out=None, fill_value=np.nan):
+def take_2d_axis1_bool_bool(ndarray[uint8_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, j, k, n, idx
-        ndarray[int8_t, ndim=2] outbuf
-        int8_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
+        ndarray[uint8_t, ndim=2] outbuf = out
+        uint8_t fv
 
+    n = len(values)
+    k = len(indexer)
 
     if True and _checknan(fill_value):
-        for i in range(n):
-            idx = idx0[i]
+        for j from 0 <= j < k:
+            idx = indexer[j]
             if idx == -1:
-                for j in range(k):
+                for i from 0 <= i < n:
                     raise ValueError('No NA values allowed')
             else:
-                for j in range(k):
-                    if idx1[j] == -1:
-                        raise ValueError('No NA values allowed')
-                    else:
-                        outbuf[i, j] = values[idx, idx1[j]]
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
     else:
         fv = fill_value
-        for i in range(n):
-            idx = idx0[i]
+        for j from 0 <= j < k:
+            idx = indexer[j]
             if idx == -1:
-                for j in range(k):
+                for i from 0 <= i < n:
                     outbuf[i, j] = fv
             else:
-                for j in range(k):
-                    if idx1[j] == -1:
-                        outbuf[i, j] = fv
-                    else:
-                        outbuf[i, j] = values[idx, idx1[j]]
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def take_2d_multi_int16(ndarray[int16_t, ndim=2] values,
-                           ndarray[int64_t] idx0,
-                           ndarray[int64_t] idx1,
-                           out=None, fill_value=np.nan):
+def take_2d_axis1_bool_object(ndarray[uint8_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, j, k, n, idx
-        ndarray[int16_t, ndim=2] outbuf
-        int16_t fv
+        ndarray[object, ndim=2] outbuf = out
+        object fv
 
-    n = len(idx0)
-    k = len(idx1)
+    n = len(values)
+    k = len(indexer)
 
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
+    if False and _checknan(fill_value):
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                for i from 0 <= i < n:
+                    raise ValueError('No NA values allowed')
+            else:
+                for i from 0 <= i < n:
+                    outbuf[i, j] = True if values[i, idx] > 0 else False
     else:
-        outbuf = out
+        fv = fill_value
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                for i from 0 <= i < n:
+                    outbuf[i, j] = fv
+            else:
+                for i from 0 <= i < n:
+                    outbuf[i, j] = True if values[i, idx] > 0 else False
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_int8_int8(ndarray[int8_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int8_t, ndim=2] outbuf = out
+        int8_t fv
 
+    n = len(values)
+    k = len(indexer)
 
     if True and _checknan(fill_value):
-        for i in range(n):
-            idx = idx0[i]
+        for j from 0 <= j < k:
+            idx = indexer[j]
             if idx == -1:
-                for j in range(k):
+                for i from 0 <= i < n:
                     raise ValueError('No NA values allowed')
             else:
-                for j in range(k):
-                    if idx1[j] == -1:
-                        raise ValueError('No NA values allowed')
-                    else:
-                        outbuf[i, j] = values[idx, idx1[j]]
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
     else:
         fv = fill_value
-        for i in range(n):
-            idx = idx0[i]
+        for j from 0 <= j < k:
+            idx = indexer[j]
             if idx == -1:
-                for j in range(k):
+                for i from 0 <= i < n:
                     outbuf[i, j] = fv
             else:
-                for j in range(k):
-                    if idx1[j] == -1:
-                        outbuf[i, j] = fv
-                    else:
-                        outbuf[i, j] = values[idx, idx1[j]]
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def take_2d_multi_int32(ndarray[int32_t, ndim=2] values,
-                           ndarray[int64_t] idx0,
-                           ndarray[int64_t] idx1,
-                           out=None, fill_value=np.nan):
+def take_2d_axis1_int8_int32(ndarray[int8_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, j, k, n, idx
-        ndarray[int32_t, ndim=2] outbuf
+        ndarray[int32_t, ndim=2] outbuf = out
         int32_t fv
 
-    n = len(idx0)
-    k = len(idx1)
+    n = len(values)
+    k = len(indexer)
 
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
+    if True and _checknan(fill_value):
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                for i from 0 <= i < n:
+                    raise ValueError('No NA values allowed')
+            else:
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
     else:
-        outbuf = out
+        fv = fill_value
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                for i from 0 <= i < n:
+                    outbuf[i, j] = fv
+            else:
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_int8_int64(ndarray[int8_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t, ndim=2] outbuf = out
+        int64_t fv
 
+    n = len(values)
+    k = len(indexer)
 
     if True and _checknan(fill_value):
-        for i in range(n):
-            idx = idx0[i]
+        for j from 0 <= j < k:
+            idx = indexer[j]
             if idx == -1:
-                for j in range(k):
+                for i from 0 <= i < n:
                     raise ValueError('No NA values allowed')
             else:
-                for j in range(k):
-                    if idx1[j] == -1:
-                        raise ValueError('No NA values allowed')
-                    else:
-                        outbuf[i, j] = values[idx, idx1[j]]
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
     else:
         fv = fill_value
-        for i in range(n):
-            idx = idx0[i]
+        for j from 0 <= j < k:
+            idx = indexer[j]
             if idx == -1:
-                for j in range(k):
+                for i from 0 <= i < n:
                     outbuf[i, j] = fv
             else:
-                for j in range(k):
-                    if idx1[j] == -1:
-                        outbuf[i, j] = fv
-                    else:
-                        outbuf[i, j] = values[idx, idx1[j]]
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def take_2d_multi_int64(ndarray[int64_t, ndim=2] values,
-                           ndarray[int64_t] idx0,
-                           ndarray[int64_t] idx1,
-                           out=None, fill_value=np.nan):
+def take_2d_axis1_int8_float64(ndarray[int8_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t, ndim=2] outbuf
-        int64_t fv
+        ndarray[float64_t, ndim=2] outbuf = out
+        float64_t fv
 
-    n = len(idx0)
-    k = len(idx1)
+    n = len(values)
+    k = len(indexer)
 
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
+    if False and _checknan(fill_value):
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                for i from 0 <= i < n:
+                    raise ValueError('No NA values allowed')
+            else:
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
     else:
-        outbuf = out
+        fv = fill_value
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                for i from 0 <= i < n:
+                    outbuf[i, j] = fv
+            else:
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_int16_int16(ndarray[int16_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int16_t, ndim=2] outbuf = out
+        int16_t fv
 
+    n = len(values)
+    k = len(indexer)
 
     if True and _checknan(fill_value):
-        for i in range(n):
-            idx = idx0[i]
+        for j from 0 <= j < k:
+            idx = indexer[j]
             if idx == -1:
-                for j in range(k):
+                for i from 0 <= i < n:
                     raise ValueError('No NA values allowed')
             else:
-                for j in range(k):
-                    if idx1[j] == -1:
-                        raise ValueError('No NA values allowed')
-                    else:
-                        outbuf[i, j] = values[idx, idx1[j]]
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
     else:
         fv = fill_value
-        for i in range(n):
-            idx = idx0[i]
+        for j from 0 <= j < k:
+            idx = indexer[j]
             if idx == -1:
-                for j in range(k):
+                for i from 0 <= i < n:
                     outbuf[i, j] = fv
             else:
-                for j in range(k):
-                    if idx1[j] == -1:
-                        outbuf[i, j] = fv
-                    else:
-                        outbuf[i, j] = values[idx, idx1[j]]
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def take_2d_multi_bool(ndarray[uint8_t, ndim=2] values,
-                           ndarray[int64_t] idx0,
-                           ndarray[int64_t] idx1,
-                           out=None, fill_value=np.nan):
+def take_2d_axis1_int16_int32(ndarray[int16_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
         Py_ssize_t i, j, k, n, idx
-        ndarray[uint8_t, ndim=2] outbuf
-        uint8_t fv
+        ndarray[int32_t, ndim=2] outbuf = out
+        int32_t fv
 
-    n = len(idx0)
-    k = len(idx1)
+    n = len(values)
+    k = len(indexer)
 
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
+    if True and _checknan(fill_value):
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                for i from 0 <= i < n:
+                    raise ValueError('No NA values allowed')
+            else:
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
     else:
-        outbuf = out
+        fv = fill_value
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                for i from 0 <= i < n:
+                    outbuf[i, j] = fv
+            else:
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_int16_int64(ndarray[int16_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t, ndim=2] outbuf = out
+        int64_t fv
 
+    n = len(values)
+    k = len(indexer)
 
     if True and _checknan(fill_value):
-        for i in range(n):
-            idx = idx0[i]
+        for j from 0 <= j < k:
+            idx = indexer[j]
             if idx == -1:
-                for j in range(k):
+                for i from 0 <= i < n:
                     raise ValueError('No NA values allowed')
             else:
-                for j in range(k):
-                    if idx1[j] == -1:
-                        raise ValueError('No NA values allowed')
-                    else:
-                        outbuf[i, j] = values[idx, idx1[j]]
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
     else:
         fv = fill_value
-        for i in range(n):
-            idx = idx0[i]
+        for j from 0 <= j < k:
+            idx = indexer[j]
             if idx == -1:
-                for j in range(k):
+                for i from 0 <= i < n:
                     outbuf[i, j] = fv
             else:
-                for j in range(k):
-                    if idx1[j] == -1:
-                        outbuf[i, j] = fv
-                    else:
-                        outbuf[i, j] = values[idx, idx1[j]]
-
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
 
-@cython.boundscheck(False)
 @cython.wraparound(False)
-def diff_2d_float64(ndarray[float64_t, ndim=2] arr,
-                     ndarray[float64_t, ndim=2] out,
-                    Py_ssize_t periods, int axis):
+@cython.boundscheck(False)
+def take_2d_axis1_int16_float64(ndarray[int16_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
-        Py_ssize_t i, j, sx, sy
+        Py_ssize_t i, j, k, n, idx
+        ndarray[float64_t, ndim=2] outbuf = out
+        float64_t fv
 
-    sx, sy = (<object> arr).shape
-    if arr.flags.f_contiguous:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for j in range(sy):
-                for i in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
+    n = len(values)
+    k = len(indexer)
+
+    if False and _checknan(fill_value):
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                for i from 0 <= i < n:
+                    raise ValueError('No NA values allowed')
             else:
-                start, stop = 0, sy + periods
-            for j in range(start, stop):
-                for i in range(sx):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
     else:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for i in range(start, stop):
-                for j in range(sy):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
+        fv = fill_value
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                for i from 0 <= i < n:
+                    outbuf[i, j] = fv
             else:
-                start, stop = 0, sy + periods
-            for i in range(sx):
-                for j in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-@cython.boundscheck(False)
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
+
 @cython.wraparound(False)
-def diff_2d_float32(ndarray[float32_t, ndim=2] arr,
-                     ndarray[float32_t, ndim=2] out,
-                    Py_ssize_t periods, int axis):
+@cython.boundscheck(False)
+def take_2d_axis1_int32_int32(ndarray[int32_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
-        Py_ssize_t i, j, sx, sy
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int32_t, ndim=2] outbuf = out
+        int32_t fv
 
-    sx, sy = (<object> arr).shape
-    if arr.flags.f_contiguous:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for j in range(sy):
-                for i in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
+    n = len(values)
+    k = len(indexer)
+
+    if True and _checknan(fill_value):
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                for i from 0 <= i < n:
+                    raise ValueError('No NA values allowed')
             else:
-                start, stop = 0, sy + periods
-            for j in range(start, stop):
-                for i in range(sx):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
     else:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for i in range(start, stop):
-                for j in range(sy):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
+        fv = fill_value
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                for i from 0 <= i < n:
+                    outbuf[i, j] = fv
             else:
-                start, stop = 0, sy + periods
-            for i in range(sx):
-                for j in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-@cython.boundscheck(False)
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
+
 @cython.wraparound(False)
-def diff_2d_int8(ndarray[int8_t, ndim=2] arr,
-                     ndarray[float32_t, ndim=2] out,
-                    Py_ssize_t periods, int axis):
+@cython.boundscheck(False)
+def take_2d_axis1_int32_int64(ndarray[int32_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
-        Py_ssize_t i, j, sx, sy
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t, ndim=2] outbuf = out
+        int64_t fv
 
-    sx, sy = (<object> arr).shape
-    if arr.flags.f_contiguous:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for j in range(sy):
-                for i in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
+    n = len(values)
+    k = len(indexer)
+
+    if True and _checknan(fill_value):
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                for i from 0 <= i < n:
+                    raise ValueError('No NA values allowed')
             else:
-                start, stop = 0, sy + periods
-            for j in range(start, stop):
-                for i in range(sx):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
     else:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for i in range(start, stop):
-                for j in range(sy):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
+        fv = fill_value
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                for i from 0 <= i < n:
+                    outbuf[i, j] = fv
             else:
-                start, stop = 0, sy + periods
-            for i in range(sx):
-                for j in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-@cython.boundscheck(False)
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
+
 @cython.wraparound(False)
-def diff_2d_int16(ndarray[int16_t, ndim=2] arr,
-                     ndarray[float32_t, ndim=2] out,
-                    Py_ssize_t periods, int axis):
+@cython.boundscheck(False)
+def take_2d_axis1_int32_float64(ndarray[int32_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
-        Py_ssize_t i, j, sx, sy
+        Py_ssize_t i, j, k, n, idx
+        ndarray[float64_t, ndim=2] outbuf = out
+        float64_t fv
 
-    sx, sy = (<object> arr).shape
-    if arr.flags.f_contiguous:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for j in range(sy):
-                for i in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
+    n = len(values)
+    k = len(indexer)
+
+    if False and _checknan(fill_value):
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                for i from 0 <= i < n:
+                    raise ValueError('No NA values allowed')
             else:
-                start, stop = 0, sy + periods
-            for j in range(start, stop):
-                for i in range(sx):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
     else:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for i in range(start, stop):
-                for j in range(sy):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
+        fv = fill_value
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                for i from 0 <= i < n:
+                    outbuf[i, j] = fv
             else:
-                start, stop = 0, sy + periods
-            for i in range(sx):
-                for j in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-@cython.boundscheck(False)
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
+
 @cython.wraparound(False)
-def diff_2d_int32(ndarray[int32_t, ndim=2] arr,
-                     ndarray[float64_t, ndim=2] out,
-                    Py_ssize_t periods, int axis):
+@cython.boundscheck(False)
+def take_2d_axis1_int64_int64(ndarray[int64_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
-        Py_ssize_t i, j, sx, sy
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t, ndim=2] outbuf = out
+        int64_t fv
 
-    sx, sy = (<object> arr).shape
-    if arr.flags.f_contiguous:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for j in range(sy):
-                for i in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
+    n = len(values)
+    k = len(indexer)
+
+    if True and _checknan(fill_value):
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                for i from 0 <= i < n:
+                    raise ValueError('No NA values allowed')
             else:
-                start, stop = 0, sy + periods
-            for j in range(start, stop):
-                for i in range(sx):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
     else:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for i in range(start, stop):
-                for j in range(sy):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
+        fv = fill_value
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                for i from 0 <= i < n:
+                    outbuf[i, j] = fv
             else:
-                start, stop = 0, sy + periods
-            for i in range(sx):
-                for j in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-@cython.boundscheck(False)
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
+
 @cython.wraparound(False)
-def diff_2d_int64(ndarray[int64_t, ndim=2] arr,
-                     ndarray[float64_t, ndim=2] out,
-                    Py_ssize_t periods, int axis):
+@cython.boundscheck(False)
+def take_2d_axis1_int64_float64(ndarray[int64_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
-        Py_ssize_t i, j, sx, sy
+        Py_ssize_t i, j, k, n, idx
+        ndarray[float64_t, ndim=2] outbuf = out
+        float64_t fv
 
-    sx, sy = (<object> arr).shape
-    if arr.flags.f_contiguous:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for j in range(sy):
-                for i in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
+    n = len(values)
+    k = len(indexer)
+
+    if False and _checknan(fill_value):
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                for i from 0 <= i < n:
+                    raise ValueError('No NA values allowed')
             else:
-                start, stop = 0, sy + periods
-            for j in range(start, stop):
-                for i in range(sx):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
     else:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
+        fv = fill_value
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                for i from 0 <= i < n:
+                    outbuf[i, j] = fv
             else:
-                start, stop = 0, sx + periods
-            for i in range(start, stop):
-                for j in range(sy):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for i in range(sx):
-                for j in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
 
 @cython.wraparound(False)
-@cython.wraparound(False)
-def group_last_float64(ndarray[float64_t, ndim=2] out,
-               ndarray[int64_t] counts,
-               ndarray[float64_t, ndim=2] values,
-               ndarray[int64_t] labels):
-    '''
-    Only aggregates on axis=0
-    '''
+@cython.boundscheck(False)
+def take_2d_axis1_float32_float32(ndarray[float32_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
-        Py_ssize_t i, j, N, K, lab
-        float64_t val, count
-        ndarray[float64_t, ndim=2] resx
-        ndarray[int64_t, ndim=2] nobs
-
-    nobs = np.zeros((<object> out).shape, dtype=np.int64)
-    resx = np.empty_like(out)
-
-    N, K = (<object> values).shape
-
-    for i in range(N):
-        lab = labels[i]
-        if lab < 0:
-            continue
-
-        counts[lab] += 1
-        for j in range(K):
-            val = values[i, j]
+        Py_ssize_t i, j, k, n, idx
+        ndarray[float32_t, ndim=2] outbuf = out
+        float32_t fv
 
-            # not nan
-            if val == val:
-                nobs[lab, j] += 1
-                resx[lab, j] = val
+    n = len(values)
+    k = len(indexer)
 
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
+    if False and _checknan(fill_value):
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                for i from 0 <= i < n:
+                    raise ValueError('No NA values allowed')
             else:
-                out[i, j] = resx[i, j]
-@cython.wraparound(False)
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
+    else:
+        fv = fill_value
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                for i from 0 <= i < n:
+                    outbuf[i, j] = fv
+            else:
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
+
 @cython.wraparound(False)
-def group_last_float32(ndarray[float32_t, ndim=2] out,
-               ndarray[int64_t] counts,
-               ndarray[float32_t, ndim=2] values,
-               ndarray[int64_t] labels):
-    '''
-    Only aggregates on axis=0
-    '''
+@cython.boundscheck(False)
+def take_2d_axis1_float32_float64(ndarray[float32_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
-        Py_ssize_t i, j, N, K, lab
-        float32_t val, count
-        ndarray[float32_t, ndim=2] resx
-        ndarray[int64_t, ndim=2] nobs
-
-    nobs = np.zeros((<object> out).shape, dtype=np.int64)
-    resx = np.empty_like(out)
+        Py_ssize_t i, j, k, n, idx
+        ndarray[float64_t, ndim=2] outbuf = out
+        float64_t fv
 
-    N, K = (<object> values).shape
+    n = len(values)
+    k = len(indexer)
 
-    for i in range(N):
-        lab = labels[i]
-        if lab < 0:
-            continue
+    if False and _checknan(fill_value):
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                for i from 0 <= i < n:
+                    raise ValueError('No NA values allowed')
+            else:
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
+    else:
+        fv = fill_value
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                for i from 0 <= i < n:
+                    outbuf[i, j] = fv
+            else:
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
 
-        counts[lab] += 1
-        for j in range(K):
-            val = values[i, j]
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_float64_float64(ndarray[float64_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[float64_t, ndim=2] outbuf = out
+        float64_t fv
 
-            # not nan
-            if val == val:
-                nobs[lab, j] += 1
-                resx[lab, j] = val
+    n = len(values)
+    k = len(indexer)
 
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
+    if False and _checknan(fill_value):
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                for i from 0 <= i < n:
+                    raise ValueError('No NA values allowed')
             else:
-                out[i, j] = resx[i, j]
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
+    else:
+        fv = fill_value
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                for i from 0 <= i < n:
+                    outbuf[i, j] = fv
+            else:
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
 
 @cython.wraparound(False)
-@cython.wraparound(False)
-def group_last_bin_float64(ndarray[float64_t, ndim=2] out,
-                   ndarray[int64_t] counts,
-                   ndarray[float64_t, ndim=2] values,
-                   ndarray[int64_t] bins):
-    '''
-    Only aggregates on axis=0
-    '''
+@cython.boundscheck(False)
+def take_2d_axis1_object_object(ndarray[object, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    out, fill_value=np.nan):
     cdef:
-        Py_ssize_t i, j, N, K, ngroups, b
-        float64_t val, count
-        ndarray[float64_t, ndim=2] resx, nobs
+        Py_ssize_t i, j, k, n, idx
+        ndarray[object, ndim=2] outbuf = out
+        object fv
 
-    nobs = np.zeros_like(out)
-    resx = np.empty_like(out)
+    n = len(values)
+    k = len(indexer)
 
-    if bins[len(bins) - 1] == len(values):
-        ngroups = len(bins)
+    if False and _checknan(fill_value):
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                for i from 0 <= i < n:
+                    raise ValueError('No NA values allowed')
+            else:
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
     else:
-        ngroups = len(bins) + 1
-
-    N, K = (<object> values).shape
+        fv = fill_value
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                for i from 0 <= i < n:
+                    outbuf[i, j] = fv
+            else:
+                for i from 0 <= i < n:
+                    outbuf[i, j] = values[i, idx]
 
-    b = 0
-    for i in range(N):
-        while b < ngroups - 1 and i >= bins[b]:
-            b += 1
 
-        counts[b] += 1
-        for j in range(K):
-            val = values[i, j]
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_bool_bool(ndarray[uint8_t, ndim=2] values,
+                                    indexer,
+                                    out, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        ndarray[uint8_t, ndim=2] outbuf = out
+        uint8_t fv
 
-            # not nan
-            if val == val:
-                nobs[b, j] += 1
-                resx[b, j] = val
+    n = len(idx0)
+    k = len(idx1)
 
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
+    if True and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    raise ValueError('No NA values allowed')
             else:
-                out[i, j] = resx[i, j]
-@cython.wraparound(False)
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        raise ValueError('No NA values allowed')
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    outbuf[i, j] = fv
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        outbuf[i, j] = fv
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+
 @cython.wraparound(False)
-def group_last_bin_float32(ndarray[float32_t, ndim=2] out,
-                   ndarray[int64_t] counts,
-                   ndarray[float32_t, ndim=2] values,
-                   ndarray[int64_t] bins):
-    '''
-    Only aggregates on axis=0
-    '''
+@cython.boundscheck(False)
+def take_2d_multi_bool_object(ndarray[uint8_t, ndim=2] values,
+                                    indexer,
+                                    out, fill_value=np.nan):
     cdef:
-        Py_ssize_t i, j, N, K, ngroups, b
-        float32_t val, count
-        ndarray[float32_t, ndim=2] resx, nobs
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        ndarray[object, ndim=2] outbuf = out
+        object fv
 
-    nobs = np.zeros_like(out)
-    resx = np.empty_like(out)
+    n = len(idx0)
+    k = len(idx1)
 
-    if bins[len(bins) - 1] == len(values):
-        ngroups = len(bins)
+    if False and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    raise ValueError('No NA values allowed')
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        raise ValueError('No NA values allowed')
+                    else:
+                        outbuf[i, j] = True if values[idx, idx1[j]] > 0 else False
     else:
-        ngroups = len(bins) + 1
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    outbuf[i, j] = fv
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        outbuf[i, j] = fv
+                    else:
+                        outbuf[i, j] = True if values[idx, idx1[j]] > 0 else False
 
-    N, K = (<object> values).shape
-
-    b = 0
-    for i in range(N):
-        while b < ngroups - 1 and i >= bins[b]:
-            b += 1
-
-        counts[b] += 1
-        for j in range(K):
-            val = values[i, j]
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_int8_int8(ndarray[int8_t, ndim=2] values,
+                                    indexer,
+                                    out, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        ndarray[int8_t, ndim=2] outbuf = out
+        int8_t fv
 
-            # not nan
-            if val == val:
-                nobs[b, j] += 1
-                resx[b, j] = val
+    n = len(idx0)
+    k = len(idx1)
 
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
+    if True and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    raise ValueError('No NA values allowed')
             else:
-                out[i, j] = resx[i, j]
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        raise ValueError('No NA values allowed')
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    outbuf[i, j] = fv
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        outbuf[i, j] = fv
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
 
-@cython.boundscheck(False)
 @cython.wraparound(False)
-def group_nth_float64(ndarray[float64_t, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[float64_t, ndim=2] values,
-              ndarray[int64_t] labels, int64_t rank):
-    '''
-    Only aggregates on axis=0
-    '''
+@cython.boundscheck(False)
+def take_2d_multi_int8_int32(ndarray[int8_t, ndim=2] values,
+                                    indexer,
+                                    out, fill_value=np.nan):
     cdef:
-        Py_ssize_t i, j, N, K, lab
-        float64_t val, count
-        ndarray[float64_t, ndim=2] resx
-        ndarray[int64_t, ndim=2] nobs
-
-    nobs = np.zeros((<object> out).shape, dtype=np.int64)
-    resx = np.empty_like(out)
-
-    N, K = (<object> values).shape
-
-    for i in range(N):
-        lab = labels[i]
-        if lab < 0:
-            continue
-
-        counts[lab] += 1
-        for j in range(K):
-            val = values[i, j]
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        ndarray[int32_t, ndim=2] outbuf = out
+        int32_t fv
 
-            # not nan
-            if val == val:
-                nobs[lab, j] += 1
-                if nobs[lab, j] == rank:
-                    resx[lab, j] = val
+    n = len(idx0)
+    k = len(idx1)
 
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
+    if True and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    raise ValueError('No NA values allowed')
             else:
-                out[i, j] = resx[i, j]
-@cython.boundscheck(False)
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        raise ValueError('No NA values allowed')
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    outbuf[i, j] = fv
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        outbuf[i, j] = fv
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+
 @cython.wraparound(False)
-def group_nth_float32(ndarray[float32_t, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[float32_t, ndim=2] values,
-              ndarray[int64_t] labels, int64_t rank):
-    '''
-    Only aggregates on axis=0
-    '''
+@cython.boundscheck(False)
+def take_2d_multi_int8_int64(ndarray[int8_t, ndim=2] values,
+                                    indexer,
+                                    out, fill_value=np.nan):
     cdef:
-        Py_ssize_t i, j, N, K, lab
-        float32_t val, count
-        ndarray[float32_t, ndim=2] resx
-        ndarray[int64_t, ndim=2] nobs
-
-    nobs = np.zeros((<object> out).shape, dtype=np.int64)
-    resx = np.empty_like(out)
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        ndarray[int64_t, ndim=2] outbuf = out
+        int64_t fv
 
-    N, K = (<object> values).shape
+    n = len(idx0)
+    k = len(idx1)
 
-    for i in range(N):
-        lab = labels[i]
-        if lab < 0:
-            continue
+    if True and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    raise ValueError('No NA values allowed')
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        raise ValueError('No NA values allowed')
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    outbuf[i, j] = fv
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        outbuf[i, j] = fv
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
 
-        counts[lab] += 1
-        for j in range(K):
-            val = values[i, j]
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_int8_float64(ndarray[int8_t, ndim=2] values,
+                                    indexer,
+                                    out, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        ndarray[float64_t, ndim=2] outbuf = out
+        float64_t fv
 
-            # not nan
-            if val == val:
-                nobs[lab, j] += 1
-                if nobs[lab, j] == rank:
-                    resx[lab, j] = val
+    n = len(idx0)
+    k = len(idx1)
 
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
+    if False and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    raise ValueError('No NA values allowed')
             else:
-                out[i, j] = resx[i, j]
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        raise ValueError('No NA values allowed')
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    outbuf[i, j] = fv
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        outbuf[i, j] = fv
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
 
-@cython.boundscheck(False)
 @cython.wraparound(False)
-def group_nth_bin_float64(ndarray[float64_t, ndim=2] out,
-                  ndarray[int64_t] counts,
-                  ndarray[float64_t, ndim=2] values,
-                  ndarray[int64_t] bins, int64_t rank):
-    '''
-    Only aggregates on axis=0
-    '''
+@cython.boundscheck(False)
+def take_2d_multi_int16_int16(ndarray[int16_t, ndim=2] values,
+                                    indexer,
+                                    out, fill_value=np.nan):
     cdef:
-        Py_ssize_t i, j, N, K, ngroups, b
-        float64_t val, count
-        ndarray[float64_t, ndim=2] resx, nobs
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        ndarray[int16_t, ndim=2] outbuf = out
+        int16_t fv
 
-    nobs = np.zeros_like(out)
-    resx = np.empty_like(out)
+    n = len(idx0)
+    k = len(idx1)
 
-    if bins[len(bins) - 1] == len(values):
-        ngroups = len(bins)
+    if True and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    raise ValueError('No NA values allowed')
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        raise ValueError('No NA values allowed')
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
     else:
-        ngroups = len(bins) + 1
-
-    N, K = (<object> values).shape
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    outbuf[i, j] = fv
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        outbuf[i, j] = fv
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
 
-    b = 0
-    for i in range(N):
-        while b < ngroups - 1 and i >= bins[b]:
-            b += 1
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_int16_int32(ndarray[int16_t, ndim=2] values,
+                                    indexer,
+                                    out, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        ndarray[int32_t, ndim=2] outbuf = out
+        int32_t fv
 
-        counts[b] += 1
-        for j in range(K):
-            val = values[i, j]
+    n = len(idx0)
+    k = len(idx1)
 
-            # not nan
-            if val == val:
-                nobs[b, j] += 1
-                if nobs[b, j] == rank:
-                    resx[b, j] = val
+    if True and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    raise ValueError('No NA values allowed')
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        raise ValueError('No NA values allowed')
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    outbuf[i, j] = fv
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        outbuf[i, j] = fv
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
 
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_int16_int64(ndarray[int16_t, ndim=2] values,
+                                    indexer,
+                                    out, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        ndarray[int64_t, ndim=2] outbuf = out
+        int64_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    if True and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    raise ValueError('No NA values allowed')
             else:
-                out[i, j] = resx[i, j]
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        raise ValueError('No NA values allowed')
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    outbuf[i, j] = fv
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        outbuf[i, j] = fv
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+
+@cython.wraparound(False)
 @cython.boundscheck(False)
+def take_2d_multi_int16_float64(ndarray[int16_t, ndim=2] values,
+                                    indexer,
+                                    out, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        ndarray[float64_t, ndim=2] outbuf = out
+        float64_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    if False and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    raise ValueError('No NA values allowed')
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        raise ValueError('No NA values allowed')
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    outbuf[i, j] = fv
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        outbuf[i, j] = fv
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+
 @cython.wraparound(False)
-def group_nth_bin_float32(ndarray[float32_t, ndim=2] out,
-                  ndarray[int64_t] counts,
-                  ndarray[float32_t, ndim=2] values,
-                  ndarray[int64_t] bins, int64_t rank):
-    '''
-    Only aggregates on axis=0
-    '''
+@cython.boundscheck(False)
+def take_2d_multi_int32_int32(ndarray[int32_t, ndim=2] values,
+                                    indexer,
+                                    out, fill_value=np.nan):
     cdef:
-        Py_ssize_t i, j, N, K, ngroups, b
-        float32_t val, count
-        ndarray[float32_t, ndim=2] resx, nobs
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        ndarray[int32_t, ndim=2] outbuf = out
+        int32_t fv
 
-    nobs = np.zeros_like(out)
-    resx = np.empty_like(out)
+    n = len(idx0)
+    k = len(idx1)
 
-    if bins[len(bins) - 1] == len(values):
-        ngroups = len(bins)
+    if True and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    raise ValueError('No NA values allowed')
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        raise ValueError('No NA values allowed')
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
     else:
-        ngroups = len(bins) + 1
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    outbuf[i, j] = fv
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        outbuf[i, j] = fv
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
 
-    N, K = (<object> values).shape
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_int32_int64(ndarray[int32_t, ndim=2] values,
+                                    indexer,
+                                    out, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        ndarray[int64_t, ndim=2] outbuf = out
+        int64_t fv
 
-    b = 0
-    for i in range(N):
-        while b < ngroups - 1 and i >= bins[b]:
-            b += 1
+    n = len(idx0)
+    k = len(idx1)
 
-        counts[b] += 1
-        for j in range(K):
-            val = values[i, j]
+    if True and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    raise ValueError('No NA values allowed')
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        raise ValueError('No NA values allowed')
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    outbuf[i, j] = fv
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        outbuf[i, j] = fv
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
 
-            # not nan
-            if val == val:
-                nobs[b, j] += 1
-                if nobs[b, j] == rank:
-                    resx[b, j] = val
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_int32_float64(ndarray[int32_t, ndim=2] values,
+                                    indexer,
+                                    out, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        ndarray[float64_t, ndim=2] outbuf = out
+        float64_t fv
 
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
+    n = len(idx0)
+    k = len(idx1)
+
+    if False and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    raise ValueError('No NA values allowed')
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        raise ValueError('No NA values allowed')
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    outbuf[i, j] = fv
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        outbuf[i, j] = fv
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_int64_int64(ndarray[int64_t, ndim=2] values,
+                                    indexer,
+                                    out, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        ndarray[int64_t, ndim=2] outbuf = out
+        int64_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    if True and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    raise ValueError('No NA values allowed')
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        raise ValueError('No NA values allowed')
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    outbuf[i, j] = fv
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        outbuf[i, j] = fv
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_int64_float64(ndarray[int64_t, ndim=2] values,
+                                    indexer,
+                                    out, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        ndarray[float64_t, ndim=2] outbuf = out
+        float64_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    if False and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    raise ValueError('No NA values allowed')
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        raise ValueError('No NA values allowed')
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    outbuf[i, j] = fv
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        outbuf[i, j] = fv
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_float32_float32(ndarray[float32_t, ndim=2] values,
+                                    indexer,
+                                    out, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        ndarray[float32_t, ndim=2] outbuf = out
+        float32_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    if False and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    raise ValueError('No NA values allowed')
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        raise ValueError('No NA values allowed')
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    outbuf[i, j] = fv
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        outbuf[i, j] = fv
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_float32_float64(ndarray[float32_t, ndim=2] values,
+                                    indexer,
+                                    out, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        ndarray[float64_t, ndim=2] outbuf = out
+        float64_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    if False and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    raise ValueError('No NA values allowed')
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        raise ValueError('No NA values allowed')
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    outbuf[i, j] = fv
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        outbuf[i, j] = fv
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_float64_float64(ndarray[float64_t, ndim=2] values,
+                                    indexer,
+                                    out, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        ndarray[float64_t, ndim=2] outbuf = out
+        float64_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    if False and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    raise ValueError('No NA values allowed')
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        raise ValueError('No NA values allowed')
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    outbuf[i, j] = fv
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        outbuf[i, j] = fv
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_object_object(ndarray[object, ndim=2] values,
+                                    indexer,
+                                    out, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        ndarray[object, ndim=2] outbuf = out
+        object fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    if False and _checknan(fill_value):
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    raise ValueError('No NA values allowed')
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        raise ValueError('No NA values allowed')
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+    else:
+        fv = fill_value
+        for i from 0 <= i < n:
+            idx = idx0[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    outbuf[i, j] = fv
+            else:
+                for j from 0 <= j < k:
+                    if idx1[j] == -1:
+                        outbuf[i, j] = fv
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def diff_2d_float64(ndarray[float64_t, ndim=2] arr,
+                     ndarray[float64_t, ndim=2] out,
+                    Py_ssize_t periods, int axis):
+    cdef:
+        Py_ssize_t i, j, sx, sy
+
+    sx, sy = (<object> arr).shape
+    if arr.flags.f_contiguous:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for j in range(sy):
+                for i in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for j in range(start, stop):
+                for i in range(sx):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+    else:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for i in range(start, stop):
+                for j in range(sy):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for i in range(sx):
+                for j in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def diff_2d_float32(ndarray[float32_t, ndim=2] arr,
+                     ndarray[float32_t, ndim=2] out,
+                    Py_ssize_t periods, int axis):
+    cdef:
+        Py_ssize_t i, j, sx, sy
+
+    sx, sy = (<object> arr).shape
+    if arr.flags.f_contiguous:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for j in range(sy):
+                for i in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for j in range(start, stop):
+                for i in range(sx):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+    else:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for i in range(start, stop):
+                for j in range(sy):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for i in range(sx):
+                for j in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def diff_2d_int8(ndarray[int8_t, ndim=2] arr,
+                     ndarray[float32_t, ndim=2] out,
+                    Py_ssize_t periods, int axis):
+    cdef:
+        Py_ssize_t i, j, sx, sy
+
+    sx, sy = (<object> arr).shape
+    if arr.flags.f_contiguous:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for j in range(sy):
+                for i in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for j in range(start, stop):
+                for i in range(sx):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+    else:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for i in range(start, stop):
+                for j in range(sy):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for i in range(sx):
+                for j in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def diff_2d_int16(ndarray[int16_t, ndim=2] arr,
+                     ndarray[float32_t, ndim=2] out,
+                    Py_ssize_t periods, int axis):
+    cdef:
+        Py_ssize_t i, j, sx, sy
+
+    sx, sy = (<object> arr).shape
+    if arr.flags.f_contiguous:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for j in range(sy):
+                for i in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for j in range(start, stop):
+                for i in range(sx):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+    else:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for i in range(start, stop):
+                for j in range(sy):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for i in range(sx):
+                for j in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def diff_2d_int32(ndarray[int32_t, ndim=2] arr,
+                     ndarray[float64_t, ndim=2] out,
+                    Py_ssize_t periods, int axis):
+    cdef:
+        Py_ssize_t i, j, sx, sy
+
+    sx, sy = (<object> arr).shape
+    if arr.flags.f_contiguous:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for j in range(sy):
+                for i in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for j in range(start, stop):
+                for i in range(sx):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+    else:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for i in range(start, stop):
+                for j in range(sy):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for i in range(sx):
+                for j in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def diff_2d_int64(ndarray[int64_t, ndim=2] arr,
+                     ndarray[float64_t, ndim=2] out,
+                    Py_ssize_t periods, int axis):
+    cdef:
+        Py_ssize_t i, j, sx, sy
+
+    sx, sy = (<object> arr).shape
+    if arr.flags.f_contiguous:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for j in range(sy):
+                for i in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
             else:
-                out[i, j] = resx[i, j]
+                start, stop = 0, sy + periods
+            for j in range(start, stop):
+                for i in range(sx):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+    else:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for i in range(start, stop):
+                for j in range(sy):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for i in range(sx):
+                for j in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
 
-@cython.boundscheck(False)
 @cython.wraparound(False)
-def group_add_float64(ndarray[float64_t, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[float64_t, ndim=2] values,
-              ndarray[int64_t] labels):
+@cython.wraparound(False)
+def group_last_float64(ndarray[float64_t, ndim=2] out,
+               ndarray[int64_t] counts,
+               ndarray[float64_t, ndim=2] values,
+               ndarray[int64_t] labels):
     '''
     Only aggregates on axis=0
     '''
     cdef:
         Py_ssize_t i, j, N, K, lab
         float64_t val, count
-        ndarray[float64_t, ndim=2] sumx, nobs
+        ndarray[float64_t, ndim=2] resx
+        ndarray[int64_t, ndim=2] nobs
 
-    nobs = np.zeros_like(out)
-    sumx = np.zeros_like(out)
+    nobs = np.zeros((<object> out).shape, dtype=np.int64)
+    resx = np.empty_like(out)
 
     N, K = (<object> values).shape
 
-    if K > 1:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val:
-                    nobs[lab, j] += 1
-                    sumx[lab, j] += val
-    else:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
+    for i in range(N):
+        lab = labels[i]
+        if lab < 0:
+            continue
 
-            counts[lab] += 1
-            val = values[i, 0]
+        counts[lab] += 1
+        for j in range(K):
+            val = values[i, j]
 
             # not nan
             if val == val:
-                nobs[lab, 0] += 1
-                sumx[lab, 0] += val
+                nobs[lab, j] += 1
+                resx[lab, j] = val
 
     for i in range(len(counts)):
         for j in range(K):
             if nobs[i, j] == 0:
                 out[i, j] = nan
             else:
-                out[i, j] = sumx[i, j]
-@cython.boundscheck(False)
+                out[i, j] = resx[i, j]
 @cython.wraparound(False)
-def group_add_float32(ndarray[float32_t, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[float32_t, ndim=2] values,
-              ndarray[int64_t] labels):
+@cython.wraparound(False)
+def group_last_float32(ndarray[float32_t, ndim=2] out,
+               ndarray[int64_t] counts,
+               ndarray[float32_t, ndim=2] values,
+               ndarray[int64_t] labels):
     '''
     Only aggregates on axis=0
     '''
     cdef:
         Py_ssize_t i, j, N, K, lab
         float32_t val, count
-        ndarray[float32_t, ndim=2] sumx, nobs
+        ndarray[float32_t, ndim=2] resx
+        ndarray[int64_t, ndim=2] nobs
 
-    nobs = np.zeros_like(out)
-    sumx = np.zeros_like(out)
+    nobs = np.zeros((<object> out).shape, dtype=np.int64)
+    resx = np.empty_like(out)
 
     N, K = (<object> values).shape
 
-    if K > 1:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val:
-                    nobs[lab, j] += 1
-                    sumx[lab, j] += val
-    else:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
+    for i in range(N):
+        lab = labels[i]
+        if lab < 0:
+            continue
 
-            counts[lab] += 1
-            val = values[i, 0]
+        counts[lab] += 1
+        for j in range(K):
+            val = values[i, j]
 
             # not nan
             if val == val:
-                nobs[lab, 0] += 1
-                sumx[lab, 0] += val
+                nobs[lab, j] += 1
+                resx[lab, j] = val
 
     for i in range(len(counts)):
         for j in range(K):
             if nobs[i, j] == 0:
                 out[i, j] = nan
             else:
-                out[i, j] = sumx[i, j]
+                out[i, j] = resx[i, j]
 
-@cython.boundscheck(False)
 @cython.wraparound(False)
-def group_add_bin_float64(ndarray[float64_t, ndim=2] out,
-                  ndarray[int64_t] counts,
-                  ndarray[float64_t, ndim=2] values,
-                  ndarray[int64_t] bins):
+@cython.wraparound(False)
+def group_last_bin_float64(ndarray[float64_t, ndim=2] out,
+                   ndarray[int64_t] counts,
+                   ndarray[float64_t, ndim=2] values,
+                   ndarray[int64_t] bins):
     '''
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, N, K, ngroups, b, nbins
+        Py_ssize_t i, j, N, K, ngroups, b
         float64_t val, count
-        ndarray[float64_t, ndim=2] sumx, nobs
+        ndarray[float64_t, ndim=2] resx, nobs
 
     nobs = np.zeros_like(out)
-    sumx = np.zeros_like(out)
+    resx = np.empty_like(out)
 
     if bins[len(bins) - 1] == len(values):
         ngroups = len(bins)
     else:
         ngroups = len(bins) + 1
+
     N, K = (<object> values).shape
 
     b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
-
-            counts[b] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val:
-                    nobs[b, j] += 1
-                    sumx[b, j] += val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+    for i in range(N):
+        while b < ngroups - 1 and i >= bins[b]:
+            b += 1
 
-            counts[b] += 1
-            val = values[i, 0]
+        counts[b] += 1
+        for j in range(K):
+            val = values[i, j]
 
             # not nan
             if val == val:
-                nobs[b, 0] += 1
-                sumx[b, 0] += val
+                nobs[b, j] += 1
+                resx[b, j] = val
 
     for i in range(ngroups):
         for j in range(K):
             if nobs[i, j] == 0:
                 out[i, j] = nan
             else:
-                out[i, j] = sumx[i, j]
-@cython.boundscheck(False)
+                out[i, j] = resx[i, j]
 @cython.wraparound(False)
-def group_add_bin_float32(ndarray[float32_t, ndim=2] out,
-                  ndarray[int64_t] counts,
-                  ndarray[float32_t, ndim=2] values,
-                  ndarray[int64_t] bins):
+@cython.wraparound(False)
+def group_last_bin_float32(ndarray[float32_t, ndim=2] out,
+                   ndarray[int64_t] counts,
+                   ndarray[float32_t, ndim=2] values,
+                   ndarray[int64_t] bins):
     '''
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, N, K, ngroups, b, nbins
+        Py_ssize_t i, j, N, K, ngroups, b
         float32_t val, count
-        ndarray[float32_t, ndim=2] sumx, nobs
+        ndarray[float32_t, ndim=2] resx, nobs
 
     nobs = np.zeros_like(out)
-    sumx = np.zeros_like(out)
+    resx = np.empty_like(out)
 
     if bins[len(bins) - 1] == len(values):
         ngroups = len(bins)
     else:
         ngroups = len(bins) + 1
+
     N, K = (<object> values).shape
 
     b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
-
-            counts[b] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val:
-                    nobs[b, j] += 1
-                    sumx[b, j] += val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+    for i in range(N):
+        while b < ngroups - 1 and i >= bins[b]:
+            b += 1
 
-            counts[b] += 1
-            val = values[i, 0]
+        counts[b] += 1
+        for j in range(K):
+            val = values[i, j]
 
             # not nan
             if val == val:
-                nobs[b, 0] += 1
-                sumx[b, 0] += val
+                nobs[b, j] += 1
+                resx[b, j] = val
 
     for i in range(ngroups):
         for j in range(K):
             if nobs[i, j] == 0:
                 out[i, j] = nan
             else:
-                out[i, j] = sumx[i, j]
+                out[i, j] = resx[i, j]
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_prod_float64(ndarray[float64_t, ndim=2] out,
-               ndarray[int64_t] counts,
-               ndarray[float64_t, ndim=2] values,
-               ndarray[int64_t] labels):
+def group_nth_float64(ndarray[float64_t, ndim=2] out,
+              ndarray[int64_t] counts,
+              ndarray[float64_t, ndim=2] values,
+              ndarray[int64_t] labels, int64_t rank):
     '''
     Only aggregates on axis=0
     '''
     cdef:
         Py_ssize_t i, j, N, K, lab
         float64_t val, count
-        ndarray[float64_t, ndim=2] prodx, nobs
-
-    nobs = np.zeros_like(out)
-    prodx = np.ones_like(out)
-
-    N, K = (<object> values).shape
-
-    if K > 1:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val:
-                    nobs[lab, j] += 1
-                    prodx[lab, j] *= val
-    else:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
+        ndarray[float64_t, ndim=2] resx
+        ndarray[int64_t, ndim=2] nobs
 
-            counts[lab] += 1
-            val = values[i, 0]
+    nobs = np.zeros((<object> out).shape, dtype=np.int64)
+    resx = np.empty_like(out)
+
+    N, K = (<object> values).shape
+
+    for i in range(N):
+        lab = labels[i]
+        if lab < 0:
+            continue
+
+        counts[lab] += 1
+        for j in range(K):
+            val = values[i, j]
 
             # not nan
             if val == val:
-                nobs[lab, 0] += 1
-                prodx[lab, 0] *= val
+                nobs[lab, j] += 1
+                if nobs[lab, j] == rank:
+                    resx[lab, j] = val
 
     for i in range(len(counts)):
         for j in range(K):
             if nobs[i, j] == 0:
                 out[i, j] = nan
             else:
-                out[i, j] = prodx[i, j]
+                out[i, j] = resx[i, j]
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_prod_float32(ndarray[float32_t, ndim=2] out,
-               ndarray[int64_t] counts,
-               ndarray[float32_t, ndim=2] values,
-               ndarray[int64_t] labels):
+def group_nth_float32(ndarray[float32_t, ndim=2] out,
+              ndarray[int64_t] counts,
+              ndarray[float32_t, ndim=2] values,
+              ndarray[int64_t] labels, int64_t rank):
     '''
     Only aggregates on axis=0
     '''
     cdef:
         Py_ssize_t i, j, N, K, lab
         float32_t val, count
-        ndarray[float32_t, ndim=2] prodx, nobs
+        ndarray[float32_t, ndim=2] resx
+        ndarray[int64_t, ndim=2] nobs
 
-    nobs = np.zeros_like(out)
-    prodx = np.ones_like(out)
+    nobs = np.zeros((<object> out).shape, dtype=np.int64)
+    resx = np.empty_like(out)
 
     N, K = (<object> values).shape
 
-    if K > 1:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val:
-                    nobs[lab, j] += 1
-                    prodx[lab, j] *= val
-    else:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
+    for i in range(N):
+        lab = labels[i]
+        if lab < 0:
+            continue
 
-            counts[lab] += 1
-            val = values[i, 0]
+        counts[lab] += 1
+        for j in range(K):
+            val = values[i, j]
 
             # not nan
             if val == val:
-                nobs[lab, 0] += 1
-                prodx[lab, 0] *= val
+                nobs[lab, j] += 1
+                if nobs[lab, j] == rank:
+                    resx[lab, j] = val
 
     for i in range(len(counts)):
         for j in range(K):
             if nobs[i, j] == 0:
                 out[i, j] = nan
             else:
-                out[i, j] = prodx[i, j]
+                out[i, j] = resx[i, j]
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_prod_bin_float64(ndarray[float64_t, ndim=2] out,
+def group_nth_bin_float64(ndarray[float64_t, ndim=2] out,
                   ndarray[int64_t] counts,
                   ndarray[float64_t, ndim=2] values,
-                  ndarray[int64_t] bins):
+                  ndarray[int64_t] bins, int64_t rank):
     '''
     Only aggregates on axis=0
     '''
     cdef:
         Py_ssize_t i, j, N, K, ngroups, b
         float64_t val, count
-        ndarray[float64_t, ndim=2] prodx, nobs
+        ndarray[float64_t, ndim=2] resx, nobs
 
     nobs = np.zeros_like(out)
-    prodx = np.ones_like(out)
+    resx = np.empty_like(out)
 
     if bins[len(bins) - 1] == len(values):
         ngroups = len(bins)
     else:
         ngroups = len(bins) + 1
+
     N, K = (<object> values).shape
 
     b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
-
-            counts[b] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val:
-                    nobs[b, j] += 1
-                    prodx[b, j] *= val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+    for i in range(N):
+        while b < ngroups - 1 and i >= bins[b]:
+            b += 1
 
-            counts[b] += 1
-            val = values[i, 0]
+        counts[b] += 1
+        for j in range(K):
+            val = values[i, j]
 
             # not nan
             if val == val:
-                nobs[b, 0] += 1
-                prodx[b, 0] *= val
+                nobs[b, j] += 1
+                if nobs[b, j] == rank:
+                    resx[b, j] = val
 
     for i in range(ngroups):
         for j in range(K):
             if nobs[i, j] == 0:
                 out[i, j] = nan
             else:
-                out[i, j] = prodx[i, j]
+                out[i, j] = resx[i, j]
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_prod_bin_float32(ndarray[float32_t, ndim=2] out,
+def group_nth_bin_float32(ndarray[float32_t, ndim=2] out,
                   ndarray[int64_t] counts,
                   ndarray[float32_t, ndim=2] values,
-                  ndarray[int64_t] bins):
+                  ndarray[int64_t] bins, int64_t rank):
     '''
     Only aggregates on axis=0
     '''
     cdef:
         Py_ssize_t i, j, N, K, ngroups, b
         float32_t val, count
-        ndarray[float32_t, ndim=2] prodx, nobs
+        ndarray[float32_t, ndim=2] resx, nobs
 
     nobs = np.zeros_like(out)
-    prodx = np.ones_like(out)
+    resx = np.empty_like(out)
 
     if bins[len(bins) - 1] == len(values):
         ngroups = len(bins)
     else:
         ngroups = len(bins) + 1
+
     N, K = (<object> values).shape
 
     b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
-
-            counts[b] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val:
-                    nobs[b, j] += 1
-                    prodx[b, j] *= val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+    for i in range(N):
+        while b < ngroups - 1 and i >= bins[b]:
+            b += 1
 
-            counts[b] += 1
-            val = values[i, 0]
+        counts[b] += 1
+        for j in range(K):
+            val = values[i, j]
 
             # not nan
             if val == val:
-                nobs[b, 0] += 1
-                prodx[b, 0] *= val
+                nobs[b, j] += 1
+                if nobs[b, j] == rank:
+                    resx[b, j] = val
 
     for i in range(ngroups):
         for j in range(K):
             if nobs[i, j] == 0:
                 out[i, j] = nan
             else:
-                out[i, j] = prodx[i, j]
+                out[i, j] = resx[i, j]
 
-@cython.wraparound(False)
 @cython.boundscheck(False)
-def group_var_float64(ndarray[float64_t, ndim=2] out,
+@cython.wraparound(False)
+def group_add_float64(ndarray[float64_t, ndim=2] out,
               ndarray[int64_t] counts,
               ndarray[float64_t, ndim=2] values,
               ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
     cdef:
         Py_ssize_t i, j, N, K, lab
-        float64_t val, ct
-        ndarray[float64_t, ndim=2] nobs, sumx, sumxx
+        float64_t val, count
+        ndarray[float64_t, ndim=2] sumx, nobs
 
     nobs = np.zeros_like(out)
     sumx = np.zeros_like(out)
-    sumxx = np.zeros_like(out)
 
     N, K = (<object> values).shape
 
     if K > 1:
         for i in range(N):
-
             lab = labels[i]
             if lab < 0:
                 continue
 
             counts[lab] += 1
-
             for j in range(K):
                 val = values[i, j]
 
@@ -5215,57 +5383,52 @@ def group_var_float64(ndarray[float64_t, ndim=2] out,
                 if val == val:
                     nobs[lab, j] += 1
                     sumx[lab, j] += val
-                    sumxx[lab, j] += val * val
     else:
         for i in range(N):
-
             lab = labels[i]
             if lab < 0:
                 continue
 
             counts[lab] += 1
             val = values[i, 0]
+
             # not nan
             if val == val:
                 nobs[lab, 0] += 1
                 sumx[lab, 0] += val
-                sumxx[lab, 0] += val * val
-
 
     for i in range(len(counts)):
         for j in range(K):
-            ct = nobs[i, j]
-            if ct < 2:
+            if nobs[i, j] == 0:
                 out[i, j] = nan
             else:
-                out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
-                             (ct * ct - ct))
-@cython.wraparound(False)
+                out[i, j] = sumx[i, j]
 @cython.boundscheck(False)
-def group_var_float32(ndarray[float32_t, ndim=2] out,
+@cython.wraparound(False)
+def group_add_float32(ndarray[float32_t, ndim=2] out,
               ndarray[int64_t] counts,
               ndarray[float32_t, ndim=2] values,
               ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
     cdef:
         Py_ssize_t i, j, N, K, lab
-        float32_t val, ct
-        ndarray[float32_t, ndim=2] nobs, sumx, sumxx
+        float32_t val, count
+        ndarray[float32_t, ndim=2] sumx, nobs
 
     nobs = np.zeros_like(out)
     sumx = np.zeros_like(out)
-    sumxx = np.zeros_like(out)
 
     N, K = (<object> values).shape
 
     if K > 1:
         for i in range(N):
-
             lab = labels[i]
             if lab < 0:
                 continue
 
             counts[lab] += 1
-
             for j in range(K):
                 val = values[i, j]
 
@@ -5273,53 +5436,48 @@ def group_var_float32(ndarray[float32_t, ndim=2] out,
                 if val == val:
                     nobs[lab, j] += 1
                     sumx[lab, j] += val
-                    sumxx[lab, j] += val * val
     else:
         for i in range(N):
-
             lab = labels[i]
             if lab < 0:
                 continue
 
             counts[lab] += 1
             val = values[i, 0]
+
             # not nan
             if val == val:
                 nobs[lab, 0] += 1
                 sumx[lab, 0] += val
-                sumxx[lab, 0] += val * val
-
 
     for i in range(len(counts)):
         for j in range(K):
-            ct = nobs[i, j]
-            if ct < 2:
+            if nobs[i, j] == 0:
                 out[i, j] = nan
             else:
-                out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
-                             (ct * ct - ct))
+                out[i, j] = sumx[i, j]
 
-@cython.wraparound(False)
 @cython.boundscheck(False)
-def group_var_bin_float64(ndarray[float64_t, ndim=2] out,
+@cython.wraparound(False)
+def group_add_bin_float64(ndarray[float64_t, ndim=2] out,
                   ndarray[int64_t] counts,
                   ndarray[float64_t, ndim=2] values,
                   ndarray[int64_t] bins):
-
+    '''
+    Only aggregates on axis=0
+    '''
     cdef:
-        Py_ssize_t i, j, N, K, ngroups, b
-        float64_t val, ct
-        ndarray[float64_t, ndim=2] nobs, sumx, sumxx
+        Py_ssize_t i, j, N, K, ngroups, b, nbins
+        float64_t val, count
+        ndarray[float64_t, ndim=2] sumx, nobs
 
     nobs = np.zeros_like(out)
     sumx = np.zeros_like(out)
-    sumxx = np.zeros_like(out)
 
     if bins[len(bins) - 1] == len(values):
         ngroups = len(bins)
     else:
         ngroups = len(bins) + 1
-
     N, K = (<object> values).shape
 
     b = 0
@@ -5329,7 +5487,6 @@ def group_var_bin_float64(ndarray[float64_t, ndim=2] out,
                 b += 1
 
             counts[b] += 1
-
             for j in range(K):
                 val = values[i, j]
 
@@ -5337,7 +5494,6 @@ def group_var_bin_float64(ndarray[float64_t, ndim=2] out,
                 if val == val:
                     nobs[b, j] += 1
                     sumx[b, j] += val
-                    sumxx[b, j] += val * val
     else:
         for i in range(N):
             while b < ngroups - 1 and i >= bins[b]:
@@ -5350,37 +5506,34 @@ def group_var_bin_float64(ndarray[float64_t, ndim=2] out,
             if val == val:
                 nobs[b, 0] += 1
                 sumx[b, 0] += val
-                sumxx[b, 0] += val * val
 
     for i in range(ngroups):
         for j in range(K):
-            ct = nobs[i, j]
-            if ct < 2:
+            if nobs[i, j] == 0:
                 out[i, j] = nan
             else:
-                out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
-                             (ct * ct - ct))
-@cython.wraparound(False)
+                out[i, j] = sumx[i, j]
 @cython.boundscheck(False)
-def group_var_bin_float32(ndarray[float32_t, ndim=2] out,
+@cython.wraparound(False)
+def group_add_bin_float32(ndarray[float32_t, ndim=2] out,
                   ndarray[int64_t] counts,
                   ndarray[float32_t, ndim=2] values,
                   ndarray[int64_t] bins):
-
+    '''
+    Only aggregates on axis=0
+    '''
     cdef:
-        Py_ssize_t i, j, N, K, ngroups, b
-        float32_t val, ct
-        ndarray[float32_t, ndim=2] nobs, sumx, sumxx
+        Py_ssize_t i, j, N, K, ngroups, b, nbins
+        float32_t val, count
+        ndarray[float32_t, ndim=2] sumx, nobs
 
     nobs = np.zeros_like(out)
     sumx = np.zeros_like(out)
-    sumxx = np.zeros_like(out)
 
     if bins[len(bins) - 1] == len(values):
         ngroups = len(bins)
     else:
         ngroups = len(bins) + 1
-
     N, K = (<object> values).shape
 
     b = 0
@@ -5390,7 +5543,6 @@ def group_var_bin_float32(ndarray[float32_t, ndim=2] out,
                 b += 1
 
             counts[b] += 1
-
             for j in range(K):
                 val = values[i, j]
 
@@ -5398,7 +5550,6 @@ def group_var_bin_float32(ndarray[float32_t, ndim=2] out,
                 if val == val:
                     nobs[b, j] += 1
                     sumx[b, j] += val
-                    sumxx[b, j] += val * val
     else:
         for i in range(N):
             while b < ngroups - 1 and i >= bins[b]:
@@ -5411,30 +5562,30 @@ def group_var_bin_float32(ndarray[float32_t, ndim=2] out,
             if val == val:
                 nobs[b, 0] += 1
                 sumx[b, 0] += val
-                sumxx[b, 0] += val * val
 
     for i in range(ngroups):
         for j in range(K):
-            ct = nobs[i, j]
-            if ct < 2:
+            if nobs[i, j] == 0:
                 out[i, j] = nan
             else:
-                out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
-                             (ct * ct - ct))
+                out[i, j] = sumx[i, j]
 
-@cython.wraparound(False)
 @cython.boundscheck(False)
-def group_mean_float64(ndarray[float64_t, ndim=2] out,
+@cython.wraparound(False)
+def group_prod_float64(ndarray[float64_t, ndim=2] out,
                ndarray[int64_t] counts,
                ndarray[float64_t, ndim=2] values,
                ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
     cdef:
         Py_ssize_t i, j, N, K, lab
         float64_t val, count
-        ndarray[float64_t, ndim=2] sumx, nobs
+        ndarray[float64_t, ndim=2] prodx, nobs
 
     nobs = np.zeros_like(out)
-    sumx = np.zeros_like(out)
+    prodx = np.ones_like(out)
 
     N, K = (<object> values).shape
 
@@ -5447,10 +5598,11 @@ def group_mean_float64(ndarray[float64_t, ndim=2] out,
             counts[lab] += 1
             for j in range(K):
                 val = values[i, j]
+
                 # not nan
                 if val == val:
                     nobs[lab, j] += 1
-                    sumx[lab, j] += val
+                    prodx[lab, j] *= val
     else:
         for i in range(N):
             lab = labels[i]
@@ -5459,31 +5611,34 @@ def group_mean_float64(ndarray[float64_t, ndim=2] out,
 
             counts[lab] += 1
             val = values[i, 0]
+
             # not nan
             if val == val:
                 nobs[lab, 0] += 1
-                sumx[lab, 0] += val
+                prodx[lab, 0] *= val
 
     for i in range(len(counts)):
         for j in range(K):
-            count = nobs[i, j]
             if nobs[i, j] == 0:
                 out[i, j] = nan
             else:
-                out[i, j] = sumx[i, j] / count
-@cython.wraparound(False)
+                out[i, j] = prodx[i, j]
 @cython.boundscheck(False)
-def group_mean_float32(ndarray[float32_t, ndim=2] out,
+@cython.wraparound(False)
+def group_prod_float32(ndarray[float32_t, ndim=2] out,
                ndarray[int64_t] counts,
                ndarray[float32_t, ndim=2] values,
                ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
     cdef:
         Py_ssize_t i, j, N, K, lab
         float32_t val, count
-        ndarray[float32_t, ndim=2] sumx, nobs
+        ndarray[float32_t, ndim=2] prodx, nobs
 
     nobs = np.zeros_like(out)
-    sumx = np.zeros_like(out)
+    prodx = np.ones_like(out)
 
     N, K = (<object> values).shape
 
@@ -5496,10 +5651,11 @@ def group_mean_float32(ndarray[float32_t, ndim=2] out,
             counts[lab] += 1
             for j in range(K):
                 val = values[i, j]
+
                 # not nan
                 if val == val:
                     nobs[lab, j] += 1
-                    sumx[lab, j] += val
+                    prodx[lab, j] *= val
     else:
         for i in range(N):
             lab = labels[i]
@@ -5508,37 +5664,41 @@ def group_mean_float32(ndarray[float32_t, ndim=2] out,
 
             counts[lab] += 1
             val = values[i, 0]
+
             # not nan
             if val == val:
                 nobs[lab, 0] += 1
-                sumx[lab, 0] += val
+                prodx[lab, 0] *= val
 
     for i in range(len(counts)):
         for j in range(K):
-            count = nobs[i, j]
             if nobs[i, j] == 0:
                 out[i, j] = nan
             else:
-                out[i, j] = sumx[i, j] / count
-
+                out[i, j] = prodx[i, j]
 
-def group_mean_bin_float64(ndarray[float64_t, ndim=2] out,
-                   ndarray[int64_t] counts,
-                   ndarray[float64_t, ndim=2] values,
-                   ndarray[int64_t] bins):
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_prod_bin_float64(ndarray[float64_t, ndim=2] out,
+                  ndarray[int64_t] counts,
+                  ndarray[float64_t, ndim=2] values,
+                  ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
     cdef:
         Py_ssize_t i, j, N, K, ngroups, b
         float64_t val, count
-        ndarray[float64_t, ndim=2] sumx, nobs
+        ndarray[float64_t, ndim=2] prodx, nobs
 
     nobs = np.zeros_like(out)
-    sumx = np.zeros_like(out)
+    prodx = np.ones_like(out)
 
-    N, K = (<object> values).shape
     if bins[len(bins) - 1] == len(values):
         ngroups = len(bins)
     else:
         ngroups = len(bins) + 1
+    N, K = (<object> values).shape
 
     b = 0
     if K > 1:
@@ -5553,7 +5713,7 @@ def group_mean_bin_float64(ndarray[float64_t, ndim=2] out,
                 # not nan
                 if val == val:
                     nobs[b, j] += 1
-                    sumx[b, j] += val
+                    prodx[b, j] *= val
     else:
         for i in range(N):
             while b < ngroups - 1 and i >= bins[b]:
@@ -5565,33 +5725,36 @@ def group_mean_bin_float64(ndarray[float64_t, ndim=2] out,
             # not nan
             if val == val:
                 nobs[b, 0] += 1
-                sumx[b, 0] += val
+                prodx[b, 0] *= val
 
     for i in range(ngroups):
         for j in range(K):
-            count = nobs[i, j]
             if nobs[i, j] == 0:
                 out[i, j] = nan
             else:
-                out[i, j] = sumx[i, j] / count
-
-def group_mean_bin_float32(ndarray[float32_t, ndim=2] out,
-                   ndarray[int64_t] counts,
-                   ndarray[float32_t, ndim=2] values,
-                   ndarray[int64_t] bins):
+                out[i, j] = prodx[i, j]
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_prod_bin_float32(ndarray[float32_t, ndim=2] out,
+                  ndarray[int64_t] counts,
+                  ndarray[float32_t, ndim=2] values,
+                  ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
     cdef:
         Py_ssize_t i, j, N, K, ngroups, b
         float32_t val, count
-        ndarray[float32_t, ndim=2] sumx, nobs
+        ndarray[float32_t, ndim=2] prodx, nobs
 
     nobs = np.zeros_like(out)
-    sumx = np.zeros_like(out)
+    prodx = np.ones_like(out)
 
-    N, K = (<object> values).shape
     if bins[len(bins) - 1] == len(values):
         ngroups = len(bins)
     else:
         ngroups = len(bins) + 1
+    N, K = (<object> values).shape
 
     b = 0
     if K > 1:
@@ -5606,7 +5769,7 @@ def group_mean_bin_float32(ndarray[float32_t, ndim=2] out,
                 # not nan
                 if val == val:
                     nobs[b, j] += 1
-                    sumx[b, j] += val
+                    prodx[b, j] *= val
     else:
         for i in range(N):
             while b < ngroups - 1 and i >= bins[b]:
@@ -5618,149 +5781,147 @@ def group_mean_bin_float32(ndarray[float32_t, ndim=2] out,
             # not nan
             if val == val:
                 nobs[b, 0] += 1
-                sumx[b, 0] += val
+                prodx[b, 0] *= val
 
     for i in range(ngroups):
         for j in range(K):
-            count = nobs[i, j]
             if nobs[i, j] == 0:
                 out[i, j] = nan
             else:
-                out[i, j] = sumx[i, j] / count
+                out[i, j] = prodx[i, j]
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_min_float64(ndarray[float64_t, ndim=2] out,
+def group_var_float64(ndarray[float64_t, ndim=2] out,
               ndarray[int64_t] counts,
               ndarray[float64_t, ndim=2] values,
               ndarray[int64_t] labels):
-    '''
-    Only aggregates on axis=0
-    '''
     cdef:
         Py_ssize_t i, j, N, K, lab
-        float64_t val, count
-        ndarray[float64_t, ndim=2] minx, nobs
+        float64_t val, ct
+        ndarray[float64_t, ndim=2] nobs, sumx, sumxx
 
     nobs = np.zeros_like(out)
-
-    minx = np.empty_like(out)
-    minx.fill(np.inf)
+    sumx = np.zeros_like(out)
+    sumxx = np.zeros_like(out)
 
     N, K = (<object> values).shape
 
     if K > 1:
         for i in range(N):
+
             lab = labels[i]
             if lab < 0:
                 continue
 
             counts[lab] += 1
+
             for j in range(K):
                 val = values[i, j]
 
                 # not nan
                 if val == val:
                     nobs[lab, j] += 1
-                    if val < minx[lab, j]:
-                        minx[lab, j] = val
+                    sumx[lab, j] += val
+                    sumxx[lab, j] += val * val
     else:
         for i in range(N):
+
             lab = labels[i]
             if lab < 0:
                 continue
 
             counts[lab] += 1
             val = values[i, 0]
-
             # not nan
             if val == val:
                 nobs[lab, 0] += 1
-                if val < minx[lab, 0]:
-                    minx[lab, 0] = val
+                sumx[lab, 0] += val
+                sumxx[lab, 0] += val * val
+
 
     for i in range(len(counts)):
         for j in range(K):
-            if nobs[i, j] == 0:
+            ct = nobs[i, j]
+            if ct < 2:
                 out[i, j] = nan
             else:
-                out[i, j] = minx[i, j]
+                out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
+                             (ct * ct - ct))
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_min_float32(ndarray[float32_t, ndim=2] out,
+def group_var_float32(ndarray[float32_t, ndim=2] out,
               ndarray[int64_t] counts,
               ndarray[float32_t, ndim=2] values,
               ndarray[int64_t] labels):
-    '''
-    Only aggregates on axis=0
-    '''
     cdef:
         Py_ssize_t i, j, N, K, lab
-        float32_t val, count
-        ndarray[float32_t, ndim=2] minx, nobs
+        float32_t val, ct
+        ndarray[float32_t, ndim=2] nobs, sumx, sumxx
 
     nobs = np.zeros_like(out)
-
-    minx = np.empty_like(out)
-    minx.fill(np.inf)
+    sumx = np.zeros_like(out)
+    sumxx = np.zeros_like(out)
 
     N, K = (<object> values).shape
 
     if K > 1:
         for i in range(N):
+
             lab = labels[i]
             if lab < 0:
                 continue
 
             counts[lab] += 1
+
             for j in range(K):
                 val = values[i, j]
 
                 # not nan
                 if val == val:
                     nobs[lab, j] += 1
-                    if val < minx[lab, j]:
-                        minx[lab, j] = val
+                    sumx[lab, j] += val
+                    sumxx[lab, j] += val * val
     else:
         for i in range(N):
+
             lab = labels[i]
             if lab < 0:
                 continue
 
             counts[lab] += 1
             val = values[i, 0]
-
             # not nan
             if val == val:
                 nobs[lab, 0] += 1
-                if val < minx[lab, 0]:
-                    minx[lab, 0] = val
+                sumx[lab, 0] += val
+                sumxx[lab, 0] += val * val
+
 
     for i in range(len(counts)):
         for j in range(K):
-            if nobs[i, j] == 0:
+            ct = nobs[i, j]
+            if ct < 2:
                 out[i, j] = nan
             else:
-                out[i, j] = minx[i, j]
+                out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
+                             (ct * ct - ct))
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_min_bin_float64(ndarray[float64_t, ndim=2] out,
-                   ndarray[int64_t] counts,
-                   ndarray[float64_t, ndim=2] values,
-                   ndarray[int64_t] bins):
-    '''
-    Only aggregates on axis=0
-    '''
+def group_var_bin_float64(ndarray[float64_t, ndim=2] out,
+                  ndarray[int64_t] counts,
+                  ndarray[float64_t, ndim=2] values,
+                  ndarray[int64_t] bins):
+
     cdef:
         Py_ssize_t i, j, N, K, ngroups, b
-        float64_t val, count
-        ndarray[float64_t, ndim=2] minx, nobs
+        float64_t val, ct
+        ndarray[float64_t, ndim=2] nobs, sumx, sumxx
 
     nobs = np.zeros_like(out)
-
-    minx = np.empty_like(out)
-    minx.fill(np.inf)
+    sumx = np.zeros_like(out)
+    sumxx = np.zeros_like(out)
 
     if bins[len(bins) - 1] == len(values):
         ngroups = len(bins)
@@ -5776,14 +5937,15 @@ def group_min_bin_float64(ndarray[float64_t, ndim=2] out,
                 b += 1
 
             counts[b] += 1
+
             for j in range(K):
                 val = values[i, j]
 
                 # not nan
                 if val == val:
                     nobs[b, j] += 1
-                    if val < minx[b, j]:
-                        minx[b, j] = val
+                    sumx[b, j] += val
+                    sumxx[b, j] += val * val
     else:
         for i in range(N):
             while b < ngroups - 1 and i >= bins[b]:
@@ -5795,33 +5957,32 @@ def group_min_bin_float64(ndarray[float64_t, ndim=2] out,
             # not nan
             if val == val:
                 nobs[b, 0] += 1
-                if val < minx[b, 0]:
-                    minx[b, 0] = val
+                sumx[b, 0] += val
+                sumxx[b, 0] += val * val
 
     for i in range(ngroups):
         for j in range(K):
-            if nobs[i, j] == 0:
+            ct = nobs[i, j]
+            if ct < 2:
                 out[i, j] = nan
             else:
-                out[i, j] = minx[i, j]
+                out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
+                             (ct * ct - ct))
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_min_bin_float32(ndarray[float32_t, ndim=2] out,
-                   ndarray[int64_t] counts,
-                   ndarray[float32_t, ndim=2] values,
-                   ndarray[int64_t] bins):
-    '''
-    Only aggregates on axis=0
-    '''
+def group_var_bin_float32(ndarray[float32_t, ndim=2] out,
+                  ndarray[int64_t] counts,
+                  ndarray[float32_t, ndim=2] values,
+                  ndarray[int64_t] bins):
+
     cdef:
         Py_ssize_t i, j, N, K, ngroups, b
-        float32_t val, count
-        ndarray[float32_t, ndim=2] minx, nobs
+        float32_t val, ct
+        ndarray[float32_t, ndim=2] nobs, sumx, sumxx
 
     nobs = np.zeros_like(out)
-
-    minx = np.empty_like(out)
-    minx.fill(np.inf)
+    sumx = np.zeros_like(out)
+    sumxx = np.zeros_like(out)
 
     if bins[len(bins) - 1] == len(values):
         ngroups = len(bins)
@@ -5837,14 +5998,15 @@ def group_min_bin_float32(ndarray[float32_t, ndim=2] out,
                 b += 1
 
             counts[b] += 1
+
             for j in range(K):
                 val = values[i, j]
 
                 # not nan
                 if val == val:
                     nobs[b, j] += 1
-                    if val < minx[b, j]:
-                        minx[b, j] = val
+                    sumx[b, j] += val
+                    sumxx[b, j] += val * val
     else:
         for i in range(N):
             while b < ngroups - 1 and i >= bins[b]:
@@ -5856,34 +6018,31 @@ def group_min_bin_float32(ndarray[float32_t, ndim=2] out,
             # not nan
             if val == val:
                 nobs[b, 0] += 1
-                if val < minx[b, 0]:
-                    minx[b, 0] = val
+                sumx[b, 0] += val
+                sumxx[b, 0] += val * val
 
     for i in range(ngroups):
         for j in range(K):
-            if nobs[i, j] == 0:
+            ct = nobs[i, j]
+            if ct < 2:
                 out[i, j] = nan
             else:
-                out[i, j] = minx[i, j]
+                out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
+                             (ct * ct - ct))
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_max_float64(ndarray[float64_t, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[float64_t, ndim=2] values,
-              ndarray[int64_t] labels):
-    '''
-    Only aggregates on axis=0
-    '''
+def group_mean_float64(ndarray[float64_t, ndim=2] out,
+               ndarray[int64_t] counts,
+               ndarray[float64_t, ndim=2] values,
+               ndarray[int64_t] labels):
     cdef:
         Py_ssize_t i, j, N, K, lab
         float64_t val, count
-        ndarray[float64_t, ndim=2] maxx, nobs
+        ndarray[float64_t, ndim=2] sumx, nobs
 
     nobs = np.zeros_like(out)
-
-    maxx = np.empty_like(out)
-    maxx.fill(-np.inf)
+    sumx = np.zeros_like(out)
 
     N, K = (<object> values).shape
 
@@ -5896,12 +6055,10 @@ def group_max_float64(ndarray[float64_t, ndim=2] out,
             counts[lab] += 1
             for j in range(K):
                 val = values[i, j]
-
                 # not nan
                 if val == val:
                     nobs[lab, j] += 1
-                    if val > maxx[lab, j]:
-                        maxx[lab, j] = val
+                    sumx[lab, j] += val
     else:
         for i in range(N):
             lab = labels[i]
@@ -5910,37 +6067,31 @@ def group_max_float64(ndarray[float64_t, ndim=2] out,
 
             counts[lab] += 1
             val = values[i, 0]
-
             # not nan
             if val == val:
                 nobs[lab, 0] += 1
-                if val > maxx[lab, 0]:
-                    maxx[lab, 0] = val
+                sumx[lab, 0] += val
 
     for i in range(len(counts)):
         for j in range(K):
+            count = nobs[i, j]
             if nobs[i, j] == 0:
                 out[i, j] = nan
             else:
-                out[i, j] = maxx[i, j]
+                out[i, j] = sumx[i, j] / count
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_max_float32(ndarray[float32_t, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[float32_t, ndim=2] values,
-              ndarray[int64_t] labels):
-    '''
-    Only aggregates on axis=0
-    '''
+def group_mean_float32(ndarray[float32_t, ndim=2] out,
+               ndarray[int64_t] counts,
+               ndarray[float32_t, ndim=2] values,
+               ndarray[int64_t] labels):
     cdef:
         Py_ssize_t i, j, N, K, lab
         float32_t val, count
-        ndarray[float32_t, ndim=2] maxx, nobs
+        ndarray[float32_t, ndim=2] sumx, nobs
 
     nobs = np.zeros_like(out)
-
-    maxx = np.empty_like(out)
-    maxx.fill(-np.inf)
+    sumx = np.zeros_like(out)
 
     N, K = (<object> values).shape
 
@@ -5953,12 +6104,10 @@ def group_max_float32(ndarray[float32_t, ndim=2] out,
             counts[lab] += 1
             for j in range(K):
                 val = values[i, j]
-
                 # not nan
                 if val == val:
                     nobs[lab, j] += 1
-                    if val > maxx[lab, j]:
-                        maxx[lab, j] = val
+                    sumx[lab, j] += val
     else:
         for i in range(N):
             lab = labels[i]
@@ -5967,45 +6116,38 @@ def group_max_float32(ndarray[float32_t, ndim=2] out,
 
             counts[lab] += 1
             val = values[i, 0]
-
             # not nan
             if val == val:
                 nobs[lab, 0] += 1
-                if val > maxx[lab, 0]:
-                    maxx[lab, 0] = val
+                sumx[lab, 0] += val
 
     for i in range(len(counts)):
         for j in range(K):
+            count = nobs[i, j]
             if nobs[i, j] == 0:
                 out[i, j] = nan
             else:
-                out[i, j] = maxx[i, j]
+                out[i, j] = sumx[i, j] / count
 
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_max_bin_float64(ndarray[float64_t, ndim=2] out,
-                  ndarray[int64_t] counts,
-                  ndarray[float64_t, ndim=2] values,
-                  ndarray[int64_t] bins):
-    '''
-    Only aggregates on axis=0
-    '''
+
+def group_mean_bin_float64(ndarray[float64_t, ndim=2] out,
+                   ndarray[int64_t] counts,
+                   ndarray[float64_t, ndim=2] values,
+                   ndarray[int64_t] bins):
     cdef:
         Py_ssize_t i, j, N, K, ngroups, b
         float64_t val, count
-        ndarray[float64_t, ndim=2] maxx, nobs
+        ndarray[float64_t, ndim=2] sumx, nobs
 
     nobs = np.zeros_like(out)
-    maxx = np.empty_like(out)
-    maxx.fill(-np.inf)
+    sumx = np.zeros_like(out)
 
+    N, K = (<object> values).shape
     if bins[len(bins) - 1] == len(values):
         ngroups = len(bins)
     else:
         ngroups = len(bins) + 1
 
-    N, K = (<object> values).shape
-
     b = 0
     if K > 1:
         for i in range(N):
@@ -6019,8 +6161,7 @@ def group_max_bin_float64(ndarray[float64_t, ndim=2] out,
                 # not nan
                 if val == val:
                     nobs[b, j] += 1
-                    if val > maxx[b, j]:
-                        maxx[b, j] = val
+                    sumx[b, j] += val
     else:
         for i in range(N):
             while b < ngroups - 1 and i >= bins[b]:
@@ -6032,40 +6173,34 @@ def group_max_bin_float64(ndarray[float64_t, ndim=2] out,
             # not nan
             if val == val:
                 nobs[b, 0] += 1
-                if val > maxx[b, 0]:
-                    maxx[b, 0] = val
+                sumx[b, 0] += val
 
     for i in range(ngroups):
         for j in range(K):
+            count = nobs[i, j]
             if nobs[i, j] == 0:
                 out[i, j] = nan
             else:
-                out[i, j] = maxx[i, j]
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_max_bin_float32(ndarray[float32_t, ndim=2] out,
-                  ndarray[int64_t] counts,
-                  ndarray[float32_t, ndim=2] values,
-                  ndarray[int64_t] bins):
-    '''
-    Only aggregates on axis=0
-    '''
+                out[i, j] = sumx[i, j] / count
+
+def group_mean_bin_float32(ndarray[float32_t, ndim=2] out,
+                   ndarray[int64_t] counts,
+                   ndarray[float32_t, ndim=2] values,
+                   ndarray[int64_t] bins):
     cdef:
         Py_ssize_t i, j, N, K, ngroups, b
         float32_t val, count
-        ndarray[float32_t, ndim=2] maxx, nobs
+        ndarray[float32_t, ndim=2] sumx, nobs
 
     nobs = np.zeros_like(out)
-    maxx = np.empty_like(out)
-    maxx.fill(-np.inf)
+    sumx = np.zeros_like(out)
 
+    N, K = (<object> values).shape
     if bins[len(bins) - 1] == len(values):
         ngroups = len(bins)
     else:
         ngroups = len(bins) + 1
 
-    N, K = (<object> values).shape
-
     b = 0
     if K > 1:
         for i in range(N):
@@ -6079,8 +6214,7 @@ def group_max_bin_float32(ndarray[float32_t, ndim=2] out,
                 # not nan
                 if val == val:
                     nobs[b, j] += 1
-                    if val > maxx[b, j]:
-                        maxx[b, j] = val
+                    sumx[b, j] += val
     else:
         for i in range(N):
             while b < ngroups - 1 and i >= bins[b]:
@@ -6092,103 +6226,149 @@ def group_max_bin_float32(ndarray[float32_t, ndim=2] out,
             # not nan
             if val == val:
                 nobs[b, 0] += 1
-                if val > maxx[b, 0]:
-                    maxx[b, 0] = val
+                sumx[b, 0] += val
 
     for i in range(ngroups):
         for j in range(K):
+            count = nobs[i, j]
             if nobs[i, j] == 0:
                 out[i, j] = nan
             else:
-                out[i, j] = maxx[i, j]
+                out[i, j] = sumx[i, j] / count
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_ohlc_float64(ndarray[float64_t, ndim=2] out,
-                  ndarray[int64_t] counts,
-                  ndarray[float64_t, ndim=2] values,
-                  ndarray[int64_t] bins):
+def group_min_float64(ndarray[float64_t, ndim=2] out,
+              ndarray[int64_t] counts,
+              ndarray[float64_t, ndim=2] values,
+              ndarray[int64_t] labels):
     '''
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, N, K, ngroups, b
+        Py_ssize_t i, j, N, K, lab
         float64_t val, count
-        float64_t vopen, vhigh, vlow, vclose, NA
-        bint got_first = 0
+        ndarray[float64_t, ndim=2] minx, nobs
 
-    if bins[len(bins) - 1] == len(values):
-        ngroups = len(bins)
-    else:
-        ngroups = len(bins) + 1
+    nobs = np.zeros_like(out)
+
+    minx = np.empty_like(out)
+    minx.fill(np.inf)
 
     N, K = (<object> values).shape
 
-    if out.shape[1] != 4:
-        raise ValueError('Output array must have 4 columns')
+    if K > 1:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
 
-    NA = np.nan
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    if val < minx[lab, j]:
+                        minx[lab, j] = val
+    else:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[lab, 0] += 1
+                if val < minx[lab, 0]:
+                    minx[lab, 0] = val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = minx[i, j]
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_min_float32(ndarray[float32_t, ndim=2] out,
+              ndarray[int64_t] counts,
+              ndarray[float32_t, ndim=2] values,
+              ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        float32_t val, count
+        ndarray[float32_t, ndim=2] minx, nobs
+
+    nobs = np.zeros_like(out)
+
+    minx = np.empty_like(out)
+    minx.fill(np.inf)
+
+    N, K = (<object> values).shape
 
-    b = 0
     if K > 1:
-        raise NotImplementedError
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    if val < minx[lab, j]:
+                        minx[lab, j] = val
     else:
         for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                if not got_first:
-                    out[b, 0] = NA
-                    out[b, 1] = NA
-                    out[b, 2] = NA
-                    out[b, 3] = NA
-                else:
-                    out[b, 0] = vopen
-                    out[b, 1] = vhigh
-                    out[b, 2] = vlow
-                    out[b, 3] = vclose
-                b += 1
-                got_first = 0
+            lab = labels[i]
+            if lab < 0:
+                continue
 
-            counts[b] += 1
+            counts[lab] += 1
             val = values[i, 0]
 
             # not nan
             if val == val:
-                if not got_first:
-                    got_first = 1
-                    vopen = val
-                    vlow = val
-                    vhigh = val
-                else:
-                    if val < vlow:
-                        vlow = val
-                    if val > vhigh:
-                        vhigh = val
-                vclose = val
+                nobs[lab, 0] += 1
+                if val < minx[lab, 0]:
+                    minx[lab, 0] = val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = minx[i, j]
 
-        if not got_first:
-            out[b, 0] = NA
-            out[b, 1] = NA
-            out[b, 2] = NA
-            out[b, 3] = NA
-        else:
-            out[b, 0] = vopen
-            out[b, 1] = vhigh
-            out[b, 2] = vlow
-            out[b, 3] = vclose
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_ohlc_float32(ndarray[float32_t, ndim=2] out,
-                  ndarray[int64_t] counts,
-                  ndarray[float32_t, ndim=2] values,
-                  ndarray[int64_t] bins):
+def group_min_bin_float64(ndarray[float64_t, ndim=2] out,
+                   ndarray[int64_t] counts,
+                   ndarray[float64_t, ndim=2] values,
+                   ndarray[int64_t] bins):
     '''
     Only aggregates on axis=0
     '''
     cdef:
         Py_ssize_t i, j, N, K, ngroups, b
-        float32_t val, count
-        float32_t vopen, vhigh, vlow, vclose, NA
-        bint got_first = 0
+        float64_t val, count
+        ndarray[float64_t, ndim=2] minx, nobs
+
+    nobs = np.zeros_like(out)
+
+    minx = np.empty_like(out)
+    minx.fill(np.inf)
 
     if bins[len(bins) - 1] == len(values):
         ngroups = len(bins)
@@ -6197,691 +6377,723 @@ def group_ohlc_float32(ndarray[float32_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    if out.shape[1] != 4:
-        raise ValueError('Output array must have 4 columns')
-
-    NA = np.nan
-
     b = 0
     if K > 1:
-        raise NotImplementedError
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    if val < minx[b, j]:
+                        minx[b, j] = val
     else:
         for i in range(N):
             while b < ngroups - 1 and i >= bins[b]:
-                if not got_first:
-                    out[b, 0] = NA
-                    out[b, 1] = NA
-                    out[b, 2] = NA
-                    out[b, 3] = NA
-                else:
-                    out[b, 0] = vopen
-                    out[b, 1] = vhigh
-                    out[b, 2] = vlow
-                    out[b, 3] = vclose
                 b += 1
-                got_first = 0
 
             counts[b] += 1
             val = values[i, 0]
 
             # not nan
             if val == val:
-                if not got_first:
-                    got_first = 1
-                    vopen = val
-                    vlow = val
-                    vhigh = val
-                else:
-                    if val < vlow:
-                        vlow = val
-                    if val > vhigh:
-                        vhigh = val
-                vclose = val
-
-        if not got_first:
-            out[b, 0] = NA
-            out[b, 1] = NA
-            out[b, 2] = NA
-            out[b, 3] = NA
-        else:
-            out[b, 0] = vopen
-            out[b, 1] = vhigh
-            out[b, 2] = vlow
-            out[b, 3] = vclose
+                nobs[b, 0] += 1
+                if val < minx[b, 0]:
+                    minx[b, 0] = val
 
+    for i in range(ngroups):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = minx[i, j]
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def left_join_indexer_unique_float64(ndarray[float64_t] left,
-                                      ndarray[float64_t] right):
+def group_min_bin_float32(ndarray[float32_t, ndim=2] out,
+                   ndarray[int64_t] counts,
+                   ndarray[float32_t, ndim=2] values,
+                   ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
     cdef:
-        Py_ssize_t i, j, nleft, nright
-        ndarray[int64_t] indexer
-        float64_t lval, rval
-
-    i = 0
-    j = 0
-    nleft = len(left)
-    nright = len(right)
-
-    indexer = np.empty(nleft, dtype=np.int64)
-    while True:
-        if i == nleft:
-            break
-
-        if j == nright:
-            indexer[i] = -1
-            i += 1
-            continue
+        Py_ssize_t i, j, N, K, ngroups, b
+        float32_t val, count
+        ndarray[float32_t, ndim=2] minx, nobs
 
-        rval = right[j]
+    nobs = np.zeros_like(out)
 
-        while i < nleft - 1 and left[i] == rval:
-            indexer[i] = j
-            i += 1
+    minx = np.empty_like(out)
+    minx.fill(np.inf)
 
-        if left[i] == right[j]:
-            indexer[i] = j
-            i += 1
-            while i < nleft - 1 and left[i] == rval:
-                indexer[i] = j
-                i += 1
-            j += 1
-        elif left[i] > rval:
-            indexer[i] = -1
-            j += 1
-        else:
-            indexer[i] = -1
-            i += 1
-    return indexer
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
 
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def left_join_indexer_unique_float32(ndarray[float32_t] left,
-                                      ndarray[float32_t] right):
-    cdef:
-        Py_ssize_t i, j, nleft, nright
-        ndarray[int64_t] indexer
-        float32_t lval, rval
+    N, K = (<object> values).shape
 
-    i = 0
-    j = 0
-    nleft = len(left)
-    nright = len(right)
+    b = 0
+    if K > 1:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
 
-    indexer = np.empty(nleft, dtype=np.int64)
-    while True:
-        if i == nleft:
-            break
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
 
-        if j == nright:
-            indexer[i] = -1
-            i += 1
-            continue
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    if val < minx[b, j]:
+                        minx[b, j] = val
+    else:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
 
-        rval = right[j]
+            counts[b] += 1
+            val = values[i, 0]
 
-        while i < nleft - 1 and left[i] == rval:
-            indexer[i] = j
-            i += 1
+            # not nan
+            if val == val:
+                nobs[b, 0] += 1
+                if val < minx[b, 0]:
+                    minx[b, 0] = val
 
-        if left[i] == right[j]:
-            indexer[i] = j
-            i += 1
-            while i < nleft - 1 and left[i] == rval:
-                indexer[i] = j
-                i += 1
-            j += 1
-        elif left[i] > rval:
-            indexer[i] = -1
-            j += 1
-        else:
-            indexer[i] = -1
-            i += 1
-    return indexer
+    for i in range(ngroups):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = minx[i, j]
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def left_join_indexer_unique_object(ndarray[object] left,
-                                      ndarray[object] right):
+def group_max_float64(ndarray[float64_t, ndim=2] out,
+              ndarray[int64_t] counts,
+              ndarray[float64_t, ndim=2] values,
+              ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
     cdef:
-        Py_ssize_t i, j, nleft, nright
-        ndarray[int64_t] indexer
-        object lval, rval
+        Py_ssize_t i, j, N, K, lab
+        float64_t val, count
+        ndarray[float64_t, ndim=2] maxx, nobs
 
-    i = 0
-    j = 0
-    nleft = len(left)
-    nright = len(right)
+    nobs = np.zeros_like(out)
 
-    indexer = np.empty(nleft, dtype=np.int64)
-    while True:
-        if i == nleft:
-            break
+    maxx = np.empty_like(out)
+    maxx.fill(-np.inf)
 
-        if j == nright:
-            indexer[i] = -1
-            i += 1
-            continue
+    N, K = (<object> values).shape
 
-        rval = right[j]
+    if K > 1:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
 
-        while i < nleft - 1 and left[i] == rval:
-            indexer[i] = j
-            i += 1
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
 
-        if left[i] == right[j]:
-            indexer[i] = j
-            i += 1
-            while i < nleft - 1 and left[i] == rval:
-                indexer[i] = j
-                i += 1
-            j += 1
-        elif left[i] > rval:
-            indexer[i] = -1
-            j += 1
-        else:
-            indexer[i] = -1
-            i += 1
-    return indexer
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    if val > maxx[lab, j]:
+                        maxx[lab, j] = val
+    else:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[lab, 0] += 1
+                if val > maxx[lab, 0]:
+                    maxx[lab, 0] = val
 
+    for i in range(len(counts)):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = maxx[i, j]
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def left_join_indexer_unique_int8(ndarray[int8_t] left,
-                                      ndarray[int8_t] right):
+def group_max_float32(ndarray[float32_t, ndim=2] out,
+              ndarray[int64_t] counts,
+              ndarray[float32_t, ndim=2] values,
+              ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
     cdef:
-        Py_ssize_t i, j, nleft, nright
-        ndarray[int64_t] indexer
-        int8_t lval, rval
+        Py_ssize_t i, j, N, K, lab
+        float32_t val, count
+        ndarray[float32_t, ndim=2] maxx, nobs
 
-    i = 0
-    j = 0
-    nleft = len(left)
-    nright = len(right)
+    nobs = np.zeros_like(out)
 
-    indexer = np.empty(nleft, dtype=np.int64)
-    while True:
-        if i == nleft:
-            break
+    maxx = np.empty_like(out)
+    maxx.fill(-np.inf)
 
-        if j == nright:
-            indexer[i] = -1
-            i += 1
-            continue
+    N, K = (<object> values).shape
 
-        rval = right[j]
+    if K > 1:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
 
-        while i < nleft - 1 and left[i] == rval:
-            indexer[i] = j
-            i += 1
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
 
-        if left[i] == right[j]:
-            indexer[i] = j
-            i += 1
-            while i < nleft - 1 and left[i] == rval:
-                indexer[i] = j
-                i += 1
-            j += 1
-        elif left[i] > rval:
-            indexer[i] = -1
-            j += 1
-        else:
-            indexer[i] = -1
-            i += 1
-    return indexer
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    if val > maxx[lab, j]:
+                        maxx[lab, j] = val
+    else:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[lab, 0] += 1
+                if val > maxx[lab, 0]:
+                    maxx[lab, 0] = val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = maxx[i, j]
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def left_join_indexer_unique_int16(ndarray[int16_t] left,
-                                      ndarray[int16_t] right):
+def group_max_bin_float64(ndarray[float64_t, ndim=2] out,
+                  ndarray[int64_t] counts,
+                  ndarray[float64_t, ndim=2] values,
+                  ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
     cdef:
-        Py_ssize_t i, j, nleft, nright
-        ndarray[int64_t] indexer
-        int16_t lval, rval
+        Py_ssize_t i, j, N, K, ngroups, b
+        float64_t val, count
+        ndarray[float64_t, ndim=2] maxx, nobs
 
-    i = 0
-    j = 0
-    nleft = len(left)
-    nright = len(right)
+    nobs = np.zeros_like(out)
+    maxx = np.empty_like(out)
+    maxx.fill(-np.inf)
 
-    indexer = np.empty(nleft, dtype=np.int64)
-    while True:
-        if i == nleft:
-            break
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
 
-        if j == nright:
-            indexer[i] = -1
-            i += 1
-            continue
+    N, K = (<object> values).shape
 
-        rval = right[j]
+    b = 0
+    if K > 1:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
 
-        while i < nleft - 1 and left[i] == rval:
-            indexer[i] = j
-            i += 1
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
 
-        if left[i] == right[j]:
-            indexer[i] = j
-            i += 1
-            while i < nleft - 1 and left[i] == rval:
-                indexer[i] = j
-                i += 1
-            j += 1
-        elif left[i] > rval:
-            indexer[i] = -1
-            j += 1
-        else:
-            indexer[i] = -1
-            i += 1
-    return indexer
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    if val > maxx[b, j]:
+                        maxx[b, j] = val
+    else:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[b, 0] += 1
+                if val > maxx[b, 0]:
+                    maxx[b, 0] = val
 
+    for i in range(ngroups):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = maxx[i, j]
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def left_join_indexer_unique_int32(ndarray[int32_t] left,
-                                      ndarray[int32_t] right):
+def group_max_bin_float32(ndarray[float32_t, ndim=2] out,
+                  ndarray[int64_t] counts,
+                  ndarray[float32_t, ndim=2] values,
+                  ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
     cdef:
-        Py_ssize_t i, j, nleft, nright
-        ndarray[int64_t] indexer
-        int32_t lval, rval
+        Py_ssize_t i, j, N, K, ngroups, b
+        float32_t val, count
+        ndarray[float32_t, ndim=2] maxx, nobs
 
-    i = 0
-    j = 0
-    nleft = len(left)
-    nright = len(right)
+    nobs = np.zeros_like(out)
+    maxx = np.empty_like(out)
+    maxx.fill(-np.inf)
 
-    indexer = np.empty(nleft, dtype=np.int64)
-    while True:
-        if i == nleft:
-            break
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
 
-        if j == nright:
-            indexer[i] = -1
-            i += 1
-            continue
+    N, K = (<object> values).shape
 
-        rval = right[j]
+    b = 0
+    if K > 1:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
 
-        while i < nleft - 1 and left[i] == rval:
-            indexer[i] = j
-            i += 1
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
 
-        if left[i] == right[j]:
-            indexer[i] = j
-            i += 1
-            while i < nleft - 1 and left[i] == rval:
-                indexer[i] = j
-                i += 1
-            j += 1
-        elif left[i] > rval:
-            indexer[i] = -1
-            j += 1
-        else:
-            indexer[i] = -1
-            i += 1
-    return indexer
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    if val > maxx[b, j]:
+                        maxx[b, j] = val
+    else:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[b, 0] += 1
+                if val > maxx[b, 0]:
+                    maxx[b, 0] = val
+
+    for i in range(ngroups):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = maxx[i, j]
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def left_join_indexer_unique_int64(ndarray[int64_t] left,
-                                      ndarray[int64_t] right):
+def group_ohlc_float64(ndarray[float64_t, ndim=2] out,
+                  ndarray[int64_t] counts,
+                  ndarray[float64_t, ndim=2] values,
+                  ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
     cdef:
-        Py_ssize_t i, j, nleft, nright
-        ndarray[int64_t] indexer
-        int64_t lval, rval
-
-    i = 0
-    j = 0
-    nleft = len(left)
-    nright = len(right)
+        Py_ssize_t i, j, N, K, ngroups, b
+        float64_t val, count
+        float64_t vopen, vhigh, vlow, vclose, NA
+        bint got_first = 0
 
-    indexer = np.empty(nleft, dtype=np.int64)
-    while True:
-        if i == nleft:
-            break
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
 
-        if j == nright:
-            indexer[i] = -1
-            i += 1
-            continue
+    N, K = (<object> values).shape
 
-        rval = right[j]
+    if out.shape[1] != 4:
+        raise ValueError('Output array must have 4 columns')
 
-        while i < nleft - 1 and left[i] == rval:
-            indexer[i] = j
-            i += 1
+    NA = np.nan
 
-        if left[i] == right[j]:
-            indexer[i] = j
-            i += 1
-            while i < nleft - 1 and left[i] == rval:
-                indexer[i] = j
-                i += 1
-            j += 1
-        elif left[i] > rval:
-            indexer[i] = -1
-            j += 1
-        else:
-            indexer[i] = -1
-            i += 1
-    return indexer
+    b = 0
+    if K > 1:
+        raise NotImplementedError
+    else:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                if not got_first:
+                    out[b, 0] = NA
+                    out[b, 1] = NA
+                    out[b, 2] = NA
+                    out[b, 3] = NA
+                else:
+                    out[b, 0] = vopen
+                    out[b, 1] = vhigh
+                    out[b, 2] = vlow
+                    out[b, 3] = vclose
+                b += 1
+                got_first = 0
 
+            counts[b] += 1
+            val = values[i, 0]
 
+            # not nan
+            if val == val:
+                if not got_first:
+                    got_first = 1
+                    vopen = val
+                    vlow = val
+                    vhigh = val
+                else:
+                    if val < vlow:
+                        vlow = val
+                    if val > vhigh:
+                        vhigh = val
+                vclose = val
 
-def left_join_indexer_float64(ndarray[float64_t] left,
-                              ndarray[float64_t] right):
+        if not got_first:
+            out[b, 0] = NA
+            out[b, 1] = NA
+            out[b, 2] = NA
+            out[b, 3] = NA
+        else:
+            out[b, 0] = vopen
+            out[b, 1] = vhigh
+            out[b, 2] = vlow
+            out[b, 3] = vclose
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_ohlc_float32(ndarray[float32_t, ndim=2] out,
+                  ndarray[int64_t] counts,
+                  ndarray[float32_t, ndim=2] values,
+                  ndarray[int64_t] bins):
     '''
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
+    Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
-        float64_t lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[float64_t] result
+        Py_ssize_t i, j, N, K, ngroups, b
+        float32_t val, count
+        float32_t vopen, vhigh, vlow, vclose, NA
+        bint got_first = 0
 
-    nleft = len(left)
-    nright = len(right)
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
 
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0:
-        while i < nleft:
-            if j == nright:
-                count += nleft - i
-                break
+    N, K = (<object> values).shape
 
-            lval = left[i]
-            rval = right[j]
+    if out.shape[1] != 4:
+        raise ValueError('Output array must have 4 columns')
 
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
+    NA = np.nan
+
+    b = 0
+    if K > 1:
+        raise NotImplementedError
+    else:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                if not got_first:
+                    out[b, 0] = NA
+                    out[b, 1] = NA
+                    out[b, 2] = NA
+                    out[b, 3] = NA
                 else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                count += 1
-                i += 1
-            else:
-                j += 1
+                    out[b, 0] = vopen
+                    out[b, 1] = vhigh
+                    out[b, 2] = vlow
+                    out[b, 3] = vclose
+                b += 1
+                got_first = 0
 
-    # do it again now that result size is known
+            counts[b] += 1
+            val = values[i, 0]
 
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.float64)
+            # not nan
+            if val == val:
+                if not got_first:
+                    got_first = 1
+                    vopen = val
+                    vlow = val
+                    vhigh = val
+                else:
+                    if val < vlow:
+                        vlow = val
+                    if val > vhigh:
+                        vhigh = val
+                vclose = val
 
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0:
-        while i < nleft:
-            if j == nright:
-                while i < nleft:
-                    lindexer[count] = i
-                    rindexer[count] = -1
-                    result[count] = left[i]
-                    i += 1
-                    count += 1
-                break
+        if not got_first:
+            out[b, 0] = NA
+            out[b, 1] = NA
+            out[b, 2] = NA
+            out[b, 3] = NA
+        else:
+            out[b, 0] = vopen
+            out[b, 1] = vhigh
+            out[b, 2] = vlow
+            out[b, 3] = vclose
 
-            lval = left[i]
-            rval = right[j]
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def left_join_indexer_unique_float64(ndarray[float64_t] left,
+                                      ndarray[float64_t] right):
+    cdef:
+        Py_ssize_t i, j, nleft, nright
+        ndarray[int64_t] indexer
+        float64_t lval, rval
 
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = lval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = left[i]
-                count += 1
-                i += 1
-            else:
-                j += 1
+    i = 0
+    j = 0
+    nleft = len(left)
+    nright = len(right)
 
-    return result, lindexer, rindexer
+    indexer = np.empty(nleft, dtype=np.int64)
+    while True:
+        if i == nleft:
+            break
 
+        if j == nright:
+            indexer[i] = -1
+            i += 1
+            continue
 
-def left_join_indexer_float32(ndarray[float32_t] left,
-                              ndarray[float32_t] right):
-    '''
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    '''
+        rval = right[j]
+
+        while i < nleft - 1 and left[i] == rval:
+            indexer[i] = j
+            i += 1
+
+        if left[i] == right[j]:
+            indexer[i] = j
+            i += 1
+            while i < nleft - 1 and left[i] == rval:
+                indexer[i] = j
+                i += 1
+            j += 1
+        elif left[i] > rval:
+            indexer[i] = -1
+            j += 1
+        else:
+            indexer[i] = -1
+            i += 1
+    return indexer
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def left_join_indexer_unique_float32(ndarray[float32_t] left,
+                                      ndarray[float32_t] right):
     cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
+        Py_ssize_t i, j, nleft, nright
+        ndarray[int64_t] indexer
         float32_t lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[float32_t] result
 
+    i = 0
+    j = 0
     nleft = len(left)
     nright = len(right)
 
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0:
-        while i < nleft:
-            if j == nright:
-                count += nleft - i
-                break
+    indexer = np.empty(nleft, dtype=np.int64)
+    while True:
+        if i == nleft:
+            break
 
-            lval = left[i]
-            rval = right[j]
+        if j == nright:
+            indexer[i] = -1
+            i += 1
+            continue
 
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                count += 1
-                i += 1
-            else:
-                j += 1
+        rval = right[j]
 
-    # do it again now that result size is known
+        while i < nleft - 1 and left[i] == rval:
+            indexer[i] = j
+            i += 1
 
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.float32)
+        if left[i] == right[j]:
+            indexer[i] = j
+            i += 1
+            while i < nleft - 1 and left[i] == rval:
+                indexer[i] = j
+                i += 1
+            j += 1
+        elif left[i] > rval:
+            indexer[i] = -1
+            j += 1
+        else:
+            indexer[i] = -1
+            i += 1
+    return indexer
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def left_join_indexer_unique_object(ndarray[object] left,
+                                      ndarray[object] right):
+    cdef:
+        Py_ssize_t i, j, nleft, nright
+        ndarray[int64_t] indexer
+        object lval, rval
 
     i = 0
     j = 0
-    count = 0
-    if nleft > 0:
-        while i < nleft:
-            if j == nright:
-                while i < nleft:
-                    lindexer[count] = i
-                    rindexer[count] = -1
-                    result[count] = left[i]
-                    i += 1
-                    count += 1
-                break
+    nleft = len(left)
+    nright = len(right)
 
-            lval = left[i]
-            rval = right[j]
+    indexer = np.empty(nleft, dtype=np.int64)
+    while True:
+        if i == nleft:
+            break
 
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = lval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = left[i]
-                count += 1
-                i += 1
-            else:
-                j += 1
+        if j == nright:
+            indexer[i] = -1
+            i += 1
+            continue
 
-    return result, lindexer, rindexer
+        rval = right[j]
 
+        while i < nleft - 1 and left[i] == rval:
+            indexer[i] = j
+            i += 1
 
-def left_join_indexer_object(ndarray[object] left,
-                              ndarray[object] right):
-    '''
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    '''
+        if left[i] == right[j]:
+            indexer[i] = j
+            i += 1
+            while i < nleft - 1 and left[i] == rval:
+                indexer[i] = j
+                i += 1
+            j += 1
+        elif left[i] > rval:
+            indexer[i] = -1
+            j += 1
+        else:
+            indexer[i] = -1
+            i += 1
+    return indexer
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def left_join_indexer_unique_int32(ndarray[int32_t] left,
+                                      ndarray[int32_t] right):
     cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
-        object lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[object] result
+        Py_ssize_t i, j, nleft, nright
+        ndarray[int64_t] indexer
+        int32_t lval, rval
 
+    i = 0
+    j = 0
     nleft = len(left)
     nright = len(right)
 
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0:
-        while i < nleft:
-            if j == nright:
-                count += nleft - i
-                break
+    indexer = np.empty(nleft, dtype=np.int64)
+    while True:
+        if i == nleft:
+            break
 
-            lval = left[i]
-            rval = right[j]
+        if j == nright:
+            indexer[i] = -1
+            i += 1
+            continue
 
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                count += 1
-                i += 1
-            else:
-                j += 1
+        rval = right[j]
 
-    # do it again now that result size is known
+        while i < nleft - 1 and left[i] == rval:
+            indexer[i] = j
+            i += 1
+
+        if left[i] == right[j]:
+            indexer[i] = j
+            i += 1
+            while i < nleft - 1 and left[i] == rval:
+                indexer[i] = j
+                i += 1
+            j += 1
+        elif left[i] > rval:
+            indexer[i] = -1
+            j += 1
+        else:
+            indexer[i] = -1
+            i += 1
+    return indexer
 
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=object)
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def left_join_indexer_unique_int64(ndarray[int64_t] left,
+                                      ndarray[int64_t] right):
+    cdef:
+        Py_ssize_t i, j, nleft, nright
+        ndarray[int64_t] indexer
+        int64_t lval, rval
 
     i = 0
     j = 0
-    count = 0
-    if nleft > 0:
-        while i < nleft:
-            if j == nright:
-                while i < nleft:
-                    lindexer[count] = i
-                    rindexer[count] = -1
-                    result[count] = left[i]
-                    i += 1
-                    count += 1
-                break
+    nleft = len(left)
+    nright = len(right)
 
-            lval = left[i]
-            rval = right[j]
+    indexer = np.empty(nleft, dtype=np.int64)
+    while True:
+        if i == nleft:
+            break
 
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = lval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = left[i]
-                count += 1
+        if j == nright:
+            indexer[i] = -1
+            i += 1
+            continue
+
+        rval = right[j]
+
+        while i < nleft - 1 and left[i] == rval:
+            indexer[i] = j
+            i += 1
+
+        if left[i] == right[j]:
+            indexer[i] = j
+            i += 1
+            while i < nleft - 1 and left[i] == rval:
+                indexer[i] = j
                 i += 1
-            else:
-                j += 1
+            j += 1
+        elif left[i] > rval:
+            indexer[i] = -1
+            j += 1
+        else:
+            indexer[i] = -1
+            i += 1
+    return indexer
 
-    return result, lindexer, rindexer
 
 
-def left_join_indexer_int8(ndarray[int8_t] left,
-                              ndarray[int8_t] right):
+def left_join_indexer_float64(ndarray[float64_t] left,
+                              ndarray[float64_t] right):
     '''
     Two-pass algorithm for monotonic indexes. Handles many-to-one merges
     '''
     cdef:
         Py_ssize_t i, j, k, nright, nleft, count
-        int8_t lval, rval
+        float64_t lval, rval
         ndarray[int64_t] lindexer, rindexer
-        ndarray[int8_t] result
+        ndarray[float64_t] result
 
     nleft = len(left)
     nright = len(right)
@@ -6924,7 +7136,7 @@ def left_join_indexer_int8(ndarray[int8_t] left,
 
     lindexer = np.empty(count, dtype=np.int64)
     rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.int8)
+    result = np.empty(count, dtype=np.float64)
 
     i = 0
     j = 0
@@ -6974,16 +7186,16 @@ def left_join_indexer_int8(ndarray[int8_t] left,
     return result, lindexer, rindexer
 
 
-def left_join_indexer_int16(ndarray[int16_t] left,
-                              ndarray[int16_t] right):
+def left_join_indexer_float32(ndarray[float32_t] left,
+                              ndarray[float32_t] right):
     '''
     Two-pass algorithm for monotonic indexes. Handles many-to-one merges
     '''
     cdef:
         Py_ssize_t i, j, k, nright, nleft, count
-        int16_t lval, rval
+        float32_t lval, rval
         ndarray[int64_t] lindexer, rindexer
-        ndarray[int16_t] result
+        ndarray[float32_t] result
 
     nleft = len(left)
     nright = len(right)
@@ -7026,7 +7238,7 @@ def left_join_indexer_int16(ndarray[int16_t] left,
 
     lindexer = np.empty(count, dtype=np.int64)
     rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.int16)
+    result = np.empty(count, dtype=np.float32)
 
     i = 0
     j = 0
@@ -7076,16 +7288,16 @@ def left_join_indexer_int16(ndarray[int16_t] left,
     return result, lindexer, rindexer
 
 
-def left_join_indexer_int32(ndarray[int32_t] left,
-                              ndarray[int32_t] right):
+def left_join_indexer_object(ndarray[object] left,
+                              ndarray[object] right):
     '''
     Two-pass algorithm for monotonic indexes. Handles many-to-one merges
     '''
     cdef:
         Py_ssize_t i, j, k, nright, nleft, count
-        int32_t lval, rval
+        object lval, rval
         ndarray[int64_t] lindexer, rindexer
-        ndarray[int32_t] result
+        ndarray[object] result
 
     nleft = len(left)
     nright = len(right)
@@ -7128,7 +7340,7 @@ def left_join_indexer_int32(ndarray[int32_t] left,
 
     lindexer = np.empty(count, dtype=np.int64)
     rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.int32)
+    result = np.empty(count, dtype=object)
 
     i = 0
     j = 0
@@ -7178,16 +7390,16 @@ def left_join_indexer_int32(ndarray[int32_t] left,
     return result, lindexer, rindexer
 
 
-def left_join_indexer_int64(ndarray[int64_t] left,
-                              ndarray[int64_t] right):
+def left_join_indexer_int32(ndarray[int32_t] left,
+                              ndarray[int32_t] right):
     '''
     Two-pass algorithm for monotonic indexes. Handles many-to-one merges
     '''
     cdef:
         Py_ssize_t i, j, k, nright, nleft, count
-        int64_t lval, rval
+        int32_t lval, rval
         ndarray[int64_t] lindexer, rindexer
-        ndarray[int64_t] result
+        ndarray[int32_t] result
 
     nleft = len(left)
     nright = len(right)
@@ -7230,7 +7442,7 @@ def left_join_indexer_int64(ndarray[int64_t] left,
 
     lindexer = np.empty(count, dtype=np.int64)
     rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.int64)
+    result = np.empty(count, dtype=np.int32)
 
     i = 0
     j = 0
@@ -7280,144 +7492,16 @@ def left_join_indexer_int64(ndarray[int64_t] left,
     return result, lindexer, rindexer
 
 
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def outer_join_indexer_float64(ndarray[float64_t] left,
-                                ndarray[float64_t] right):
-    cdef:
-        Py_ssize_t i, j, nright, nleft, count
-        float64_t lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[float64_t] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft == 0:
-        count = nright
-    elif nright == 0:
-        count = nleft
-    else:
-        while True:
-            if i == nleft:
-                count += nright - j
-                break
-            if j == nright:
-                count += nleft - i
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                count += 1
-                i += 1
-            else:
-                count += 1
-                j += 1
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.float64)
-
-    # do it again, but populate the indexers / result
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft == 0:
-        for j in range(nright):
-            lindexer[j] = -1
-            rindexer[j] = j
-            result[j] = right[j]
-    elif nright == 0:
-        for i in range(nright):
-            lindexer[i] = i
-            rindexer[i] = -1
-            result[i] = left[i]
-    else:
-        while True:
-            if i == nleft:
-                while j < nright:
-                    lindexer[count] = -1
-                    rindexer[count] = j
-                    result[count] = right[j]
-                    count += 1
-                    j += 1
-                break
-            if j == nright:
-                while i < nleft:
-                    lindexer[count] = i
-                    rindexer[count] = -1
-                    result[count] = left[i]
-                    count += 1
-                    i += 1
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = lval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = lval
-                count += 1
-                i += 1
-            else:
-                lindexer[count] = -1
-                rindexer[count] = j
-                result[count] = rval
-                count += 1
-                j += 1
-
-    return result, lindexer, rindexer
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def outer_join_indexer_float32(ndarray[float32_t] left,
-                                ndarray[float32_t] right):
+def left_join_indexer_int64(ndarray[int64_t] left,
+                              ndarray[int64_t] right):
+    '''
+    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
+    '''
     cdef:
-        Py_ssize_t i, j, nright, nleft, count
-        float32_t lval, rval
+        Py_ssize_t i, j, k, nright, nleft, count
+        int64_t lval, rval
         ndarray[int64_t] lindexer, rindexer
-        ndarray[float32_t] result
+        ndarray[int64_t] result
 
     nleft = len(left)
     nright = len(right)
@@ -7425,21 +7509,15 @@ def outer_join_indexer_float32(ndarray[float32_t] left,
     i = 0
     j = 0
     count = 0
-    if nleft == 0:
-        count = nright
-    elif nright == 0:
-        count = nleft
-    else:
-        while True:
-            if i == nleft:
-                count += nright - j
-                break
+    if nleft > 0:
+        while i < nleft:
             if j == nright:
                 count += nleft - i
                 break
 
             lval = left[i]
             rval = right[j]
+
             if lval == rval:
                 count += 1
                 if i < nleft - 1:
@@ -7460,45 +7538,26 @@ def outer_join_indexer_float32(ndarray[float32_t] left,
                 count += 1
                 i += 1
             else:
-                count += 1
                 j += 1
 
+    # do it again now that result size is known
+
     lindexer = np.empty(count, dtype=np.int64)
     rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.float32)
-
-    # do it again, but populate the indexers / result
+    result = np.empty(count, dtype=np.int64)
 
     i = 0
     j = 0
     count = 0
-    if nleft == 0:
-        for j in range(nright):
-            lindexer[j] = -1
-            rindexer[j] = j
-            result[j] = right[j]
-    elif nright == 0:
-        for i in range(nright):
-            lindexer[i] = i
-            rindexer[i] = -1
-            result[i] = left[i]
-    else:
-        while True:
-            if i == nleft:
-                while j < nright:
-                    lindexer[count] = -1
-                    rindexer[count] = j
-                    result[count] = right[j]
-                    count += 1
-                    j += 1
-                break
+    if nleft > 0:
+        while i < nleft:
             if j == nright:
                 while i < nleft:
                     lindexer[count] = i
                     rindexer[count] = -1
                     result[count] = left[i]
-                    count += 1
                     i += 1
+                    count += 1
                 break
 
             lval = left[i]
@@ -7526,27 +7585,24 @@ def outer_join_indexer_float32(ndarray[float32_t] left,
             elif lval < rval:
                 lindexer[count] = i
                 rindexer[count] = -1
-                result[count] = lval
+                result[count] = left[i]
                 count += 1
                 i += 1
             else:
-                lindexer[count] = -1
-                rindexer[count] = j
-                result[count] = rval
-                count += 1
                 j += 1
 
     return result, lindexer, rindexer
 
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def outer_join_indexer_object(ndarray[object] left,
-                                ndarray[object] right):
+def outer_join_indexer_float64(ndarray[float64_t] left,
+                                ndarray[float64_t] right):
     cdef:
         Py_ssize_t i, j, nright, nleft, count
-        object lval, rval
+        float64_t lval, rval
         ndarray[int64_t] lindexer, rindexer
-        ndarray[object] result
+        ndarray[float64_t] result
 
     nleft = len(left)
     nright = len(right)
@@ -7594,7 +7650,7 @@ def outer_join_indexer_object(ndarray[object] left,
 
     lindexer = np.empty(count, dtype=np.int64)
     rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=object)
+    result = np.empty(count, dtype=np.float64)
 
     # do it again, but populate the indexers / result
 
@@ -7669,13 +7725,13 @@ def outer_join_indexer_object(ndarray[object] left,
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def outer_join_indexer_int8(ndarray[int8_t] left,
-                                ndarray[int8_t] right):
+def outer_join_indexer_float32(ndarray[float32_t] left,
+                                ndarray[float32_t] right):
     cdef:
         Py_ssize_t i, j, nright, nleft, count
-        int8_t lval, rval
+        float32_t lval, rval
         ndarray[int64_t] lindexer, rindexer
-        ndarray[int8_t] result
+        ndarray[float32_t] result
 
     nleft = len(left)
     nright = len(right)
@@ -7723,7 +7779,7 @@ def outer_join_indexer_int8(ndarray[int8_t] left,
 
     lindexer = np.empty(count, dtype=np.int64)
     rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.int8)
+    result = np.empty(count, dtype=np.float32)
 
     # do it again, but populate the indexers / result
 
@@ -7798,13 +7854,13 @@ def outer_join_indexer_int8(ndarray[int8_t] left,
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def outer_join_indexer_int16(ndarray[int16_t] left,
-                                ndarray[int16_t] right):
+def outer_join_indexer_object(ndarray[object] left,
+                                ndarray[object] right):
     cdef:
         Py_ssize_t i, j, nright, nleft, count
-        int16_t lval, rval
+        object lval, rval
         ndarray[int64_t] lindexer, rindexer
-        ndarray[int16_t] result
+        ndarray[object] result
 
     nleft = len(left)
     nright = len(right)
@@ -7852,7 +7908,7 @@ def outer_join_indexer_int16(ndarray[int16_t] left,
 
     lindexer = np.empty(count, dtype=np.int64)
     rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.int16)
+    result = np.empty(count, dtype=object)
 
     # do it again, but populate the indexers / result
 
@@ -8463,192 +8519,6 @@ def inner_join_indexer_object(ndarray[object] left,
 
     return result, lindexer, rindexer
 
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def inner_join_indexer_int8(ndarray[int8_t] left,
-                              ndarray[int8_t] right):
-    '''
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    '''
-    cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
-        int8_t lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[int8_t] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0 and nright > 0:
-        while True:
-            if i == nleft:
-                break
-            if j == nright:
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                i += 1
-            else:
-                j += 1
-
-    # do it again now that result size is known
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.int8)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0 and nright > 0:
-        while True:
-            if i == nleft:
-                break
-            if j == nright:
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = rval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                i += 1
-            else:
-                j += 1
-
-    return result, lindexer, rindexer
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def inner_join_indexer_int16(ndarray[int16_t] left,
-                              ndarray[int16_t] right):
-    '''
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    '''
-    cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
-        int16_t lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[int16_t] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0 and nright > 0:
-        while True:
-            if i == nleft:
-                break
-            if j == nright:
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                i += 1
-            else:
-                j += 1
-
-    # do it again now that result size is known
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.int16)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0 and nright > 0:
-        while True:
-            if i == nleft:
-                break
-            if j == nright:
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = rval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                i += 1
-            else:
-                j += 1
-
-    return result, lindexer, rindexer
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def inner_join_indexer_int32(ndarray[int32_t] left,
diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py
index 1017f9cd7c503..e9f19e0fbad11 100644
--- a/pandas/tests/test_common.py
+++ b/pandas/tests/test_common.py
@@ -286,59 +286,295 @@ class TestTake(unittest.TestCase):
     _multiprocess_can_split_ = True
 
     def test_1d_with_out(self):
-        def _test_dtype(dtype):
-            out = np.empty(5, dtype=dtype)
-            arr = np.random.randn(10).astype(dtype)
-            indexer = [0, 2, 4, 7, 1]
+        def _test_dtype(dtype, can_hold_na):
+            data = np.random.randint(0, 2, 4).astype(dtype)
 
-            arr.take(indexer, out=out)
-            expected = arr.take(indexer)
+            indexer = [2, 1, 0, 1]
+            out = np.empty(4, dtype=dtype)
+            com.take_1d(data, indexer, out=out)
+            expected = data.take(indexer)
             tm.assert_almost_equal(out, expected)
 
-        _test_dtype(np.float64)
-        _test_dtype(np.float32)
-        _test_dtype(np.int32)
-        _test_dtype(np.int64)
-        _test_dtype(np.object_)
-        _test_dtype(np.bool)
-
-    def test_1d_upcast_with_out(self):
-        def _test_dtype(dtype):
+            indexer = [2, 1, 0, -1]
             out = np.empty(4, dtype=dtype)
-            data = np.random.randint(0, 2, 5).astype(dtype)
+            if can_hold_na:
+                com.take_1d(data, indexer, out=out)
+                expected = data.take(indexer)
+                expected[3] = np.nan
+                tm.assert_almost_equal(out, expected)
+            else:
+                self.assertRaises(Exception, com.take_1d, data,
+                                  indexer, out=out)
+                # no exception o/w
+                data.take(indexer, out=out)
+
+        _test_dtype(np.float64, True)
+        _test_dtype(np.float32, True)
+        _test_dtype(np.uint64, False)
+        _test_dtype(np.uint32, False)
+        _test_dtype(np.uint16, False)
+        _test_dtype(np.uint8, False)
+        _test_dtype(np.int64, False)
+        _test_dtype(np.int32, False)
+        _test_dtype(np.int16, False)
+        _test_dtype(np.int8, False)
+        _test_dtype(np.object_, True)
+        _test_dtype(np.bool, False)
+
+    def test_1d_fill_nonna(self):
+        def _test_dtype(dtype, fill_value, out_dtype):
+            data = np.random.randint(0, 2, 4).astype(dtype)
 
             indexer = [2, 1, 0, -1]
-            self.assertRaises(Exception, com.take_1d, data,
-                              indexer, out=out)
 
-        _test_dtype(np.int64)
-        _test_dtype(np.int32)
-        _test_dtype(np.int16)
-        _test_dtype(np.int8)
-        _test_dtype(np.bool)
+            result = com.take_1d(data, indexer, fill_value=fill_value)
+            assert((result[[0, 1, 2]] == data[[2, 1, 0]]).all())
+            assert(result[3] == fill_value)
+            assert(result.dtype == out_dtype)
+
+            indexer = [2, 1, 0, 1]
+
+            result = com.take_1d(data, indexer, fill_value=fill_value)
+            assert((result[[0, 1, 2, 3]] == data[indexer]).all())
+            assert(result.dtype == dtype)
+
+        _test_dtype(np.int8, np.int16(127), np.int8)
+        _test_dtype(np.int8, np.int16(128), np.int16)
+        _test_dtype(np.int32, 1, np.int32)
+        _test_dtype(np.int32, 2.0, np.float64)
+        _test_dtype(np.int32, 3.0 + 4.0j, np.complex128)
+        _test_dtype(np.int32, True, np.object_)
+        _test_dtype(np.int32, '', np.object_)
+        _test_dtype(np.float64, 1, np.float64)
+        _test_dtype(np.float64, 2.0, np.float64)
+        _test_dtype(np.float64, 3.0 + 4.0j, np.complex128)
+        _test_dtype(np.float64, True, np.object_)
+        _test_dtype(np.float64, '', np.object_)
+        _test_dtype(np.complex128, 1, np.complex128)
+        _test_dtype(np.complex128, 2.0, np.complex128)
+        _test_dtype(np.complex128, 3.0 + 4.0j, np.complex128)
+        _test_dtype(np.complex128, True, np.object_)
+        _test_dtype(np.complex128, '', np.object_)
+        _test_dtype(np.bool_, 1, np.object_)
+        _test_dtype(np.bool_, 2.0, np.object_)
+        _test_dtype(np.bool_, 3.0 + 4.0j, np.object_)
+        _test_dtype(np.bool_, True, np.bool_)
+        _test_dtype(np.bool_, '', np.object_)
+
+    def test_2d_with_out(self):
+        def _test_dtype(dtype, can_hold_na):
+            data = np.random.randint(0, 2, (5, 3)).astype(dtype)
 
-    def test_2d_upcast_with_out(self):
-        def _test_dtype(dtype):
+            indexer = [2, 1, 0, 1]
             out0 = np.empty((4, 3), dtype=dtype)
             out1 = np.empty((5, 4), dtype=dtype)
+            com.take_nd(data, indexer, out=out0, axis=0)
+            com.take_nd(data, indexer, out=out1, axis=1)
+            expected0 = data.take(indexer, axis=0)
+            expected1 = data.take(indexer, axis=1)
+            tm.assert_almost_equal(out0, expected0)
+            tm.assert_almost_equal(out1, expected1)
 
+            indexer = [2, 1, 0, -1]
+            out0 = np.empty((4, 3), dtype=dtype)
+            out1 = np.empty((5, 4), dtype=dtype)
+            if can_hold_na:
+                com.take_nd(data, indexer, out=out0, axis=0)
+                com.take_nd(data, indexer, out=out1, axis=1)
+                expected0 = data.take(indexer, axis=0)
+                expected1 = data.take(indexer, axis=1)
+                expected0[3, :] = np.nan
+                expected1[:, 3] = np.nan
+                tm.assert_almost_equal(out0, expected0)
+                tm.assert_almost_equal(out1, expected1)
+            else:
+                self.assertRaises(Exception, com.take_nd, data,
+                                  indexer, out=out0, axis=0)
+                self.assertRaises(Exception, com.take_nd, data,
+                                  indexer, out=out1, axis=1)
+                # no exception o/w
+                data.take(indexer, out=out0, axis=0)
+                data.take(indexer, out=out1, axis=1)
+
+        _test_dtype(np.float64, True)
+        _test_dtype(np.float32, True)
+        _test_dtype(np.uint64, False)
+        _test_dtype(np.uint32, False)
+        _test_dtype(np.uint16, False)
+        _test_dtype(np.uint8, False)
+        _test_dtype(np.int64, False)
+        _test_dtype(np.int32, False)
+        _test_dtype(np.int16, False)
+        _test_dtype(np.int8, False)
+        _test_dtype(np.object_, True)
+        _test_dtype(np.bool, False)
+
+    def test_2d_fill_nonna(self):
+        def _test_dtype(dtype, fill_value, out_dtype):
             data = np.random.randint(0, 2, (5, 3)).astype(dtype)
 
             indexer = [2, 1, 0, -1]
-            self.assertRaises(Exception, com.take_2d, data,
-                              indexer, out=out0, axis=0)
-            self.assertRaises(Exception, com.take_2d, data,
-                              indexer, out=out1, axis=1)
 
-            # no exception o/w
-            data.take(indexer, out=out0, axis=0)
-            data.take(indexer, out=out1, axis=1)
+            result = com.take_nd(data, indexer, axis=0, fill_value=fill_value)
+            assert((result[[0, 1, 2], :] == data[[2, 1, 0], :]).all())
+            assert((result[3, :] == fill_value).all())
+            assert(result.dtype == out_dtype)
+
+            result = com.take_nd(data, indexer, axis=1, fill_value=fill_value)
+            assert((result[:, [0, 1, 2]] == data[:, [2, 1, 0]]).all())
+            assert((result[:, 3] == fill_value).all())
+            assert(result.dtype == out_dtype)
+
+            indexer = [2, 1, 0, 1]
+
+            result = com.take_nd(data, indexer, axis=0, fill_value=fill_value)
+            assert((result[[0, 1, 2, 3], :] == data[indexer, :]).all())
+            assert(result.dtype == dtype)
+
+            result = com.take_nd(data, indexer, axis=1, fill_value=fill_value)
+            assert((result[:, [0, 1, 2, 3]] == data[:, indexer]).all())
+            assert(result.dtype == dtype)
+
+        _test_dtype(np.int8, np.int16(127), np.int8)
+        _test_dtype(np.int8, np.int16(128), np.int16)
+        _test_dtype(np.int32, 1, np.int32)
+        _test_dtype(np.int32, 2.0, np.float64)
+        _test_dtype(np.int32, 3.0 + 4.0j, np.complex128)
+        _test_dtype(np.int32, True, np.object_)
+        _test_dtype(np.int32, '', np.object_)
+        _test_dtype(np.float64, 1, np.float64)
+        _test_dtype(np.float64, 2.0, np.float64)
+        _test_dtype(np.float64, 3.0 + 4.0j, np.complex128)
+        _test_dtype(np.float64, True, np.object_)
+        _test_dtype(np.float64, '', np.object_)
+        _test_dtype(np.complex128, 1, np.complex128)
+        _test_dtype(np.complex128, 2.0, np.complex128)
+        _test_dtype(np.complex128, 3.0 + 4.0j, np.complex128)
+        _test_dtype(np.complex128, True, np.object_)
+        _test_dtype(np.complex128, '', np.object_)
+        _test_dtype(np.bool_, 1, np.object_)
+        _test_dtype(np.bool_, 2.0, np.object_)
+        _test_dtype(np.bool_, 3.0 + 4.0j, np.object_)
+        _test_dtype(np.bool_, True, np.bool_)
+        _test_dtype(np.bool_, '', np.object_)
+
+    def test_3d_with_out(self):
+        def _test_dtype(dtype, can_hold_na):
+            data = np.random.randint(0, 2, (5, 4, 3)).astype(dtype)
+
+            indexer = [2, 1, 0, 1]
+            out0 = np.empty((4, 4, 3), dtype=dtype)
+            out1 = np.empty((5, 4, 3), dtype=dtype)
+            out2 = np.empty((5, 4, 4), dtype=dtype)
+            com.take_nd(data, indexer, out=out0, axis=0)
+            com.take_nd(data, indexer, out=out1, axis=1)
+            com.take_nd(data, indexer, out=out2, axis=2)
+            expected0 = data.take(indexer, axis=0)
+            expected1 = data.take(indexer, axis=1)
+            expected2 = data.take(indexer, axis=2)
+            tm.assert_almost_equal(out0, expected0)
+            tm.assert_almost_equal(out1, expected1)
+            tm.assert_almost_equal(out2, expected2)
 
-        _test_dtype(np.int64)
-        _test_dtype(np.int32)
-        _test_dtype(np.int16)
-        _test_dtype(np.int8)
-        _test_dtype(np.bool)
+            indexer = [2, 1, 0, -1]
+            out0 = np.empty((4, 4, 3), dtype=dtype)
+            out1 = np.empty((5, 4, 3), dtype=dtype)
+            out2 = np.empty((5, 4, 4), dtype=dtype)
+            if can_hold_na:
+                com.take_nd(data, indexer, out=out0, axis=0)
+                com.take_nd(data, indexer, out=out1, axis=1)
+                com.take_nd(data, indexer, out=out2, axis=2)
+                expected0 = data.take(indexer, axis=0)
+                expected1 = data.take(indexer, axis=1)
+                expected2 = data.take(indexer, axis=2)
+                expected0[3, :, :] = np.nan
+                expected1[:, 3, :] = np.nan
+                expected2[:, :, 3] = np.nan
+                tm.assert_almost_equal(out0, expected0)
+                tm.assert_almost_equal(out1, expected1)
+                tm.assert_almost_equal(out2, expected2)
+            else:
+                self.assertRaises(Exception, com.take_nd, data,
+                                  indexer, out=out0, axis=0)
+                self.assertRaises(Exception, com.take_nd, data,
+                                  indexer, out=out1, axis=1)
+                self.assertRaises(Exception, com.take_nd, data,
+                                  indexer, out=out2, axis=2)
+                # no exception o/w
+                data.take(indexer, out=out0, axis=0)
+                data.take(indexer, out=out1, axis=1)
+                data.take(indexer, out=out2, axis=2)
+
+        _test_dtype(np.float64, True)
+        _test_dtype(np.float32, True)
+        _test_dtype(np.uint64, False)
+        _test_dtype(np.uint32, False)
+        _test_dtype(np.uint16, False)
+        _test_dtype(np.uint8, False)
+        _test_dtype(np.int64, False)
+        _test_dtype(np.int32, False)
+        _test_dtype(np.int16, False)
+        _test_dtype(np.int8, False)
+        _test_dtype(np.object_, True)
+        _test_dtype(np.bool, False)
+
+    def test_3d_fill_nonna(self):
+        def _test_dtype(dtype, fill_value, out_dtype):
+            data = np.random.randint(0, 2, (5, 4, 3)).astype(dtype)
+
+            indexer = [2, 1, 0, -1]
+
+            result = com.take_nd(data, indexer, axis=0, fill_value=fill_value)
+            assert((result[[0, 1, 2], :, :] == data[[2, 1, 0], :, :]).all())
+            assert((result[3, :, :] == fill_value).all())
+            assert(result.dtype == out_dtype)
+
+            result = com.take_nd(data, indexer, axis=1, fill_value=fill_value)
+            assert((result[:, [0, 1, 2], :] == data[:, [2, 1, 0], :]).all())
+            assert((result[:, 3, :] == fill_value).all())
+            assert(result.dtype == out_dtype)
+
+            result = com.take_nd(data, indexer, axis=2, fill_value=fill_value)
+            assert((result[:, :, [0, 1, 2]] == data[:, :, [2, 1, 0]]).all())
+            assert((result[:, :, 3] == fill_value).all())
+            assert(result.dtype == out_dtype)
+
+            indexer = [2, 1, 0, 1]
+
+            result = com.take_nd(data, indexer, axis=0, fill_value=fill_value)
+            assert((result[[0, 1, 2, 3], :, :] == data[indexer, :, :]).all())
+            assert(result.dtype == dtype)
+
+            result = com.take_nd(data, indexer, axis=1, fill_value=fill_value)
+            assert((result[:, [0, 1, 2, 3], :] == data[:, indexer, :]).all())
+            assert(result.dtype == dtype)
+
+            result = com.take_nd(data, indexer, axis=2, fill_value=fill_value)
+            assert((result[:, :, [0, 1, 2, 3]] == data[:, :, indexer]).all())
+            assert(result.dtype == dtype)
+
+        _test_dtype(np.int8, np.int16(127), np.int8)
+        _test_dtype(np.int8, np.int16(128), np.int16)
+        _test_dtype(np.int32, 1, np.int32)
+        _test_dtype(np.int32, 2.0, np.float64)
+        _test_dtype(np.int32, 3.0 + 4.0j, np.complex128)
+        _test_dtype(np.int32, True, np.object_)
+        _test_dtype(np.int32, '', np.object_)
+        _test_dtype(np.float64, 1, np.float64)
+        _test_dtype(np.float64, 2.0, np.float64)
+        _test_dtype(np.float64, 3.0 + 4.0j, np.complex128)
+        _test_dtype(np.float64, True, np.object_)
+        _test_dtype(np.float64, '', np.object_)
+        _test_dtype(np.complex128, 1, np.complex128)
+        _test_dtype(np.complex128, 2.0, np.complex128)
+        _test_dtype(np.complex128, 3.0 + 4.0j, np.complex128)
+        _test_dtype(np.complex128, True, np.object_)
+        _test_dtype(np.complex128, '', np.object_)
+        _test_dtype(np.bool_, 1, np.object_)
+        _test_dtype(np.bool_, 2.0, np.object_)
+        _test_dtype(np.bool_, 3.0 + 4.0j, np.object_)
+        _test_dtype(np.bool_, True, np.bool_)
+        _test_dtype(np.bool_, '', np.object_)
 
     def test_1d_other_dtypes(self):
         arr = np.random.randn(10).astype(np.float32)
@@ -355,13 +591,13 @@ def test_2d_other_dtypes(self):
         indexer = [1, 2, 3, -1]
 
         # axis=0
-        result = com.take_2d(arr, indexer, axis=0)
+        result = com.take_nd(arr, indexer, axis=0)
         expected = arr.take(indexer, axis=0)
         expected[-1] = np.nan
         tm.assert_almost_equal(result, expected)
 
         # axis=1
-        result = com.take_2d(arr, indexer, axis=1)
+        result = com.take_nd(arr, indexer, axis=1)
         expected = arr.take(indexer, axis=1)
         expected[:, -1] = np.nan
         tm.assert_almost_equal(result, expected)
@@ -381,15 +617,15 @@ def test_2d_bool(self):
                         [1, 0, 1],
                         [0, 1, 1]], dtype=bool)
 
-        result = com.take_2d(arr, [0, 2, 2, 1])
+        result = com.take_nd(arr, [0, 2, 2, 1])
         expected = arr.take([0, 2, 2, 1], axis=0)
         self.assert_(np.array_equal(result, expected))
 
-        result = com.take_2d(arr, [0, 2, 2, 1], axis=1)
+        result = com.take_nd(arr, [0, 2, 2, 1], axis=1)
         expected = arr.take([0, 2, 2, 1], axis=1)
         self.assert_(np.array_equal(result, expected))
 
-        result = com.take_2d(arr, [0, 2, -1])
+        result = com.take_nd(arr, [0, 2, -1])
         self.assert_(result.dtype == np.object_)
 
     def test_2d_float32(self):
@@ -397,28 +633,76 @@ def test_2d_float32(self):
         indexer = [0, 2, -1, 1, -1]
 
         # axis=0
-        result = com.take_2d(arr, indexer)
+        result = com.take_nd(arr, indexer, axis=0)
         result2 = np.empty_like(result)
-        com.take_2d(arr, indexer, out=result2)
-        tm.assert_almost_equal(result, result)
+        com.take_nd(arr, indexer, axis=0, out=result2)
+        tm.assert_almost_equal(result, result2)
 
         expected = arr.take(indexer, axis=0)
-        expected[[2, 4]] = np.nan
+        expected[[2, 4], :] = np.nan
         tm.assert_almost_equal(result, expected)
 
         #### this now accepts a float32! # test with float64 out buffer
         out = np.empty((len(indexer), arr.shape[1]), dtype='float32')
-        com.take_2d(arr, indexer, out=out)  # it works!
+        com.take_nd(arr, indexer, out=out)  # it works!
 
         # axis=1
-        result = com.take_2d(arr, indexer, axis=1)
+        result = com.take_nd(arr, indexer, axis=1)
         result2 = np.empty_like(result)
-        com.take_2d(arr, indexer, axis=1, out=result2)
-        tm.assert_almost_equal(result, result)
+        com.take_nd(arr, indexer, axis=1, out=result2)
+        tm.assert_almost_equal(result, result2)
 
         expected = arr.take(indexer, axis=1)
         expected[:, [2, 4]] = np.nan
         tm.assert_almost_equal(result, expected)
+    
+    def test_2d_datetime64(self):
+        # 2005/01/01 - 2006/01/01
+        arr = np.random.randint(11045376L, 11360736L, (5,3))*100000000000
+        arr = arr.view(dtype='datetime64[ns]')
+        indexer = [0, 2, -1, 1, -1]
+
+        # axis=0
+        result = com.take_nd(arr, indexer, axis=0)
+        result2 = np.empty_like(result)
+        com.take_nd(arr, indexer, axis=0, out=result2)
+        tm.assert_almost_equal(result, result2)
+
+        expected = arr.take(indexer, axis=0)
+        expected.view(np.int64)[[2, 4], :] = iNaT
+        tm.assert_almost_equal(result, expected)
+
+        result = com.take_nd(arr, indexer, axis=0,
+                             fill_value=datetime(2007, 1, 1))
+        result2 = np.empty_like(result)
+        com.take_nd(arr, indexer, out=result2, axis=0,
+                    fill_value=datetime(2007, 1, 1))
+        tm.assert_almost_equal(result, result2)
+
+        expected = arr.take(indexer, axis=0)
+        expected[[2, 4], :] = datetime(2007, 1, 1)
+        tm.assert_almost_equal(result, expected)
+
+        # axis=1
+        result = com.take_nd(arr, indexer, axis=1)
+        result2 = np.empty_like(result)
+        com.take_nd(arr, indexer, axis=1, out=result2)
+        tm.assert_almost_equal(result, result2)
+
+        expected = arr.take(indexer, axis=1)
+        expected.view(np.int64)[:, [2, 4]] = iNaT
+        tm.assert_almost_equal(result, expected)
+
+        result = com.take_nd(arr, indexer, axis=1,
+                             fill_value=datetime(2007, 1, 1))
+        result2 = np.empty_like(result)
+        com.take_nd(arr, indexer, out=result2, axis=1,
+                    fill_value=datetime(2007, 1, 1))
+        tm.assert_almost_equal(result, result2)
+
+        expected = arr.take(indexer, axis=1)
+        expected[:, [2, 4]] = datetime(2007, 1, 1)
+        tm.assert_almost_equal(result, expected)
 
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
index 03fdd53ce19af..5cad4a0518ce9 100644
--- a/pandas/tests/test_frame.py
+++ b/pandas/tests/test_frame.py
@@ -8033,10 +8033,6 @@ def test_boolean_set_uncons(self):
         self.frame[self.frame > 1] = 2
         assert_almost_equal(expected, self.frame.values)
 
-    def test_boolean_set_mixed_type(self):
-        bools = self.mixed_frame.applymap(lambda x: x != 2).astype(bool)
-        self.assertRaises(Exception, self.mixed_frame.__setitem__, bools, 2)
-
     def test_xs_view(self):
         dm = DataFrame(np.arange(20.).reshape(4, 5),
                        index=range(4), columns=range(5))
diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py
index 3adfb38e6144b..6d699967915ba 100644
--- a/pandas/tools/merge.py
+++ b/pandas/tools/merge.py
@@ -715,18 +715,9 @@ def _merge_blocks(self, merge_chunks):
         sofar = 0
         for unit, blk in merge_chunks:
             out_chunk = out[sofar: sofar + len(blk)]
-
-            if unit.indexer is None:
-            # is this really faster than assigning to arr.flat?
-                com.take_fast(blk.values, np.arange(n, dtype=np.int64),
-                              None, False,
-                              axis=self.axis, out=out_chunk)
-            else:
-                # write out the values to the result array
-                com.take_fast(blk.values, unit.indexer,
-                              None, False,
-                              axis=self.axis, out=out_chunk)
-
+            com.take_fast(blk.values, unit.indexer,
+                          None, False, axis=self.axis,
+                          out=out_chunk)
             sofar += len(blk)
 
         # does not sort
@@ -771,10 +762,7 @@ def reindex_block(self, block, axis, ref_items, copy=True):
         mask, need_masking = self.mask_info
 
         if self.indexer is None:
-            if copy:
-                result = block.copy()
-            else:
-                result = block
+            result = block.copy() if copy else block
         else:
             result = block.reindex_axis(self.indexer, mask, need_masking,
                                         axis=axis)
diff --git a/vb_suite/frame_methods.py b/vb_suite/frame_methods.py
index ce341b2de8060..2bd69d79ac024 100644
--- a/vb_suite/frame_methods.py
+++ b/vb_suite/frame_methods.py
@@ -54,6 +54,19 @@
 frame_reindex_both_axes_ix = Benchmark('df.ix[idx, idx]', setup,
                                        start_date=datetime(2011, 1, 1))
 
+#----------------------------------------------------------------------
+# reindex with upcasts
+setup = common_setup + """
+df=DataFrame(dict([(c, {
+        0: randint(0, 2, 1000).astype(np.bool_),
+        1: randint(0, 1000, 1000).astype(np.int16),
+        2: randint(0, 1000, 1000).astype(np.int32),
+        3: randint(0, 1000, 1000).astype(np.int64)
+    }[randint(0, 4)]) for c in range(1000)]))
+"""
+
+frame_reindex_upcast = Benchmark('df.reindex(permutation(range(1200)))', setup)
+
 #----------------------------------------------------------------------
 # boolean indexing
 
@@ -71,6 +84,7 @@
 
 setup = common_setup + """
 df = DataFrame(randn(10000, 100))
+
 def f():
     if hasattr(df, '_item_cache'):
         df._item_cache.clear()
diff --git a/vb_suite/pandas_vb_common.py b/vb_suite/pandas_vb_common.py
index 58cd67227cf80..77d0e2e27260e 100644
--- a/vb_suite/pandas_vb_common.py
+++ b/vb_suite/pandas_vb_common.py
@@ -2,6 +2,8 @@
 from pandas.util.testing import rands
 from datetime import timedelta
 from numpy.random import randn
+from numpy.random import randint
+from numpy.random import permutation
 import pandas.util.testing as tm
 import random
 import numpy as np