From 166a80dc5dfdb6aa41b2c1687069e5fd47e41cc6 Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Sat, 19 Jan 2013 16:18:20 -0500
Subject: [PATCH] ENH: allow propgation and coexistance of numeric dtypes
 (closes GH #622)      construction of multi numeric dtypes with other types
 in a dict      validated get_numeric_data returns correct dtypes      added
 blocks attribute (and as_blocks()) method that returns a dict of dtype ->
 homogeneous Frame to DataFrame      added keyword 'raise_on_error' to astype,
 which can be set to false to exluded non-numeric columns      fixed merging
 to correctly merge on multiple dtypes with blocks (e.g. float64 and float32
 in other merger)      changed implementation of get_dtype_counts() to use
 .blocks      revised DataFrame.convert_objects to use blocks to be more
 efficient      added Dtype printing to show on default with a Series     
 added convert_dates='coerce' option to convert_objects, to force conversions
 to datetime64[ns]      where can upcast integer to float as needed (on
 inplace ops #2793)      added fully cythonized support for int8/int16      no
 support for float16 (it can exist, but no cython methods for it)

TST: fixed test in test_from_records_sequencelike (dict orders can be different on different arch!)
       NOTE: using tuples will remove dtype info from the input stream (using a record array is ok though!)
     test updates for merging (multi-dtypes)
     added tests for replace (but skipped for now, algos not set for float32/16)
     tests for astype and convert in internals
     fixes for test_excel on 32-bit
     fixed test_resample_median_bug_1688 I belive
     separated out test_from_records_dictlike
     testing of panel constructors (GH #797)
     where ops now have a full test suite
     allow slightly less sensitive decimal tests for less precise dtypes

BUG: fixed GH #2778, fillna on empty frame causes seg fault
     fixed bug in groupby where types were not being casted to original dtype
     respect the dtype of non-natural numeric (Decimal)
     don't upcast ints/bools to floats (if you say were agging on len, you can get an int)
DOC: added astype conversion examples to whatsnew and docs (dsintro)
     updated RELEASE notes
     whatsnew for 0.10.2
     added upcasting gotchas docs

CLN: updated convert_objects to be more consistent across frame/series
     moved most groupby functions out of algos.pyx to generated.pyx
     fully support cython functions for pad/bfill/take/diff/groupby for float32
     moved more block-like conversion loops from frame.py to internals.py (created apply method)
       (e.g. diff,fillna,where,shift,replace,interpolate,combining), to top-level methods in BlockManager
---
 RELEASE.rst                             |   39 +-
 doc/source/dsintro.rst                  |  114 +-
 doc/source/indexing.rst                 |   28 +
 doc/source/v0.10.2.txt                  |   95 +
 doc/source/whatsnew.rst                 |    2 +
 pandas/algos.pyx                        | 1106 +---
 pandas/core/common.py                   |  122 +-
 pandas/core/format.py                   |    9 +-
 pandas/core/frame.py                    |  175 +-
 pandas/core/generic.py                  |    8 +-
 pandas/core/groupby.py                  |  152 +-
 pandas/core/internals.py                |  435 +-
 pandas/core/series.py                   |   22 +-
 pandas/io/tests/test_excel.py           |    4 +-
 pandas/src/generate_code.py             | 1096 +++-
 pandas/src/generated.pyx                | 7900 ++++++++++++++++++-----
 pandas/src/inference.pyx                |   32 +-
 pandas/src/numpy.pxd                    |    2 +
 pandas/tests/test_common.py             |    4 +-
 pandas/tests/test_format.py             |   23 +-
 pandas/tests/test_frame.py              |  811 ++-
 pandas/tests/test_groupby.py            |   57 +-
 pandas/tests/test_internals.py          |  161 +-
 pandas/tests/test_multilevel.py         |   19 +-
 pandas/tests/test_ndframe.py            |    5 +-
 pandas/tests/test_panel.py              |   38 +-
 pandas/tests/test_panel4d.py            |    9 +-
 pandas/tests/test_series.py             |   60 +-
 pandas/tests/test_tseries.py            |   81 +-
 pandas/tools/merge.py                   |    6 +-
 pandas/tools/tests/test_merge.py        |   81 +-
 pandas/tseries/tests/test_resample.py   |   23 +-
 pandas/tseries/tests/test_timeseries.py |    3 +-
 pandas/tslib.pyx                        |   23 +-
 pandas/util/testing.py                  |   50 +-
 vb_suite/groupby.py                     |    9 +
 vb_suite/reindex.py                     |    8 +
 37 files changed, 9634 insertions(+), 3178 deletions(-)
 create mode 100644 doc/source/v0.10.2.txt

diff --git a/RELEASE.rst b/RELEASE.rst
index 981fa5bed257d..5db564176959e 100644
--- a/RELEASE.rst
+++ b/RELEASE.rst
@@ -22,6 +22,42 @@ Where to get it
 * Binary installers on PyPI: http://pypi.python.org/pypi/pandas
 * Documentation: http://pandas.pydata.org
 
+pandas 0.10.2
+=============
+
+**Release date:** 2013-??-??
+
+**New features**
+
+  - Allow mixed dtypes (e.g ``float32/float64/int32/int16/int8``) to coexist in DataFrames and propogate in operations
+
+**Improvements to existing features**
+
+  - added ``blocks`` attribute to DataFrames, to return a dict of dtypes to homogeneously dtyped DataFrames
+  - added keyword ``convert_numeric`` to ``convert_objects()`` to try to convert object dtypes to numeric types
+  - ``convert_dates`` in ``convert_objects`` can now be ``coerce`` which will return a datetime64[ns] dtype
+    with non-convertibles set as ``NaT``; will preserve an all-nan object (e.g. strings)
+  - Series print output now includes the dtype by default
+
+**API Changes**
+
+  - Do not automatically upcast numeric specified dtypes to ``int64`` or ``float64`` (GH622_ and GH797_)
+  - Guarantee that ``convert_objects()`` for Series/DataFrame always returns a copy
+  - groupby operations will respect dtypes for numeric float operations (float32/float64); other types will be operated on,
+    and will try to cast back to the input dtype (e.g. if an int is passed, as long as the output doesn't have nans, 
+    then an int will be returned)
+  - backfill/pad/take/diff/ohlc will now support ``float32/int16/int8`` operations
+  - Integer block types will upcast as needed in where operations (GH2793_)
+
+**Bug Fixes**
+
+  - Fix seg fault on empty data frame when fillna with ``pad`` or ``backfill`` (GH2778_)
+
+.. _GH622: https://github.com/pydata/pandas/issues/622
+.. _GH797: https://github.com/pydata/pandas/issues/797
+.. _GH2778: https://github.com/pydata/pandas/issues/2778
+.. _GH2793: https://github.com/pydata/pandas/issues/2793
+
 pandas 0.10.1
 =============
 
@@ -36,6 +72,7 @@ pandas 0.10.1
   - Restored inplace=True behavior returning self (same object) with
     deprecation warning until 0.11 (GH1893_)
   - ``HDFStore``
+
     - refactored HFDStore to deal with non-table stores as objects, will allow future enhancements
     - removed keyword ``compression`` from ``put`` (replaced by keyword
       ``complib`` to be consistent across library)
@@ -49,7 +86,7 @@ pandas 0.10.1
     - support data column indexing and selection, via ``data_columns`` keyword in append
     - support write chunking to reduce memory footprint, via ``chunksize``
       keyword to append
-    - support automagic indexing via ``index`` keywork to append
+    - support automagic indexing via ``index`` keyword to append
     - support ``expectedrows`` keyword in append to inform ``PyTables`` about
       the expected tablesize
     - support ``start`` and ``stop`` keywords in select to limit the row
diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst
index 362ef8ef7d7fb..6919f67db5b78 100644
--- a/doc/source/dsintro.rst
+++ b/doc/source/dsintro.rst
@@ -450,15 +450,101 @@ DataFrame:
    df.xs('b')
    df.ix[2]
 
-Note if a DataFrame contains columns of multiple dtypes, the dtype of the row
-will be chosen to accommodate all of the data types (dtype=object is the most
-general).
-
 For a more exhaustive treatment of more sophisticated label-based indexing and
 slicing, see the :ref:`section on indexing <indexing>`. We will address the
 fundamentals of reindexing / conforming to new sets of lables in the
 :ref:`section on reindexing <basics.reindexing>`.
 
+DataTypes
+~~~~~~~~~
+
+.. _dsintro.column_types:
+
+The main types stored in pandas objects are float, int, boolean, datetime64[ns],
+and object. A convenient ``dtypes`` attribute return a Series with the data type of
+each column.
+
+.. ipython:: python
+
+   df['integer'] = 1
+   df['int32']   = df['integer'].astype('int32')
+   df['float32'] = Series([1.0]*len(df),dtype='float32')
+   df['timestamp'] = Timestamp('20010102')
+   df.dtypes
+
+If a DataFrame contains columns of multiple dtypes, the dtype of the column
+will be chosen to accommodate all of the data types (dtype=object is the most
+general).
+
+The related method ``get_dtype_counts`` will return the number of columns of
+each type:
+
+.. ipython:: python
+
+   df.get_dtype_counts()
+
+Numeric dtypes will propgate and can coexist in DataFrames (starting in v0.10.2). 
+If a dtype is passed (either directly via the ``dtype`` keyword, a passed ``ndarray``, 
+or a passed ``Series``, then it will be preserved in DataFrame operations. Furthermore, different numeric dtypes will **NOT** be combined. The following example will give you a taste.
+
+.. ipython:: python
+
+   df1 = DataFrame(randn(8, 1), columns = ['A'], dtype = 'float32')
+   df1
+   df1.dtypes
+   df2 = DataFrame(dict( A = Series(randn(8),dtype='float16'), 
+                         B = Series(randn(8)), 
+                         C = Series(np.array(randn(8),dtype='uint8')) ))
+   df2
+   df2.dtypes
+
+   # here you get some upcasting
+   df3 = df1.reindex_like(df2).fillna(value=0.0) + df2
+   df3
+   df3.dtypes
+
+   # this is lower-common-denomicator upcasting (meaning you get the dtype which can accomodate all of the types)
+   df3.values.dtype
+
+Upcasting is always according to the **numpy** rules. If two different dtypes are involved in an operation, then the more *general* one will be used as the result of the operation.
+
+DataType Conversion
+~~~~~~~~~~~~~~~~~~~
+
+You can use the ``astype`` method to convert dtypes from one to another. These *always* return a copy. 
+In addition, ``convert_objects`` will attempt to *soft* conversion of any *object* dtypes, meaning that if all the objects in a Series are of the same type, the Series
+will have that dtype.
+
+.. ipython:: python
+
+   df3
+   df3.dtypes
+
+   # conversion of dtypes
+   df3.astype('float32').dtypes
+
+To force conversion of specific types of number conversion, pass ``convert_numeric = True``. 
+This will force strings and numbers alike to be numbers if possible, otherwise the will be set to ``np.nan``.
+To force conversion to ``datetime64[ns]``, pass ``convert_dates = 'coerce'``. 
+This will convert any datetimelike object to dates, forcing other values to ``NaT``.
+
+.. ipython:: python
+
+   # mixed type conversions
+   df3['D'] = '1.'
+   df3['E'] = '1'
+   df3.convert_objects(convert_numeric=True).dtypes
+
+   # same, but specific dtype conversion
+   df3['D'] = df3['D'].astype('float16')
+   df3['E'] = df3['E'].astype('int32')
+   df3.dtypes
+
+   # forcing date coercion
+   s = Series([datetime(2001,1,1,0,0), 'foo', 1.0, 1, Timestamp('20010104'), '20010105'],dtype='O')
+   s
+   s.convert_objects(convert_dates='coerce')
+
 Data alignment and arithmetic
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -633,26 +719,6 @@ You can also disable this feature via the ``expand_frame_repr`` option:
    reset_option('expand_frame_repr')
 
 
-DataFrame column types
-~~~~~~~~~~~~~~~~~~~~~~
-
-.. _dsintro.column_types:
-
-The four main types stored in pandas objects are float, int, boolean, and
-object. A convenient ``dtypes`` attribute return a Series with the data type of
-each column:
-
-.. ipython:: python
-
-   baseball.dtypes
-
-The related method ``get_dtype_counts`` will return the number of columns of
-each type:
-
-.. ipython:: python
-
-   baseball.get_dtype_counts()
-
 DataFrame column attribute access and IPython completion
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst
index 33c5db2d24102..969173d0d3569 100644
--- a/doc/source/indexing.rst
+++ b/doc/source/indexing.rst
@@ -304,6 +304,34 @@ so that the original data can be modified without creating a copy:
 
    df.mask(df >= 0)
 
+Upcasting Gotchas
+~~~~~~~~~~~~~~~~~
+
+Performing indexing operations on ``integer`` type data can easily upcast the data to ``floating``.
+The dtype of the input data will be preserved in cases where ``nans`` are not introduced (coming soon).
+
+.. ipython:: python
+
+   dfi = df.astype('int32')
+   dfi['E'] = 1
+   dfi
+   dfi.dtypes
+
+   casted = dfi[dfi>0]
+   casted
+   casted.dtypes
+
+While float dtypes are unchanged.
+
+.. ipython:: python
+
+   df2 = df.copy()
+   df2['A'] = df2['A'].astype('float32')
+   df2.dtypes
+
+   casted = df2[df2>0]
+   casted
+   casted.dtypes
 
 Take Methods
 ~~~~~~~~~~~~
diff --git a/doc/source/v0.10.2.txt b/doc/source/v0.10.2.txt
new file mode 100644
index 0000000000000..d87cf86d56864
--- /dev/null
+++ b/doc/source/v0.10.2.txt
@@ -0,0 +1,95 @@
+.. _whatsnew_0102:
+
+v0.10.2 (February ??, 2013)
+---------------------------
+
+This is a minor release from 0.10.1 and includes many new features and
+enhancements along with a large number of bug fixes. There are also a number of
+important API changes that long-time pandas users should pay close attention
+to.
+
+API changes
+~~~~~~~~~~~
+
+Numeric dtypes will propgate and can coexist in DataFrames. If a dtype is passed (either directly via the ``dtype`` keyword, a passed ``ndarray``, or a passed ``Series``, then it will be preserved in DataFrame operations. Furthermore, different numeric dtypes will **NOT** be combined. The following example will give you a taste.
+
+**Dtype Specification**
+
+.. ipython:: python
+
+   df1 = DataFrame(randn(8, 1), columns = ['A'], dtype = 'float32')
+   df1
+   df1.dtypes
+   df2 = DataFrame(dict( A = Series(randn(8),dtype='float16'), B = Series(randn(8)), C = Series(randn(8),dtype='uint8') ))
+   df2
+   df2.dtypes
+
+   # here you get some upcasting
+   df3 = df1.reindex_like(df2).fillna(value=0.0) + df2
+   df3
+   df3.dtypes
+
+**Dtype conversion**
+
+.. ipython:: python
+
+   # this is lower-common-denomicator upcasting (meaning you get the dtype which can accomodate all of the types)
+   df3.values.dtype
+
+   # conversion of dtypes
+   df3.astype('float32').dtypes
+
+   # mixed type conversions
+   df3['D'] = '1.'
+   df3['E'] = '1'
+   df3.convert_objects(convert_numeric=True).dtypes
+
+   # same, but specific dtype conversion
+   df3['D'] = df3['D'].astype('float16')
+   df3['E'] = df3['E'].astype('int32')
+   df3.dtypes
+
+   # forcing date coercion
+   s = Series([datetime(2001,1,1,0,0), 'foo', 1.0, 1, 
+               Timestamp('20010104'), '20010105'],dtype='O')
+   s.convert_objects(convert_dates='coerce')
+
+**Upcasting Gotchas**
+
+Performing indexing operations on integer type data can easily upcast the data.
+The dtype of the input data will be preserved in cases where ``nans`` are not introduced (coming soon).
+
+.. ipython:: python
+
+   dfi = df3.astype('int32')
+   dfi['D'] = dfi['D'].astype('int64')
+   dfi
+   dfi.dtypes
+
+   casted = dfi[dfi>0]
+   casted
+   casted.dtypes
+
+While float dtypes are unchanged.
+
+.. ipython:: python
+
+   df4 = df3.copy()
+   df4['A'] = df4['A'].astype('float32')
+   df4.dtypes
+
+   casted = df4[df4>0]
+   casted
+   casted.dtypes
+
+New features
+~~~~~~~~~~~~
+
+**Enhancements**
+
+**Bug Fixes**
+
+See the `full release notes
+<https://github.com/pydata/pandas/blob/master/RELEASE.rst>`__ or issue tracker
+on GitHub for a complete list.
+
diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst
index 6c125c45a2599..646610ecccd88 100644
--- a/doc/source/whatsnew.rst
+++ b/doc/source/whatsnew.rst
@@ -16,6 +16,8 @@ What's New
 
 These are new features and improvements of note in each release.
 
+.. include:: v0.10.2.txt
+
 .. include:: v0.10.1.txt
 
 .. include:: v0.10.0.txt
diff --git a/pandas/algos.pyx b/pandas/algos.pyx
index 0d7006f08111b..40c8cabe3cb9a 100644
--- a/pandas/algos.pyx
+++ b/pandas/algos.pyx
@@ -12,23 +12,35 @@ cimport util
 
 from libc.stdlib cimport malloc, free
 
+from numpy cimport NPY_INT8 as NPY_int8
+from numpy cimport NPY_INT16 as NPY_int16
 from numpy cimport NPY_INT32 as NPY_int32
 from numpy cimport NPY_INT64 as NPY_int64
+from numpy cimport NPY_FLOAT16 as NPY_float16
 from numpy cimport NPY_FLOAT32 as NPY_float32
 from numpy cimport NPY_FLOAT64 as NPY_float64
 
+int8 = np.dtype(np.int8)
+int16 = np.dtype(np.int16)
 int32 = np.dtype(np.int32)
 int64 = np.dtype(np.int64)
+float16 = np.dtype(np.float16)
 float32 = np.dtype(np.float32)
 float64 = np.dtype(np.float64)
 
+cdef np.int8_t MINint8 = np.iinfo(np.int8).min
+cdef np.int16_t MINint16 = np.iinfo(np.int16).min
 cdef np.int32_t MINint32 = np.iinfo(np.int32).min
 cdef np.int64_t MINint64 = np.iinfo(np.int64).min
+cdef np.float16_t MINfloat16 = np.NINF
 cdef np.float32_t MINfloat32 = np.NINF
 cdef np.float64_t MINfloat64 = np.NINF
 
+cdef np.int8_t MAXint8 = np.iinfo(np.int8).max
+cdef np.int16_t MAXint16 = np.iinfo(np.int16).max
 cdef np.int32_t MAXint32 = np.iinfo(np.int32).max
 cdef np.int64_t MAXint64 = np.iinfo(np.int64).max
+cdef np.float16_t MAXfloat16 = np.inf
 cdef np.float32_t MAXfloat32 = np.inf
 cdef np.float64_t MAXfloat64 = np.inf
 
@@ -615,141 +627,6 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average',
 #     return result
 
 
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def diff_2d_float64(ndarray[float64_t, ndim=2] arr,
-                    ndarray[float64_t, ndim=2] out,
-                    Py_ssize_t periods, int axis):
-    cdef:
-        Py_ssize_t i, j, sx, sy
-
-    sx, sy = (<object> arr).shape
-    if arr.flags.f_contiguous:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for j in range(sy):
-                for i in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for j in range(start, stop):
-                for i in range(sx):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-    else:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for i in range(start, stop):
-                for j in range(sy):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for i in range(sx):
-                for j in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def diff_2d_int64(ndarray[int64_t, ndim=2] arr,
-                  ndarray[float64_t, ndim=2] out,
-                  Py_ssize_t periods, int axis):
-    cdef:
-        Py_ssize_t i, j, sx, sy
-
-    sx, sy = (<object> arr).shape
-    if arr.flags.f_contiguous:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for j in range(sy):
-                for i in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for j in range(start, stop):
-                for i in range(sx):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-    else:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for i in range(start, stop):
-                for j in range(sy):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for i in range(sx):
-                for j in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def diff_2d_int32(ndarray[int64_t, ndim=2] arr,
-                  ndarray[float64_t, ndim=2] out,
-                  Py_ssize_t periods, int axis):
-    cdef:
-        Py_ssize_t i, j, sx, sy
-
-    sx, sy = (<object> arr).shape
-    if arr.flags.f_contiguous:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for j in range(sy):
-                for i in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for j in range(start, stop):
-                for i in range(sx):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-    else:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for i in range(start, stop):
-                for j in range(sy):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for i in range(sx):
-                for j in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-
-
 # Cython implementations of rolling sum, mean, variance, skewness,
 # other statistical moment functions
 #
@@ -1931,161 +1808,9 @@ def groupsort_indexer(ndarray[int64_t] index, Py_ssize_t ngroups):
     return result, counts
 
 # TODO: aggregate multiple columns in single pass
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_add(ndarray[float64_t, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[float64_t, ndim=2] values,
-              ndarray[int64_t] labels):
-    '''
-    Only aggregates on axis=0
-    '''
-    cdef:
-        Py_ssize_t i, j, N, K, lab
-        float64_t val, count
-        ndarray[float64_t, ndim=2] sumx, nobs
-
-    nobs = np.zeros_like(out)
-    sumx = np.zeros_like(out)
-
-    N, K = (<object> values).shape
-
-    if K > 1:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val:
-                    nobs[lab, j] += 1
-                    sumx[lab, j] += val
-    else:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            val = values[i, 0]
-
-            # not nan
-            if val == val:
-                nobs[lab, 0] += 1
-                sumx[lab, 0] += val
-
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = sumx[i, j]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_prod(ndarray[float64_t, ndim=2] out,
-               ndarray[int64_t] counts,
-               ndarray[float64_t, ndim=2] values,
-               ndarray[int64_t] labels):
-    '''
-    Only aggregates on axis=0
-    '''
-    cdef:
-        Py_ssize_t i, j, N, K, lab
-        float64_t val, count
-        ndarray[float64_t, ndim=2] prodx, nobs
-
-    nobs = np.zeros_like(out)
-    prodx = np.ones_like(out)
-
-    N, K = (<object> values).shape
-
-    if K > 1:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val:
-                    nobs[lab, j] += 1
-                    prodx[lab, j] *= val
-    else:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            val = values[i, 0]
-
-            # not nan
-            if val == val:
-                nobs[lab, 0] += 1
-                prodx[lab, 0] *= val
-
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = prodx[i, j]
-
 #----------------------------------------------------------------------
 # first, nth, last
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_nth(ndarray[float64_t, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[float64_t, ndim=2] values,
-              ndarray[int64_t] labels, int64_t rank):
-    '''
-    Only aggregates on axis=0
-    '''
-    cdef:
-        Py_ssize_t i, j, N, K, lab
-        float64_t val, count
-        ndarray[float64_t, ndim=2] resx
-        ndarray[int64_t, ndim=2] nobs
-
-    nobs = np.zeros((<object> out).shape, dtype=np.int64)
-    resx = np.empty_like(out)
-
-    N, K = (<object> values).shape
-
-    for i in range(N):
-        lab = labels[i]
-        if lab < 0:
-            continue
-
-        counts[lab] += 1
-        for j in range(K):
-            val = values[i, j]
-
-            # not nan
-            if val == val:
-                nobs[lab, j] += 1
-                if nobs[lab, j] == rank:
-                    resx[lab, j] = val
-
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = resx[i, j]
-
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def group_nth_object(ndarray[object, ndim=2] out,
@@ -2130,52 +1855,6 @@ def group_nth_object(ndarray[object, ndim=2] out,
             else:
                 out[i, j] = resx[i, j]
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_nth_bin(ndarray[float64_t, ndim=2] out,
-                  ndarray[int64_t] counts,
-                  ndarray[float64_t, ndim=2] values,
-                  ndarray[int64_t] bins, int64_t rank):
-    '''
-    Only aggregates on axis=0
-    '''
-    cdef:
-        Py_ssize_t i, j, N, K, ngroups, b
-        float64_t val, count
-        ndarray[float64_t, ndim=2] resx, nobs
-
-    nobs = np.zeros_like(out)
-    resx = np.empty_like(out)
-
-    if bins[len(bins) - 1] == len(values):
-        ngroups = len(bins)
-    else:
-        ngroups = len(bins) + 1
-
-    N, K = (<object> values).shape
-
-    b = 0
-    for i in range(N):
-        while b < ngroups - 1 and i >= bins[b]:
-            b += 1
-
-        counts[b] += 1
-        for j in range(K):
-            val = values[i, j]
-
-            # not nan
-            if val == val:
-                nobs[b, j] += 1
-                if nobs[b, j] == rank:
-                    resx[b, j] = val
-
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = resx[i, j]
-
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def group_nth_bin_object(ndarray[object, ndim=2] out,
@@ -2224,47 +1903,6 @@ def group_nth_bin_object(ndarray[object, ndim=2] out,
             else:
                 out[i, j] = resx[i, j]
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_last(ndarray[float64_t, ndim=2] out,
-               ndarray[int64_t] counts,
-               ndarray[float64_t, ndim=2] values,
-               ndarray[int64_t] labels):
-    '''
-    Only aggregates on axis=0
-    '''
-    cdef:
-        Py_ssize_t i, j, N, K, lab
-        float64_t val, count
-        ndarray[float64_t, ndim=2] resx
-        ndarray[int64_t, ndim=2] nobs
-
-    nobs = np.zeros((<object> out).shape, dtype=np.int64)
-    resx = np.empty_like(out)
-
-    N, K = (<object> values).shape
-
-    for i in range(N):
-        lab = labels[i]
-        if lab < 0:
-            continue
-
-        counts[lab] += 1
-        for j in range(K):
-            val = values[i, j]
-
-            # not nan
-            if val == val:
-                nobs[lab, j] += 1
-                resx[lab, j] = val
-
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = resx[i, j]
-
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def group_last_object(ndarray[object, ndim=2] out,
@@ -2307,52 +1945,6 @@ def group_last_object(ndarray[object, ndim=2] out,
             else:
                 out[i, j] = resx[i, j]
 
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_last_bin(ndarray[float64_t, ndim=2] out,
-                   ndarray[int64_t] counts,
-                   ndarray[float64_t, ndim=2] values,
-                   ndarray[int64_t] bins):
-    '''
-    Only aggregates on axis=0
-    '''
-    cdef:
-        Py_ssize_t i, j, N, K, ngroups, b
-        float64_t val, count
-        ndarray[float64_t, ndim=2] resx, nobs
-
-    nobs = np.zeros_like(out)
-    resx = np.empty_like(out)
-
-    if bins[len(bins) - 1] == len(values):
-        ngroups = len(bins)
-    else:
-        ngroups = len(bins) + 1
-
-    N, K = (<object> values).shape
-
-    b = 0
-    for i in range(N):
-        while b < ngroups - 1 and i >= bins[b]:
-            b += 1
-
-        counts[b] += 1
-        for j in range(K):
-            val = values[i, j]
-
-            # not nan
-            if val == val:
-                nobs[b, j] += 1
-                resx[b, j] = val
-
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = resx[i, j]
-
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def group_last_bin_object(ndarray[object, ndim=2] out,
@@ -2400,183 +1992,15 @@ def group_last_bin_object(ndarray[object, ndim=2] out,
             else:
                 out[i, j] = resx[i, j]
 
-#----------------------------------------------------------------------
-# group_min, group_max
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_min(ndarray[float64_t, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[float64_t, ndim=2] values,
-              ndarray[int64_t] labels):
-    '''
-    Only aggregates on axis=0
-    '''
-    cdef:
-        Py_ssize_t i, j, N, K, lab
-        float64_t val, count
-        ndarray[float64_t, ndim=2] minx, nobs
-
-    nobs = np.zeros_like(out)
-
-    minx = np.empty_like(out)
-    minx.fill(np.inf)
-
-    N, K = (<object> values).shape
-
-    if K > 1:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val:
-                    nobs[lab, j] += 1
-                    if val < minx[lab, j]:
-                        minx[lab, j] = val
-    else:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            val = values[i, 0]
-
-            # not nan
-            if val == val:
-                nobs[lab, 0] += 1
-                if val < minx[lab, 0]:
-                    minx[lab, 0] = val
-
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = minx[i, j]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_max(ndarray[float64_t, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[float64_t, ndim=2] values,
-              ndarray[int64_t] labels):
-    '''
-    Only aggregates on axis=0
-    '''
-    cdef:
-        Py_ssize_t i, j, N, K, lab
-        float64_t val, count
-        ndarray[float64_t, ndim=2] maxx, nobs
-
-    nobs = np.zeros_like(out)
-
-    maxx = np.empty_like(out)
-    maxx.fill(-np.inf)
-
-    N, K = (<object> values).shape
-
-    if K > 1:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val:
-                    nobs[lab, j] += 1
-                    if val > maxx[lab, j]:
-                        maxx[lab, j] = val
-    else:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            val = values[i, 0]
-
-            # not nan
-            if val == val:
-                nobs[lab, 0] += 1
-                if val > maxx[lab, 0]:
-                    maxx[lab, 0] = val
-
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = maxx[i, j]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_mean(ndarray[float64_t, ndim=2] out,
-               ndarray[int64_t] counts,
-               ndarray[float64_t, ndim=2] values,
-               ndarray[int64_t] labels):
-    cdef:
-        Py_ssize_t i, j, N, K, lab
-        float64_t val, count
-        ndarray[float64_t, ndim=2] sumx, nobs
-
-    nobs = np.zeros_like(out)
-    sumx = np.zeros_like(out)
-
-    N, K = (<object> values).shape
-
-    if K > 1:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-                # not nan
-                if val == val:
-                    nobs[lab, j] += 1
-                    sumx[lab, j] += val
-    else:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            val = values[i, 0]
-            # not nan
-            if val == val:
-                nobs[lab, 0] += 1
-                sumx[lab, 0] += val
-
-    for i in range(len(counts)):
-        for j in range(K):
-            count = nobs[i, j]
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = sumx[i, j] / count
-
-
-def group_median(ndarray[float64_t, ndim=2] out,
-                 ndarray[int64_t] counts,
-                 ndarray[float64_t, ndim=2] values,
-                 ndarray[int64_t] labels):
+#----------------------------------------------------------------------
+# median
+
+def group_median(ndarray[float64_t, ndim=2] out,
+                 ndarray[int64_t] counts,
+                 ndarray[float64_t, ndim=2] values,
+                 ndarray[int64_t] labels):
     '''
     Only aggregates on axis=0
     '''
@@ -2642,497 +2066,5 @@ cdef inline float64_t _median_linear(float64_t* a, int n):
 
     return result
 
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_var(ndarray[float64_t, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[float64_t, ndim=2] values,
-              ndarray[int64_t] labels):
-    cdef:
-        Py_ssize_t i, j, N, K, lab
-        float64_t val, ct
-        ndarray[float64_t, ndim=2] nobs, sumx, sumxx
-
-    nobs = np.zeros_like(out)
-    sumx = np.zeros_like(out)
-    sumxx = np.zeros_like(out)
-
-    N, K = (<object> values).shape
-
-    if K > 1:
-        for i in range(N):
-
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val:
-                    nobs[lab, j] += 1
-                    sumx[lab, j] += val
-                    sumxx[lab, j] += val * val
-    else:
-        for i in range(N):
-
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            val = values[i, 0]
-            # not nan
-            if val == val:
-                nobs[lab, 0] += 1
-                sumx[lab, 0] += val
-                sumxx[lab, 0] += val * val
-
-
-    for i in range(len(counts)):
-        for j in range(K):
-            ct = nobs[i, j]
-            if ct < 2:
-                out[i, j] = nan
-            else:
-                out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
-                             (ct * ct - ct))
-# add passing bin edges, instead of labels
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_add_bin(ndarray[float64_t, ndim=2] out,
-                  ndarray[int64_t] counts,
-                  ndarray[float64_t, ndim=2] values,
-                  ndarray[int64_t] bins):
-    '''
-    Only aggregates on axis=0
-    '''
-    cdef:
-        Py_ssize_t i, j, N, K, ngroups, b, nbins
-        float64_t val, count
-        ndarray[float64_t, ndim=2] sumx, nobs
-
-    nobs = np.zeros_like(out)
-    sumx = np.zeros_like(out)
-
-    if bins[len(bins) - 1] == len(values):
-        ngroups = len(bins)
-    else:
-        ngroups = len(bins) + 1
-    N, K = (<object> values).shape
-
-    b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
-
-            counts[b] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val:
-                    nobs[b, j] += 1
-                    sumx[b, j] += val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
-
-            counts[b] += 1
-            val = values[i, 0]
-
-            # not nan
-            if val == val:
-                nobs[b, 0] += 1
-                sumx[b, 0] += val
-
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = sumx[i, j]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_prod_bin(ndarray[float64_t, ndim=2] out,
-                  ndarray[int64_t] counts,
-                  ndarray[float64_t, ndim=2] values,
-                  ndarray[int64_t] bins):
-    '''
-    Only aggregates on axis=0
-    '''
-    cdef:
-        Py_ssize_t i, j, N, K, ngroups, b
-        float64_t val, count
-        ndarray[float64_t, ndim=2] prodx, nobs
-
-    nobs = np.zeros_like(out)
-    prodx = np.ones_like(out)
-
-    if bins[len(bins) - 1] == len(values):
-        ngroups = len(bins)
-    else:
-        ngroups = len(bins) + 1
-    N, K = (<object> values).shape
-
-    b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
-
-            counts[b] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val:
-                    nobs[b, j] += 1
-                    prodx[b, j] *= val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
-
-            counts[b] += 1
-            val = values[i, 0]
-
-            # not nan
-            if val == val:
-                nobs[b, 0] += 1
-                prodx[b, 0] *= val
-
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = prodx[i, j]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_min_bin(ndarray[float64_t, ndim=2] out,
-                   ndarray[int64_t] counts,
-                   ndarray[float64_t, ndim=2] values,
-                   ndarray[int64_t] bins):
-    '''
-    Only aggregates on axis=0
-    '''
-    cdef:
-        Py_ssize_t i, j, N, K, ngroups, b
-        float64_t val, count
-        ndarray[float64_t, ndim=2] minx, nobs
-
-    nobs = np.zeros_like(out)
-
-    minx = np.empty_like(out)
-    minx.fill(np.inf)
-
-    if bins[len(bins) - 1] == len(values):
-        ngroups = len(bins)
-    else:
-        ngroups = len(bins) + 1
-
-    N, K = (<object> values).shape
-
-    b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
-
-            counts[b] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val:
-                    nobs[b, j] += 1
-                    if val < minx[b, j]:
-                        minx[b, j] = val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
-
-            counts[b] += 1
-            val = values[i, 0]
-
-            # not nan
-            if val == val:
-                nobs[b, 0] += 1
-                if val < minx[b, 0]:
-                    minx[b, 0] = val
-
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = minx[i, j]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_max_bin(ndarray[float64_t, ndim=2] out,
-                  ndarray[int64_t] counts,
-                  ndarray[float64_t, ndim=2] values,
-                  ndarray[int64_t] bins):
-    '''
-    Only aggregates on axis=0
-    '''
-    cdef:
-        Py_ssize_t i, j, N, K, ngroups, b
-        float64_t val, count
-        ndarray[float64_t, ndim=2] maxx, nobs
-
-    nobs = np.zeros_like(out)
-    maxx = np.empty_like(out)
-    maxx.fill(-np.inf)
-
-    if bins[len(bins) - 1] == len(values):
-        ngroups = len(bins)
-    else:
-        ngroups = len(bins) + 1
-
-    N, K = (<object> values).shape
-
-    b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
-
-            counts[b] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val:
-                    nobs[b, j] += 1
-                    if val > maxx[b, j]:
-                        maxx[b, j] = val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
-
-            counts[b] += 1
-            val = values[i, 0]
-
-            # not nan
-            if val == val:
-                nobs[b, 0] += 1
-                if val > maxx[b, 0]:
-                    maxx[b, 0] = val
-
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = maxx[i, j]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_ohlc(ndarray[float64_t, ndim=2] out,
-                  ndarray[int64_t] counts,
-                  ndarray[float64_t, ndim=2] values,
-                  ndarray[int64_t] bins):
-    '''
-    Only aggregates on axis=0
-    '''
-    cdef:
-        Py_ssize_t i, j, N, K, ngroups, b
-        float64_t val, count
-        float64_t vopen, vhigh, vlow, vclose, NA
-        bint got_first = 0
-
-    if bins[len(bins) - 1] == len(values):
-        ngroups = len(bins)
-    else:
-        ngroups = len(bins) + 1
-
-    N, K = (<object> values).shape
-
-    if out.shape[1] != 4:
-        raise ValueError('Output array must have 4 columns')
-
-    NA = np.nan
-
-    b = 0
-    if K > 1:
-        raise NotImplementedError
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                if not got_first:
-                    out[b, 0] = NA
-                    out[b, 1] = NA
-                    out[b, 2] = NA
-                    out[b, 3] = NA
-                else:
-                    out[b, 0] = vopen
-                    out[b, 1] = vhigh
-                    out[b, 2] = vlow
-                    out[b, 3] = vclose
-                b += 1
-                got_first = 0
-
-            counts[b] += 1
-            val = values[i, 0]
-
-            # not nan
-            if val == val:
-                if not got_first:
-                    got_first = 1
-                    vopen = val
-                    vlow = val
-                    vhigh = val
-                else:
-                    if val < vlow:
-                        vlow = val
-                    if val > vhigh:
-                        vhigh = val
-                vclose = val
-
-        if not got_first:
-            out[b, 0] = NA
-            out[b, 1] = NA
-            out[b, 2] = NA
-            out[b, 3] = NA
-        else:
-            out[b, 0] = vopen
-            out[b, 1] = vhigh
-            out[b, 2] = vlow
-            out[b, 3] = vclose
-
-
-# @cython.boundscheck(False)
-# @cython.wraparound(False)
-def group_mean_bin(ndarray[float64_t, ndim=2] out,
-                   ndarray[int64_t] counts,
-                   ndarray[float64_t, ndim=2] values,
-                   ndarray[int64_t] bins):
-    cdef:
-        Py_ssize_t i, j, N, K, ngroups, b
-        float64_t val, count
-        ndarray[float64_t, ndim=2] sumx, nobs
-
-    nobs = np.zeros_like(out)
-    sumx = np.zeros_like(out)
-
-    N, K = (<object> values).shape
-    if bins[len(bins) - 1] == len(values):
-        ngroups = len(bins)
-    else:
-        ngroups = len(bins) + 1
-
-    b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
-
-            counts[b] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val:
-                    nobs[b, j] += 1
-                    sumx[b, j] += val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
-
-            counts[b] += 1
-            val = values[i, 0]
-
-            # not nan
-            if val == val:
-                nobs[b, 0] += 1
-                sumx[b, 0] += val
-
-    for i in range(ngroups):
-        for j in range(K):
-            count = nobs[i, j]
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = sumx[i, j] / count
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_var_bin(ndarray[float64_t, ndim=2] out,
-                  ndarray[int64_t] counts,
-                  ndarray[float64_t, ndim=2] values,
-                  ndarray[int64_t] bins):
-
-    cdef:
-        Py_ssize_t i, j, N, K, ngroups, b
-        float64_t val, ct
-        ndarray[float64_t, ndim=2] nobs, sumx, sumxx
-
-    nobs = np.zeros_like(out)
-    sumx = np.zeros_like(out)
-    sumxx = np.zeros_like(out)
-
-    if bins[len(bins) - 1] == len(values):
-        ngroups = len(bins)
-    else:
-        ngroups = len(bins) + 1
-
-    N, K = (<object> values).shape
-
-    b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
-
-            counts[b] += 1
-
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val:
-                    nobs[b, j] += 1
-                    sumx[b, j] += val
-                    sumxx[b, j] += val * val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
-
-            counts[b] += 1
-            val = values[i, 0]
-
-            # not nan
-            if val == val:
-                nobs[b, 0] += 1
-                sumx[b, 0] += val
-                sumxx[b, 0] += val * val
-
-    for i in range(ngroups):
-        for j in range(K):
-            ct = nobs[i, j]
-            if ct < 2:
-                out[i, j] = nan
-            else:
-                out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
-                             (ct * ct - ct))
-
 include "join.pyx"
 include "generated.pyx"
diff --git a/pandas/core/common.py b/pandas/core/common.py
index b3d996ffd0606..c99fd87f7a643 100644
--- a/pandas/core/common.py
+++ b/pandas/core/common.py
@@ -256,6 +256,9 @@ def wrapper(arr, indexer, out, fill_value=np.nan):
 
 _take1d_dict = {
     'float64': algos.take_1d_float64,
+    'float32': algos.take_1d_float32,
+    'int8': algos.take_1d_int8,
+    'int16': algos.take_1d_int16,
     'int32': algos.take_1d_int32,
     'int64': algos.take_1d_int64,
     'object': algos.take_1d_object,
@@ -266,6 +269,9 @@ def wrapper(arr, indexer, out, fill_value=np.nan):
 
 _take2d_axis0_dict = {
     'float64': algos.take_2d_axis0_float64,
+    'float32': algos.take_2d_axis0_float32,
+    'int8': algos.take_2d_axis0_int8,
+    'int16': algos.take_2d_axis0_int16,
     'int32': algos.take_2d_axis0_int32,
     'int64': algos.take_2d_axis0_int64,
     'object': algos.take_2d_axis0_object,
@@ -276,6 +282,9 @@ def wrapper(arr, indexer, out, fill_value=np.nan):
 
 _take2d_axis1_dict = {
     'float64': algos.take_2d_axis1_float64,
+    'float32': algos.take_2d_axis1_float32,
+    'int8': algos.take_2d_axis1_int8,
+    'int16': algos.take_2d_axis1_int16,
     'int32': algos.take_2d_axis1_int32,
     'int64': algos.take_2d_axis1_int64,
     'object': algos.take_2d_axis1_object,
@@ -286,6 +295,9 @@ def wrapper(arr, indexer, out, fill_value=np.nan):
 
 _take2d_multi_dict = {
     'float64': algos.take_2d_multi_float64,
+    'float32': algos.take_2d_multi_float32,
+    'int8': algos.take_2d_multi_int8,
+    'int16': algos.take_2d_multi_int16,
     'int32': algos.take_2d_multi_int32,
     'int64': algos.take_2d_multi_int64,
     'object': algos.take_2d_multi_object,
@@ -294,6 +306,8 @@ def wrapper(arr, indexer, out, fill_value=np.nan):
                                     na_override=tslib.iNaT),
 }
 
+_dtypes_no_na = set(['int8','int16','int32', 'int64', 'bool'])
+_dtypes_na    = set(['float32', 'float64', 'object', 'datetime64[ns]'])
 
 def _get_take2d_function(dtype_str, axis=0):
     if axis == 0:
@@ -319,7 +333,7 @@ def take_1d(arr, indexer, out=None, fill_value=np.nan):
     out_passed = out is not None
     take_f = _take1d_dict.get(dtype_str)
 
-    if dtype_str in ('int32', 'int64', 'bool'):
+    if dtype_str in _dtypes_no_na:
         try:
             if out is None:
                 out = np.empty(n, dtype=arr.dtype)
@@ -337,7 +351,7 @@ def take_1d(arr, indexer, out=None, fill_value=np.nan):
                                     out.dtype)
                 out = _maybe_upcast(out)
                 np.putmask(out, mask, fill_value)
-    elif dtype_str in ('float64', 'object', 'datetime64[ns]'):
+    elif dtype_str in _dtypes_na:
         if out is None:
             out = np.empty(n, dtype=arr.dtype)
         take_f(arr, _ensure_int64(indexer), out=out, fill_value=fill_value)
@@ -360,7 +374,7 @@ def take_2d_multi(arr, row_idx, col_idx, fill_value=np.nan, out=None):
 
     out_shape = len(row_idx), len(col_idx)
 
-    if dtype_str in ('int32', 'int64', 'bool'):
+    if dtype_str in _dtypes_no_na:
         row_mask = row_idx == -1
         col_mask = col_idx == -1
         needs_masking = row_mask.any() or col_mask.any()
@@ -376,7 +390,7 @@ def take_2d_multi(arr, row_idx, col_idx, fill_value=np.nan, out=None):
                    _ensure_int64(col_idx), out=out,
                    fill_value=fill_value)
             return out
-    elif dtype_str in ('float64', 'object', 'datetime64[ns]'):
+    elif dtype_str in _dtypes_na:
         if out is None:
             out = np.empty(out_shape, dtype=arr.dtype)
         take_f = _get_take2d_function(dtype_str, axis='multi')
@@ -405,7 +419,7 @@ def take_2d(arr, indexer, out=None, mask=None, needs_masking=None, axis=0,
     if not isinstance(indexer, np.ndarray):
         indexer = np.array(indexer, dtype=np.int64)
 
-    if dtype_str in ('int32', 'int64', 'bool'):
+    if dtype_str in _dtypes_no_na:
         if mask is None:
             mask = indexer == -1
             needs_masking = mask.any()
@@ -423,7 +437,7 @@ def take_2d(arr, indexer, out=None, mask=None, needs_masking=None, axis=0,
             take_f = _get_take2d_function(dtype_str, axis=axis)
             take_f(arr, _ensure_int64(indexer), out=out, fill_value=fill_value)
             return out
-    elif dtype_str in ('float64', 'object', 'datetime64[ns]'):
+    elif dtype_str in _dtypes_na:
         if out is None:
             out = np.empty(out_shape, dtype=arr.dtype)
         take_f = _get_take2d_function(dtype_str, axis=axis)
@@ -457,8 +471,11 @@ def mask_out_axis(arr, mask, axis, fill_value=np.nan):
 
 _diff_special = {
     'float64': algos.diff_2d_float64,
+    'float32': algos.diff_2d_float32,
     'int64': algos.diff_2d_int64,
-    'int32': algos.diff_2d_int32
+    'int32': algos.diff_2d_int32,
+    'int16': algos.diff_2d_int16,
+    'int8': algos.diff_2d_int8,
 }
 
 
@@ -548,14 +565,18 @@ def wrapper(arr, mask, limit=None):
 
 
 def pad_1d(values, limit=None, mask=None):
+
+    dtype   = values.dtype.name
+    _method = None
     if is_float_dtype(values):
-        _method = algos.pad_inplace_float64
+        _method = getattr(algos,'pad_inplace_%s' % dtype,None)
     elif is_datetime64_dtype(values):
         _method = _pad_1d_datetime
     elif values.dtype == np.object_:
         _method = algos.pad_inplace_object
-    else:  # pragma: no cover
-        raise ValueError('Invalid dtype for padding')
+
+    if _method is None:
+        raise ValueError('Invalid dtype for pad_1d [%s]' % dtype)
 
     if mask is None:
         mask = isnull(values)
@@ -564,14 +585,18 @@ def pad_1d(values, limit=None, mask=None):
 
 
 def backfill_1d(values, limit=None, mask=None):
+
+    dtype   = values.dtype.name
+    _method = None
     if is_float_dtype(values):
-        _method = algos.backfill_inplace_float64
+        _method = getattr(algos,'backfill_inplace_%s' % dtype,None)
     elif is_datetime64_dtype(values):
         _method = _backfill_1d_datetime
     elif values.dtype == np.object_:
         _method = algos.backfill_inplace_object
-    else:  # pragma: no cover
-        raise ValueError('Invalid dtype for padding')
+
+    if _method is None:
+        raise ValueError('Invalid dtype for backfill_1d [%s]' % dtype)
 
     if mask is None:
         mask = isnull(values)
@@ -581,14 +606,18 @@ def backfill_1d(values, limit=None, mask=None):
 
 
 def pad_2d(values, limit=None, mask=None):
+
+    dtype   = values.dtype.name
+    _method = None
     if is_float_dtype(values):
-        _method = algos.pad_2d_inplace_float64
+        _method = getattr(algos,'pad_2d_inplace_%s' % dtype,None)
     elif is_datetime64_dtype(values):
         _method = _pad_2d_datetime
     elif values.dtype == np.object_:
         _method = algos.pad_2d_inplace_object
-    else:  # pragma: no cover
-        raise ValueError('Invalid dtype for padding')
+
+    if _method is None:
+        raise ValueError('Invalid dtype for pad_2d [%s]' % dtype)
 
     if mask is None:
         mask = isnull(values)
@@ -602,14 +631,18 @@ def pad_2d(values, limit=None, mask=None):
 
 
 def backfill_2d(values, limit=None, mask=None):
+
+    dtype   = values.dtype.name
+    _method = None
     if is_float_dtype(values):
-        _method = algos.backfill_2d_inplace_float64
+        _method = getattr(algos,'backfill_2d_inplace_%s' % dtype,None)
     elif is_datetime64_dtype(values):
         _method = _backfill_2d_datetime
     elif values.dtype == np.object_:
         _method = algos.backfill_2d_inplace_object
-    else:  # pragma: no cover
-        raise ValueError('Invalid dtype for padding')
+
+    if _method is None:
+        raise ValueError('Invalid dtype for backfill_2d [%s]' % dtype)
 
     if mask is None:
         mask = isnull(values)
@@ -633,10 +666,43 @@ def _consensus_name_attr(objs):
 # Lots of little utilities
 
 
-def _possibly_cast_to_datetime(value, dtype):
+def _possibly_convert_objects(values, convert_dates=True, convert_numeric=True):
+    """ if we have an object dtype, try to coerce dates and/or numers """
+
+    if values.dtype == np.object_ and convert_dates:
+
+        # we take an aggressive stance and convert to datetime64[ns]
+        if convert_dates == 'coerce':
+            new_values = _possibly_cast_to_datetime(values, 'M8[ns]', coerce = True)
+
+            # if we are all nans then leave me alone
+            if not isnull(new_values).all():
+                values = new_values
+
+        else:
+            values = lib.maybe_convert_objects(values, convert_datetime=convert_dates)
+
+    if values.dtype == np.object_ and convert_numeric:
+        try:
+            new_values = lib.maybe_convert_numeric(values,set(),coerce_numeric=True)
+            
+            # if we are all nans then leave me alone
+            if not isnull(new_values).all():
+                values = new_values
+
+        except:
+            pass
+
+    return values
+
+
+def _possibly_cast_to_datetime(value, dtype, coerce = False):
     """ try to cast the array/value to a datetimelike dtype, converting float nan to iNaT """
 
-    if dtype == 'M8[ns]':
+    if isinstance(dtype, basestring):
+        dtype = np.dtype(dtype)
+
+    if dtype is not None and is_datetime64_dtype(dtype):
         if np.isscalar(value):
             if value == tslib.iNaT or isnull(value):
                 value = tslib.iNaT
@@ -650,7 +716,7 @@ def _possibly_cast_to_datetime(value, dtype):
             # we have an array of datetime & nulls
             elif np.prod(value.shape):
                 try:
-                    value = tslib.array_to_datetime(value)
+                    value = tslib.array_to_datetime(value, coerce = coerce)
                 except:
                     pass
 
@@ -1001,6 +1067,8 @@ def _is_int_or_datetime_dtype(arr_or_dtype):
 def is_datetime64_dtype(arr_or_dtype):
     if isinstance(arr_or_dtype, np.dtype):
         tipo = arr_or_dtype.type
+    elif isinstance(arr_or_dtype, type):
+        tipo = np.dtype(arr_or_dtype).type
     else:
         tipo = arr_or_dtype.dtype.type
     return issubclass(tipo, np.datetime64)
@@ -1026,13 +1094,17 @@ def _is_sequence(x):
         return False
 
 _ensure_float64 = algos.ensure_float64
+_ensure_float32 = algos.ensure_float32
 _ensure_int64 = algos.ensure_int64
 _ensure_int32 = algos.ensure_int32
+_ensure_int16 = algos.ensure_int16
+_ensure_int8 = algos.ensure_int8
 _ensure_platform_int = algos.ensure_platform_int
 _ensure_object = algos.ensure_object
 
 
-def _astype_nansafe(arr, dtype):
+def _astype_nansafe(arr, dtype, copy = True):
+    """ return a view if copy is False """
     if not isinstance(dtype, np.dtype):
         dtype = np.dtype(dtype)
 
@@ -1048,7 +1120,9 @@ def _astype_nansafe(arr, dtype):
         # work around NumPy brokenness, #1987
         return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
 
-    return arr.astype(dtype)
+    if copy:
+        return arr.astype(dtype)
+    return arr.view(dtype)
 
 
 def _clean_fill_method(method):
diff --git a/pandas/core/format.py b/pandas/core/format.py
index 7fc9fbccced04..88b729349ca60 100644
--- a/pandas/core/format.py
+++ b/pandas/core/format.py
@@ -66,10 +66,11 @@
 class SeriesFormatter(object):
 
     def __init__(self, series, buf=None, header=True, length=True,
-                 na_rep='NaN', name=False, float_format=None):
+                 na_rep='NaN', name=False, float_format=None, dtype=True):
         self.series = series
         self.buf = buf if buf is not None else StringIO(u"")
         self.name = name
+        self.dtype = dtype
         self.na_rep = na_rep
         self.length = length
         self.header = header
@@ -98,6 +99,12 @@ def _get_footer(self):
                 footer += ', '
             footer += 'Length: %d' % len(self.series)
 
+        if self.dtype:
+            if getattr(self.series.dtype,'name',None):
+                if footer:
+                    footer += ', '
+                footer += 'Dtype: %s' % com.pprint_thing(self.series.dtype.name)
+
         return unicode(footer)
 
     def _get_formatted_index(self):
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 301ea9d28d001..efb478df014ae 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -312,7 +312,10 @@ def f(self, other):
         elif isinstance(other, Series):
             return self._combine_series_infer(other, func)
         else:
-            return self._combine_const(other, func)
+
+            # straight boolean comparisions we want to allow all columns 
+            # (regardless of dtype to pass thru)
+            return self._combine_const(other, func, raise_on_error = False).fillna(True).astype(bool)
 
     f.__name__ = name
 
@@ -327,6 +330,7 @@ class DataFrame(NDFrame):
     _auto_consolidate = True
     _verbose_info = True
     _het_axis = 1
+    _info_axis = 'columns'
     _col_klass = Series
 
     _AXIS_NUMBERS = {
@@ -1004,6 +1008,12 @@ def from_records(cls, data, index=None, exclude=None, columns=None,
                         arr_columns.append(k)
                         arrays.append(v)
 
+                # reorder according to the columns
+                if len(columns) and len(arr_columns):
+                    indexer     = _ensure_index(arr_columns).get_indexer(columns)
+                    arr_columns = _ensure_index([ arr_columns[i] for i in indexer ])
+                    arrays      = [ arrays[i] for i in indexer ]
+
         elif isinstance(data, (np.ndarray, DataFrame)):
             arrays, columns = _to_arrays(data, columns)
             if columns is not None:
@@ -1650,38 +1660,25 @@ def info(self, verbose=True, buf=None, max_cols=None):
     def dtypes(self):
         return self.apply(lambda x: x.dtype)
 
-    def convert_objects(self, convert_dates=True):
+    def convert_objects(self, convert_dates=True, convert_numeric=True):
         """
         Attempt to infer better dtype for object columns
+        Always returns a copy (even if no object columns)
+
+        Parameters
+        ----------
+        convert_dates : if True, attempt to soft convert_dates, if 'coerce', force conversion (and non-convertibles get NaT)
+        convert_numeric : if True attempt to coerce to numerbers (including strings), non-convertibles get NaN
 
         Returns
         -------
         converted : DataFrame
         """
-        new_data = {}
-        convert_f = lambda x: lib.maybe_convert_objects(
-            x, convert_datetime=convert_dates)
-
-        # TODO: could be more efficient taking advantage of the block
-        for col, s in self.iteritems():
-            if s.dtype == np.object_:
-                new_data[col] = convert_f(s)
-            else:
-                new_data[col] = s
-
-        return DataFrame(new_data, index=self.index, columns=self.columns)
+        return self._constructor(self._data.convert(convert_dates=convert_dates, convert_numeric=convert_numeric))
 
     def get_dtype_counts(self):
-        counts = {}
-        for i in range(len(self.columns)):
-            series = self.icol(i)
-            # endianness can cause dtypes to look different
-            dtype_str = str(series.dtype)
-            if dtype_str in counts:
-                counts[dtype_str] += 1
-            else:
-                counts[dtype_str] = 1
-        return Series(counts)
+        """ return the counts of dtypes in this frame """
+        return Series(dict([ (dtype, len(df.columns)) for dtype, df in self.blocks.iteritems() ]))
 
     #----------------------------------------------------------------------
     # properties for index and columns
@@ -1695,6 +1692,14 @@ def as_matrix(self, columns=None):
         are presented in sorted order unless a specific list of columns is
         provided.
 
+        NOTE: the dtype will be a lower-common-denominator dtype (implicit upcasting)
+              that is to say if the dtypes (even of numeric types) are mixed, the one that accomodates all will be chosen
+              use this with care if you are not dealing with the blocks
+
+              e.g. if the dtypes are float16,float32         -> float32
+                                     float16,float32,float64 -> float64
+                                     int32,uint8             -> int32
+
         Parameters
         ----------
         columns : array-like
@@ -1711,6 +1716,33 @@ def as_matrix(self, columns=None):
 
     values = property(fget=as_matrix)
 
+    def as_blocks(self, columns=None):
+        """
+        Convert the frame to a dict of dtype -> DataFrames that each has a homogeneous dtype.
+        are presented in sorted order unless a specific list of columns is
+        provided.
+
+        NOTE: the dtypes of the blocks WILL BE PRESERVED HERE (unlike in as_matrix)
+
+        Parameters
+        ----------
+        columns : array-like
+            Specific column order
+
+        Returns
+        -------
+        values : a list of DataFrames
+        """
+        self._consolidate_inplace()
+
+        bd = dict()
+        for b in self._data.blocks:
+            b = b.reindex_items_from(columns or b.items)
+            bd[str(b.dtype)] = DataFrame(BlockManager([ b ], [ b.items, self.index ]))
+        return bd
+
+    blocks = property(fget=as_blocks)
+
     def transpose(self):
         """
         Returns a DataFrame with the rows/columns switched. If the DataFrame is
@@ -1964,7 +1996,7 @@ def __getitem__(self, key):
             return self._getitem_multilevel(key)
         elif isinstance(key, DataFrame):
             if key.values.dtype == bool:
-                return self.where(key)
+                return self.where(key, try_cast = False)
             else:
                 raise ValueError('Cannot index using non-boolean DataFrame')
         else:
@@ -3330,17 +3362,12 @@ def fillna(self, value=None, method=None, axis=0, inplace=False,
                     raise NotImplementedError()
                 return self.T.fillna(method=method, limit=limit).T
 
-            new_blocks = []
             method = com._clean_fill_method(method)
-            for block in self._data.blocks:
-                if block._can_hold_na:
-                    newb = block.interpolate(method, axis=axis,
-                                             limit=limit, inplace=inplace)
-                else:
-                    newb = block if inplace else block.copy()
-                new_blocks.append(newb)
-
-            new_data = BlockManager(new_blocks, self._data.axes)
+            new_data = self._data.interpolate(method  = method,
+                                              axis    = axis,
+                                              limit   = limit,
+                                              inplace = inplace,
+                                              coerce  = True)
         else:
             if method is not None:
                 raise ValueError('cannot specify both a fill method and value')
@@ -3443,8 +3470,8 @@ def replace(self, to_replace, value=None, method='pad', axis=0,
                                          'in length. Expecting %d got %d ' %
                                          (len(to_replace), len(value)))
 
-                    new_data = self._data if inplace else self.copy()._data
-                    new_data._replace_list(to_replace, value)
+                    new_data = self._data.replace_list(to_replace, value, 
+                                                       inplace=inplace)
 
                 else:  # [NA, ''] -> 0
                     new_data = self._data.replace(to_replace, value,
@@ -3489,13 +3516,13 @@ def _interpolate(self, to_replace, method, axis, inplace, limit):
             return rs if not inplace else None
 
         else:
-            new_blocks = []
-            for block in self._data.blocks:
-                newb = block.interpolate(method, axis=axis,
-                                         limit=limit, inplace=inplace,
-                                         missing=to_replace)
-                new_blocks.append(newb)
-            new_data = BlockManager(new_blocks, self._data.axes)
+
+            new_data = self._data.interpolate(method  = method,
+                                              axis    = axis,
+                                              limit   = limit,
+                                              inplace = inplace,
+                                              missing = to_replace,
+                                              coerce  = False)
 
             if inplace:
                 self._data = new_data
@@ -3668,22 +3695,15 @@ def _combine_match_columns(self, other, func, fill_value=None):
         if fill_value is not None:
             raise NotImplementedError
 
-        return self._constructor(func(left.values, right.values),
-                                 index=self.index,
-                                 columns=left.columns, copy=False)
+        new_data = left._data.where(func, right, axes = [left.columns, self.index])
+        return self._constructor(new_data)
 
-    def _combine_const(self, other, func):
+    def _combine_const(self, other, func, raise_on_error = True):
         if self.empty:
             return self
 
-        result_values = func(self.values, other)
-
-        if not isinstance(result_values, np.ndarray):
-            raise TypeError('Could not compare %s with DataFrame values'
-                            % repr(other))
-
-        return self._constructor(result_values, index=self.index,
-                                 columns=self.columns, copy=False)
+        new_data = self._data.where(func, other, raise_on_error=raise_on_error)
+        return self._constructor(new_data)
 
     def _compare_frame(self, other, func):
         if not self._indexed_same(other):
@@ -4012,8 +4032,7 @@ def diff(self, periods=1):
         -------
         diffed : DataFrame
         """
-        new_blocks = [b.diff(periods) for b in self._data.blocks]
-        new_data = BlockManager(new_blocks, [self.columns, self.index])
+        new_data = self._data.diff(periods)
         return self._constructor(new_data)
 
     def shift(self, periods=1, freq=None, **kwds):
@@ -4047,21 +4066,9 @@ def shift(self, periods=1, freq=None, **kwds):
         if isinstance(offset, basestring):
             offset = datetools.to_offset(offset)
 
-        def _shift_block(blk, indexer):
-            new_values = blk.values.take(indexer, axis=1)
-            # convert integer to float if necessary. need to do a lot more than
-            # that, handle boolean etc also
-            new_values = com._maybe_upcast(new_values)
-            if periods > 0:
-                new_values[:, :periods] = NA
-            else:
-                new_values[:, periods:] = NA
-            return make_block(new_values, blk.items, blk.ref_items)
-
         if offset is None:
             indexer = com._shift_indexer(len(self), periods)
-            new_blocks = [_shift_block(b, indexer) for b in self._data.blocks]
-            new_data = BlockManager(new_blocks, [self.columns, self.index])
+            new_data = self._data.shift(indexer, periods)
         elif isinstance(self.index, PeriodIndex):
             orig_offset = datetools.to_offset(self.index.freq)
             if offset == orig_offset:
@@ -5211,7 +5218,7 @@ def combineMult(self, other):
         """
         return self.mul(other, fill_value=1.)
 
-    def where(self, cond, other=NA, inplace=False):
+    def where(self, cond, other=NA, inplace=False, try_cast=False, raise_on_error=True):
         """
         Return a DataFrame with the same shape as self and whose corresponding
         entries are from self where cond is True and otherwise are from other.
@@ -5220,6 +5227,10 @@ def where(self, cond, other=NA, inplace=False):
         ----------
         cond: boolean DataFrame or array
         other: scalar or DataFrame
+        inplace: perform the operation in place on the data
+        try_cast: try to cast the result back to the input type (if possible), defaults to False
+        raise_on_error: should I raise on invalid data types (e.g. trying to where on strings),
+          defaults to True
 
         Returns
         -------
@@ -5231,7 +5242,7 @@ def where(self, cond, other=NA, inplace=False):
 
         if isinstance(cond, np.ndarray):
             if cond.shape != self.shape:
-                raise ValueError('Array onditional must be same shape as self')
+                raise ValueError('Array conditional must be same shape as self')
             cond = self._constructor(cond, index=self.index,
                                      columns=self.columns)
 
@@ -5247,12 +5258,23 @@ def where(self, cond, other=NA, inplace=False):
 
         if isinstance(other, DataFrame):
             _, other = self.align(other, join='left', fill_value=NA)
+        elif isinstance(other,np.ndarray):
+
+            if other.shape[0] != len(self.index) or other.shape[1] != len(self.columns):
+                raise ValueError('other must be the same shape as self when an ndarray')
+            other = DataFrame(other,self.index,self.columns)
 
         if inplace:
-            np.putmask(self.values, cond, other)
+
+            # we may have different type blocks come out of putmask, so reconstruct the block manager
+            self._data = self._data.putmask(cond,other,inplace=True)
+
         else:
-            rs = np.where(cond, self, other)
-            return self._constructor(rs, self.index, self.columns)
+
+            func = lambda values, others, conds: np.where(conds, values, others)
+            new_data = self._data.where(func, other, cond, raise_on_error=raise_on_error, try_cast=try_cast)
+
+            return self._constructor(new_data)
 
     def mask(self, cond):
         """
@@ -5609,7 +5631,6 @@ def _homogenize(data, index, dtype=None):
 
     return homogenized
 
-
 def _from_nested_dict(data):
     # TODO: this should be seriously cythonized
     new_data = OrderedDict()
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 69bde62fdae20..0f78ddb3ca48f 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -486,19 +486,23 @@ def __init__(self, data, axes=None, copy=False, dtype=None):
         object.__setattr__(self, '_data', data)
         object.__setattr__(self, '_item_cache', {})
 
-    def astype(self, dtype):
+    def astype(self, dtype, copy = True, raise_on_error = True):
         """
         Cast object to input numpy.dtype
+        Return a copy when copy = True (be really careful with this!)
 
         Parameters
         ----------
         dtype : numpy.dtype or Python type
+        raise_on_error : raise on invalid input
 
         Returns
         -------
         casted : type of caller
         """
-        return self._constructor(self._data, dtype=dtype)
+
+        mgr = self._data.astype(dtype, copy = copy, raise_on_error = raise_on_error)
+        return self._constructor(mgr)
 
     @property
     def _constructor(self):
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index 705e1574efe06..7fedb8011122a 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -425,6 +425,36 @@ def picker(arr):
                 return np.nan
         return self.agg(picker)
 
+    def _try_cast(self, result, obj):
+        """ try to cast the result to our obj original type,
+        we may have roundtripped thru object in the mean-time """
+        try:
+            if obj.ndim > 1:
+                dtype = obj.values.dtype
+            else:
+                dtype = obj.dtype
+
+            if _is_numeric_dtype(dtype):
+                
+                # need to respect a non-number here (e.g. Decimal)
+                if len(result) and issubclass(type(result[0]),(np.number,float,int)):
+                    if issubclass(dtype.type, (np.integer, np.bool_)):
+
+                        # castable back to an int/bool as we don't have nans
+                        if com.notnull(result).all():
+                            result = result.astype(dtype)
+                    else:
+
+                        result = result.astype(dtype)
+
+            elif issubclass(dtype.type, np.datetime64):
+                if is_datetime64_dtype(obj.dtype):
+                    result = result.astype(obj.dtype)
+        except:
+            pass
+
+        return result
+
     def _cython_agg_general(self, how, numeric_only=True):
         output = {}
         for name, obj in self._iterate_slices():
@@ -449,7 +479,7 @@ def _python_agg_general(self, func, *args, **kwargs):
         for name, obj in self._iterate_slices():
             try:
                 result, counts = self.grouper.agg_series(obj, f)
-                output[name] = result
+                output[name] = self._try_cast(result, obj)
             except TypeError:
                 continue
 
@@ -457,9 +487,16 @@ def _python_agg_general(self, func, *args, **kwargs):
             return self._python_apply_general(f)
 
         if self.grouper._filter_empty_groups:
+
             mask = counts.ravel() > 0
             for name, result in output.iteritems():
-                output[name] = result[mask]
+
+                # since we are masking, make sure that we have a float object
+                values = result
+                if _is_numeric_dtype(values.dtype):
+                    values = com.ensure_float(values)
+                
+                output[name] = self._try_cast(values[mask],result)
 
         return self._wrap_aggregated_output(output)
 
@@ -708,21 +745,16 @@ def get_group_levels(self):
     # Aggregation functions
 
     _cython_functions = {
-        'add': _algos.group_add,
-        'prod': _algos.group_prod,
-        'min': _algos.group_min,
-        'max': _algos.group_max,
-        'mean': _algos.group_mean,
-        'median': _algos.group_median,
-        'var': _algos.group_var,
-        'std': _algos.group_var,
-        'first': lambda a, b, c, d: _algos.group_nth(a, b, c, d, 1),
-        'last': _algos.group_last
-    }
-
-    _cython_object_functions = {
-        'first': lambda a, b, c, d: _algos.group_nth_object(a, b, c, d, 1),
-        'last': _algos.group_last_object
+        'add'  : 'group_add',
+        'prod' : 'group_prod',
+        'min'  : 'group_min',
+        'max'  : 'group_max',
+        'mean' : 'group_mean',
+        'median': dict(name = 'group_median'),
+        'var'  : 'group_var',
+        'std'  : 'group_var',
+        'first': dict(name = 'group_nth', f = lambda func, a, b, c, d: func(a, b, c, d, 1)),
+        'last' : 'group_last',
     }
 
     _cython_transforms = {
@@ -737,6 +769,40 @@ def get_group_levels(self):
 
     _filter_empty_groups = True
 
+    def _get_aggregate_function(self, how, values):
+
+        dtype_str = values.dtype.name
+        def get_func(fname):
+
+            # find the function, or use the object function, or return a generic
+            for dt in [dtype_str,'object']:
+                f = getattr(_algos,"%s_%s" % (fname,dtype_str),None)
+                if f is not None:
+                    return f
+            return getattr(_algos,fname,None)
+
+        ftype = self._cython_functions[how]
+
+        if isinstance(ftype,dict):
+            func = afunc = get_func(ftype['name'])
+
+            # a sub-function
+            f = ftype.get('f')
+            if f is not None:
+
+                def wrapper(*args, **kwargs):
+                    return f(afunc, *args, **kwargs)
+
+                # need to curry our sub-function
+                func = wrapper
+                
+        else:
+            func = get_func(ftype)
+
+        if func is None:
+            raise NotImplementedError("function is not implemented for this dtype: [how->%s,dtype->%s]" % (how,dtype_str))
+        return func, dtype_str
+
     def aggregate(self, values, how, axis=0):
 
         arity = self._cython_arity.get(how, 1)
@@ -796,12 +862,8 @@ def aggregate(self, values, how, axis=0):
         return result, names
 
     def _aggregate(self, result, counts, values, how, is_numeric):
-        if not is_numeric:
-            agg_func = self._cython_object_functions[how]
-        else:
-            agg_func = self._cython_functions[how]
-
-        trans_func = self._cython_transforms.get(how, lambda x: x)
+        agg_func,dtype  = self._get_aggregate_function(how, values)
+        trans_func      = self._cython_transforms.get(how, lambda x: x)
 
         comp_ids, _, ngroups = self.group_info
         if values.ndim > 3:
@@ -809,8 +871,9 @@ def _aggregate(self, result, counts, values, how, is_numeric):
             raise NotImplementedError
         elif values.ndim > 2:
             for i, chunk in enumerate(values.transpose(2, 0, 1)):
-                agg_func(result[:, :, i], counts, chunk.squeeze(),
-                         comp_ids)
+
+                chunk = chunk.squeeze()
+                agg_func(result[:, :, i], counts, chunk, comp_ids)
         else:
             agg_func(result, counts, values, comp_ids)
 
@@ -1000,21 +1063,16 @@ def names(self):
     # cython aggregation
 
     _cython_functions = {
-        'add': _algos.group_add_bin,
-        'prod': _algos.group_prod_bin,
-        'mean': _algos.group_mean_bin,
-        'min': _algos.group_min_bin,
-        'max': _algos.group_max_bin,
-        'var': _algos.group_var_bin,
-        'std': _algos.group_var_bin,
-        'ohlc': _algos.group_ohlc,
-        'first': lambda a, b, c, d: _algos.group_nth_bin(a, b, c, d, 1),
-        'last': _algos.group_last_bin
-    }
-
-    _cython_object_functions = {
-        'first': lambda a, b, c, d: _algos.group_nth_bin_object(a, b, c, d, 1),
-        'last': _algos.group_last_bin_object
+        'add'  : 'group_add_bin',
+        'prod' : 'group_prod_bin',
+        'mean' : 'group_mean_bin',
+        'min'  : 'group_min_bin',
+        'max'  : 'group_max_bin',
+        'var'  : 'group_var_bin',
+        'std'  : 'group_var_bin',
+        'ohlc' : 'group_ohlc',
+        'first': dict(name = 'group_nth_bin', f = lambda func, a, b, c, d: func(a, b, c, d, 1)),
+        'last' : 'group_last_bin',
     }
 
     _name_functions = {
@@ -1024,11 +1082,9 @@ def names(self):
     _filter_empty_groups = True
 
     def _aggregate(self, result, counts, values, how, is_numeric=True):
-        fdict = self._cython_functions
-        if not is_numeric:
-            fdict = self._cython_object_functions
-        agg_func = fdict[how]
-        trans_func = self._cython_transforms.get(how, lambda x: x)
+
+        agg_func,dtype = self._get_aggregate_function(how, values)
+        trans_func     = self._cython_transforms.get(how, lambda x: x)
 
         if values.ndim > 3:
             # punting for now
@@ -1439,7 +1495,7 @@ def _aggregate_named(self, func, *args, **kwargs):
             output = func(group, *args, **kwargs)
             if isinstance(output, np.ndarray):
                 raise Exception('Must produce aggregated value')
-            result[name] = output
+            result[name] = self._try_cast(output, group)
 
         return result
 
@@ -1676,14 +1732,14 @@ def _aggregate_generic(self, func, *args, **kwargs):
                 for name, data in self:
                     # for name in self.indices:
                     #     data = self.get_group(name, obj=obj)
-                    result[name] = func(data, *args, **kwargs)
+                    result[name] = self._try_cast(func(data, *args, **kwargs),data)
             except Exception:
                 return self._aggregate_item_by_item(func, *args, **kwargs)
         else:
             for name in self.indices:
                 try:
                     data = self.get_group(name, obj=obj)
-                    result[name] = func(data, *args, **kwargs)
+                    result[name] = self._try_cast(func(data, *args, **kwargs), data)
                 except Exception:
                     wrapper = lambda x: func(x, *args, **kwargs)
                     result[name] = data.apply(wrapper, axis=axis)
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
index e3031b58ff286..58d193a956491 100644
--- a/pandas/core/internals.py
+++ b/pandas/core/internals.py
@@ -20,6 +20,10 @@ class Block(object):
     Index-ignorant; let the container take care of that
     """
     __slots__ = ['items', 'ref_items', '_ref_locs', 'values', 'ndim']
+    is_numeric = False
+    is_bool = False
+    is_object = False
+    _can_hold_na = False
 
     def __init__(self, values, items, ref_items, ndim=2):
         if issubclass(values.dtype.type, basestring):
@@ -93,6 +97,10 @@ def __setstate__(self, state):
     def shape(self):
         return self.values.shape
 
+    @property
+    def itemsize(self):
+        return self.values.itemsize
+
     @property
     def dtype(self):
         return self.values.dtype
@@ -206,8 +214,13 @@ def split_block_at(self, item):
                              self.ref_items)
 
     def fillna(self, value, inplace=False):
-        new_values = self.values if inplace else self.values.copy()
+        if not self._can_hold_na:
+            if inplace:
+                return self
+            else:
+                return self.copy()
 
+        new_values = self.values if inplace else self.values.copy()
         mask = com.isnull(new_values)
         np.putmask(new_values, mask, value)
 
@@ -216,12 +229,43 @@ def fillna(self, value, inplace=False):
         else:
             return make_block(new_values, self.items, self.ref_items)
 
+    def astype(self, dtype, copy = True, raise_on_error = True):
+        """ coerce to the new type (if copy=True, return a new copy) raise on an except if raise == True """
+        try:
+            newb = make_block(com._astype_nansafe(self.values, dtype, copy = copy),
+                              self.items, self.ref_items)
+        except:
+            if raise_on_error is True:
+                raise
+            newb = self.copy() if copy else self
+
+        if newb.is_numeric and self.is_numeric:
+            if newb.shape != self.shape or (not copy and newb.itemsize < self.itemsize):
+                raise TypeError("cannot set astype for copy = [%s] for dtype (%s [%s]) with smaller itemsize that current (%s [%s])" % (copy,
+                                                                                                                                        self.dtype.name,
+                                                                                                                                        self.itemsize,
+                                                                                                                                        newb.dtype.name,
+                                                                                                                                        newb.itemsize))
+        return newb
+
+    def convert(self, copy = True, **kwargs):
+        """ attempt to coerce any object types to better types
+            return a copy of the block (if copy = True)
+            by definition we are not an ObjectBlock here!  """
+
+        return self.copy() if copy else self
+
     def _can_hold_element(self, value):
         raise NotImplementedError()
 
     def _try_cast(self, value):
         raise NotImplementedError()
 
+    def _try_cast_result(self, result):
+        """ try to cast the result to our original type,
+        we may have roundtripped thru object in the mean-time """
+        return result
+
     def replace(self, to_replace, value, inplace=False):
         new_values = self.values if inplace else self.values.copy()
         if self._can_hold_element(value):
@@ -251,17 +295,58 @@ def replace(self, to_replace, value, inplace=False):
             return make_block(new_values, self.items, self.ref_items)
 
     def putmask(self, mask, new, inplace=False):
+        """ putmask the data to the block; it is possible that we may create a new dtype of block
+            return the resulting block(s) """
+
         new_values = self.values if inplace else self.values.copy()
+
+        # may need to align the new
+        if hasattr(new,'reindex_axis'):
+            axis = getattr(new,'_het_axis',0)
+            new = new.reindex_axis(self.items, axis=axis, copy=False).values.T
+
+        # may need to align the mask
+        if hasattr(mask,'reindex_axis'):
+            axis = getattr(mask,'_het_axis',0)
+            mask = mask.reindex_axis(self.items, axis=axis, copy=False).values.T
+
         if self._can_hold_element(new):
             new = self._try_cast(new)
             np.putmask(new_values, mask, new)
-        if inplace:
-            return self
+
+        # upcast me
         else:
-            return make_block(new_values, self.items, self.ref_items)
+            
+            # type of the new block
+            if isinstance(new,np.ndarray) and issubclass(new.dtype,np.number) or issubclass(type(new),float):
+                typ = float
+            else:
+                typ = object
+
+            # we need to exiplicty astype here to make a copy
+            new_values = new_values.astype(typ)
+
+            # we create a new block type
+            np.putmask(new_values, mask, new)
+            return [ make_block(new_values, self.items, self.ref_items) ]
+
+        if inplace:
+            return [ self ]
+
+        return [ make_block(new_values, self.items, self.ref_items) ]
 
     def interpolate(self, method='pad', axis=0, inplace=False,
-                    limit=None, missing=None):
+                    limit=None, missing=None, coerce=False):
+
+        # if we are coercing, then don't force the conversion 
+        # if the block can't hold the type
+        if coerce:
+            if not self._can_hold_na:
+                if inplace:
+                    return self
+                else:
+                    return self.copy()
+        
         values = self.values if inplace else self.values.copy()
 
         if values.ndim != 2:
@@ -293,9 +378,96 @@ def get_values(self, dtype):
         return self.values
 
     def diff(self, n):
+        """ return block for the diff of the values """
         new_values = com.diff(self.values, n, axis=1)
         return make_block(new_values, self.items, self.ref_items)
 
+    def shift(self, indexer, periods):
+        """ shift the block by periods, possibly upcast """
+
+        new_values = self.values.take(indexer, axis=1)
+        # convert integer to float if necessary. need to do a lot more than
+        # that, handle boolean etc also
+        new_values = com._maybe_upcast(new_values)
+        if periods > 0:
+            new_values[:, :periods] = np.nan
+        else:
+            new_values[:, periods:] = np.nan
+        return make_block(new_values, self.items, self.ref_items)
+
+    def where(self, func, other, cond = None, raise_on_error = True, try_cast = False):
+        """ 
+        evaluate the block; return result block(s) from the result 
+
+        Parameters
+        ----------
+        func  : how to combine self,other
+        other : a ndarray/object
+        cond  : the condition to respect, optional
+        raise_on_error : if True, raise when I can't perform the function, False by default (and just return
+             the data that we had coming in)
+
+        Returns
+        -------
+        a new block, the result of the func
+        """
+
+        values = self.values
+
+        # see if we can align other
+        if hasattr(other,'reindex_axis'):
+            axis = getattr(other,'_het_axis',0)
+            other = other.reindex_axis(self.items, axis=axis, copy=True).values
+
+        # make sure that we can broadcast
+        is_transposed = False
+        if hasattr(other, 'ndim') and hasattr(values, 'ndim'):
+            if values.ndim != other.ndim or values.shape == other.shape[::-1]:
+                values = values.T
+                is_transposed = True
+
+        # see if we can align cond
+        if cond is not None:
+            if not hasattr(cond,'shape'):
+                raise ValueError("where must have a condition that is ndarray like")
+            if hasattr(cond,'reindex_axis'):
+                axis = getattr(cond,'_het_axis',0)
+                cond = cond.reindex_axis(self.items, axis=axis, copy=True).values
+            else:
+                cond = cond.values
+
+            # may need to undo transpose of values
+            if hasattr(values, 'ndim'):
+                if values.ndim != cond.ndim or values.shape == cond.shape[::-1]:
+                    values = values.T
+                    is_transposed =  not is_transposed
+
+        args = [ values, other ]
+        if cond is not None:
+            args.append(cond)
+        try:
+            result = func(*args)
+        except:
+            if raise_on_error:
+                raise TypeError('Coulnd not operate %s with block values'
+                                % repr(other))
+            else:
+                # return the values
+                result = np.empty(values.shape,dtype='O')
+                result.fill(np.nan)
+
+        if not isinstance(result, np.ndarray):
+            raise TypeError('Could not compare %s with block values'
+                            % repr(other))
+
+        if is_transposed:
+            result = result.T
+
+        # try to cast if requested
+        if try_cast:
+            result = self._try_cast_result(result)
+
+        return [ make_block(result, self.items, self.ref_items) ]
 
 def _mask_missing(array, missing_values):
     if not isinstance(missing_values, (list, np.ndarray)):
@@ -314,11 +486,15 @@ def _mask_missing(array, missing_values):
             mask |= array == missing_values
     return mask
 
-
-class FloatBlock(Block):
+class NumericBlock(Block):
+    is_numeric = True
     _can_hold_na = True
 
+class FloatBlock(NumericBlock):
+
     def _can_hold_element(self, element):
+        if isinstance(element, np.ndarray):
+            return issubclass(element.dtype.type, (np.floating,np.integer))
         return isinstance(element, (float, int))
 
     def _try_cast(self, element):
@@ -330,11 +506,10 @@ def _try_cast(self, element):
     def should_store(self, value):
         # when inserting a column should not coerce integers to floats
         # unnecessarily
-        return issubclass(value.dtype.type, np.floating)
+        return issubclass(value.dtype.type, np.floating) and value.dtype == self.dtype
 
 
-class ComplexBlock(Block):
-    _can_hold_na = True
+class ComplexBlock(NumericBlock):
 
     def _can_hold_element(self, element):
         return isinstance(element, complex)
@@ -349,10 +524,12 @@ def should_store(self, value):
         return issubclass(value.dtype.type, np.complexfloating)
 
 
-class IntBlock(Block):
+class IntBlock(NumericBlock):
     _can_hold_na = False
 
     def _can_hold_element(self, element):
+        if isinstance(element, np.ndarray):
+            return issubclass(element.dtype.type, np.integer)
         return com.is_integer(element)
 
     def _try_cast(self, element):
@@ -361,11 +538,25 @@ def _try_cast(self, element):
         except:  # pragma: no cover
             return element
 
+    def _try_cast_result(self, result):
+        # this is quite restrictive to convert
+        try:
+            if isinstance(result, np.ndarray) and issubclass(result.dtype.type, np.floating):
+                if com.notnull(result).all():
+                    new_result = result.astype(self.dtype)
+                    if (new_result == result).all():
+                        return new_result
+        except:
+            pass
+
+        return result
+
     def should_store(self, value):
-        return com.is_integer_dtype(value)
+        return com.is_integer_dtype(value) and value.dtype == self.dtype
 
 
 class BoolBlock(Block):
+    is_bool = True
     _can_hold_na = False
 
     def _can_hold_element(self, element):
@@ -382,8 +573,35 @@ def should_store(self, value):
 
 
 class ObjectBlock(Block):
+    is_object = True
     _can_hold_na = True
 
+    @property
+    def is_bool(self):
+        """ we can be a bool if we have only bool values but are of type object """
+        return lib.is_bool_array(self.values.flatten())
+
+    def convert(self, convert_dates = True, convert_numeric = True, copy = True):
+        """ attempt to coerce any object types to better types
+            return a copy of the block (if copy = True)
+            by definition we ARE an ObjectBlock!!!!!
+
+            can return multiple blocks!
+            """
+
+        # attempt to create new type blocks
+        blocks = []
+        for i, c in enumerate(self.items):
+            values = self.get(c)
+
+            values = com._possibly_convert_objects(values, convert_dates=convert_dates, convert_numeric=convert_numeric)
+            values = values.reshape(((1,) + values.shape))
+            items = self.items.take([i])
+            newb = make_block(values, items, self.ref_items)
+            blocks.append(newb)
+
+        return blocks
+
     def _can_hold_element(self, element):
         return True
 
@@ -457,8 +675,6 @@ def make_block(values, items, ref_items):
     elif issubclass(vtype, np.datetime64):
         klass = DatetimeBlock
     elif issubclass(vtype, np.integer):
-        if vtype != np.int64:
-            values = values.astype('i8')
         klass = IntBlock
     elif dtype == np.bool_:
         klass = BoolBlock
@@ -611,15 +827,70 @@ def _verify_integrity(self):
             raise AssertionError('Number of manager items must equal union of '
                                  'block items')
 
-    def astype(self, dtype):
-        new_blocks = []
-        for block in self.blocks:
-            newb = make_block(com._astype_nansafe(block.values, dtype),
-                              block.items, block.ref_items)
-            new_blocks.append(newb)
+    def apply(self, f, *args, **kwargs):
+        """ iterate over the blocks, collect and create a new block manager """
+        axes = kwargs.pop('axes',None)
+        result_blocks = []
+        for blk in self.blocks:
+            if callable(f):
+                applied = f(blk, *args, **kwargs)
+            else:
+                applied = getattr(blk,f)(*args, **kwargs)
+
+            if isinstance(applied,list):
+                result_blocks.extend(applied)
+            else:
+                result_blocks.append(applied)
+        bm = self.__class__(result_blocks, axes or self.axes)
+        bm._consolidate_inplace()
+        return bm
+
+    def where(self, *args, **kwargs):
+        return self.apply('where', *args, **kwargs)
+
+    def putmask(self, *args, **kwargs):
+        return self.apply('putmask', *args, **kwargs)
+
+    def diff(self, *args, **kwargs):
+        return self.apply('diff', *args, **kwargs)
+
+    def interpolate(self, *args, **kwargs):
+        return self.apply('interpolate', *args, **kwargs)
+
+    def shift(self, *args, **kwargs):
+        return self.apply('shift', *args, **kwargs)
+
+    def fillna(self, *args, **kwargs):
+        return self.apply('fillna', *args, **kwargs)
+
+    def astype(self, *args, **kwargs):
+        return self.apply('astype', *args, **kwargs)
+
+    def convert(self, *args, **kwargs):
+        return self.apply('convert', *args, **kwargs)
+
+    def replace(self, *args, **kwargs):
+        return self.apply('replace', *args, **kwargs)
 
-        new_mgr = BlockManager(new_blocks, self.axes)
-        return new_mgr.consolidate()
+    def replace_list(self, src_lst, dest_lst, inplace=False):
+        """ do a list replace """
+        if not inplace:
+            self = self.copy()
+
+        sset = set(src_lst)
+        if any([k in sset for k in dest_lst]):
+            masks = {}
+            for s in src_lst:
+                masks[s] = [b.values == s for b in self.blocks]
+
+            for s, d in zip(src_lst, dest_lst):
+                [b.putmask(masks[s][i], d, inplace=True) for i, b in
+                 enumerate(self.blocks)]
+        else:
+            for s, d in zip(src_lst, dest_lst):
+                self.replace(s, d, inplace=True)
+
+        return self
 
     def is_consolidated(self):
         """
@@ -634,7 +905,7 @@ def _consolidate_check(self):
         self._is_consolidated = len(dtypes) == len(set(dtypes))
         self._known_consolidated = True
 
-    def get_numeric_data(self, copy=False, type_list=None):
+    def get_numeric_data(self, copy=False, type_list=None, as_blocks = False):
         """
         Parameters
         ----------
@@ -644,15 +915,15 @@ def get_numeric_data(self, copy=False, type_list=None):
             Numeric types by default (Float/Complex/Int but not Datetime)
         """
         if type_list is None:
-            def filter_blocks(block):
-                return (isinstance(block, (IntBlock, FloatBlock, ComplexBlock))
-                        and not isinstance(block, DatetimeBlock))
+            filter_blocks = lambda block: block.is_numeric
         else:
             type_list = self._get_clean_block_types(type_list)
             filter_blocks = lambda block: isinstance(block, type_list)
 
         maybe_copy = lambda b: b.copy() if copy else b
         num_blocks = [maybe_copy(b) for b in self.blocks if filter_blocks(b)]
+        if as_blocks:
+            return num_blocks
 
         if len(num_blocks) == 0:
             return BlockManager.make_empty()
@@ -686,8 +957,8 @@ def _get_clean_block_types(self, type_list):
         type_list = tuple([type_map.get(t, t) for t in type_list])
         return type_list
 
-    def get_bool_data(self, copy=False):
-        return self.get_numeric_data(copy=copy, type_list=(BoolBlock,))
+    def get_bool_data(self, copy=False, as_blocks=False):
+        return self.get_numeric_data(copy=copy, type_list=(BoolBlock,), as_blocks=as_blocks)
 
     def get_slice(self, slobj, axis=0):
         new_axes = list(self.axes)
@@ -1255,37 +1526,6 @@ def add_suffix(self, suffix):
         f = ('%s' + ('%s' % suffix)).__mod__
         return self.rename_items(f)
 
-    def fillna(self, value, inplace=False):
-        new_blocks = [b.fillna(value, inplace=inplace)
-                      if b._can_hold_na else b
-                      for b in self.blocks]
-        if inplace:
-            return self
-        return BlockManager(new_blocks, self.axes)
-
-    def replace(self, to_replace, value, inplace=False):
-        new_blocks = [b.replace(to_replace, value, inplace=inplace)
-                      for b in self.blocks]
-        if inplace:
-            return self
-        return BlockManager(new_blocks, self.axes)
-
-    def _replace_list(self, src_lst, dest_lst):
-        sset = set(src_lst)
-        if any([k in sset for k in dest_lst]):
-            masks = {}
-            for s in src_lst:
-                masks[s] = [b.values == s for b in self.blocks]
-
-            for s, d in zip(src_lst, dest_lst):
-                [b.putmask(masks[s][i], d, inplace=True) for i, b in
-                 enumerate(self.blocks)]
-        else:
-            for s, d in zip(src_lst, dest_lst):
-                self.replace(s, d, inplace=True)
-
-        return self
-
     @property
     def block_id_vector(self):
         # TODO
@@ -1359,28 +1599,28 @@ def form_blocks(arrays, names, axes):
 
     blocks = []
     if len(float_items):
-        float_block = _simple_blockify(float_items, items, np.float64)
-        blocks.append(float_block)
+        float_blocks = _multi_blockify(float_items, items)
+        blocks.extend(float_blocks)
 
     if len(complex_items):
-        complex_block = _simple_blockify(complex_items, items, np.complex128)
-        blocks.append(complex_block)
+        complex_blocks = _simple_blockify(complex_items, items, np.complex128)
+        blocks.extend(complex_blocks)
 
     if len(int_items):
-        int_block = _simple_blockify(int_items, items, np.int64)
-        blocks.append(int_block)
+        int_blocks = _multi_blockify(int_items, items)
+        blocks.extend(int_blocks)
 
     if len(datetime_items):
-        datetime_block = _simple_blockify(datetime_items, items, _NS_DTYPE)
-        blocks.append(datetime_block)
+        datetime_blocks = _simple_blockify(datetime_items, items, _NS_DTYPE)
+        blocks.extend(datetime_blocks)
 
     if len(bool_items):
-        bool_block = _simple_blockify(bool_items, items, np.bool_)
-        blocks.append(bool_block)
+        bool_blocks = _simple_blockify(bool_items, items, np.bool_)
+        blocks.extend(bool_blocks)
 
     if len(object_items) > 0:
-        object_block = _simple_blockify(object_items, items, np.object_)
-        blocks.append(object_block)
+        object_blocks = _simple_blockify(object_items, items, np.object_)
+        blocks.extend(object_blocks)
 
     if len(extra_items):
         shape = (len(extra_items),) + tuple(len(x) for x in axes[1:])
@@ -1398,14 +1638,31 @@ def form_blocks(arrays, names, axes):
 
 
 def _simple_blockify(tuples, ref_items, dtype):
+    """ return a single array of a block that has a single dtype; if dtype is not None, coerce to this dtype """
     block_items, values = _stack_arrays(tuples, ref_items, dtype)
+
     # CHECK DTYPE?
-    if values.dtype != dtype:  # pragma: no cover
+    if dtype is not None and values.dtype != dtype:  # pragma: no cover
         values = values.astype(dtype)
 
-    return make_block(values, block_items, ref_items)
+    return [ make_block(values, block_items, ref_items) ]
 
 
+def _multi_blockify(tuples, ref_items, dtype = None):
+    """ return an array of blocks that potentially have different dtypes """
+
+    # group by dtype
+    grouper = itertools.groupby(tuples, lambda x: x[1].dtype)
+
+    new_blocks = []
+    for dtype, tup_block in grouper:
+
+        block_items, values = _stack_arrays(list(tup_block), ref_items, dtype)
+        block = make_block(values, block_items, ref_items)
+        new_blocks.append(block)
+
+    return new_blocks
+
 def _stack_arrays(tuples, ref_items, dtype):
     from pandas.core.series import Series
 
@@ -1451,17 +1708,27 @@ def _blocks_to_series_dict(blocks, index=None):
 
 
 def _interleaved_dtype(blocks):
+    if not len(blocks): return None
+
     from collections import defaultdict
-    counts = defaultdict(lambda: 0)
+    counts = defaultdict(lambda: [])
     for x in blocks:
-        counts[type(x)] += 1
-
-    have_int = counts[IntBlock] > 0
-    have_bool = counts[BoolBlock] > 0
-    have_object = counts[ObjectBlock] > 0
-    have_float = counts[FloatBlock] > 0
-    have_complex = counts[ComplexBlock] > 0
-    have_dt64 = counts[DatetimeBlock] > 0
+        counts[type(x)].append(x)
+
+    def _lcd_dtype(l):
+        """ find the lowest dtype that can accomodate the given types """
+        m = l[0].dtype
+        for x in l[1:]:
+            if x.dtype.itemsize > m.itemsize:
+                m = x.dtype
+        return m
+
+    have_int = len(counts[IntBlock]) > 0
+    have_bool = len(counts[BoolBlock]) > 0
+    have_object = len(counts[ObjectBlock]) > 0
+    have_float = len(counts[FloatBlock]) > 0
+    have_complex = len(counts[ComplexBlock]) > 0
+    have_dt64 = len(counts[DatetimeBlock]) > 0
     have_numeric = have_float or have_complex or have_int
 
     if (have_object or
@@ -1471,13 +1738,13 @@ def _interleaved_dtype(blocks):
     elif have_bool:
         return np.dtype(bool)
     elif have_int and not have_float and not have_complex:
-        return np.dtype('i8')
+        return _lcd_dtype(counts[IntBlock])
     elif have_dt64 and not have_float and not have_complex:
         return np.dtype('M8[ns]')
     elif have_complex:
         return np.dtype('c16')
     else:
-        return np.dtype('f8')
+        return _lcd_dtype(counts[FloatBlock])
 
 
 def _consolidate(blocks, items):
diff --git a/pandas/core/series.py b/pandas/core/series.py
index 06281e288021a..76c91ad726868 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -779,18 +779,23 @@ def astype(self, dtype):
         casted = com._astype_nansafe(self.values, dtype)
         return self._constructor(casted, index=self.index, name=self.name)
 
-    def convert_objects(self, convert_dates=True):
+    def convert_objects(self, convert_dates=True, convert_numeric=True):
         """
         Attempt to infer better dtype
+        Always return a copy
+
+        Parameters
+        ----------
+        convert_dates : if True, attempt to soft convert_dates, if 'coerce', force conversion (and non-convertibles get NaT)
+        convert_numeric : if True attempt to coerce to numerbers (including strings), non-convertibles get NaN
 
         Returns
         -------
         converted : Series
         """
         if self.dtype == np.object_:
-            return Series(lib.maybe_convert_objects(
-                self, convert_datetime=convert_dates), self.index)
-        return self
+            return Series(com._possibly_convert_objects(self.values,convert_dates=convert_dates,convert_numeric=convert_numeric), index=self.index, name=self.name)
+        return self.copy()
 
     def repeat(self, reps):
         """
@@ -1027,7 +1032,8 @@ def _tidy_repr(self, max_vals=20):
     def _repr_footer(self):
         namestr = u"Name: %s, " % com.pprint_thing(
             self.name) if self.name is not None else ""
-        return u'%sLength: %d' % (namestr, len(self))
+        return u'%sLength: %d, Dtype: %s' % (namestr, len(self), 
+                                             com.pprint_thing(self.dtype.name))
 
     def to_string(self, buf=None, na_rep='NaN', float_format=None,
                   nanRep=None, length=False, name=False):
@@ -2402,6 +2408,12 @@ def reindex(self, index=None, method=None, level=None, fill_value=np.nan,
         new_values = com.take_1d(self.values, indexer, fill_value=fill_value)
         return Series(new_values, index=new_index, name=self.name)
 
+    def reindex_axis(self, labels, axis=0, **kwargs):
+        """ for compatibility with higher dims """
+        if axis != 0:
+            raise ValueError("cannot reindex series on non-zero axis!")
+        return self.reindex(index=labels,**kwargs)
+
     def reindex_like(self, other, method=None, limit=None):
         """
         Reindex Series to match index of another Series, optionally with
diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py
index 6801b197fa849..556bcdb93477f 100644
--- a/pandas/io/tests/test_excel.py
+++ b/pandas/io/tests/test_excel.py
@@ -362,11 +362,11 @@ def _check_extension_int64(self, ext):
         self.frame.to_excel(path, 'test1', index=False)
 
         # Test np.int64, values read come back as float
-        frame = DataFrame(np.random.randint(-10, 10, size=(10, 2)))
+        frame = DataFrame(np.random.randint(-10, 10, size=(10, 2)), dtype=np.int64)
         frame.to_excel(path, 'test1')
         reader = ExcelFile(path)
         recons = reader.parse('test1').astype(np.int64)
-        tm.assert_frame_equal(frame, recons)
+        tm.assert_frame_equal(frame, recons, check_dtype=False)
 
         os.remove(path)
 
diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py
index c7897e7def4d3..9cc749d23a3a9 100644
--- a/pandas/src/generate_code.py
+++ b/pandas/src/generate_code.py
@@ -388,6 +388,10 @@ def pad_inplace_%(name)s(ndarray[%(c_type)s] values,
 
     N = len(values)
 
+    # GH 2778
+    if N == 0:
+        return
+
     if limit is None:
         lim = N
     else:
@@ -419,6 +423,10 @@ def pad_2d_inplace_%(name)s(ndarray[%(c_type)s, ndim=2] values,
 
     K, N = (<object> values).shape
 
+    # GH 2778
+    if N == 0:
+        return
+
     if limit is None:
         lim = N
     else:
@@ -451,6 +459,10 @@ def backfill_2d_inplace_%(name)s(ndarray[%(c_type)s, ndim=2] values,
 
     K, N = (<object> values).shape
 
+    # GH 2778
+    if N == 0:
+        return
+
     if limit is None:
         lim = N
     else:
@@ -483,6 +495,10 @@ def backfill_inplace_%(name)s(ndarray[%(c_type)s] values,
 
     N = len(values)
 
+    # GH 2778
+    if N == 0:
+        return
+
     if limit is None:
         lim = N
     else:
@@ -502,6 +518,52 @@ def backfill_inplace_%(name)s(ndarray[%(c_type)s] values,
             val = values[i]
 """
 
+
+diff_2d_template = """@cython.boundscheck(False)
+@cython.wraparound(False)
+def diff_2d_%(name)s(ndarray[%(c_type)s, ndim=2] arr,
+                     ndarray[%(dest_type2)s, ndim=2] out,
+                    Py_ssize_t periods, int axis):
+    cdef:
+        Py_ssize_t i, j, sx, sy
+
+    sx, sy = (<object> arr).shape
+    if arr.flags.f_contiguous:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for j in range(sy):
+                for i in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for j in range(start, stop):
+                for i in range(sx):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+    else:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for i in range(start, stop):
+                for j in range(sy):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for i in range(sx):
+                for j in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+"""
+
 is_monotonic_template = """@cython.boundscheck(False)
 @cython.wraparound(False)
 def is_monotonic_%(name)s(ndarray[%(c_type)s] arr):
@@ -582,6 +644,965 @@ def groupby_%(name)s(ndarray[%(c_type)s] index, ndarray labels):
 
 """
 
+group_last_template = """@cython.wraparound(False)
+@cython.wraparound(False)
+def group_last_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
+               ndarray[int64_t] counts,
+               ndarray[%(c_type)s, ndim=2] values,
+               ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        %(dest_type2)s val, count
+        ndarray[%(dest_type2)s, ndim=2] resx
+        ndarray[int64_t, ndim=2] nobs
+
+    nobs = np.zeros((<object> out).shape, dtype=np.int64)
+    resx = np.empty_like(out)
+
+    N, K = (<object> values).shape
+
+    for i in range(N):
+        lab = labels[i]
+        if lab < 0:
+            continue
+
+        counts[lab] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            if val == val:
+                nobs[lab, j] += 1
+                resx[lab, j] = val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = resx[i, j]
+"""
+
+group_last_bin_template = """@cython.wraparound(False)
+@cython.wraparound(False)
+def group_last_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
+                   ndarray[int64_t] counts,
+                   ndarray[%(c_type)s, ndim=2] values,
+                   ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b
+        %(dest_type2)s val, count
+        ndarray[%(dest_type2)s, ndim=2] resx, nobs
+
+    nobs = np.zeros_like(out)
+    resx = np.empty_like(out)
+
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+
+    N, K = (<object> values).shape
+
+    b = 0
+    for i in range(N):
+        while b < ngroups - 1 and i >= bins[b]:
+            b += 1
+
+        counts[b] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            if val == val:
+                nobs[b, j] += 1
+                resx[b, j] = val
+
+    for i in range(ngroups):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = resx[i, j]
+"""
+
+group_nth_bin_template = """@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_nth_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
+                  ndarray[int64_t] counts,
+                  ndarray[%(c_type)s, ndim=2] values,
+                  ndarray[int64_t] bins, int64_t rank):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b
+        %(dest_type2)s val, count
+        ndarray[%(dest_type2)s, ndim=2] resx, nobs
+
+    nobs = np.zeros_like(out)
+    resx = np.empty_like(out)
+
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+
+    N, K = (<object> values).shape
+
+    b = 0
+    for i in range(N):
+        while b < ngroups - 1 and i >= bins[b]:
+            b += 1
+
+        counts[b] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            if val == val:
+                nobs[b, j] += 1
+                if nobs[b, j] == rank:
+                    resx[b, j] = val
+
+    for i in range(ngroups):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = resx[i, j]
+"""
+
+group_nth_template = """@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_nth_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
+              ndarray[int64_t] counts,
+              ndarray[%(c_type)s, ndim=2] values,
+              ndarray[int64_t] labels, int64_t rank):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        %(dest_type2)s val, count
+        ndarray[%(dest_type2)s, ndim=2] resx
+        ndarray[int64_t, ndim=2] nobs
+
+    nobs = np.zeros((<object> out).shape, dtype=np.int64)
+    resx = np.empty_like(out)
+
+    N, K = (<object> values).shape
+
+    for i in range(N):
+        lab = labels[i]
+        if lab < 0:
+            continue
+
+        counts[lab] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            if val == val:
+                nobs[lab, j] += 1
+                if nobs[lab, j] == rank:
+                    resx[lab, j] = val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = resx[i, j]
+"""
+
+group_add_template = """@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_add_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
+              ndarray[int64_t] counts,
+              ndarray[%(c_type)s, ndim=2] values,
+              ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        %(dest_type2)s val, count
+        ndarray[%(dest_type2)s, ndim=2] sumx, nobs
+
+    nobs = np.zeros_like(out)
+    sumx = np.zeros_like(out)
+
+    N, K = (<object> values).shape
+
+    if K > 1:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    sumx[lab, j] += val
+    else:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[lab, 0] += 1
+                sumx[lab, 0] += val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = sumx[i, j]
+"""
+
+group_add_bin_template = """@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_add_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
+                  ndarray[int64_t] counts,
+                  ndarray[%(dest_type2)s, ndim=2] values,
+                  ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b, nbins
+        %(dest_type2)s val, count
+        ndarray[%(dest_type2)s, ndim=2] sumx, nobs
+
+    nobs = np.zeros_like(out)
+    sumx = np.zeros_like(out)
+
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+    N, K = (<object> values).shape
+
+    b = 0
+    if K > 1:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    sumx[b, j] += val
+    else:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[b, 0] += 1
+                sumx[b, 0] += val
+
+    for i in range(ngroups):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = sumx[i, j]
+"""
+
+group_prod_template = """@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_prod_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
+               ndarray[int64_t] counts,
+               ndarray[%(c_type)s, ndim=2] values,
+               ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        %(dest_type2)s val, count
+        ndarray[%(dest_type2)s, ndim=2] prodx, nobs
+
+    nobs = np.zeros_like(out)
+    prodx = np.ones_like(out)
+
+    N, K = (<object> values).shape
+
+    if K > 1:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    prodx[lab, j] *= val
+    else:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[lab, 0] += 1
+                prodx[lab, 0] *= val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = prodx[i, j]
+"""
+
+group_prod_bin_template = """@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_prod_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
+                  ndarray[int64_t] counts,
+                  ndarray[%(dest_type2)s, ndim=2] values,
+                  ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b
+        %(dest_type2)s val, count
+        ndarray[%(dest_type2)s, ndim=2] prodx, nobs
+
+    nobs = np.zeros_like(out)
+    prodx = np.ones_like(out)
+
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+    N, K = (<object> values).shape
+
+    b = 0
+    if K > 1:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    prodx[b, j] *= val
+    else:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[b, 0] += 1
+                prodx[b, 0] *= val
+
+    for i in range(ngroups):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = prodx[i, j]
+"""
+
+group_var_template = """@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_var_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
+              ndarray[int64_t] counts,
+              ndarray[%(dest_type2)s, ndim=2] values,
+              ndarray[int64_t] labels):
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        %(dest_type2)s val, ct
+        ndarray[%(dest_type2)s, ndim=2] nobs, sumx, sumxx
+
+    nobs = np.zeros_like(out)
+    sumx = np.zeros_like(out)
+    sumxx = np.zeros_like(out)
+
+    N, K = (<object> values).shape
+
+    if K > 1:
+        for i in range(N):
+
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    sumx[lab, j] += val
+                    sumxx[lab, j] += val * val
+    else:
+        for i in range(N):
+
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            val = values[i, 0]
+            # not nan
+            if val == val:
+                nobs[lab, 0] += 1
+                sumx[lab, 0] += val
+                sumxx[lab, 0] += val * val
+
+
+    for i in range(len(counts)):
+        for j in range(K):
+            ct = nobs[i, j]
+            if ct < 2:
+                out[i, j] = nan
+            else:
+                out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
+                             (ct * ct - ct))
+"""
+
+group_var_bin_template = """@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_var_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
+                  ndarray[int64_t] counts,
+                  ndarray[%(dest_type2)s, ndim=2] values,
+                  ndarray[int64_t] bins):
+
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b
+        %(dest_type2)s val, ct
+        ndarray[%(dest_type2)s, ndim=2] nobs, sumx, sumxx
+
+    nobs = np.zeros_like(out)
+    sumx = np.zeros_like(out)
+    sumxx = np.zeros_like(out)
+
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+
+    N, K = (<object> values).shape
+
+    b = 0
+    if K > 1:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    sumx[b, j] += val
+                    sumxx[b, j] += val * val
+    else:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[b, 0] += 1
+                sumx[b, 0] += val
+                sumxx[b, 0] += val * val
+
+    for i in range(ngroups):
+        for j in range(K):
+            ct = nobs[i, j]
+            if ct < 2:
+                out[i, j] = nan
+            else:
+                out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
+                             (ct * ct - ct))
+"""
+
+# add passing bin edges, instead of labels
+
+
+#----------------------------------------------------------------------
+# group_min, group_max
+
+group_min_bin_template = """@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_min_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
+                   ndarray[int64_t] counts,
+                   ndarray[%(dest_type2)s, ndim=2] values,
+                   ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b
+        %(dest_type2)s val, count
+        ndarray[%(dest_type2)s, ndim=2] minx, nobs
+
+    nobs = np.zeros_like(out)
+
+    minx = np.empty_like(out)
+    minx.fill(np.inf)
+
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+
+    N, K = (<object> values).shape
+
+    b = 0
+    if K > 1:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    if val < minx[b, j]:
+                        minx[b, j] = val
+    else:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[b, 0] += 1
+                if val < minx[b, 0]:
+                    minx[b, 0] = val
+
+    for i in range(ngroups):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = minx[i, j]
+"""
+
+group_max_template = """@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_max_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
+              ndarray[int64_t] counts,
+              ndarray[%(dest_type2)s, ndim=2] values,
+              ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        %(dest_type2)s val, count
+        ndarray[%(dest_type2)s, ndim=2] maxx, nobs
+
+    nobs = np.zeros_like(out)
+
+    maxx = np.empty_like(out)
+    maxx.fill(-np.inf)
+
+    N, K = (<object> values).shape
+
+    if K > 1:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    if val > maxx[lab, j]:
+                        maxx[lab, j] = val
+    else:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[lab, 0] += 1
+                if val > maxx[lab, 0]:
+                    maxx[lab, 0] = val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = maxx[i, j]
+"""
+
+group_max_bin_template = """@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_max_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
+                  ndarray[int64_t] counts,
+                  ndarray[%(dest_type2)s, ndim=2] values,
+                  ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b
+        %(dest_type2)s val, count
+        ndarray[%(dest_type2)s, ndim=2] maxx, nobs
+
+    nobs = np.zeros_like(out)
+    maxx = np.empty_like(out)
+    maxx.fill(-np.inf)
+
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+
+    N, K = (<object> values).shape
+
+    b = 0
+    if K > 1:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    if val > maxx[b, j]:
+                        maxx[b, j] = val
+    else:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[b, 0] += 1
+                if val > maxx[b, 0]:
+                    maxx[b, 0] = val
+
+    for i in range(ngroups):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = maxx[i, j]
+"""
+
+
+group_min_template = """@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_min_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
+              ndarray[int64_t] counts,
+              ndarray[%(dest_type2)s, ndim=2] values,
+              ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        %(dest_type2)s val, count
+        ndarray[%(dest_type2)s, ndim=2] minx, nobs
+
+    nobs = np.zeros_like(out)
+
+    minx = np.empty_like(out)
+    minx.fill(np.inf)
+
+    N, K = (<object> values).shape
+
+    if K > 1:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    if val < minx[lab, j]:
+                        minx[lab, j] = val
+    else:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[lab, 0] += 1
+                if val < minx[lab, 0]:
+                    minx[lab, 0] = val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = minx[i, j]
+"""
+
+
+group_mean_template = """@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_mean_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
+               ndarray[int64_t] counts,
+               ndarray[%(dest_type2)s, ndim=2] values,
+               ndarray[int64_t] labels):
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        %(dest_type2)s val, count
+        ndarray[%(dest_type2)s, ndim=2] sumx, nobs
+
+    nobs = np.zeros_like(out)
+    sumx = np.zeros_like(out)
+
+    N, K = (<object> values).shape
+
+    if K > 1:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    sumx[lab, j] += val
+    else:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            val = values[i, 0]
+            # not nan
+            if val == val:
+                nobs[lab, 0] += 1
+                sumx[lab, 0] += val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            count = nobs[i, j]
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = sumx[i, j] / count
+"""
+
+group_mean_bin_template = """
+def group_mean_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
+                   ndarray[int64_t] counts,
+                   ndarray[%(dest_type2)s, ndim=2] values,
+                   ndarray[int64_t] bins):
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b
+        %(dest_type2)s val, count
+        ndarray[%(dest_type2)s, ndim=2] sumx, nobs
+
+    nobs = np.zeros_like(out)
+    sumx = np.zeros_like(out)
+
+    N, K = (<object> values).shape
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+
+    b = 0
+    if K > 1:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    sumx[b, j] += val
+    else:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[b, 0] += 1
+                sumx[b, 0] += val
+
+    for i in range(ngroups):
+        for j in range(K):
+            count = nobs[i, j]
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = sumx[i, j] / count
+"""
+
+group_ohlc_template = """@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_ohlc_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
+                  ndarray[int64_t] counts,
+                  ndarray[%(dest_type2)s, ndim=2] values,
+                  ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b
+        %(dest_type2)s val, count
+        %(dest_type2)s vopen, vhigh, vlow, vclose, NA
+        bint got_first = 0
+
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+
+    N, K = (<object> values).shape
+
+    if out.shape[1] != 4:
+        raise ValueError('Output array must have 4 columns')
+
+    NA = np.nan
+
+    b = 0
+    if K > 1:
+        raise NotImplementedError
+    else:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                if not got_first:
+                    out[b, 0] = NA
+                    out[b, 1] = NA
+                    out[b, 2] = NA
+                    out[b, 3] = NA
+                else:
+                    out[b, 0] = vopen
+                    out[b, 1] = vhigh
+                    out[b, 2] = vlow
+                    out[b, 3] = vclose
+                b += 1
+                got_first = 0
+
+            counts[b] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                if not got_first:
+                    got_first = 1
+                    vopen = val
+                    vlow = val
+                    vhigh = val
+                else:
+                    if val < vlow:
+                        vlow = val
+                    if val > vhigh:
+                        vhigh = val
+                vclose = val
+
+        if not got_first:
+            out[b, 0] = NA
+            out[b, 1] = NA
+            out[b, 2] = NA
+            out[b, 3] = NA
+        else:
+            out[b, 0] = vopen
+            out[b, 1] = vhigh
+            out[b, 2] = vlow
+            out[b, 3] = vclose
+"""
+
 arrmap_template = """@cython.wraparound(False)
 @cython.boundscheck(False)
 def arrmap_%(name)s(ndarray[%(c_type)s] index, object func):
@@ -1100,6 +2121,9 @@ def outer_join_indexer_%(name)s(ndarray[%(c_type)s] left,
 
 ensure_functions = [
     ('float64', 'FLOAT64', 'float64'),
+    ('float32', 'FLOAT32', 'float32'),
+    ('int8', 'INT8', 'int8'),
+    ('int16', 'INT16', 'int16'),
     ('int32', 'INT32', 'int32'),
     ('int64', 'INT64', 'int64'),
     # ('platform_int', 'INT', 'int_'),
@@ -1133,26 +2157,30 @@ def put2d_%(name)s_%(dest_type)s(ndarray[%(c_type)s, ndim=2, cast=True] values,
 #-------------------------------------------------------------------------
 # Generators
 
-def generate_put_functions():
-    function_list = [
-        ('float64', 'float64_t', 'object'),
-        ('float64', 'float64_t', 'float64_t'),
-        ('object', 'object', 'object'),
-        ('int32', 'int32_t', 'int64_t'),
-        ('int32', 'int32_t', 'float64_t'),
-        ('int32', 'int32_t', 'object'),
-        ('int64', 'int64_t', 'int64_t'),
-        ('int64', 'int64_t', 'float64_t'),
-        ('int64', 'int64_t', 'object'),
-        ('bool', 'uint8_t', 'uint8_t'),
-        ('bool', 'uint8_t', 'object')
-    ]
+def generate_put_template(template, use_ints = True, use_floats = True):
+    floats_list = [
+        ('float64', 'float64_t', 'float64_t', 'np.float64'),
+        ('float32', 'float32_t', 'float32_t', 'np.float32'),
+        ]
+    ints_list = [
+        ('int8',  'int8_t',  'float32_t', 'np.float32'),
+        ('int16', 'int16_t', 'float32_t', 'np.float32'),
+        ('int32', 'int32_t', 'float64_t', 'np.float64'),
+        ('int64', 'int64_t', 'float64_t', 'np.float64'),
+        ]
+    function_list = []
+    if use_floats:
+        function_list.extend(floats_list)
+    if use_ints:
+        function_list.extend(ints_list)
 
     output = StringIO()
-    for name, c_type, dest_type in function_list:
-        func = put2d_template % {'name' : name, 'c_type' : c_type,
-                                 'dest_type' : dest_type.replace('_t', ''),
-                                 'dest_type2' : dest_type}
+    for name, c_type, dest_type, dest_dtype in function_list:
+        func = template % {'name' : name, 
+                           'c_type' : c_type,
+                           'dest_type' : dest_type.replace('_t', ''),
+                           'dest_type2' : dest_type,
+                           'dest_dtype' : dest_dtype}
         output.write(func)
     return output.getvalue()
 
@@ -1160,10 +2188,13 @@ def generate_put_functions():
 # name, ctype, capable of holding NA
 function_list = [
     ('float64', 'float64_t', 'np.float64', True),
-    ('object', 'object', 'object', True),
+    ('float32', 'float32_t', 'np.float32', True),
+    ('object','object',  'object',   True),
+    ('int8',  'int8_t',  'np.int8',  False),
+    ('int16', 'int16_t', 'np.int16', False),
     ('int32', 'int32_t', 'np.int32', False),
     ('int64', 'int64_t', 'np.int64', False),
-    ('bool', 'uint8_t', 'np.bool', False)
+    ('bool',  'uint8_t', 'np.bool',  False)
 ]
 
 def generate_from_template(template, ndim=1, exclude=None):
@@ -1178,6 +2209,25 @@ def generate_from_template(template, ndim=1, exclude=None):
         output.write(func)
     return output.getvalue()
 
+put_2d = [diff_2d_template]
+groupbys = [group_last_template,
+            group_last_bin_template,
+            group_nth_template,
+            group_nth_bin_template,
+            group_add_template,
+            group_add_bin_template,
+            group_prod_template,
+            group_prod_bin_template,
+            group_var_template,
+            group_var_bin_template,
+            group_mean_template,
+            group_mean_bin_template,
+            group_min_template,
+            group_min_bin_template,
+            group_max_template,
+            group_max_bin_template,
+            group_ohlc_template]
+
 templates_1d = [map_indices_template,
                 pad_template,
                 backfill_template,
@@ -1211,6 +2261,12 @@ def generate_take_cython_file(path='generated.pyx'):
         for template in templates_2d:
             print >> f, generate_from_template(template, ndim=2)
 
+        for template in put_2d:
+            print >> f, generate_put_template(template)
+
+        for template in groupbys:
+            print >> f, generate_put_template(template, use_ints = False)
+
         # for template in templates_1d_datetime:
         #     print >> f, generate_from_template_datetime(template)
 
diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx
index 5ecd8439a13ec..a20fb5668aec9 100644
--- a/pandas/src/generated.pyx
+++ b/pandas/src/generated.pyx
@@ -57,6 +57,36 @@ cpdef ensure_float64(object arr):
         return np.array(arr, dtype=np.float64)
 
 
+cpdef ensure_float32(object arr):
+    if util.is_array(arr):
+        if (<ndarray> arr).descr.type_num == NPY_FLOAT32:
+            return arr
+        else:
+            return arr.astype(np.float32)
+    else:
+        return np.array(arr, dtype=np.float32)
+
+
+cpdef ensure_int8(object arr):
+    if util.is_array(arr):
+        if (<ndarray> arr).descr.type_num == NPY_INT8:
+            return arr
+        else:
+            return arr.astype(np.int8)
+    else:
+        return np.array(arr, dtype=np.int8)
+
+
+cpdef ensure_int16(object arr):
+    if util.is_array(arr):
+        if (<ndarray> arr).descr.type_num == NPY_INT16:
+            return arr
+        else:
+            return arr.astype(np.int16)
+    else:
+        return np.array(arr, dtype=np.int16)
+
+
 cpdef ensure_int32(object arr):
     if util.is_array(arr):
         if (<ndarray> arr).descr.type_num == NPY_INT32:
@@ -109,6 +139,28 @@ cpdef map_indices_float64(ndarray[float64_t] index):
 
     return result
 
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cpdef map_indices_float32(ndarray[float32_t] index):
+    '''
+    Produce a dict mapping the values of the input array to their respective
+    locations.
+
+    Example:
+        array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
+
+    Better to do this with Cython because of the enormous speed boost.
+    '''
+    cdef Py_ssize_t i, length
+    cdef dict result = {}
+
+    length = len(index)
+
+    for i in range(length):
+        result[index[i]] = i
+
+    return result
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cpdef map_indices_object(ndarray[object] index):
@@ -131,6 +183,50 @@ cpdef map_indices_object(ndarray[object] index):
 
     return result
 
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cpdef map_indices_int8(ndarray[int8_t] index):
+    '''
+    Produce a dict mapping the values of the input array to their respective
+    locations.
+
+    Example:
+        array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
+
+    Better to do this with Cython because of the enormous speed boost.
+    '''
+    cdef Py_ssize_t i, length
+    cdef dict result = {}
+
+    length = len(index)
+
+    for i in range(length):
+        result[index[i]] = i
+
+    return result
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cpdef map_indices_int16(ndarray[int16_t] index):
+    '''
+    Produce a dict mapping the values of the input array to their respective
+    locations.
+
+    Example:
+        array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
+
+    Better to do this with Cython because of the enormous speed boost.
+    '''
+    cdef Py_ssize_t i, length
+    cdef dict result = {}
+
+    length = len(index)
+
+    for i in range(length):
+        result[index[i]] = i
+
+    return result
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cpdef map_indices_int32(ndarray[int32_t] index):
@@ -259,6 +355,67 @@ def pad_float64(ndarray[float64_t] old, ndarray[float64_t] new,
 
     return indexer
 
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def pad_float32(ndarray[float32_t] old, ndarray[float32_t] new,
+                   limit=None):
+    cdef Py_ssize_t i, j, nleft, nright
+    cdef ndarray[int64_t, ndim=1] indexer
+    cdef float32_t cur, next
+    cdef int lim, fill_count = 0
+
+    nleft = len(old)
+    nright = len(new)
+    indexer = np.empty(nright, dtype=np.int64)
+    indexer.fill(-1)
+
+    if limit is None:
+        lim = nright
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
+        return indexer
+
+    i = j = 0
+
+    cur = old[0]
+
+    while j <= nright - 1 and new[j] < cur:
+        j += 1
+
+    while True:
+        if j == nright:
+            break
+
+        if i == nleft - 1:
+            while j < nright:
+                if new[j] == cur:
+                    indexer[j] = i
+                elif new[j] > cur and fill_count < lim:
+                    indexer[j] = i
+                    fill_count += 1
+                j += 1
+            break
+
+        next = old[i + 1]
+
+        while j < nright and cur <= new[j] < next:
+            if new[j] == cur:
+                indexer[j] = i
+            elif fill_count < lim:
+                indexer[j] = i
+                fill_count += 1
+            j += 1
+
+        fill_count = 0
+        i += 1
+        cur = next
+
+    return indexer
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def pad_object(ndarray[object] old, ndarray[object] new,
@@ -322,11 +479,11 @@ def pad_object(ndarray[object] old, ndarray[object] new,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def pad_int32(ndarray[int32_t] old, ndarray[int32_t] new,
+def pad_int8(ndarray[int8_t] old, ndarray[int8_t] new,
                    limit=None):
     cdef Py_ssize_t i, j, nleft, nright
     cdef ndarray[int64_t, ndim=1] indexer
-    cdef int32_t cur, next
+    cdef int8_t cur, next
     cdef int lim, fill_count = 0
 
     nleft = len(old)
@@ -383,11 +540,11 @@ def pad_int32(ndarray[int32_t] old, ndarray[int32_t] new,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def pad_int64(ndarray[int64_t] old, ndarray[int64_t] new,
+def pad_int16(ndarray[int16_t] old, ndarray[int16_t] new,
                    limit=None):
     cdef Py_ssize_t i, j, nleft, nright
     cdef ndarray[int64_t, ndim=1] indexer
-    cdef int64_t cur, next
+    cdef int16_t cur, next
     cdef int lim, fill_count = 0
 
     nleft = len(old)
@@ -444,11 +601,11 @@ def pad_int64(ndarray[int64_t] old, ndarray[int64_t] new,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def pad_bool(ndarray[uint8_t] old, ndarray[uint8_t] new,
+def pad_int32(ndarray[int32_t] old, ndarray[int32_t] new,
                    limit=None):
     cdef Py_ssize_t i, j, nleft, nright
     cdef ndarray[int64_t, ndim=1] indexer
-    cdef uint8_t cur, next
+    cdef int32_t cur, next
     cdef int lim, fill_count = 0
 
     nleft = len(old)
@@ -503,14 +660,13 @@ def pad_bool(ndarray[uint8_t] old, ndarray[uint8_t] new,
 
     return indexer
 
-
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_float64(ndarray[float64_t] old, ndarray[float64_t] new,
-                      limit=None):
+def pad_int64(ndarray[int64_t] old, ndarray[int64_t] new,
+                   limit=None):
     cdef Py_ssize_t i, j, nleft, nright
     cdef ndarray[int64_t, ndim=1] indexer
-    cdef float64_t cur, prev
+    cdef int64_t cur, next
     cdef int lim, fill_count = 0
 
     nleft = len(old)
@@ -525,54 +681,53 @@ def backfill_float64(ndarray[float64_t] old, ndarray[float64_t] new,
             raise ValueError('Limit must be non-negative')
         lim = limit
 
-    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
+    if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
         return indexer
 
-    i = nleft - 1
-    j = nright - 1
+    i = j = 0
 
-    cur = old[nleft - 1]
+    cur = old[0]
 
-    while j >= 0 and new[j] > cur:
-        j -= 1
+    while j <= nright - 1 and new[j] < cur:
+        j += 1
 
     while True:
-        if j < 0:
+        if j == nright:
             break
 
-        if i == 0:
-            while j >= 0:
+        if i == nleft - 1:
+            while j < nright:
                 if new[j] == cur:
                     indexer[j] = i
-                elif new[j] < cur and fill_count < lim:
+                elif new[j] > cur and fill_count < lim:
                     indexer[j] = i
                     fill_count += 1
-                j -= 1
+                j += 1
             break
 
-        prev = old[i - 1]
+        next = old[i + 1]
 
-        while j >= 0 and prev < new[j] <= cur:
+        while j < nright and cur <= new[j] < next:
             if new[j] == cur:
                 indexer[j] = i
-            elif new[j] < cur and fill_count < lim:
+            elif fill_count < lim:
                 indexer[j] = i
                 fill_count += 1
-            j -= 1
+            j += 1
 
         fill_count = 0
-        i -= 1
-        cur = prev
+        i += 1
+        cur = next
 
     return indexer
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_object(ndarray[object] old, ndarray[object] new,
-                      limit=None):
+def pad_bool(ndarray[uint8_t] old, ndarray[uint8_t] new,
+                   limit=None):
     cdef Py_ssize_t i, j, nleft, nright
     cdef ndarray[int64_t, ndim=1] indexer
-    cdef object cur, prev
+    cdef uint8_t cur, next
     cdef int lim, fill_count = 0
 
     nleft = len(old)
@@ -587,54 +742,54 @@ def backfill_object(ndarray[object] old, ndarray[object] new,
             raise ValueError('Limit must be non-negative')
         lim = limit
 
-    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
+    if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
         return indexer
 
-    i = nleft - 1
-    j = nright - 1
+    i = j = 0
 
-    cur = old[nleft - 1]
+    cur = old[0]
 
-    while j >= 0 and new[j] > cur:
-        j -= 1
+    while j <= nright - 1 and new[j] < cur:
+        j += 1
 
     while True:
-        if j < 0:
+        if j == nright:
             break
 
-        if i == 0:
-            while j >= 0:
+        if i == nleft - 1:
+            while j < nright:
                 if new[j] == cur:
                     indexer[j] = i
-                elif new[j] < cur and fill_count < lim:
+                elif new[j] > cur and fill_count < lim:
                     indexer[j] = i
                     fill_count += 1
-                j -= 1
+                j += 1
             break
 
-        prev = old[i - 1]
+        next = old[i + 1]
 
-        while j >= 0 and prev < new[j] <= cur:
+        while j < nright and cur <= new[j] < next:
             if new[j] == cur:
                 indexer[j] = i
-            elif new[j] < cur and fill_count < lim:
+            elif fill_count < lim:
                 indexer[j] = i
                 fill_count += 1
-            j -= 1
+            j += 1
 
         fill_count = 0
-        i -= 1
-        cur = prev
+        i += 1
+        cur = next
 
     return indexer
 
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_int32(ndarray[int32_t] old, ndarray[int32_t] new,
+def backfill_float64(ndarray[float64_t] old, ndarray[float64_t] new,
                       limit=None):
     cdef Py_ssize_t i, j, nleft, nright
     cdef ndarray[int64_t, ndim=1] indexer
-    cdef int32_t cur, prev
+    cdef float64_t cur, prev
     cdef int lim, fill_count = 0
 
     nleft = len(old)
@@ -692,11 +847,11 @@ def backfill_int32(ndarray[int32_t] old, ndarray[int32_t] new,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_int64(ndarray[int64_t] old, ndarray[int64_t] new,
+def backfill_float32(ndarray[float32_t] old, ndarray[float32_t] new,
                       limit=None):
     cdef Py_ssize_t i, j, nleft, nright
     cdef ndarray[int64_t, ndim=1] indexer
-    cdef int64_t cur, prev
+    cdef float32_t cur, prev
     cdef int lim, fill_count = 0
 
     nleft = len(old)
@@ -754,11 +909,11 @@ def backfill_int64(ndarray[int64_t] old, ndarray[int64_t] new,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_bool(ndarray[uint8_t] old, ndarray[uint8_t] new,
+def backfill_object(ndarray[object] old, ndarray[object] new,
                       limit=None):
     cdef Py_ssize_t i, j, nleft, nright
     cdef ndarray[int64_t, ndim=1] indexer
-    cdef uint8_t cur, prev
+    cdef object cur, prev
     cdef int lim, fill_count = 0
 
     nleft = len(old)
@@ -814,192 +969,332 @@ def backfill_bool(ndarray[uint8_t] old, ndarray[uint8_t] new,
 
     return indexer
 
-
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def pad_inplace_float64(ndarray[float64_t] values,
-                         ndarray[uint8_t, cast=True] mask,
-                         limit=None):
-    cdef Py_ssize_t i, N
-    cdef float64_t val
+def backfill_int8(ndarray[int8_t] old, ndarray[int8_t] new,
+                      limit=None):
+    cdef Py_ssize_t i, j, nleft, nright
+    cdef ndarray[int64_t, ndim=1] indexer
+    cdef int8_t cur, prev
     cdef int lim, fill_count = 0
 
-    N = len(values)
+    nleft = len(old)
+    nright = len(new)
+    indexer = np.empty(nright, dtype=np.int64)
+    indexer.fill(-1)
 
     if limit is None:
-        lim = N
+        lim = nright
     else:
         if limit < 0:
             raise ValueError('Limit must be non-negative')
         lim = limit
 
-    val = values[0]
-    for i in range(N):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
+    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
+        return indexer
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_inplace_object(ndarray[object] values,
-                         ndarray[uint8_t, cast=True] mask,
-                         limit=None):
-    cdef Py_ssize_t i, N
-    cdef object val
-    cdef int lim, fill_count = 0
+    i = nleft - 1
+    j = nright - 1
 
-    N = len(values)
+    cur = old[nleft - 1]
 
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
+    while j >= 0 and new[j] > cur:
+        j -= 1
 
-    val = values[0]
-    for i in range(N):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
+    while True:
+        if j < 0:
+            break
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_inplace_int32(ndarray[int32_t] values,
-                         ndarray[uint8_t, cast=True] mask,
-                         limit=None):
-    cdef Py_ssize_t i, N
-    cdef int32_t val
-    cdef int lim, fill_count = 0
+        if i == 0:
+            while j >= 0:
+                if new[j] == cur:
+                    indexer[j] = i
+                elif new[j] < cur and fill_count < lim:
+                    indexer[j] = i
+                    fill_count += 1
+                j -= 1
+            break
 
-    N = len(values)
+        prev = old[i - 1]
 
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
+        while j >= 0 and prev < new[j] <= cur:
+            if new[j] == cur:
+                indexer[j] = i
+            elif new[j] < cur and fill_count < lim:
+                indexer[j] = i
+                fill_count += 1
+            j -= 1
 
-    val = values[0]
-    for i in range(N):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
+        fill_count = 0
+        i -= 1
+        cur = prev
+
+    return indexer
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def pad_inplace_int64(ndarray[int64_t] values,
-                         ndarray[uint8_t, cast=True] mask,
-                         limit=None):
-    cdef Py_ssize_t i, N
-    cdef int64_t val
+def backfill_int16(ndarray[int16_t] old, ndarray[int16_t] new,
+                      limit=None):
+    cdef Py_ssize_t i, j, nleft, nright
+    cdef ndarray[int64_t, ndim=1] indexer
+    cdef int16_t cur, prev
     cdef int lim, fill_count = 0
 
-    N = len(values)
+    nleft = len(old)
+    nright = len(new)
+    indexer = np.empty(nright, dtype=np.int64)
+    indexer.fill(-1)
 
     if limit is None:
-        lim = N
+        lim = nright
     else:
         if limit < 0:
             raise ValueError('Limit must be non-negative')
         lim = limit
 
-    val = values[0]
-    for i in range(N):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
+    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
+        return indexer
+
+    i = nleft - 1
+    j = nright - 1
+
+    cur = old[nleft - 1]
+
+    while j >= 0 and new[j] > cur:
+        j -= 1
+
+    while True:
+        if j < 0:
+            break
+
+        if i == 0:
+            while j >= 0:
+                if new[j] == cur:
+                    indexer[j] = i
+                elif new[j] < cur and fill_count < lim:
+                    indexer[j] = i
+                    fill_count += 1
+                j -= 1
+            break
+
+        prev = old[i - 1]
+
+        while j >= 0 and prev < new[j] <= cur:
+            if new[j] == cur:
+                indexer[j] = i
+            elif new[j] < cur and fill_count < lim:
+                indexer[j] = i
+                fill_count += 1
+            j -= 1
+
+        fill_count = 0
+        i -= 1
+        cur = prev
+
+    return indexer
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def pad_inplace_bool(ndarray[uint8_t] values,
-                         ndarray[uint8_t, cast=True] mask,
-                         limit=None):
-    cdef Py_ssize_t i, N
-    cdef uint8_t val
+def backfill_int32(ndarray[int32_t] old, ndarray[int32_t] new,
+                      limit=None):
+    cdef Py_ssize_t i, j, nleft, nright
+    cdef ndarray[int64_t, ndim=1] indexer
+    cdef int32_t cur, prev
     cdef int lim, fill_count = 0
 
-    N = len(values)
+    nleft = len(old)
+    nright = len(new)
+    indexer = np.empty(nright, dtype=np.int64)
+    indexer.fill(-1)
 
     if limit is None:
-        lim = N
+        lim = nright
     else:
         if limit < 0:
             raise ValueError('Limit must be non-negative')
         lim = limit
 
-    val = values[0]
-    for i in range(N):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
+    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
+        return indexer
+
+    i = nleft - 1
+    j = nright - 1
+
+    cur = old[nleft - 1]
+
+    while j >= 0 and new[j] > cur:
+        j -= 1
 
+    while True:
+        if j < 0:
+            break
+
+        if i == 0:
+            while j >= 0:
+                if new[j] == cur:
+                    indexer[j] = i
+                elif new[j] < cur and fill_count < lim:
+                    indexer[j] = i
+                    fill_count += 1
+                j -= 1
+            break
+
+        prev = old[i - 1]
+
+        while j >= 0 and prev < new[j] <= cur:
+            if new[j] == cur:
+                indexer[j] = i
+            elif new[j] < cur and fill_count < lim:
+                indexer[j] = i
+                fill_count += 1
+            j -= 1
+
+        fill_count = 0
+        i -= 1
+        cur = prev
+
+    return indexer
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_inplace_float64(ndarray[float64_t] values,
-                              ndarray[uint8_t, cast=True] mask,
-                              limit=None):
-    cdef Py_ssize_t i, N
-    cdef float64_t val
+def backfill_int64(ndarray[int64_t] old, ndarray[int64_t] new,
+                      limit=None):
+    cdef Py_ssize_t i, j, nleft, nright
+    cdef ndarray[int64_t, ndim=1] indexer
+    cdef int64_t cur, prev
     cdef int lim, fill_count = 0
 
-    N = len(values)
+    nleft = len(old)
+    nright = len(new)
+    indexer = np.empty(nright, dtype=np.int64)
+    indexer.fill(-1)
 
     if limit is None:
-        lim = N
+        lim = nright
     else:
         if limit < 0:
             raise ValueError('Limit must be non-negative')
         lim = limit
 
-    val = values[N - 1]
-    for i in range(N - 1, -1 , -1):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_inplace_object(ndarray[object] values,
-                              ndarray[uint8_t, cast=True] mask,
-                              limit=None):
-    cdef Py_ssize_t i, N
-    cdef object val
-    cdef int lim, fill_count = 0
+    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
+        return indexer
+
+    i = nleft - 1
+    j = nright - 1
+
+    cur = old[nleft - 1]
+
+    while j >= 0 and new[j] > cur:
+        j -= 1
+
+    while True:
+        if j < 0:
+            break
+
+        if i == 0:
+            while j >= 0:
+                if new[j] == cur:
+                    indexer[j] = i
+                elif new[j] < cur and fill_count < lim:
+                    indexer[j] = i
+                    fill_count += 1
+                j -= 1
+            break
+
+        prev = old[i - 1]
+
+        while j >= 0 and prev < new[j] <= cur:
+            if new[j] == cur:
+                indexer[j] = i
+            elif new[j] < cur and fill_count < lim:
+                indexer[j] = i
+                fill_count += 1
+            j -= 1
+
+        fill_count = 0
+        i -= 1
+        cur = prev
+
+    return indexer
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def backfill_bool(ndarray[uint8_t] old, ndarray[uint8_t] new,
+                      limit=None):
+    cdef Py_ssize_t i, j, nleft, nright
+    cdef ndarray[int64_t, ndim=1] indexer
+    cdef uint8_t cur, prev
+    cdef int lim, fill_count = 0
+
+    nleft = len(old)
+    nright = len(new)
+    indexer = np.empty(nright, dtype=np.int64)
+    indexer.fill(-1)
+
+    if limit is None:
+        lim = nright
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
+        return indexer
+
+    i = nleft - 1
+    j = nright - 1
+
+    cur = old[nleft - 1]
+
+    while j >= 0 and new[j] > cur:
+        j -= 1
+
+    while True:
+        if j < 0:
+            break
+
+        if i == 0:
+            while j >= 0:
+                if new[j] == cur:
+                    indexer[j] = i
+                elif new[j] < cur and fill_count < lim:
+                    indexer[j] = i
+                    fill_count += 1
+                j -= 1
+            break
+
+        prev = old[i - 1]
+
+        while j >= 0 and prev < new[j] <= cur:
+            if new[j] == cur:
+                indexer[j] = i
+            elif new[j] < cur and fill_count < lim:
+                indexer[j] = i
+                fill_count += 1
+            j -= 1
+
+        fill_count = 0
+        i -= 1
+        cur = prev
+
+    return indexer
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def pad_inplace_float64(ndarray[float64_t] values,
+                         ndarray[uint8_t, cast=True] mask,
+                         limit=None):
+    cdef Py_ssize_t i, N
+    cdef float64_t val
+    cdef int lim, fill_count = 0
 
     N = len(values)
 
+    # GH 2778
+    if N == 0:
+        return
+
     if limit is None:
         lim = N
     else:
@@ -1007,8 +1302,8 @@ def backfill_inplace_object(ndarray[object] values,
             raise ValueError('Limit must be non-negative')
         lim = limit
 
-    val = values[N - 1]
-    for i in range(N - 1, -1 , -1):
+    val = values[0]
+    for i in range(N):
         if mask[i]:
             if fill_count >= lim:
                 continue
@@ -1017,17 +1312,22 @@ def backfill_inplace_object(ndarray[object] values,
         else:
             fill_count = 0
             val = values[i]
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_inplace_int32(ndarray[int32_t] values,
-                              ndarray[uint8_t, cast=True] mask,
-                              limit=None):
+def pad_inplace_float32(ndarray[float32_t] values,
+                         ndarray[uint8_t, cast=True] mask,
+                         limit=None):
     cdef Py_ssize_t i, N
-    cdef int32_t val
+    cdef float32_t val
     cdef int lim, fill_count = 0
 
     N = len(values)
 
+    # GH 2778
+    if N == 0:
+        return
+
     if limit is None:
         lim = N
     else:
@@ -1035,8 +1335,8 @@ def backfill_inplace_int32(ndarray[int32_t] values,
             raise ValueError('Limit must be non-negative')
         lim = limit
 
-    val = values[N - 1]
-    for i in range(N - 1, -1 , -1):
+    val = values[0]
+    for i in range(N):
         if mask[i]:
             if fill_count >= lim:
                 continue
@@ -1045,17 +1345,22 @@ def backfill_inplace_int32(ndarray[int32_t] values,
         else:
             fill_count = 0
             val = values[i]
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_inplace_int64(ndarray[int64_t] values,
-                              ndarray[uint8_t, cast=True] mask,
-                              limit=None):
+def pad_inplace_object(ndarray[object] values,
+                         ndarray[uint8_t, cast=True] mask,
+                         limit=None):
     cdef Py_ssize_t i, N
-    cdef int64_t val
+    cdef object val
     cdef int lim, fill_count = 0
 
     N = len(values)
 
+    # GH 2778
+    if N == 0:
+        return
+
     if limit is None:
         lim = N
     else:
@@ -1063,8 +1368,8 @@ def backfill_inplace_int64(ndarray[int64_t] values,
             raise ValueError('Limit must be non-negative')
         lim = limit
 
-    val = values[N - 1]
-    for i in range(N - 1, -1 , -1):
+    val = values[0]
+    for i in range(N):
         if mask[i]:
             if fill_count >= lim:
                 continue
@@ -1073,17 +1378,22 @@ def backfill_inplace_int64(ndarray[int64_t] values,
         else:
             fill_count = 0
             val = values[i]
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_inplace_bool(ndarray[uint8_t] values,
-                              ndarray[uint8_t, cast=True] mask,
-                              limit=None):
+def pad_inplace_int8(ndarray[int8_t] values,
+                         ndarray[uint8_t, cast=True] mask,
+                         limit=None):
     cdef Py_ssize_t i, N
-    cdef uint8_t val
+    cdef int8_t val
     cdef int lim, fill_count = 0
 
     N = len(values)
 
+    # GH 2778
+    if N == 0:
+        return
+
     if limit is None:
         lim = N
     else:
@@ -1091,8 +1401,8 @@ def backfill_inplace_bool(ndarray[uint8_t] values,
             raise ValueError('Limit must be non-negative')
         lim = limit
 
-    val = values[N - 1]
-    for i in range(N - 1, -1 , -1):
+    val = values[0]
+    for i in range(N):
         if mask[i]:
             if fill_count >= lim:
                 continue
@@ -1104,14 +1414,18 @@ def backfill_inplace_bool(ndarray[uint8_t] values,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def pad_2d_inplace_float64(ndarray[float64_t, ndim=2] values,
-                            ndarray[uint8_t, ndim=2] mask,
-                            limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef float64_t val
+def pad_inplace_int16(ndarray[int16_t] values,
+                         ndarray[uint8_t, cast=True] mask,
+                         limit=None):
+    cdef Py_ssize_t i, N
+    cdef int16_t val
     cdef int lim, fill_count = 0
 
-    K, N = (<object> values).shape
+    N = len(values)
+
+    # GH 2778
+    if N == 0:
+        return
 
     if limit is None:
         lim = N
@@ -1120,28 +1434,31 @@ def pad_2d_inplace_float64(ndarray[float64_t, ndim=2] values,
             raise ValueError('Limit must be non-negative')
         lim = limit
 
-    for j in range(K):
-        fill_count = 0
-        val = values[j, 0]
-        for i in range(N):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
+    val = values[0]
+    for i in range(N):
+        if mask[i]:
+            if fill_count >= lim:
+                continue
+            fill_count += 1
+            values[i] = val
+        else:
+            fill_count = 0
+            val = values[i]
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def pad_2d_inplace_object(ndarray[object, ndim=2] values,
-                            ndarray[uint8_t, ndim=2] mask,
-                            limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef object val
+def pad_inplace_int32(ndarray[int32_t] values,
+                         ndarray[uint8_t, cast=True] mask,
+                         limit=None):
+    cdef Py_ssize_t i, N
+    cdef int32_t val
     cdef int lim, fill_count = 0
 
-    K, N = (<object> values).shape
+    N = len(values)
+
+    # GH 2778
+    if N == 0:
+        return
 
     if limit is None:
         lim = N
@@ -1150,28 +1467,31 @@ def pad_2d_inplace_object(ndarray[object, ndim=2] values,
             raise ValueError('Limit must be non-negative')
         lim = limit
 
-    for j in range(K):
-        fill_count = 0
-        val = values[j, 0]
-        for i in range(N):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
+    val = values[0]
+    for i in range(N):
+        if mask[i]:
+            if fill_count >= lim:
+                continue
+            fill_count += 1
+            values[i] = val
+        else:
+            fill_count = 0
+            val = values[i]
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def pad_2d_inplace_int32(ndarray[int32_t, ndim=2] values,
-                            ndarray[uint8_t, ndim=2] mask,
-                            limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef int32_t val
+def pad_inplace_int64(ndarray[int64_t] values,
+                         ndarray[uint8_t, cast=True] mask,
+                         limit=None):
+    cdef Py_ssize_t i, N
+    cdef int64_t val
     cdef int lim, fill_count = 0
 
-    K, N = (<object> values).shape
+    N = len(values)
+
+    # GH 2778
+    if N == 0:
+        return
 
     if limit is None:
         lim = N
@@ -1180,28 +1500,31 @@ def pad_2d_inplace_int32(ndarray[int32_t, ndim=2] values,
             raise ValueError('Limit must be non-negative')
         lim = limit
 
-    for j in range(K):
-        fill_count = 0
-        val = values[j, 0]
-        for i in range(N):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_2d_inplace_int64(ndarray[int64_t, ndim=2] values,
-                            ndarray[uint8_t, ndim=2] mask,
-                            limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef int64_t val
+    val = values[0]
+    for i in range(N):
+        if mask[i]:
+            if fill_count >= lim:
+                continue
+            fill_count += 1
+            values[i] = val
+        else:
+            fill_count = 0
+            val = values[i]
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def pad_inplace_bool(ndarray[uint8_t] values,
+                         ndarray[uint8_t, cast=True] mask,
+                         limit=None):
+    cdef Py_ssize_t i, N
+    cdef uint8_t val
     cdef int lim, fill_count = 0
 
-    K, N = (<object> values).shape
+    N = len(values)
+
+    # GH 2778
+    if N == 0:
+        return
 
     if limit is None:
         lim = N
@@ -1210,28 +1533,32 @@ def pad_2d_inplace_int64(ndarray[int64_t, ndim=2] values,
             raise ValueError('Limit must be non-negative')
         lim = limit
 
-    for j in range(K):
-        fill_count = 0
-        val = values[j, 0]
-        for i in range(N):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
+    val = values[0]
+    for i in range(N):
+        if mask[i]:
+            if fill_count >= lim:
+                continue
+            fill_count += 1
+            values[i] = val
+        else:
+            fill_count = 0
+            val = values[i]
+
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def pad_2d_inplace_bool(ndarray[uint8_t, ndim=2] values,
-                            ndarray[uint8_t, ndim=2] mask,
-                            limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef uint8_t val
+def backfill_inplace_float64(ndarray[float64_t] values,
+                              ndarray[uint8_t, cast=True] mask,
+                              limit=None):
+    cdef Py_ssize_t i, N
+    cdef float64_t val
     cdef int lim, fill_count = 0
 
-    K, N = (<object> values).shape
+    N = len(values)
+
+    # GH 2778
+    if N == 0:
+        return
 
     if limit is None:
         lim = N
@@ -1240,29 +1567,30 @@ def pad_2d_inplace_bool(ndarray[uint8_t, ndim=2] values,
             raise ValueError('Limit must be non-negative')
         lim = limit
 
-    for j in range(K):
-        fill_count = 0
-        val = values[j, 0]
-        for i in range(N):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-
+    val = values[N - 1]
+    for i in range(N - 1, -1 , -1):
+        if mask[i]:
+            if fill_count >= lim:
+                continue
+            fill_count += 1
+            values[i] = val
+        else:
+            fill_count = 0
+            val = values[i]
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_2d_inplace_float64(ndarray[float64_t, ndim=2] values,
-                                 ndarray[uint8_t, ndim=2] mask,
-                                 limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef float64_t val
+def backfill_inplace_float32(ndarray[float32_t] values,
+                              ndarray[uint8_t, cast=True] mask,
+                              limit=None):
+    cdef Py_ssize_t i, N
+    cdef float32_t val
     cdef int lim, fill_count = 0
 
-    K, N = (<object> values).shape
+    N = len(values)
+
+    # GH 2778
+    if N == 0:
+        return
 
     if limit is None:
         lim = N
@@ -1271,28 +1599,30 @@ def backfill_2d_inplace_float64(ndarray[float64_t, ndim=2] values,
             raise ValueError('Limit must be non-negative')
         lim = limit
 
-    for j in range(K):
-        fill_count = 0
-        val = values[j, N - 1]
-        for i in range(N - 1, -1 , -1):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
+    val = values[N - 1]
+    for i in range(N - 1, -1 , -1):
+        if mask[i]:
+            if fill_count >= lim:
+                continue
+            fill_count += 1
+            values[i] = val
+        else:
+            fill_count = 0
+            val = values[i]
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_2d_inplace_object(ndarray[object, ndim=2] values,
-                                 ndarray[uint8_t, ndim=2] mask,
-                                 limit=None):
-    cdef Py_ssize_t i, j, N, K
+def backfill_inplace_object(ndarray[object] values,
+                              ndarray[uint8_t, cast=True] mask,
+                              limit=None):
+    cdef Py_ssize_t i, N
     cdef object val
     cdef int lim, fill_count = 0
 
-    K, N = (<object> values).shape
+    N = len(values)
+
+    # GH 2778
+    if N == 0:
+        return
 
     if limit is None:
         lim = N
@@ -1301,28 +1631,30 @@ def backfill_2d_inplace_object(ndarray[object, ndim=2] values,
             raise ValueError('Limit must be non-negative')
         lim = limit
 
-    for j in range(K):
-        fill_count = 0
-        val = values[j, N - 1]
-        for i in range(N - 1, -1 , -1):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
+    val = values[N - 1]
+    for i in range(N - 1, -1 , -1):
+        if mask[i]:
+            if fill_count >= lim:
+                continue
+            fill_count += 1
+            values[i] = val
+        else:
+            fill_count = 0
+            val = values[i]
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_2d_inplace_int32(ndarray[int32_t, ndim=2] values,
-                                 ndarray[uint8_t, ndim=2] mask,
-                                 limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef int32_t val
+def backfill_inplace_int8(ndarray[int8_t] values,
+                              ndarray[uint8_t, cast=True] mask,
+                              limit=None):
+    cdef Py_ssize_t i, N
+    cdef int8_t val
     cdef int lim, fill_count = 0
 
-    K, N = (<object> values).shape
+    N = len(values)
+
+    # GH 2778
+    if N == 0:
+        return
 
     if limit is None:
         lim = N
@@ -1331,28 +1663,30 @@ def backfill_2d_inplace_int32(ndarray[int32_t, ndim=2] values,
             raise ValueError('Limit must be non-negative')
         lim = limit
 
-    for j in range(K):
-        fill_count = 0
-        val = values[j, N - 1]
-        for i in range(N - 1, -1 , -1):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
+    val = values[N - 1]
+    for i in range(N - 1, -1 , -1):
+        if mask[i]:
+            if fill_count >= lim:
+                continue
+            fill_count += 1
+            values[i] = val
+        else:
+            fill_count = 0
+            val = values[i]
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_2d_inplace_int64(ndarray[int64_t, ndim=2] values,
-                                 ndarray[uint8_t, ndim=2] mask,
-                                 limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef int64_t val
+def backfill_inplace_int16(ndarray[int16_t] values,
+                              ndarray[uint8_t, cast=True] mask,
+                              limit=None):
+    cdef Py_ssize_t i, N
+    cdef int16_t val
     cdef int lim, fill_count = 0
 
-    K, N = (<object> values).shape
+    N = len(values)
+
+    # GH 2778
+    if N == 0:
+        return
 
     if limit is None:
         lim = N
@@ -1361,28 +1695,30 @@ def backfill_2d_inplace_int64(ndarray[int64_t, ndim=2] values,
             raise ValueError('Limit must be non-negative')
         lim = limit
 
-    for j in range(K):
-        fill_count = 0
-        val = values[j, N - 1]
-        for i in range(N - 1, -1 , -1):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
+    val = values[N - 1]
+    for i in range(N - 1, -1 , -1):
+        if mask[i]:
+            if fill_count >= lim:
+                continue
+            fill_count += 1
+            values[i] = val
+        else:
+            fill_count = 0
+            val = values[i]
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_2d_inplace_bool(ndarray[uint8_t, ndim=2] values,
-                                 ndarray[uint8_t, ndim=2] mask,
-                                 limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef uint8_t val
+def backfill_inplace_int32(ndarray[int32_t] values,
+                              ndarray[uint8_t, cast=True] mask,
+                              limit=None):
+    cdef Py_ssize_t i, N
+    cdef int32_t val
     cdef int lim, fill_count = 0
 
-    K, N = (<object> values).shape
+    N = len(values)
+
+    # GH 2778
+    if N == 0:
+        return
 
     if limit is None:
         lim = N
@@ -1391,533 +1727,640 @@ def backfill_2d_inplace_bool(ndarray[uint8_t, ndim=2] values,
             raise ValueError('Limit must be non-negative')
         lim = limit
 
-    for j in range(K):
-        fill_count = 0
-        val = values[j, N - 1]
-        for i in range(N - 1, -1 , -1):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-
+    val = values[N - 1]
+    for i in range(N - 1, -1 , -1):
+        if mask[i]:
+            if fill_count >= lim:
+                continue
+            fill_count += 1
+            values[i] = val
+        else:
+            fill_count = 0
+            val = values[i]
+@cython.boundscheck(False)
 @cython.wraparound(False)
-def take_1d_float64(ndarray[float64_t] values,
-                     ndarray[int64_t] indexer,
-                     out=None, fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, n, idx
-        ndarray[float64_t] outbuf
-        float64_t fv
+def backfill_inplace_int64(ndarray[int64_t] values,
+                              ndarray[uint8_t, cast=True] mask,
+                              limit=None):
+    cdef Py_ssize_t i, N
+    cdef int64_t val
+    cdef int lim, fill_count = 0
 
-    n = len(indexer)
+    N = len(values)
 
-    if out is None:
-        outbuf = np.empty(n, dtype=values.dtype)
-    else:
-        outbuf = out
+    # GH 2778
+    if N == 0:
+        return
 
-    if False and _checknan(fill_value):
-        for i in range(n):
-            idx = indexer[i]
-            if idx == -1:
-                raise ValueError('No NA values allowed')
-            else:
-                outbuf[i] = values[idx]
+    if limit is None:
+        lim = N
     else:
-        fv = fill_value
-        for i in range(n):
-            idx = indexer[i]
-            if idx == -1:
-                outbuf[i] = fv
-            else:
-                outbuf[i] = values[idx]
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
 
+    val = values[N - 1]
+    for i in range(N - 1, -1 , -1):
+        if mask[i]:
+            if fill_count >= lim:
+                continue
+            fill_count += 1
+            values[i] = val
+        else:
+            fill_count = 0
+            val = values[i]
+@cython.boundscheck(False)
 @cython.wraparound(False)
-def take_1d_object(ndarray[object] values,
-                     ndarray[int64_t] indexer,
-                     out=None, fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, n, idx
-        ndarray[object] outbuf
-        object fv
+def backfill_inplace_bool(ndarray[uint8_t] values,
+                              ndarray[uint8_t, cast=True] mask,
+                              limit=None):
+    cdef Py_ssize_t i, N
+    cdef uint8_t val
+    cdef int lim, fill_count = 0
 
-    n = len(indexer)
+    N = len(values)
 
-    if out is None:
-        outbuf = np.empty(n, dtype=values.dtype)
-    else:
-        outbuf = out
+    # GH 2778
+    if N == 0:
+        return
 
-    if False and _checknan(fill_value):
-        for i in range(n):
-            idx = indexer[i]
-            if idx == -1:
-                raise ValueError('No NA values allowed')
-            else:
-                outbuf[i] = values[idx]
+    if limit is None:
+        lim = N
     else:
-        fv = fill_value
-        for i in range(n):
-            idx = indexer[i]
-            if idx == -1:
-                outbuf[i] = fv
-            else:
-                outbuf[i] = values[idx]
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    val = values[N - 1]
+    for i in range(N - 1, -1 , -1):
+        if mask[i]:
+            if fill_count >= lim:
+                continue
+            fill_count += 1
+            values[i] = val
+        else:
+            fill_count = 0
+            val = values[i]
 
+@cython.boundscheck(False)
 @cython.wraparound(False)
-def take_1d_int32(ndarray[int32_t] values,
-                     ndarray[int64_t] indexer,
-                     out=None, fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, n, idx
-        ndarray[int32_t] outbuf
-        int32_t fv
+def pad_2d_inplace_float64(ndarray[float64_t, ndim=2] values,
+                            ndarray[uint8_t, ndim=2] mask,
+                            limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef float64_t val
+    cdef int lim, fill_count = 0
 
-    n = len(indexer)
+    K, N = (<object> values).shape
 
-    if out is None:
-        outbuf = np.empty(n, dtype=values.dtype)
-    else:
-        outbuf = out
+    # GH 2778
+    if N == 0:
+        return
 
-    if True and _checknan(fill_value):
-        for i in range(n):
-            idx = indexer[i]
-            if idx == -1:
-                raise ValueError('No NA values allowed')
-            else:
-                outbuf[i] = values[idx]
+    if limit is None:
+        lim = N
     else:
-        fv = fill_value
-        for i in range(n):
-            idx = indexer[i]
-            if idx == -1:
-                outbuf[i] = fv
-            else:
-                outbuf[i] = values[idx]
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
 
+    for j in range(K):
+        fill_count = 0
+        val = values[j, 0]
+        for i in range(N):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
+@cython.boundscheck(False)
 @cython.wraparound(False)
-def take_1d_int64(ndarray[int64_t] values,
-                     ndarray[int64_t] indexer,
-                     out=None, fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, n, idx
-        ndarray[int64_t] outbuf
-        int64_t fv
+def pad_2d_inplace_float32(ndarray[float32_t, ndim=2] values,
+                            ndarray[uint8_t, ndim=2] mask,
+                            limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef float32_t val
+    cdef int lim, fill_count = 0
 
-    n = len(indexer)
+    K, N = (<object> values).shape
 
-    if out is None:
-        outbuf = np.empty(n, dtype=values.dtype)
-    else:
-        outbuf = out
+    # GH 2778
+    if N == 0:
+        return
 
-    if True and _checknan(fill_value):
-        for i in range(n):
-            idx = indexer[i]
-            if idx == -1:
-                raise ValueError('No NA values allowed')
-            else:
-                outbuf[i] = values[idx]
+    if limit is None:
+        lim = N
     else:
-        fv = fill_value
-        for i in range(n):
-            idx = indexer[i]
-            if idx == -1:
-                outbuf[i] = fv
-            else:
-                outbuf[i] = values[idx]
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
 
+    for j in range(K):
+        fill_count = 0
+        val = values[j, 0]
+        for i in range(N):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
+@cython.boundscheck(False)
 @cython.wraparound(False)
-def take_1d_bool(ndarray[uint8_t] values,
-                     ndarray[int64_t] indexer,
-                     out=None, fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, n, idx
-        ndarray[uint8_t] outbuf
-        uint8_t fv
+def pad_2d_inplace_object(ndarray[object, ndim=2] values,
+                            ndarray[uint8_t, ndim=2] mask,
+                            limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef object val
+    cdef int lim, fill_count = 0
 
-    n = len(indexer)
+    K, N = (<object> values).shape
 
-    if out is None:
-        outbuf = np.empty(n, dtype=values.dtype)
-    else:
-        outbuf = out
+    # GH 2778
+    if N == 0:
+        return
 
-    if True and _checknan(fill_value):
-        for i in range(n):
-            idx = indexer[i]
-            if idx == -1:
-                raise ValueError('No NA values allowed')
-            else:
-                outbuf[i] = values[idx]
+    if limit is None:
+        lim = N
     else:
-        fv = fill_value
-        for i in range(n):
-            idx = indexer[i]
-            if idx == -1:
-                outbuf[i] = fv
-            else:
-                outbuf[i] = values[idx]
-
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
 
+    for j in range(K):
+        fill_count = 0
+        val = values[j, 0]
+        for i in range(N):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def is_monotonic_float64(ndarray[float64_t] arr):
-    '''
-    Returns
-    -------
-    is_monotonic, is_unique
-    '''
-    cdef:
-        Py_ssize_t i, n
-        float64_t prev, cur
-        bint is_unique = 1
-
-    n = len(arr)
+def pad_2d_inplace_int8(ndarray[int8_t, ndim=2] values,
+                            ndarray[uint8_t, ndim=2] mask,
+                            limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef int8_t val
+    cdef int lim, fill_count = 0
 
-    if n < 2:
-        return True, True
+    K, N = (<object> values).shape
 
-    prev = arr[0]
-    for i in range(1, n):
-        cur = arr[i]
-        if cur < prev:
-            return False, None
-        elif cur == prev:
-            is_unique = 0
-        prev = cur
-    return True, is_unique
+    # GH 2778
+    if N == 0:
+        return
+
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    for j in range(K):
+        fill_count = 0
+        val = values[j, 0]
+        for i in range(N):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def is_monotonic_object(ndarray[object] arr):
-    '''
-    Returns
-    -------
-    is_monotonic, is_unique
-    '''
-    cdef:
-        Py_ssize_t i, n
-        object prev, cur
-        bint is_unique = 1
+def pad_2d_inplace_int16(ndarray[int16_t, ndim=2] values,
+                            ndarray[uint8_t, ndim=2] mask,
+                            limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef int16_t val
+    cdef int lim, fill_count = 0
 
-    n = len(arr)
+    K, N = (<object> values).shape
 
-    if n < 2:
-        return True, True
+    # GH 2778
+    if N == 0:
+        return
 
-    prev = arr[0]
-    for i in range(1, n):
-        cur = arr[i]
-        if cur < prev:
-            return False, None
-        elif cur == prev:
-            is_unique = 0
-        prev = cur
-    return True, is_unique
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    for j in range(K):
+        fill_count = 0
+        val = values[j, 0]
+        for i in range(N):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def is_monotonic_int32(ndarray[int32_t] arr):
-    '''
-    Returns
-    -------
-    is_monotonic, is_unique
-    '''
-    cdef:
-        Py_ssize_t i, n
-        int32_t prev, cur
-        bint is_unique = 1
+def pad_2d_inplace_int32(ndarray[int32_t, ndim=2] values,
+                            ndarray[uint8_t, ndim=2] mask,
+                            limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef int32_t val
+    cdef int lim, fill_count = 0
 
-    n = len(arr)
+    K, N = (<object> values).shape
 
-    if n < 2:
-        return True, True
+    # GH 2778
+    if N == 0:
+        return
 
-    prev = arr[0]
-    for i in range(1, n):
-        cur = arr[i]
-        if cur < prev:
-            return False, None
-        elif cur == prev:
-            is_unique = 0
-        prev = cur
-    return True, is_unique
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    for j in range(K):
+        fill_count = 0
+        val = values[j, 0]
+        for i in range(N):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def is_monotonic_int64(ndarray[int64_t] arr):
-    '''
-    Returns
-    -------
-    is_monotonic, is_unique
-    '''
-    cdef:
-        Py_ssize_t i, n
-        int64_t prev, cur
-        bint is_unique = 1
+def pad_2d_inplace_int64(ndarray[int64_t, ndim=2] values,
+                            ndarray[uint8_t, ndim=2] mask,
+                            limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef int64_t val
+    cdef int lim, fill_count = 0
 
-    n = len(arr)
+    K, N = (<object> values).shape
 
-    if n < 2:
-        return True, True
+    # GH 2778
+    if N == 0:
+        return
 
-    prev = arr[0]
-    for i in range(1, n):
-        cur = arr[i]
-        if cur < prev:
-            return False, None
-        elif cur == prev:
-            is_unique = 0
-        prev = cur
-    return True, is_unique
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    for j in range(K):
+        fill_count = 0
+        val = values[j, 0]
+        for i in range(N):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def is_monotonic_bool(ndarray[uint8_t] arr):
-    '''
-    Returns
-    -------
-    is_monotonic, is_unique
-    '''
-    cdef:
-        Py_ssize_t i, n
-        uint8_t prev, cur
-        bint is_unique = 1
+def pad_2d_inplace_bool(ndarray[uint8_t, ndim=2] values,
+                            ndarray[uint8_t, ndim=2] mask,
+                            limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef uint8_t val
+    cdef int lim, fill_count = 0
 
-    n = len(arr)
+    K, N = (<object> values).shape
 
-    if n < 2:
-        return True, True
+    # GH 2778
+    if N == 0:
+        return
 
-    prev = arr[0]
-    for i in range(1, n):
-        cur = arr[i]
-        if cur < prev:
-            return False, None
-        elif cur == prev:
-            is_unique = 0
-        prev = cur
-    return True, is_unique
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    for j in range(K):
+        fill_count = 0
+        val = values[j, 0]
+        for i in range(N):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
 
-@cython.wraparound(False)
 @cython.boundscheck(False)
-def groupby_float64(ndarray[float64_t] index, ndarray labels):
-    cdef dict result = {}
-    cdef Py_ssize_t i, length
-    cdef list members
-    cdef object idx, key
+@cython.wraparound(False)
+def backfill_2d_inplace_float64(ndarray[float64_t, ndim=2] values,
+                                 ndarray[uint8_t, ndim=2] mask,
+                                 limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef float64_t val
+    cdef int lim, fill_count = 0
 
-    length = len(index)
+    K, N = (<object> values).shape
 
-    for i in range(length):
-        key = util.get_value_1d(labels, i)
+    # GH 2778
+    if N == 0:
+        return
 
-        if _checknull(key):
-            continue
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
 
-        idx = index[i]
-        if key in result:
-            members = result[key]
-            members.append(idx)
-        else:
-            result[key] = [idx]
-
-    return result
-
-@cython.wraparound(False)
+    for j in range(K):
+        fill_count = 0
+        val = values[j, N - 1]
+        for i in range(N - 1, -1 , -1):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
 @cython.boundscheck(False)
-def groupby_object(ndarray[object] index, ndarray labels):
-    cdef dict result = {}
-    cdef Py_ssize_t i, length
-    cdef list members
-    cdef object idx, key
-
-    length = len(index)
-
-    for i in range(length):
-        key = util.get_value_1d(labels, i)
+@cython.wraparound(False)
+def backfill_2d_inplace_float32(ndarray[float32_t, ndim=2] values,
+                                 ndarray[uint8_t, ndim=2] mask,
+                                 limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef float32_t val
+    cdef int lim, fill_count = 0
 
-        if _checknull(key):
-            continue
+    K, N = (<object> values).shape
 
-        idx = index[i]
-        if key in result:
-            members = result[key]
-            members.append(idx)
-        else:
-            result[key] = [idx]
+    # GH 2778
+    if N == 0:
+        return
 
-    return result
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
 
-@cython.wraparound(False)
+    for j in range(K):
+        fill_count = 0
+        val = values[j, N - 1]
+        for i in range(N - 1, -1 , -1):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
 @cython.boundscheck(False)
-def groupby_int32(ndarray[int32_t] index, ndarray labels):
-    cdef dict result = {}
-    cdef Py_ssize_t i, length
-    cdef list members
-    cdef object idx, key
-
-    length = len(index)
-
-    for i in range(length):
-        key = util.get_value_1d(labels, i)
+@cython.wraparound(False)
+def backfill_2d_inplace_object(ndarray[object, ndim=2] values,
+                                 ndarray[uint8_t, ndim=2] mask,
+                                 limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef object val
+    cdef int lim, fill_count = 0
 
-        if _checknull(key):
-            continue
+    K, N = (<object> values).shape
 
-        idx = index[i]
-        if key in result:
-            members = result[key]
-            members.append(idx)
-        else:
-            result[key] = [idx]
+    # GH 2778
+    if N == 0:
+        return
 
-    return result
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
 
-@cython.wraparound(False)
+    for j in range(K):
+        fill_count = 0
+        val = values[j, N - 1]
+        for i in range(N - 1, -1 , -1):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
 @cython.boundscheck(False)
-def groupby_int64(ndarray[int64_t] index, ndarray labels):
-    cdef dict result = {}
-    cdef Py_ssize_t i, length
-    cdef list members
-    cdef object idx, key
-
-    length = len(index)
-
-    for i in range(length):
-        key = util.get_value_1d(labels, i)
+@cython.wraparound(False)
+def backfill_2d_inplace_int8(ndarray[int8_t, ndim=2] values,
+                                 ndarray[uint8_t, ndim=2] mask,
+                                 limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef int8_t val
+    cdef int lim, fill_count = 0
 
-        if _checknull(key):
-            continue
+    K, N = (<object> values).shape
 
-        idx = index[i]
-        if key in result:
-            members = result[key]
-            members.append(idx)
-        else:
-            result[key] = [idx]
+    # GH 2778
+    if N == 0:
+        return
 
-    return result
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
 
-@cython.wraparound(False)
+    for j in range(K):
+        fill_count = 0
+        val = values[j, N - 1]
+        for i in range(N - 1, -1 , -1):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
 @cython.boundscheck(False)
-def groupby_bool(ndarray[uint8_t] index, ndarray labels):
-    cdef dict result = {}
-    cdef Py_ssize_t i, length
-    cdef list members
-    cdef object idx, key
-
-    length = len(index)
-
-    for i in range(length):
-        key = util.get_value_1d(labels, i)
-
-        if _checknull(key):
-            continue
+@cython.wraparound(False)
+def backfill_2d_inplace_int16(ndarray[int16_t, ndim=2] values,
+                                 ndarray[uint8_t, ndim=2] mask,
+                                 limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef int16_t val
+    cdef int lim, fill_count = 0
 
-        idx = index[i]
-        if key in result:
-            members = result[key]
-            members.append(idx)
-        else:
-            result[key] = [idx]
+    K, N = (<object> values).shape
 
-    return result
+    # GH 2778
+    if N == 0:
+        return
 
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
 
-@cython.wraparound(False)
+    for j in range(K):
+        fill_count = 0
+        val = values[j, N - 1]
+        for i in range(N - 1, -1 , -1):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
 @cython.boundscheck(False)
-def arrmap_float64(ndarray[float64_t] index, object func):
-    cdef Py_ssize_t length = index.shape[0]
-    cdef Py_ssize_t i = 0
-
-    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
+@cython.wraparound(False)
+def backfill_2d_inplace_int32(ndarray[int32_t, ndim=2] values,
+                                 ndarray[uint8_t, ndim=2] mask,
+                                 limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef int32_t val
+    cdef int lim, fill_count = 0
 
-    from pandas.lib import maybe_convert_objects
+    K, N = (<object> values).shape
 
-    for i in range(length):
-        result[i] = func(index[i])
+    # GH 2778
+    if N == 0:
+        return
 
-    return maybe_convert_objects(result)
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
 
-@cython.wraparound(False)
+    for j in range(K):
+        fill_count = 0
+        val = values[j, N - 1]
+        for i in range(N - 1, -1 , -1):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
 @cython.boundscheck(False)
-def arrmap_object(ndarray[object] index, object func):
-    cdef Py_ssize_t length = index.shape[0]
-    cdef Py_ssize_t i = 0
-
-    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
+@cython.wraparound(False)
+def backfill_2d_inplace_int64(ndarray[int64_t, ndim=2] values,
+                                 ndarray[uint8_t, ndim=2] mask,
+                                 limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef int64_t val
+    cdef int lim, fill_count = 0
 
-    from pandas.lib import maybe_convert_objects
+    K, N = (<object> values).shape
 
-    for i in range(length):
-        result[i] = func(index[i])
+    # GH 2778
+    if N == 0:
+        return
 
-    return maybe_convert_objects(result)
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
 
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def arrmap_int32(ndarray[int32_t] index, object func):
-    cdef Py_ssize_t length = index.shape[0]
-    cdef Py_ssize_t i = 0
-
-    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
-
-    from pandas.lib import maybe_convert_objects
-
-    for i in range(length):
-        result[i] = func(index[i])
-
-    return maybe_convert_objects(result)
-
-@cython.wraparound(False)
+    for j in range(K):
+        fill_count = 0
+        val = values[j, N - 1]
+        for i in range(N - 1, -1 , -1):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
 @cython.boundscheck(False)
-def arrmap_int64(ndarray[int64_t] index, object func):
-    cdef Py_ssize_t length = index.shape[0]
-    cdef Py_ssize_t i = 0
-
-    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
-
-    from pandas.lib import maybe_convert_objects
-
-    for i in range(length):
-        result[i] = func(index[i])
-
-    return maybe_convert_objects(result)
-
 @cython.wraparound(False)
-@cython.boundscheck(False)
-def arrmap_bool(ndarray[uint8_t] index, object func):
-    cdef Py_ssize_t length = index.shape[0]
-    cdef Py_ssize_t i = 0
-
-    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
+def backfill_2d_inplace_bool(ndarray[uint8_t, ndim=2] values,
+                                 ndarray[uint8_t, ndim=2] mask,
+                                 limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef uint8_t val
+    cdef int lim, fill_count = 0
 
-    from pandas.lib import maybe_convert_objects
+    K, N = (<object> values).shape
 
-    for i in range(length):
-        result[i] = func(index[i])
+    # GH 2778
+    if N == 0:
+        return
 
-    return maybe_convert_objects(result)
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
 
+    for j in range(K):
+        fill_count = 0
+        val = values[j, N - 1]
+        for i in range(N - 1, -1 , -1):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
 
 @cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_float64(ndarray[float64_t, ndim=2] values,
-                           ndarray[int64_t] indexer,
-                           out=None, fill_value=np.nan):
+def take_1d_float64(ndarray[float64_t] values,
+                     ndarray[int64_t] indexer,
+                     out=None, fill_value=np.nan):
     cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[float64_t, ndim=2] outbuf
+        Py_ssize_t i, n, idx
+        ndarray[float64_t] outbuf
         float64_t fv
 
     n = len(indexer)
-    k = values.shape[1]
 
     if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
+        outbuf = np.empty(n, dtype=values.dtype)
     else:
         outbuf = out
 
@@ -1925,37 +2368,31 @@ def take_2d_axis0_float64(ndarray[float64_t, ndim=2] values,
         for i in range(n):
             idx = indexer[i]
             if idx == -1:
-                for j from 0 <= j < k:
-                    raise ValueError('No NA values allowed')
+                raise ValueError('No NA values allowed')
             else:
-                for j from 0 <= j < k:
-                    outbuf[i, j] = values[idx, j]
+                outbuf[i] = values[idx]
     else:
         fv = fill_value
         for i in range(n):
             idx = indexer[i]
             if idx == -1:
-                for j in range(k):
-                    outbuf[i, j] = fv
+                outbuf[i] = fv
             else:
-                for j in range(k):
-                    outbuf[i, j] = values[idx, j]
+                outbuf[i] = values[idx]
 
 @cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_object(ndarray[object, ndim=2] values,
-                           ndarray[int64_t] indexer,
-                           out=None, fill_value=np.nan):
+def take_1d_float32(ndarray[float32_t] values,
+                     ndarray[int64_t] indexer,
+                     out=None, fill_value=np.nan):
     cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[object, ndim=2] outbuf
-        object fv
+        Py_ssize_t i, n, idx
+        ndarray[float32_t] outbuf
+        float32_t fv
 
     n = len(indexer)
-    k = values.shape[1]
 
     if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
+        outbuf = np.empty(n, dtype=values.dtype)
     else:
         outbuf = out
 
@@ -1963,75 +2400,63 @@ def take_2d_axis0_object(ndarray[object, ndim=2] values,
         for i in range(n):
             idx = indexer[i]
             if idx == -1:
-                for j from 0 <= j < k:
-                    raise ValueError('No NA values allowed')
+                raise ValueError('No NA values allowed')
             else:
-                for j from 0 <= j < k:
-                    outbuf[i, j] = values[idx, j]
+                outbuf[i] = values[idx]
     else:
         fv = fill_value
         for i in range(n):
             idx = indexer[i]
             if idx == -1:
-                for j in range(k):
-                    outbuf[i, j] = fv
+                outbuf[i] = fv
             else:
-                for j in range(k):
-                    outbuf[i, j] = values[idx, j]
+                outbuf[i] = values[idx]
 
 @cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int32(ndarray[int32_t, ndim=2] values,
-                           ndarray[int64_t] indexer,
-                           out=None, fill_value=np.nan):
+def take_1d_object(ndarray[object] values,
+                     ndarray[int64_t] indexer,
+                     out=None, fill_value=np.nan):
     cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int32_t, ndim=2] outbuf
-        int32_t fv
+        Py_ssize_t i, n, idx
+        ndarray[object] outbuf
+        object fv
 
     n = len(indexer)
-    k = values.shape[1]
 
     if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
+        outbuf = np.empty(n, dtype=values.dtype)
     else:
         outbuf = out
 
-    if True and _checknan(fill_value):
+    if False and _checknan(fill_value):
         for i in range(n):
             idx = indexer[i]
             if idx == -1:
-                for j from 0 <= j < k:
-                    raise ValueError('No NA values allowed')
+                raise ValueError('No NA values allowed')
             else:
-                for j from 0 <= j < k:
-                    outbuf[i, j] = values[idx, j]
+                outbuf[i] = values[idx]
     else:
         fv = fill_value
         for i in range(n):
             idx = indexer[i]
             if idx == -1:
-                for j in range(k):
-                    outbuf[i, j] = fv
+                outbuf[i] = fv
             else:
-                for j in range(k):
-                    outbuf[i, j] = values[idx, j]
+                outbuf[i] = values[idx]
 
 @cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int64(ndarray[int64_t, ndim=2] values,
-                           ndarray[int64_t] indexer,
-                           out=None, fill_value=np.nan):
+def take_1d_int8(ndarray[int8_t] values,
+                     ndarray[int64_t] indexer,
+                     out=None, fill_value=np.nan):
     cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t, ndim=2] outbuf
-        int64_t fv
+        Py_ssize_t i, n, idx
+        ndarray[int8_t] outbuf
+        int8_t fv
 
     n = len(indexer)
-    k = values.shape[1]
 
     if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
+        outbuf = np.empty(n, dtype=values.dtype)
     else:
         outbuf = out
 
@@ -2039,37 +2464,31 @@ def take_2d_axis0_int64(ndarray[int64_t, ndim=2] values,
         for i in range(n):
             idx = indexer[i]
             if idx == -1:
-                for j from 0 <= j < k:
-                    raise ValueError('No NA values allowed')
+                raise ValueError('No NA values allowed')
             else:
-                for j from 0 <= j < k:
-                    outbuf[i, j] = values[idx, j]
+                outbuf[i] = values[idx]
     else:
         fv = fill_value
         for i in range(n):
             idx = indexer[i]
             if idx == -1:
-                for j in range(k):
-                    outbuf[i, j] = fv
+                outbuf[i] = fv
             else:
-                for j in range(k):
-                    outbuf[i, j] = values[idx, j]
+                outbuf[i] = values[idx]
 
 @cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_bool(ndarray[uint8_t, ndim=2] values,
-                           ndarray[int64_t] indexer,
-                           out=None, fill_value=np.nan):
+def take_1d_int16(ndarray[int16_t] values,
+                     ndarray[int64_t] indexer,
+                     out=None, fill_value=np.nan):
     cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[uint8_t, ndim=2] outbuf
-        uint8_t fv
+        Py_ssize_t i, n, idx
+        ndarray[int16_t] outbuf
+        int16_t fv
 
     n = len(indexer)
-    k = values.shape[1]
 
     if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
+        outbuf = np.empty(n, dtype=values.dtype)
     else:
         outbuf = out
 
@@ -2077,647 +2496,4928 @@ def take_2d_axis0_bool(ndarray[uint8_t, ndim=2] values,
         for i in range(n):
             idx = indexer[i]
             if idx == -1:
-                for j from 0 <= j < k:
-                    raise ValueError('No NA values allowed')
+                raise ValueError('No NA values allowed')
             else:
-                for j from 0 <= j < k:
-                    outbuf[i, j] = values[idx, j]
+                outbuf[i] = values[idx]
     else:
         fv = fill_value
         for i in range(n):
             idx = indexer[i]
             if idx == -1:
-                for j in range(k):
-                    outbuf[i, j] = fv
+                outbuf[i] = fv
             else:
-                for j in range(k):
-                    outbuf[i, j] = values[idx, j]
-
+                outbuf[i] = values[idx]
 
 @cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_float64(ndarray[float64_t, ndim=2] values,
-                           ndarray[int64_t] indexer,
-                           out=None, fill_value=np.nan):
+def take_1d_int32(ndarray[int32_t] values,
+                     ndarray[int64_t] indexer,
+                     out=None, fill_value=np.nan):
     cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[float64_t, ndim=2] outbuf
-        float64_t fv
+        Py_ssize_t i, n, idx
+        ndarray[int32_t] outbuf
+        int32_t fv
 
-    n = len(values)
-    k = len(indexer)
+    n = len(indexer)
 
     if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
+        outbuf = np.empty(n, dtype=values.dtype)
     else:
         outbuf = out
 
-    if False and _checknan(fill_value):
-        for j in range(k):
-            idx = indexer[j]
-
+    if True and _checknan(fill_value):
+        for i in range(n):
+            idx = indexer[i]
             if idx == -1:
-                for i in range(n):
-                    raise ValueError('No NA values allowed')
+                raise ValueError('No NA values allowed')
             else:
-                for i in range(n):
-                    outbuf[i, j] = values[i, idx]
+                outbuf[i] = values[idx]
     else:
         fv = fill_value
-        for j in range(k):
-            idx = indexer[j]
-
+        for i in range(n):
+            idx = indexer[i]
             if idx == -1:
-                for i in range(n):
-                    outbuf[i, j] = fv
+                outbuf[i] = fv
             else:
-                for i in range(n):
-                    outbuf[i, j] = values[i, idx]
+                outbuf[i] = values[idx]
 
 @cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_object(ndarray[object, ndim=2] values,
-                           ndarray[int64_t] indexer,
-                           out=None, fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[object, ndim=2] outbuf
-        object fv
+def take_1d_int64(ndarray[int64_t] values,
+                     ndarray[int64_t] indexer,
+                     out=None, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, n, idx
+        ndarray[int64_t] outbuf
+        int64_t fv
 
-    n = len(values)
-    k = len(indexer)
+    n = len(indexer)
 
     if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
+        outbuf = np.empty(n, dtype=values.dtype)
     else:
         outbuf = out
 
-    if False and _checknan(fill_value):
-        for j in range(k):
-            idx = indexer[j]
-
+    if True and _checknan(fill_value):
+        for i in range(n):
+            idx = indexer[i]
             if idx == -1:
-                for i in range(n):
-                    raise ValueError('No NA values allowed')
+                raise ValueError('No NA values allowed')
             else:
-                for i in range(n):
-                    outbuf[i, j] = values[i, idx]
+                outbuf[i] = values[idx]
     else:
         fv = fill_value
-        for j in range(k):
-            idx = indexer[j]
-
+        for i in range(n):
+            idx = indexer[i]
             if idx == -1:
-                for i in range(n):
-                    outbuf[i, j] = fv
+                outbuf[i] = fv
             else:
-                for i in range(n):
-                    outbuf[i, j] = values[i, idx]
+                outbuf[i] = values[idx]
 
 @cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int32(ndarray[int32_t, ndim=2] values,
-                           ndarray[int64_t] indexer,
-                           out=None, fill_value=np.nan):
+def take_1d_bool(ndarray[uint8_t] values,
+                     ndarray[int64_t] indexer,
+                     out=None, fill_value=np.nan):
     cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int32_t, ndim=2] outbuf
-        int32_t fv
+        Py_ssize_t i, n, idx
+        ndarray[uint8_t] outbuf
+        uint8_t fv
 
-    n = len(values)
-    k = len(indexer)
+    n = len(indexer)
 
     if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
+        outbuf = np.empty(n, dtype=values.dtype)
     else:
         outbuf = out
 
     if True and _checknan(fill_value):
-        for j in range(k):
-            idx = indexer[j]
-
+        for i in range(n):
+            idx = indexer[i]
             if idx == -1:
-                for i in range(n):
-                    raise ValueError('No NA values allowed')
+                raise ValueError('No NA values allowed')
             else:
-                for i in range(n):
-                    outbuf[i, j] = values[i, idx]
+                outbuf[i] = values[idx]
     else:
         fv = fill_value
-        for j in range(k):
-            idx = indexer[j]
-
+        for i in range(n):
+            idx = indexer[i]
             if idx == -1:
-                for i in range(n):
-                    outbuf[i, j] = fv
+                outbuf[i] = fv
             else:
-                for i in range(n):
-                    outbuf[i, j] = values[i, idx]
+                outbuf[i] = values[idx]
+
 
-@cython.wraparound(False)
 @cython.boundscheck(False)
-def take_2d_axis1_int64(ndarray[int64_t, ndim=2] values,
-                           ndarray[int64_t] indexer,
-                           out=None, fill_value=np.nan):
+@cython.wraparound(False)
+def is_monotonic_float64(ndarray[float64_t] arr):
+    '''
+    Returns
+    -------
+    is_monotonic, is_unique
+    '''
     cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t, ndim=2] outbuf
-        int64_t fv
+        Py_ssize_t i, n
+        float64_t prev, cur
+        bint is_unique = 1
 
-    n = len(values)
-    k = len(indexer)
+    n = len(arr)
 
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
+    if n < 2:
+        return True, True
 
-    if True and _checknan(fill_value):
-        for j in range(k):
-            idx = indexer[j]
+    prev = arr[0]
+    for i in range(1, n):
+        cur = arr[i]
+        if cur < prev:
+            return False, None
+        elif cur == prev:
+            is_unique = 0
+        prev = cur
+    return True, is_unique
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def is_monotonic_float32(ndarray[float32_t] arr):
+    '''
+    Returns
+    -------
+    is_monotonic, is_unique
+    '''
+    cdef:
+        Py_ssize_t i, n
+        float32_t prev, cur
+        bint is_unique = 1
 
-            if idx == -1:
-                for i in range(n):
-                    raise ValueError('No NA values allowed')
-            else:
-                for i in range(n):
-                    outbuf[i, j] = values[i, idx]
-    else:
-        fv = fill_value
-        for j in range(k):
-            idx = indexer[j]
+    n = len(arr)
 
-            if idx == -1:
-                for i in range(n):
-                    outbuf[i, j] = fv
-            else:
-                for i in range(n):
-                    outbuf[i, j] = values[i, idx]
+    if n < 2:
+        return True, True
 
-@cython.wraparound(False)
+    prev = arr[0]
+    for i in range(1, n):
+        cur = arr[i]
+        if cur < prev:
+            return False, None
+        elif cur == prev:
+            is_unique = 0
+        prev = cur
+    return True, is_unique
 @cython.boundscheck(False)
-def take_2d_axis1_bool(ndarray[uint8_t, ndim=2] values,
-                           ndarray[int64_t] indexer,
-                           out=None, fill_value=np.nan):
+@cython.wraparound(False)
+def is_monotonic_object(ndarray[object] arr):
+    '''
+    Returns
+    -------
+    is_monotonic, is_unique
+    '''
     cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[uint8_t, ndim=2] outbuf
-        uint8_t fv
-
-    n = len(values)
-    k = len(indexer)
+        Py_ssize_t i, n
+        object prev, cur
+        bint is_unique = 1
 
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
+    n = len(arr)
 
-    if True and _checknan(fill_value):
-        for j in range(k):
-            idx = indexer[j]
+    if n < 2:
+        return True, True
 
-            if idx == -1:
-                for i in range(n):
-                    raise ValueError('No NA values allowed')
-            else:
-                for i in range(n):
-                    outbuf[i, j] = values[i, idx]
-    else:
-        fv = fill_value
-        for j in range(k):
-            idx = indexer[j]
+    prev = arr[0]
+    for i in range(1, n):
+        cur = arr[i]
+        if cur < prev:
+            return False, None
+        elif cur == prev:
+            is_unique = 0
+        prev = cur
+    return True, is_unique
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def is_monotonic_int8(ndarray[int8_t] arr):
+    '''
+    Returns
+    -------
+    is_monotonic, is_unique
+    '''
+    cdef:
+        Py_ssize_t i, n
+        int8_t prev, cur
+        bint is_unique = 1
 
-            if idx == -1:
-                for i in range(n):
-                    outbuf[i, j] = fv
-            else:
-                for i in range(n):
-                    outbuf[i, j] = values[i, idx]
+    n = len(arr)
 
+    if n < 2:
+        return True, True
 
-@cython.wraparound(False)
+    prev = arr[0]
+    for i in range(1, n):
+        cur = arr[i]
+        if cur < prev:
+            return False, None
+        elif cur == prev:
+            is_unique = 0
+        prev = cur
+    return True, is_unique
 @cython.boundscheck(False)
-def take_2d_multi_float64(ndarray[float64_t, ndim=2] values,
-                           ndarray[int64_t] idx0,
-                           ndarray[int64_t] idx1,
-                           out=None, fill_value=np.nan):
+@cython.wraparound(False)
+def is_monotonic_int16(ndarray[int16_t] arr):
+    '''
+    Returns
+    -------
+    is_monotonic, is_unique
+    '''
     cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[float64_t, ndim=2] outbuf
-        float64_t fv
-
-    n = len(idx0)
-    k = len(idx1)
+        Py_ssize_t i, n
+        int16_t prev, cur
+        bint is_unique = 1
 
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
+    n = len(arr)
 
+    if n < 2:
+        return True, True
+
+    prev = arr[0]
+    for i in range(1, n):
+        cur = arr[i]
+        if cur < prev:
+            return False, None
+        elif cur == prev:
+            is_unique = 0
+        prev = cur
+    return True, is_unique
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def is_monotonic_int32(ndarray[int32_t] arr):
+    '''
+    Returns
+    -------
+    is_monotonic, is_unique
+    '''
+    cdef:
+        Py_ssize_t i, n
+        int32_t prev, cur
+        bint is_unique = 1
+
+    n = len(arr)
+
+    if n < 2:
+        return True, True
+
+    prev = arr[0]
+    for i in range(1, n):
+        cur = arr[i]
+        if cur < prev:
+            return False, None
+        elif cur == prev:
+            is_unique = 0
+        prev = cur
+    return True, is_unique
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def is_monotonic_int64(ndarray[int64_t] arr):
+    '''
+    Returns
+    -------
+    is_monotonic, is_unique
+    '''
+    cdef:
+        Py_ssize_t i, n
+        int64_t prev, cur
+        bint is_unique = 1
+
+    n = len(arr)
+
+    if n < 2:
+        return True, True
+
+    prev = arr[0]
+    for i in range(1, n):
+        cur = arr[i]
+        if cur < prev:
+            return False, None
+        elif cur == prev:
+            is_unique = 0
+        prev = cur
+    return True, is_unique
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def is_monotonic_bool(ndarray[uint8_t] arr):
+    '''
+    Returns
+    -------
+    is_monotonic, is_unique
+    '''
+    cdef:
+        Py_ssize_t i, n
+        uint8_t prev, cur
+        bint is_unique = 1
+
+    n = len(arr)
+
+    if n < 2:
+        return True, True
+
+    prev = arr[0]
+    for i in range(1, n):
+        cur = arr[i]
+        if cur < prev:
+            return False, None
+        elif cur == prev:
+            is_unique = 0
+        prev = cur
+    return True, is_unique
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def groupby_float64(ndarray[float64_t] index, ndarray labels):
+    cdef dict result = {}
+    cdef Py_ssize_t i, length
+    cdef list members
+    cdef object idx, key
+
+    length = len(index)
+
+    for i in range(length):
+        key = util.get_value_1d(labels, i)
+
+        if _checknull(key):
+            continue
+
+        idx = index[i]
+        if key in result:
+            members = result[key]
+            members.append(idx)
+        else:
+            result[key] = [idx]
+
+    return result
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def groupby_float32(ndarray[float32_t] index, ndarray labels):
+    cdef dict result = {}
+    cdef Py_ssize_t i, length
+    cdef list members
+    cdef object idx, key
+
+    length = len(index)
+
+    for i in range(length):
+        key = util.get_value_1d(labels, i)
+
+        if _checknull(key):
+            continue
+
+        idx = index[i]
+        if key in result:
+            members = result[key]
+            members.append(idx)
+        else:
+            result[key] = [idx]
+
+    return result
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def groupby_object(ndarray[object] index, ndarray labels):
+    cdef dict result = {}
+    cdef Py_ssize_t i, length
+    cdef list members
+    cdef object idx, key
+
+    length = len(index)
+
+    for i in range(length):
+        key = util.get_value_1d(labels, i)
+
+        if _checknull(key):
+            continue
+
+        idx = index[i]
+        if key in result:
+            members = result[key]
+            members.append(idx)
+        else:
+            result[key] = [idx]
+
+    return result
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def groupby_int8(ndarray[int8_t] index, ndarray labels):
+    cdef dict result = {}
+    cdef Py_ssize_t i, length
+    cdef list members
+    cdef object idx, key
+
+    length = len(index)
+
+    for i in range(length):
+        key = util.get_value_1d(labels, i)
+
+        if _checknull(key):
+            continue
+
+        idx = index[i]
+        if key in result:
+            members = result[key]
+            members.append(idx)
+        else:
+            result[key] = [idx]
+
+    return result
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def groupby_int16(ndarray[int16_t] index, ndarray labels):
+    cdef dict result = {}
+    cdef Py_ssize_t i, length
+    cdef list members
+    cdef object idx, key
+
+    length = len(index)
+
+    for i in range(length):
+        key = util.get_value_1d(labels, i)
+
+        if _checknull(key):
+            continue
+
+        idx = index[i]
+        if key in result:
+            members = result[key]
+            members.append(idx)
+        else:
+            result[key] = [idx]
+
+    return result
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def groupby_int32(ndarray[int32_t] index, ndarray labels):
+    cdef dict result = {}
+    cdef Py_ssize_t i, length
+    cdef list members
+    cdef object idx, key
+
+    length = len(index)
+
+    for i in range(length):
+        key = util.get_value_1d(labels, i)
+
+        if _checknull(key):
+            continue
+
+        idx = index[i]
+        if key in result:
+            members = result[key]
+            members.append(idx)
+        else:
+            result[key] = [idx]
+
+    return result
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def groupby_int64(ndarray[int64_t] index, ndarray labels):
+    cdef dict result = {}
+    cdef Py_ssize_t i, length
+    cdef list members
+    cdef object idx, key
+
+    length = len(index)
+
+    for i in range(length):
+        key = util.get_value_1d(labels, i)
+
+        if _checknull(key):
+            continue
+
+        idx = index[i]
+        if key in result:
+            members = result[key]
+            members.append(idx)
+        else:
+            result[key] = [idx]
+
+    return result
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def groupby_bool(ndarray[uint8_t] index, ndarray labels):
+    cdef dict result = {}
+    cdef Py_ssize_t i, length
+    cdef list members
+    cdef object idx, key
+
+    length = len(index)
+
+    for i in range(length):
+        key = util.get_value_1d(labels, i)
+
+        if _checknull(key):
+            continue
+
+        idx = index[i]
+        if key in result:
+            members = result[key]
+            members.append(idx)
+        else:
+            result[key] = [idx]
+
+    return result
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def arrmap_float64(ndarray[float64_t] index, object func):
+    cdef Py_ssize_t length = index.shape[0]
+    cdef Py_ssize_t i = 0
+
+    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
+
+    from pandas.lib import maybe_convert_objects
+
+    for i in range(length):
+        result[i] = func(index[i])
+
+    return maybe_convert_objects(result)
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def arrmap_float32(ndarray[float32_t] index, object func):
+    cdef Py_ssize_t length = index.shape[0]
+    cdef Py_ssize_t i = 0
+
+    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
+
+    from pandas.lib import maybe_convert_objects
+
+    for i in range(length):
+        result[i] = func(index[i])
+
+    return maybe_convert_objects(result)
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def arrmap_object(ndarray[object] index, object func):
+    cdef Py_ssize_t length = index.shape[0]
+    cdef Py_ssize_t i = 0
+
+    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
+
+    from pandas.lib import maybe_convert_objects
+
+    for i in range(length):
+        result[i] = func(index[i])
+
+    return maybe_convert_objects(result)
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def arrmap_int8(ndarray[int8_t] index, object func):
+    cdef Py_ssize_t length = index.shape[0]
+    cdef Py_ssize_t i = 0
+
+    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
+
+    from pandas.lib import maybe_convert_objects
+
+    for i in range(length):
+        result[i] = func(index[i])
+
+    return maybe_convert_objects(result)
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def arrmap_int16(ndarray[int16_t] index, object func):
+    cdef Py_ssize_t length = index.shape[0]
+    cdef Py_ssize_t i = 0
+
+    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
+
+    from pandas.lib import maybe_convert_objects
+
+    for i in range(length):
+        result[i] = func(index[i])
+
+    return maybe_convert_objects(result)
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def arrmap_int32(ndarray[int32_t] index, object func):
+    cdef Py_ssize_t length = index.shape[0]
+    cdef Py_ssize_t i = 0
+
+    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
+
+    from pandas.lib import maybe_convert_objects
+
+    for i in range(length):
+        result[i] = func(index[i])
+
+    return maybe_convert_objects(result)
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def arrmap_int64(ndarray[int64_t] index, object func):
+    cdef Py_ssize_t length = index.shape[0]
+    cdef Py_ssize_t i = 0
+
+    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
+
+    from pandas.lib import maybe_convert_objects
+
+    for i in range(length):
+        result[i] = func(index[i])
+
+    return maybe_convert_objects(result)
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def arrmap_bool(ndarray[uint8_t] index, object func):
+    cdef Py_ssize_t length = index.shape[0]
+    cdef Py_ssize_t i = 0
+
+    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
+
+    from pandas.lib import maybe_convert_objects
+
+    for i in range(length):
+        result[i] = func(index[i])
+
+    return maybe_convert_objects(result)
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis0_float64(ndarray[float64_t, ndim=2] values,
+                           ndarray[int64_t] indexer,
+                           out=None, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[float64_t, ndim=2] outbuf
+        float64_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    if out is None:
+        outbuf = np.empty((n, k), dtype=values.dtype)
+    else:
+        outbuf = out
+
+    if False and _checknan(fill_value):
+        for i in range(n):
+            idx = indexer[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    raise ValueError('No NA values allowed')
+            else:
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
+    else:
+        fv = fill_value
+        for i in range(n):
+            idx = indexer[i]
+            if idx == -1:
+                for j in range(k):
+                    outbuf[i, j] = fv
+            else:
+                for j in range(k):
+                    outbuf[i, j] = values[idx, j]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis0_float32(ndarray[float32_t, ndim=2] values,
+                           ndarray[int64_t] indexer,
+                           out=None, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[float32_t, ndim=2] outbuf
+        float32_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    if out is None:
+        outbuf = np.empty((n, k), dtype=values.dtype)
+    else:
+        outbuf = out
+
+    if False and _checknan(fill_value):
+        for i in range(n):
+            idx = indexer[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    raise ValueError('No NA values allowed')
+            else:
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
+    else:
+        fv = fill_value
+        for i in range(n):
+            idx = indexer[i]
+            if idx == -1:
+                for j in range(k):
+                    outbuf[i, j] = fv
+            else:
+                for j in range(k):
+                    outbuf[i, j] = values[idx, j]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis0_object(ndarray[object, ndim=2] values,
+                           ndarray[int64_t] indexer,
+                           out=None, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[object, ndim=2] outbuf
+        object fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    if out is None:
+        outbuf = np.empty((n, k), dtype=values.dtype)
+    else:
+        outbuf = out
+
+    if False and _checknan(fill_value):
+        for i in range(n):
+            idx = indexer[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    raise ValueError('No NA values allowed')
+            else:
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
+    else:
+        fv = fill_value
+        for i in range(n):
+            idx = indexer[i]
+            if idx == -1:
+                for j in range(k):
+                    outbuf[i, j] = fv
+            else:
+                for j in range(k):
+                    outbuf[i, j] = values[idx, j]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis0_int8(ndarray[int8_t, ndim=2] values,
+                           ndarray[int64_t] indexer,
+                           out=None, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int8_t, ndim=2] outbuf
+        int8_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    if out is None:
+        outbuf = np.empty((n, k), dtype=values.dtype)
+    else:
+        outbuf = out
+
+    if True and _checknan(fill_value):
+        for i in range(n):
+            idx = indexer[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    raise ValueError('No NA values allowed')
+            else:
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
+    else:
+        fv = fill_value
+        for i in range(n):
+            idx = indexer[i]
+            if idx == -1:
+                for j in range(k):
+                    outbuf[i, j] = fv
+            else:
+                for j in range(k):
+                    outbuf[i, j] = values[idx, j]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis0_int16(ndarray[int16_t, ndim=2] values,
+                           ndarray[int64_t] indexer,
+                           out=None, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int16_t, ndim=2] outbuf
+        int16_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    if out is None:
+        outbuf = np.empty((n, k), dtype=values.dtype)
+    else:
+        outbuf = out
+
+    if True and _checknan(fill_value):
+        for i in range(n):
+            idx = indexer[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    raise ValueError('No NA values allowed')
+            else:
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
+    else:
+        fv = fill_value
+        for i in range(n):
+            idx = indexer[i]
+            if idx == -1:
+                for j in range(k):
+                    outbuf[i, j] = fv
+            else:
+                for j in range(k):
+                    outbuf[i, j] = values[idx, j]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis0_int32(ndarray[int32_t, ndim=2] values,
+                           ndarray[int64_t] indexer,
+                           out=None, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int32_t, ndim=2] outbuf
+        int32_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    if out is None:
+        outbuf = np.empty((n, k), dtype=values.dtype)
+    else:
+        outbuf = out
+
+    if True and _checknan(fill_value):
+        for i in range(n):
+            idx = indexer[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    raise ValueError('No NA values allowed')
+            else:
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
+    else:
+        fv = fill_value
+        for i in range(n):
+            idx = indexer[i]
+            if idx == -1:
+                for j in range(k):
+                    outbuf[i, j] = fv
+            else:
+                for j in range(k):
+                    outbuf[i, j] = values[idx, j]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis0_int64(ndarray[int64_t, ndim=2] values,
+                           ndarray[int64_t] indexer,
+                           out=None, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t, ndim=2] outbuf
+        int64_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    if out is None:
+        outbuf = np.empty((n, k), dtype=values.dtype)
+    else:
+        outbuf = out
+
+    if True and _checknan(fill_value):
+        for i in range(n):
+            idx = indexer[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    raise ValueError('No NA values allowed')
+            else:
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
+    else:
+        fv = fill_value
+        for i in range(n):
+            idx = indexer[i]
+            if idx == -1:
+                for j in range(k):
+                    outbuf[i, j] = fv
+            else:
+                for j in range(k):
+                    outbuf[i, j] = values[idx, j]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis0_bool(ndarray[uint8_t, ndim=2] values,
+                           ndarray[int64_t] indexer,
+                           out=None, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[uint8_t, ndim=2] outbuf
+        uint8_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    if out is None:
+        outbuf = np.empty((n, k), dtype=values.dtype)
+    else:
+        outbuf = out
+
+    if True and _checknan(fill_value):
+        for i in range(n):
+            idx = indexer[i]
+            if idx == -1:
+                for j from 0 <= j < k:
+                    raise ValueError('No NA values allowed')
+            else:
+                for j from 0 <= j < k:
+                    outbuf[i, j] = values[idx, j]
+    else:
+        fv = fill_value
+        for i in range(n):
+            idx = indexer[i]
+            if idx == -1:
+                for j in range(k):
+                    outbuf[i, j] = fv
+            else:
+                for j in range(k):
+                    outbuf[i, j] = values[idx, j]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_float64(ndarray[float64_t, ndim=2] values,
+                           ndarray[int64_t] indexer,
+                           out=None, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[float64_t, ndim=2] outbuf
+        float64_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if out is None:
+        outbuf = np.empty((n, k), dtype=values.dtype)
+    else:
+        outbuf = out
+
+    if False and _checknan(fill_value):
+        for j in range(k):
+            idx = indexer[j]
+
+            if idx == -1:
+                for i in range(n):
+                    raise ValueError('No NA values allowed')
+            else:
+                for i in range(n):
+                    outbuf[i, j] = values[i, idx]
+    else:
+        fv = fill_value
+        for j in range(k):
+            idx = indexer[j]
+
+            if idx == -1:
+                for i in range(n):
+                    outbuf[i, j] = fv
+            else:
+                for i in range(n):
+                    outbuf[i, j] = values[i, idx]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_float32(ndarray[float32_t, ndim=2] values,
+                           ndarray[int64_t] indexer,
+                           out=None, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[float32_t, ndim=2] outbuf
+        float32_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if out is None:
+        outbuf = np.empty((n, k), dtype=values.dtype)
+    else:
+        outbuf = out
+
+    if False and _checknan(fill_value):
+        for j in range(k):
+            idx = indexer[j]
+
+            if idx == -1:
+                for i in range(n):
+                    raise ValueError('No NA values allowed')
+            else:
+                for i in range(n):
+                    outbuf[i, j] = values[i, idx]
+    else:
+        fv = fill_value
+        for j in range(k):
+            idx = indexer[j]
+
+            if idx == -1:
+                for i in range(n):
+                    outbuf[i, j] = fv
+            else:
+                for i in range(n):
+                    outbuf[i, j] = values[i, idx]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_object(ndarray[object, ndim=2] values,
+                           ndarray[int64_t] indexer,
+                           out=None, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[object, ndim=2] outbuf
+        object fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if out is None:
+        outbuf = np.empty((n, k), dtype=values.dtype)
+    else:
+        outbuf = out
+
+    if False and _checknan(fill_value):
+        for j in range(k):
+            idx = indexer[j]
+
+            if idx == -1:
+                for i in range(n):
+                    raise ValueError('No NA values allowed')
+            else:
+                for i in range(n):
+                    outbuf[i, j] = values[i, idx]
+    else:
+        fv = fill_value
+        for j in range(k):
+            idx = indexer[j]
+
+            if idx == -1:
+                for i in range(n):
+                    outbuf[i, j] = fv
+            else:
+                for i in range(n):
+                    outbuf[i, j] = values[i, idx]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_int8(ndarray[int8_t, ndim=2] values,
+                           ndarray[int64_t] indexer,
+                           out=None, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int8_t, ndim=2] outbuf
+        int8_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if out is None:
+        outbuf = np.empty((n, k), dtype=values.dtype)
+    else:
+        outbuf = out
+
+    if True and _checknan(fill_value):
+        for j in range(k):
+            idx = indexer[j]
+
+            if idx == -1:
+                for i in range(n):
+                    raise ValueError('No NA values allowed')
+            else:
+                for i in range(n):
+                    outbuf[i, j] = values[i, idx]
+    else:
+        fv = fill_value
+        for j in range(k):
+            idx = indexer[j]
+
+            if idx == -1:
+                for i in range(n):
+                    outbuf[i, j] = fv
+            else:
+                for i in range(n):
+                    outbuf[i, j] = values[i, idx]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_int16(ndarray[int16_t, ndim=2] values,
+                           ndarray[int64_t] indexer,
+                           out=None, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int16_t, ndim=2] outbuf
+        int16_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if out is None:
+        outbuf = np.empty((n, k), dtype=values.dtype)
+    else:
+        outbuf = out
+
+    if True and _checknan(fill_value):
+        for j in range(k):
+            idx = indexer[j]
+
+            if idx == -1:
+                for i in range(n):
+                    raise ValueError('No NA values allowed')
+            else:
+                for i in range(n):
+                    outbuf[i, j] = values[i, idx]
+    else:
+        fv = fill_value
+        for j in range(k):
+            idx = indexer[j]
+
+            if idx == -1:
+                for i in range(n):
+                    outbuf[i, j] = fv
+            else:
+                for i in range(n):
+                    outbuf[i, j] = values[i, idx]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_int32(ndarray[int32_t, ndim=2] values,
+                           ndarray[int64_t] indexer,
+                           out=None, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int32_t, ndim=2] outbuf
+        int32_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if out is None:
+        outbuf = np.empty((n, k), dtype=values.dtype)
+    else:
+        outbuf = out
+
+    if True and _checknan(fill_value):
+        for j in range(k):
+            idx = indexer[j]
+
+            if idx == -1:
+                for i in range(n):
+                    raise ValueError('No NA values allowed')
+            else:
+                for i in range(n):
+                    outbuf[i, j] = values[i, idx]
+    else:
+        fv = fill_value
+        for j in range(k):
+            idx = indexer[j]
+
+            if idx == -1:
+                for i in range(n):
+                    outbuf[i, j] = fv
+            else:
+                for i in range(n):
+                    outbuf[i, j] = values[i, idx]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_int64(ndarray[int64_t, ndim=2] values,
+                           ndarray[int64_t] indexer,
+                           out=None, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t, ndim=2] outbuf
+        int64_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if out is None:
+        outbuf = np.empty((n, k), dtype=values.dtype)
+    else:
+        outbuf = out
+
+    if True and _checknan(fill_value):
+        for j in range(k):
+            idx = indexer[j]
+
+            if idx == -1:
+                for i in range(n):
+                    raise ValueError('No NA values allowed')
+            else:
+                for i in range(n):
+                    outbuf[i, j] = values[i, idx]
+    else:
+        fv = fill_value
+        for j in range(k):
+            idx = indexer[j]
+
+            if idx == -1:
+                for i in range(n):
+                    outbuf[i, j] = fv
+            else:
+                for i in range(n):
+                    outbuf[i, j] = values[i, idx]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_bool(ndarray[uint8_t, ndim=2] values,
+                           ndarray[int64_t] indexer,
+                           out=None, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[uint8_t, ndim=2] outbuf
+        uint8_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if out is None:
+        outbuf = np.empty((n, k), dtype=values.dtype)
+    else:
+        outbuf = out
+
+    if True and _checknan(fill_value):
+        for j in range(k):
+            idx = indexer[j]
+
+            if idx == -1:
+                for i in range(n):
+                    raise ValueError('No NA values allowed')
+            else:
+                for i in range(n):
+                    outbuf[i, j] = values[i, idx]
+    else:
+        fv = fill_value
+        for j in range(k):
+            idx = indexer[j]
+
+            if idx == -1:
+                for i in range(n):
+                    outbuf[i, j] = fv
+            else:
+                for i in range(n):
+                    outbuf[i, j] = values[i, idx]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_float64(ndarray[float64_t, ndim=2] values,
+                           ndarray[int64_t] idx0,
+                           ndarray[int64_t] idx1,
+                           out=None, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[float64_t, ndim=2] outbuf
+        float64_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    if out is None:
+        outbuf = np.empty((n, k), dtype=values.dtype)
+    else:
+        outbuf = out
+
+
+    if False and _checknan(fill_value):
+        for i in range(n):
+            idx = idx0[i]
+            if idx == -1:
+                for j in range(k):
+                    raise ValueError('No NA values allowed')
+            else:
+                for j in range(k):
+                    if idx1[j] == -1:
+                        raise ValueError('No NA values allowed')
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+    else:
+        fv = fill_value
+        for i in range(n):
+            idx = idx0[i]
+            if idx == -1:
+                for j in range(k):
+                    outbuf[i, j] = fv
+            else:
+                for j in range(k):
+                    if idx1[j] == -1:
+                        outbuf[i, j] = fv
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_float32(ndarray[float32_t, ndim=2] values,
+                           ndarray[int64_t] idx0,
+                           ndarray[int64_t] idx1,
+                           out=None, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[float32_t, ndim=2] outbuf
+        float32_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    if out is None:
+        outbuf = np.empty((n, k), dtype=values.dtype)
+    else:
+        outbuf = out
+
+
+    if False and _checknan(fill_value):
+        for i in range(n):
+            idx = idx0[i]
+            if idx == -1:
+                for j in range(k):
+                    raise ValueError('No NA values allowed')
+            else:
+                for j in range(k):
+                    if idx1[j] == -1:
+                        raise ValueError('No NA values allowed')
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+    else:
+        fv = fill_value
+        for i in range(n):
+            idx = idx0[i]
+            if idx == -1:
+                for j in range(k):
+                    outbuf[i, j] = fv
+            else:
+                for j in range(k):
+                    if idx1[j] == -1:
+                        outbuf[i, j] = fv
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_object(ndarray[object, ndim=2] values,
+                           ndarray[int64_t] idx0,
+                           ndarray[int64_t] idx1,
+                           out=None, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[object, ndim=2] outbuf
+        object fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    if out is None:
+        outbuf = np.empty((n, k), dtype=values.dtype)
+    else:
+        outbuf = out
+
+
+    if False and _checknan(fill_value):
+        for i in range(n):
+            idx = idx0[i]
+            if idx == -1:
+                for j in range(k):
+                    raise ValueError('No NA values allowed')
+            else:
+                for j in range(k):
+                    if idx1[j] == -1:
+                        raise ValueError('No NA values allowed')
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+    else:
+        fv = fill_value
+        for i in range(n):
+            idx = idx0[i]
+            if idx == -1:
+                for j in range(k):
+                    outbuf[i, j] = fv
+            else:
+                for j in range(k):
+                    if idx1[j] == -1:
+                        outbuf[i, j] = fv
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_int8(ndarray[int8_t, ndim=2] values,
+                           ndarray[int64_t] idx0,
+                           ndarray[int64_t] idx1,
+                           out=None, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int8_t, ndim=2] outbuf
+        int8_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    if out is None:
+        outbuf = np.empty((n, k), dtype=values.dtype)
+    else:
+        outbuf = out
+
+
+    if True and _checknan(fill_value):
+        for i in range(n):
+            idx = idx0[i]
+            if idx == -1:
+                for j in range(k):
+                    raise ValueError('No NA values allowed')
+            else:
+                for j in range(k):
+                    if idx1[j] == -1:
+                        raise ValueError('No NA values allowed')
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+    else:
+        fv = fill_value
+        for i in range(n):
+            idx = idx0[i]
+            if idx == -1:
+                for j in range(k):
+                    outbuf[i, j] = fv
+            else:
+                for j in range(k):
+                    if idx1[j] == -1:
+                        outbuf[i, j] = fv
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_int16(ndarray[int16_t, ndim=2] values,
+                           ndarray[int64_t] idx0,
+                           ndarray[int64_t] idx1,
+                           out=None, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int16_t, ndim=2] outbuf
+        int16_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    if out is None:
+        outbuf = np.empty((n, k), dtype=values.dtype)
+    else:
+        outbuf = out
+
+
+    if True and _checknan(fill_value):
+        for i in range(n):
+            idx = idx0[i]
+            if idx == -1:
+                for j in range(k):
+                    raise ValueError('No NA values allowed')
+            else:
+                for j in range(k):
+                    if idx1[j] == -1:
+                        raise ValueError('No NA values allowed')
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+    else:
+        fv = fill_value
+        for i in range(n):
+            idx = idx0[i]
+            if idx == -1:
+                for j in range(k):
+                    outbuf[i, j] = fv
+            else:
+                for j in range(k):
+                    if idx1[j] == -1:
+                        outbuf[i, j] = fv
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_int32(ndarray[int32_t, ndim=2] values,
+                           ndarray[int64_t] idx0,
+                           ndarray[int64_t] idx1,
+                           out=None, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int32_t, ndim=2] outbuf
+        int32_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    if out is None:
+        outbuf = np.empty((n, k), dtype=values.dtype)
+    else:
+        outbuf = out
+
+
+    if True and _checknan(fill_value):
+        for i in range(n):
+            idx = idx0[i]
+            if idx == -1:
+                for j in range(k):
+                    raise ValueError('No NA values allowed')
+            else:
+                for j in range(k):
+                    if idx1[j] == -1:
+                        raise ValueError('No NA values allowed')
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+    else:
+        fv = fill_value
+        for i in range(n):
+            idx = idx0[i]
+            if idx == -1:
+                for j in range(k):
+                    outbuf[i, j] = fv
+            else:
+                for j in range(k):
+                    if idx1[j] == -1:
+                        outbuf[i, j] = fv
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_int64(ndarray[int64_t, ndim=2] values,
+                           ndarray[int64_t] idx0,
+                           ndarray[int64_t] idx1,
+                           out=None, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t, ndim=2] outbuf
+        int64_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    if out is None:
+        outbuf = np.empty((n, k), dtype=values.dtype)
+    else:
+        outbuf = out
+
+
+    if True and _checknan(fill_value):
+        for i in range(n):
+            idx = idx0[i]
+            if idx == -1:
+                for j in range(k):
+                    raise ValueError('No NA values allowed')
+            else:
+                for j in range(k):
+                    if idx1[j] == -1:
+                        raise ValueError('No NA values allowed')
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+    else:
+        fv = fill_value
+        for i in range(n):
+            idx = idx0[i]
+            if idx == -1:
+                for j in range(k):
+                    outbuf[i, j] = fv
+            else:
+                for j in range(k):
+                    if idx1[j] == -1:
+                        outbuf[i, j] = fv
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_bool(ndarray[uint8_t, ndim=2] values,
+                           ndarray[int64_t] idx0,
+                           ndarray[int64_t] idx1,
+                           out=None, fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[uint8_t, ndim=2] outbuf
+        uint8_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    if out is None:
+        outbuf = np.empty((n, k), dtype=values.dtype)
+    else:
+        outbuf = out
+
+
+    if True and _checknan(fill_value):
+        for i in range(n):
+            idx = idx0[i]
+            if idx == -1:
+                for j in range(k):
+                    raise ValueError('No NA values allowed')
+            else:
+                for j in range(k):
+                    if idx1[j] == -1:
+                        raise ValueError('No NA values allowed')
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+    else:
+        fv = fill_value
+        for i in range(n):
+            idx = idx0[i]
+            if idx == -1:
+                for j in range(k):
+                    outbuf[i, j] = fv
+            else:
+                for j in range(k):
+                    if idx1[j] == -1:
+                        outbuf[i, j] = fv
+                    else:
+                        outbuf[i, j] = values[idx, idx1[j]]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def diff_2d_float64(ndarray[float64_t, ndim=2] arr,
+                     ndarray[float64_t, ndim=2] out,
+                    Py_ssize_t periods, int axis):
+    cdef:
+        Py_ssize_t i, j, sx, sy
+
+    sx, sy = (<object> arr).shape
+    if arr.flags.f_contiguous:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for j in range(sy):
+                for i in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for j in range(start, stop):
+                for i in range(sx):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+    else:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for i in range(start, stop):
+                for j in range(sy):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for i in range(sx):
+                for j in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def diff_2d_float32(ndarray[float32_t, ndim=2] arr,
+                     ndarray[float32_t, ndim=2] out,
+                    Py_ssize_t periods, int axis):
+    cdef:
+        Py_ssize_t i, j, sx, sy
+
+    sx, sy = (<object> arr).shape
+    if arr.flags.f_contiguous:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for j in range(sy):
+                for i in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for j in range(start, stop):
+                for i in range(sx):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+    else:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for i in range(start, stop):
+                for j in range(sy):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for i in range(sx):
+                for j in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def diff_2d_int8(ndarray[int8_t, ndim=2] arr,
+                     ndarray[float32_t, ndim=2] out,
+                    Py_ssize_t periods, int axis):
+    cdef:
+        Py_ssize_t i, j, sx, sy
+
+    sx, sy = (<object> arr).shape
+    if arr.flags.f_contiguous:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for j in range(sy):
+                for i in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for j in range(start, stop):
+                for i in range(sx):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+    else:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for i in range(start, stop):
+                for j in range(sy):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for i in range(sx):
+                for j in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def diff_2d_int16(ndarray[int16_t, ndim=2] arr,
+                     ndarray[float32_t, ndim=2] out,
+                    Py_ssize_t periods, int axis):
+    cdef:
+        Py_ssize_t i, j, sx, sy
+
+    sx, sy = (<object> arr).shape
+    if arr.flags.f_contiguous:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for j in range(sy):
+                for i in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for j in range(start, stop):
+                for i in range(sx):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+    else:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for i in range(start, stop):
+                for j in range(sy):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for i in range(sx):
+                for j in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def diff_2d_int32(ndarray[int32_t, ndim=2] arr,
+                     ndarray[float64_t, ndim=2] out,
+                    Py_ssize_t periods, int axis):
+    cdef:
+        Py_ssize_t i, j, sx, sy
+
+    sx, sy = (<object> arr).shape
+    if arr.flags.f_contiguous:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for j in range(sy):
+                for i in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for j in range(start, stop):
+                for i in range(sx):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+    else:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for i in range(start, stop):
+                for j in range(sy):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for i in range(sx):
+                for j in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def diff_2d_int64(ndarray[int64_t, ndim=2] arr,
+                     ndarray[float64_t, ndim=2] out,
+                    Py_ssize_t periods, int axis):
+    cdef:
+        Py_ssize_t i, j, sx, sy
+
+    sx, sy = (<object> arr).shape
+    if arr.flags.f_contiguous:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for j in range(sy):
+                for i in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for j in range(start, stop):
+                for i in range(sx):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+    else:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for i in range(start, stop):
+                for j in range(sy):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for i in range(sx):
+                for j in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+
+@cython.wraparound(False)
+@cython.wraparound(False)
+def group_last_float64(ndarray[float64_t, ndim=2] out,
+               ndarray[int64_t] counts,
+               ndarray[float64_t, ndim=2] values,
+               ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        float64_t val, count
+        ndarray[float64_t, ndim=2] resx
+        ndarray[int64_t, ndim=2] nobs
+
+    nobs = np.zeros((<object> out).shape, dtype=np.int64)
+    resx = np.empty_like(out)
+
+    N, K = (<object> values).shape
+
+    for i in range(N):
+        lab = labels[i]
+        if lab < 0:
+            continue
+
+        counts[lab] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            if val == val:
+                nobs[lab, j] += 1
+                resx[lab, j] = val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = resx[i, j]
+@cython.wraparound(False)
+@cython.wraparound(False)
+def group_last_float32(ndarray[float32_t, ndim=2] out,
+               ndarray[int64_t] counts,
+               ndarray[float32_t, ndim=2] values,
+               ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        float32_t val, count
+        ndarray[float32_t, ndim=2] resx
+        ndarray[int64_t, ndim=2] nobs
+
+    nobs = np.zeros((<object> out).shape, dtype=np.int64)
+    resx = np.empty_like(out)
+
+    N, K = (<object> values).shape
+
+    for i in range(N):
+        lab = labels[i]
+        if lab < 0:
+            continue
+
+        counts[lab] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            if val == val:
+                nobs[lab, j] += 1
+                resx[lab, j] = val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = resx[i, j]
+
+@cython.wraparound(False)
+@cython.wraparound(False)
+def group_last_bin_float64(ndarray[float64_t, ndim=2] out,
+                   ndarray[int64_t] counts,
+                   ndarray[float64_t, ndim=2] values,
+                   ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b
+        float64_t val, count
+        ndarray[float64_t, ndim=2] resx, nobs
+
+    nobs = np.zeros_like(out)
+    resx = np.empty_like(out)
+
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+
+    N, K = (<object> values).shape
+
+    b = 0
+    for i in range(N):
+        while b < ngroups - 1 and i >= bins[b]:
+            b += 1
+
+        counts[b] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            if val == val:
+                nobs[b, j] += 1
+                resx[b, j] = val
+
+    for i in range(ngroups):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = resx[i, j]
+@cython.wraparound(False)
+@cython.wraparound(False)
+def group_last_bin_float32(ndarray[float32_t, ndim=2] out,
+                   ndarray[int64_t] counts,
+                   ndarray[float32_t, ndim=2] values,
+                   ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b
+        float32_t val, count
+        ndarray[float32_t, ndim=2] resx, nobs
+
+    nobs = np.zeros_like(out)
+    resx = np.empty_like(out)
+
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+
+    N, K = (<object> values).shape
+
+    b = 0
+    for i in range(N):
+        while b < ngroups - 1 and i >= bins[b]:
+            b += 1
+
+        counts[b] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            if val == val:
+                nobs[b, j] += 1
+                resx[b, j] = val
+
+    for i in range(ngroups):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = resx[i, j]
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_nth_float64(ndarray[float64_t, ndim=2] out,
+              ndarray[int64_t] counts,
+              ndarray[float64_t, ndim=2] values,
+              ndarray[int64_t] labels, int64_t rank):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        float64_t val, count
+        ndarray[float64_t, ndim=2] resx
+        ndarray[int64_t, ndim=2] nobs
+
+    nobs = np.zeros((<object> out).shape, dtype=np.int64)
+    resx = np.empty_like(out)
+
+    N, K = (<object> values).shape
+
+    for i in range(N):
+        lab = labels[i]
+        if lab < 0:
+            continue
+
+        counts[lab] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            if val == val:
+                nobs[lab, j] += 1
+                if nobs[lab, j] == rank:
+                    resx[lab, j] = val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = resx[i, j]
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_nth_float32(ndarray[float32_t, ndim=2] out,
+              ndarray[int64_t] counts,
+              ndarray[float32_t, ndim=2] values,
+              ndarray[int64_t] labels, int64_t rank):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        float32_t val, count
+        ndarray[float32_t, ndim=2] resx
+        ndarray[int64_t, ndim=2] nobs
+
+    nobs = np.zeros((<object> out).shape, dtype=np.int64)
+    resx = np.empty_like(out)
+
+    N, K = (<object> values).shape
+
+    for i in range(N):
+        lab = labels[i]
+        if lab < 0:
+            continue
+
+        counts[lab] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            if val == val:
+                nobs[lab, j] += 1
+                if nobs[lab, j] == rank:
+                    resx[lab, j] = val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = resx[i, j]
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_nth_bin_float64(ndarray[float64_t, ndim=2] out,
+                  ndarray[int64_t] counts,
+                  ndarray[float64_t, ndim=2] values,
+                  ndarray[int64_t] bins, int64_t rank):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b
+        float64_t val, count
+        ndarray[float64_t, ndim=2] resx, nobs
+
+    nobs = np.zeros_like(out)
+    resx = np.empty_like(out)
+
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+
+    N, K = (<object> values).shape
+
+    b = 0
+    for i in range(N):
+        while b < ngroups - 1 and i >= bins[b]:
+            b += 1
+
+        counts[b] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            if val == val:
+                nobs[b, j] += 1
+                if nobs[b, j] == rank:
+                    resx[b, j] = val
+
+    for i in range(ngroups):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = resx[i, j]
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_nth_bin_float32(ndarray[float32_t, ndim=2] out,
+                  ndarray[int64_t] counts,
+                  ndarray[float32_t, ndim=2] values,
+                  ndarray[int64_t] bins, int64_t rank):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b
+        float32_t val, count
+        ndarray[float32_t, ndim=2] resx, nobs
+
+    nobs = np.zeros_like(out)
+    resx = np.empty_like(out)
+
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+
+    N, K = (<object> values).shape
+
+    b = 0
+    for i in range(N):
+        while b < ngroups - 1 and i >= bins[b]:
+            b += 1
+
+        counts[b] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            if val == val:
+                nobs[b, j] += 1
+                if nobs[b, j] == rank:
+                    resx[b, j] = val
+
+    for i in range(ngroups):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = resx[i, j]
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_add_float64(ndarray[float64_t, ndim=2] out,
+              ndarray[int64_t] counts,
+              ndarray[float64_t, ndim=2] values,
+              ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        float64_t val, count
+        ndarray[float64_t, ndim=2] sumx, nobs
+
+    nobs = np.zeros_like(out)
+    sumx = np.zeros_like(out)
+
+    N, K = (<object> values).shape
+
+    if K > 1:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    sumx[lab, j] += val
+    else:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[lab, 0] += 1
+                sumx[lab, 0] += val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = sumx[i, j]
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_add_float32(ndarray[float32_t, ndim=2] out,
+              ndarray[int64_t] counts,
+              ndarray[float32_t, ndim=2] values,
+              ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        float32_t val, count
+        ndarray[float32_t, ndim=2] sumx, nobs
+
+    nobs = np.zeros_like(out)
+    sumx = np.zeros_like(out)
+
+    N, K = (<object> values).shape
+
+    if K > 1:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    sumx[lab, j] += val
+    else:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[lab, 0] += 1
+                sumx[lab, 0] += val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = sumx[i, j]
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_add_bin_float64(ndarray[float64_t, ndim=2] out,
+                  ndarray[int64_t] counts,
+                  ndarray[float64_t, ndim=2] values,
+                  ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b, nbins
+        float64_t val, count
+        ndarray[float64_t, ndim=2] sumx, nobs
+
+    nobs = np.zeros_like(out)
+    sumx = np.zeros_like(out)
+
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+    N, K = (<object> values).shape
+
+    b = 0
+    if K > 1:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    sumx[b, j] += val
+    else:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[b, 0] += 1
+                sumx[b, 0] += val
+
+    for i in range(ngroups):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = sumx[i, j]
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_add_bin_float32(ndarray[float32_t, ndim=2] out,
+                  ndarray[int64_t] counts,
+                  ndarray[float32_t, ndim=2] values,
+                  ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b, nbins
+        float32_t val, count
+        ndarray[float32_t, ndim=2] sumx, nobs
+
+    nobs = np.zeros_like(out)
+    sumx = np.zeros_like(out)
+
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+    N, K = (<object> values).shape
+
+    b = 0
+    if K > 1:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    sumx[b, j] += val
+    else:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[b, 0] += 1
+                sumx[b, 0] += val
+
+    for i in range(ngroups):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = sumx[i, j]
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_prod_float64(ndarray[float64_t, ndim=2] out,
+               ndarray[int64_t] counts,
+               ndarray[float64_t, ndim=2] values,
+               ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        float64_t val, count
+        ndarray[float64_t, ndim=2] prodx, nobs
+
+    nobs = np.zeros_like(out)
+    prodx = np.ones_like(out)
+
+    N, K = (<object> values).shape
+
+    if K > 1:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    prodx[lab, j] *= val
+    else:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[lab, 0] += 1
+                prodx[lab, 0] *= val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = prodx[i, j]
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_prod_float32(ndarray[float32_t, ndim=2] out,
+               ndarray[int64_t] counts,
+               ndarray[float32_t, ndim=2] values,
+               ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        float32_t val, count
+        ndarray[float32_t, ndim=2] prodx, nobs
+
+    nobs = np.zeros_like(out)
+    prodx = np.ones_like(out)
+
+    N, K = (<object> values).shape
+
+    if K > 1:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    prodx[lab, j] *= val
+    else:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[lab, 0] += 1
+                prodx[lab, 0] *= val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = prodx[i, j]
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_prod_bin_float64(ndarray[float64_t, ndim=2] out,
+                  ndarray[int64_t] counts,
+                  ndarray[float64_t, ndim=2] values,
+                  ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b
+        float64_t val, count
+        ndarray[float64_t, ndim=2] prodx, nobs
+
+    nobs = np.zeros_like(out)
+    prodx = np.ones_like(out)
+
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+    N, K = (<object> values).shape
+
+    b = 0
+    if K > 1:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    prodx[b, j] *= val
+    else:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[b, 0] += 1
+                prodx[b, 0] *= val
+
+    for i in range(ngroups):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = prodx[i, j]
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_prod_bin_float32(ndarray[float32_t, ndim=2] out,
+                  ndarray[int64_t] counts,
+                  ndarray[float32_t, ndim=2] values,
+                  ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b
+        float32_t val, count
+        ndarray[float32_t, ndim=2] prodx, nobs
+
+    nobs = np.zeros_like(out)
+    prodx = np.ones_like(out)
+
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+    N, K = (<object> values).shape
+
+    b = 0
+    if K > 1:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    prodx[b, j] *= val
+    else:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[b, 0] += 1
+                prodx[b, 0] *= val
+
+    for i in range(ngroups):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = prodx[i, j]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_var_float64(ndarray[float64_t, ndim=2] out,
+              ndarray[int64_t] counts,
+              ndarray[float64_t, ndim=2] values,
+              ndarray[int64_t] labels):
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        float64_t val, ct
+        ndarray[float64_t, ndim=2] nobs, sumx, sumxx
+
+    nobs = np.zeros_like(out)
+    sumx = np.zeros_like(out)
+    sumxx = np.zeros_like(out)
+
+    N, K = (<object> values).shape
+
+    if K > 1:
+        for i in range(N):
+
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    sumx[lab, j] += val
+                    sumxx[lab, j] += val * val
+    else:
+        for i in range(N):
+
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            val = values[i, 0]
+            # not nan
+            if val == val:
+                nobs[lab, 0] += 1
+                sumx[lab, 0] += val
+                sumxx[lab, 0] += val * val
+
+
+    for i in range(len(counts)):
+        for j in range(K):
+            ct = nobs[i, j]
+            if ct < 2:
+                out[i, j] = nan
+            else:
+                out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
+                             (ct * ct - ct))
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_var_float32(ndarray[float32_t, ndim=2] out,
+              ndarray[int64_t] counts,
+              ndarray[float32_t, ndim=2] values,
+              ndarray[int64_t] labels):
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        float32_t val, ct
+        ndarray[float32_t, ndim=2] nobs, sumx, sumxx
+
+    nobs = np.zeros_like(out)
+    sumx = np.zeros_like(out)
+    sumxx = np.zeros_like(out)
+
+    N, K = (<object> values).shape
+
+    if K > 1:
+        for i in range(N):
+
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    sumx[lab, j] += val
+                    sumxx[lab, j] += val * val
+    else:
+        for i in range(N):
+
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            val = values[i, 0]
+            # not nan
+            if val == val:
+                nobs[lab, 0] += 1
+                sumx[lab, 0] += val
+                sumxx[lab, 0] += val * val
+
+
+    for i in range(len(counts)):
+        for j in range(K):
+            ct = nobs[i, j]
+            if ct < 2:
+                out[i, j] = nan
+            else:
+                out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
+                             (ct * ct - ct))
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_var_bin_float64(ndarray[float64_t, ndim=2] out,
+                  ndarray[int64_t] counts,
+                  ndarray[float64_t, ndim=2] values,
+                  ndarray[int64_t] bins):
+
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b
+        float64_t val, ct
+        ndarray[float64_t, ndim=2] nobs, sumx, sumxx
+
+    nobs = np.zeros_like(out)
+    sumx = np.zeros_like(out)
+    sumxx = np.zeros_like(out)
+
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+
+    N, K = (<object> values).shape
+
+    b = 0
+    if K > 1:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    sumx[b, j] += val
+                    sumxx[b, j] += val * val
+    else:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[b, 0] += 1
+                sumx[b, 0] += val
+                sumxx[b, 0] += val * val
+
+    for i in range(ngroups):
+        for j in range(K):
+            ct = nobs[i, j]
+            if ct < 2:
+                out[i, j] = nan
+            else:
+                out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
+                             (ct * ct - ct))
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_var_bin_float32(ndarray[float32_t, ndim=2] out,
+                  ndarray[int64_t] counts,
+                  ndarray[float32_t, ndim=2] values,
+                  ndarray[int64_t] bins):
+
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b
+        float32_t val, ct
+        ndarray[float32_t, ndim=2] nobs, sumx, sumxx
+
+    nobs = np.zeros_like(out)
+    sumx = np.zeros_like(out)
+    sumxx = np.zeros_like(out)
+
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+
+    N, K = (<object> values).shape
+
+    b = 0
+    if K > 1:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    sumx[b, j] += val
+                    sumxx[b, j] += val * val
+    else:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[b, 0] += 1
+                sumx[b, 0] += val
+                sumxx[b, 0] += val * val
+
+    for i in range(ngroups):
+        for j in range(K):
+            ct = nobs[i, j]
+            if ct < 2:
+                out[i, j] = nan
+            else:
+                out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
+                             (ct * ct - ct))
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_mean_float64(ndarray[float64_t, ndim=2] out,
+               ndarray[int64_t] counts,
+               ndarray[float64_t, ndim=2] values,
+               ndarray[int64_t] labels):
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        float64_t val, count
+        ndarray[float64_t, ndim=2] sumx, nobs
+
+    nobs = np.zeros_like(out)
+    sumx = np.zeros_like(out)
+
+    N, K = (<object> values).shape
+
+    if K > 1:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    sumx[lab, j] += val
+    else:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            val = values[i, 0]
+            # not nan
+            if val == val:
+                nobs[lab, 0] += 1
+                sumx[lab, 0] += val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            count = nobs[i, j]
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = sumx[i, j] / count
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_mean_float32(ndarray[float32_t, ndim=2] out,
+               ndarray[int64_t] counts,
+               ndarray[float32_t, ndim=2] values,
+               ndarray[int64_t] labels):
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        float32_t val, count
+        ndarray[float32_t, ndim=2] sumx, nobs
+
+    nobs = np.zeros_like(out)
+    sumx = np.zeros_like(out)
+
+    N, K = (<object> values).shape
+
+    if K > 1:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    sumx[lab, j] += val
+    else:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            val = values[i, 0]
+            # not nan
+            if val == val:
+                nobs[lab, 0] += 1
+                sumx[lab, 0] += val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            count = nobs[i, j]
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = sumx[i, j] / count
+
+
+def group_mean_bin_float64(ndarray[float64_t, ndim=2] out,
+                   ndarray[int64_t] counts,
+                   ndarray[float64_t, ndim=2] values,
+                   ndarray[int64_t] bins):
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b
+        float64_t val, count
+        ndarray[float64_t, ndim=2] sumx, nobs
+
+    nobs = np.zeros_like(out)
+    sumx = np.zeros_like(out)
+
+    N, K = (<object> values).shape
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+
+    b = 0
+    if K > 1:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    sumx[b, j] += val
+    else:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[b, 0] += 1
+                sumx[b, 0] += val
+
+    for i in range(ngroups):
+        for j in range(K):
+            count = nobs[i, j]
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = sumx[i, j] / count
+
+def group_mean_bin_float32(ndarray[float32_t, ndim=2] out,
+                   ndarray[int64_t] counts,
+                   ndarray[float32_t, ndim=2] values,
+                   ndarray[int64_t] bins):
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b
+        float32_t val, count
+        ndarray[float32_t, ndim=2] sumx, nobs
+
+    nobs = np.zeros_like(out)
+    sumx = np.zeros_like(out)
+
+    N, K = (<object> values).shape
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+
+    b = 0
+    if K > 1:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    sumx[b, j] += val
+    else:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[b, 0] += 1
+                sumx[b, 0] += val
+
+    for i in range(ngroups):
+        for j in range(K):
+            count = nobs[i, j]
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = sumx[i, j] / count
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_min_float64(ndarray[float64_t, ndim=2] out,
+              ndarray[int64_t] counts,
+              ndarray[float64_t, ndim=2] values,
+              ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        float64_t val, count
+        ndarray[float64_t, ndim=2] minx, nobs
+
+    nobs = np.zeros_like(out)
+
+    minx = np.empty_like(out)
+    minx.fill(np.inf)
+
+    N, K = (<object> values).shape
+
+    if K > 1:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    if val < minx[lab, j]:
+                        minx[lab, j] = val
+    else:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[lab, 0] += 1
+                if val < minx[lab, 0]:
+                    minx[lab, 0] = val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = minx[i, j]
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_min_float32(ndarray[float32_t, ndim=2] out,
+              ndarray[int64_t] counts,
+              ndarray[float32_t, ndim=2] values,
+              ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        float32_t val, count
+        ndarray[float32_t, ndim=2] minx, nobs
+
+    nobs = np.zeros_like(out)
+
+    minx = np.empty_like(out)
+    minx.fill(np.inf)
+
+    N, K = (<object> values).shape
+
+    if K > 1:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    if val < minx[lab, j]:
+                        minx[lab, j] = val
+    else:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[lab, 0] += 1
+                if val < minx[lab, 0]:
+                    minx[lab, 0] = val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = minx[i, j]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_min_bin_float64(ndarray[float64_t, ndim=2] out,
+                   ndarray[int64_t] counts,
+                   ndarray[float64_t, ndim=2] values,
+                   ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b
+        float64_t val, count
+        ndarray[float64_t, ndim=2] minx, nobs
+
+    nobs = np.zeros_like(out)
+
+    minx = np.empty_like(out)
+    minx.fill(np.inf)
+
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+
+    N, K = (<object> values).shape
+
+    b = 0
+    if K > 1:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    if val < minx[b, j]:
+                        minx[b, j] = val
+    else:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[b, 0] += 1
+                if val < minx[b, 0]:
+                    minx[b, 0] = val
+
+    for i in range(ngroups):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = minx[i, j]
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_min_bin_float32(ndarray[float32_t, ndim=2] out,
+                   ndarray[int64_t] counts,
+                   ndarray[float32_t, ndim=2] values,
+                   ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b
+        float32_t val, count
+        ndarray[float32_t, ndim=2] minx, nobs
+
+    nobs = np.zeros_like(out)
+
+    minx = np.empty_like(out)
+    minx.fill(np.inf)
+
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+
+    N, K = (<object> values).shape
+
+    b = 0
+    if K > 1:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    if val < minx[b, j]:
+                        minx[b, j] = val
+    else:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[b, 0] += 1
+                if val < minx[b, 0]:
+                    minx[b, 0] = val
+
+    for i in range(ngroups):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = minx[i, j]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_max_float64(ndarray[float64_t, ndim=2] out,
+              ndarray[int64_t] counts,
+              ndarray[float64_t, ndim=2] values,
+              ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        float64_t val, count
+        ndarray[float64_t, ndim=2] maxx, nobs
+
+    nobs = np.zeros_like(out)
+
+    maxx = np.empty_like(out)
+    maxx.fill(-np.inf)
+
+    N, K = (<object> values).shape
+
+    if K > 1:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    if val > maxx[lab, j]:
+                        maxx[lab, j] = val
+    else:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[lab, 0] += 1
+                if val > maxx[lab, 0]:
+                    maxx[lab, 0] = val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = maxx[i, j]
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_max_float32(ndarray[float32_t, ndim=2] out,
+              ndarray[int64_t] counts,
+              ndarray[float32_t, ndim=2] values,
+              ndarray[int64_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        float32_t val, count
+        ndarray[float32_t, ndim=2] maxx, nobs
+
+    nobs = np.zeros_like(out)
+
+    maxx = np.empty_like(out)
+    maxx.fill(-np.inf)
+
+    N, K = (<object> values).shape
+
+    if K > 1:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    if val > maxx[lab, j]:
+                        maxx[lab, j] = val
+    else:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[lab, 0] += 1
+                if val > maxx[lab, 0]:
+                    maxx[lab, 0] = val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = maxx[i, j]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_max_bin_float64(ndarray[float64_t, ndim=2] out,
+                  ndarray[int64_t] counts,
+                  ndarray[float64_t, ndim=2] values,
+                  ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b
+        float64_t val, count
+        ndarray[float64_t, ndim=2] maxx, nobs
+
+    nobs = np.zeros_like(out)
+    maxx = np.empty_like(out)
+    maxx.fill(-np.inf)
+
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+
+    N, K = (<object> values).shape
+
+    b = 0
+    if K > 1:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    if val > maxx[b, j]:
+                        maxx[b, j] = val
+    else:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[b, 0] += 1
+                if val > maxx[b, 0]:
+                    maxx[b, 0] = val
+
+    for i in range(ngroups):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = maxx[i, j]
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_max_bin_float32(ndarray[float32_t, ndim=2] out,
+                  ndarray[int64_t] counts,
+                  ndarray[float32_t, ndim=2] values,
+                  ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b
+        float32_t val, count
+        ndarray[float32_t, ndim=2] maxx, nobs
+
+    nobs = np.zeros_like(out)
+    maxx = np.empty_like(out)
+    maxx.fill(-np.inf)
+
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+
+    N, K = (<object> values).shape
+
+    b = 0
+    if K > 1:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    if val > maxx[b, j]:
+                        maxx[b, j] = val
+    else:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[b, 0] += 1
+                if val > maxx[b, 0]:
+                    maxx[b, 0] = val
+
+    for i in range(ngroups):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = maxx[i, j]
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_ohlc_float64(ndarray[float64_t, ndim=2] out,
+                  ndarray[int64_t] counts,
+                  ndarray[float64_t, ndim=2] values,
+                  ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b
+        float64_t val, count
+        float64_t vopen, vhigh, vlow, vclose, NA
+        bint got_first = 0
+
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+
+    N, K = (<object> values).shape
+
+    if out.shape[1] != 4:
+        raise ValueError('Output array must have 4 columns')
+
+    NA = np.nan
+
+    b = 0
+    if K > 1:
+        raise NotImplementedError
+    else:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                if not got_first:
+                    out[b, 0] = NA
+                    out[b, 1] = NA
+                    out[b, 2] = NA
+                    out[b, 3] = NA
+                else:
+                    out[b, 0] = vopen
+                    out[b, 1] = vhigh
+                    out[b, 2] = vlow
+                    out[b, 3] = vclose
+                b += 1
+                got_first = 0
+
+            counts[b] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                if not got_first:
+                    got_first = 1
+                    vopen = val
+                    vlow = val
+                    vhigh = val
+                else:
+                    if val < vlow:
+                        vlow = val
+                    if val > vhigh:
+                        vhigh = val
+                vclose = val
+
+        if not got_first:
+            out[b, 0] = NA
+            out[b, 1] = NA
+            out[b, 2] = NA
+            out[b, 3] = NA
+        else:
+            out[b, 0] = vopen
+            out[b, 1] = vhigh
+            out[b, 2] = vlow
+            out[b, 3] = vclose
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_ohlc_float32(ndarray[float32_t, ndim=2] out,
+                  ndarray[int64_t] counts,
+                  ndarray[float32_t, ndim=2] values,
+                  ndarray[int64_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b
+        float32_t val, count
+        float32_t vopen, vhigh, vlow, vclose, NA
+        bint got_first = 0
+
+    if bins[len(bins) - 1] == len(values):
+        ngroups = len(bins)
+    else:
+        ngroups = len(bins) + 1
+
+    N, K = (<object> values).shape
+
+    if out.shape[1] != 4:
+        raise ValueError('Output array must have 4 columns')
+
+    NA = np.nan
+
+    b = 0
+    if K > 1:
+        raise NotImplementedError
+    else:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                if not got_first:
+                    out[b, 0] = NA
+                    out[b, 1] = NA
+                    out[b, 2] = NA
+                    out[b, 3] = NA
+                else:
+                    out[b, 0] = vopen
+                    out[b, 1] = vhigh
+                    out[b, 2] = vlow
+                    out[b, 3] = vclose
+                b += 1
+                got_first = 0
+
+            counts[b] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                if not got_first:
+                    got_first = 1
+                    vopen = val
+                    vlow = val
+                    vhigh = val
+                else:
+                    if val < vlow:
+                        vlow = val
+                    if val > vhigh:
+                        vhigh = val
+                vclose = val
+
+        if not got_first:
+            out[b, 0] = NA
+            out[b, 1] = NA
+            out[b, 2] = NA
+            out[b, 3] = NA
+        else:
+            out[b, 0] = vopen
+            out[b, 1] = vhigh
+            out[b, 2] = vlow
+            out[b, 3] = vclose
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def left_join_indexer_unique_float64(ndarray[float64_t] left,
+                                      ndarray[float64_t] right):
+    cdef:
+        Py_ssize_t i, j, nleft, nright
+        ndarray[int64_t] indexer
+        float64_t lval, rval
+
+    i = 0
+    j = 0
+    nleft = len(left)
+    nright = len(right)
+
+    indexer = np.empty(nleft, dtype=np.int64)
+    while True:
+        if i == nleft:
+            break
+
+        if j == nright:
+            indexer[i] = -1
+            i += 1
+            continue
+
+        rval = right[j]
+
+        while i < nleft - 1 and left[i] == rval:
+            indexer[i] = j
+            i += 1
+
+        if left[i] == right[j]:
+            indexer[i] = j
+            i += 1
+            while i < nleft - 1 and left[i] == rval:
+                indexer[i] = j
+                i += 1
+            j += 1
+        elif left[i] > rval:
+            indexer[i] = -1
+            j += 1
+        else:
+            indexer[i] = -1
+            i += 1
+    return indexer
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def left_join_indexer_unique_float32(ndarray[float32_t] left,
+                                      ndarray[float32_t] right):
+    cdef:
+        Py_ssize_t i, j, nleft, nright
+        ndarray[int64_t] indexer
+        float32_t lval, rval
+
+    i = 0
+    j = 0
+    nleft = len(left)
+    nright = len(right)
+
+    indexer = np.empty(nleft, dtype=np.int64)
+    while True:
+        if i == nleft:
+            break
+
+        if j == nright:
+            indexer[i] = -1
+            i += 1
+            continue
+
+        rval = right[j]
+
+        while i < nleft - 1 and left[i] == rval:
+            indexer[i] = j
+            i += 1
+
+        if left[i] == right[j]:
+            indexer[i] = j
+            i += 1
+            while i < nleft - 1 and left[i] == rval:
+                indexer[i] = j
+                i += 1
+            j += 1
+        elif left[i] > rval:
+            indexer[i] = -1
+            j += 1
+        else:
+            indexer[i] = -1
+            i += 1
+    return indexer
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def left_join_indexer_unique_object(ndarray[object] left,
+                                      ndarray[object] right):
+    cdef:
+        Py_ssize_t i, j, nleft, nright
+        ndarray[int64_t] indexer
+        object lval, rval
+
+    i = 0
+    j = 0
+    nleft = len(left)
+    nright = len(right)
+
+    indexer = np.empty(nleft, dtype=np.int64)
+    while True:
+        if i == nleft:
+            break
+
+        if j == nright:
+            indexer[i] = -1
+            i += 1
+            continue
+
+        rval = right[j]
+
+        while i < nleft - 1 and left[i] == rval:
+            indexer[i] = j
+            i += 1
+
+        if left[i] == right[j]:
+            indexer[i] = j
+            i += 1
+            while i < nleft - 1 and left[i] == rval:
+                indexer[i] = j
+                i += 1
+            j += 1
+        elif left[i] > rval:
+            indexer[i] = -1
+            j += 1
+        else:
+            indexer[i] = -1
+            i += 1
+    return indexer
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def left_join_indexer_unique_int8(ndarray[int8_t] left,
+                                      ndarray[int8_t] right):
+    cdef:
+        Py_ssize_t i, j, nleft, nright
+        ndarray[int64_t] indexer
+        int8_t lval, rval
+
+    i = 0
+    j = 0
+    nleft = len(left)
+    nright = len(right)
+
+    indexer = np.empty(nleft, dtype=np.int64)
+    while True:
+        if i == nleft:
+            break
+
+        if j == nright:
+            indexer[i] = -1
+            i += 1
+            continue
+
+        rval = right[j]
+
+        while i < nleft - 1 and left[i] == rval:
+            indexer[i] = j
+            i += 1
+
+        if left[i] == right[j]:
+            indexer[i] = j
+            i += 1
+            while i < nleft - 1 and left[i] == rval:
+                indexer[i] = j
+                i += 1
+            j += 1
+        elif left[i] > rval:
+            indexer[i] = -1
+            j += 1
+        else:
+            indexer[i] = -1
+            i += 1
+    return indexer
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def left_join_indexer_unique_int16(ndarray[int16_t] left,
+                                      ndarray[int16_t] right):
+    cdef:
+        Py_ssize_t i, j, nleft, nright
+        ndarray[int64_t] indexer
+        int16_t lval, rval
+
+    i = 0
+    j = 0
+    nleft = len(left)
+    nright = len(right)
+
+    indexer = np.empty(nleft, dtype=np.int64)
+    while True:
+        if i == nleft:
+            break
+
+        if j == nright:
+            indexer[i] = -1
+            i += 1
+            continue
+
+        rval = right[j]
+
+        while i < nleft - 1 and left[i] == rval:
+            indexer[i] = j
+            i += 1
+
+        if left[i] == right[j]:
+            indexer[i] = j
+            i += 1
+            while i < nleft - 1 and left[i] == rval:
+                indexer[i] = j
+                i += 1
+            j += 1
+        elif left[i] > rval:
+            indexer[i] = -1
+            j += 1
+        else:
+            indexer[i] = -1
+            i += 1
+    return indexer
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def left_join_indexer_unique_int32(ndarray[int32_t] left,
+                                      ndarray[int32_t] right):
+    cdef:
+        Py_ssize_t i, j, nleft, nright
+        ndarray[int64_t] indexer
+        int32_t lval, rval
+
+    i = 0
+    j = 0
+    nleft = len(left)
+    nright = len(right)
+
+    indexer = np.empty(nleft, dtype=np.int64)
+    while True:
+        if i == nleft:
+            break
+
+        if j == nright:
+            indexer[i] = -1
+            i += 1
+            continue
+
+        rval = right[j]
+
+        while i < nleft - 1 and left[i] == rval:
+            indexer[i] = j
+            i += 1
+
+        if left[i] == right[j]:
+            indexer[i] = j
+            i += 1
+            while i < nleft - 1 and left[i] == rval:
+                indexer[i] = j
+                i += 1
+            j += 1
+        elif left[i] > rval:
+            indexer[i] = -1
+            j += 1
+        else:
+            indexer[i] = -1
+            i += 1
+    return indexer
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def left_join_indexer_unique_int64(ndarray[int64_t] left,
+                                      ndarray[int64_t] right):
+    cdef:
+        Py_ssize_t i, j, nleft, nright
+        ndarray[int64_t] indexer
+        int64_t lval, rval
+
+    i = 0
+    j = 0
+    nleft = len(left)
+    nright = len(right)
+
+    indexer = np.empty(nleft, dtype=np.int64)
+    while True:
+        if i == nleft:
+            break
+
+        if j == nright:
+            indexer[i] = -1
+            i += 1
+            continue
+
+        rval = right[j]
+
+        while i < nleft - 1 and left[i] == rval:
+            indexer[i] = j
+            i += 1
+
+        if left[i] == right[j]:
+            indexer[i] = j
+            i += 1
+            while i < nleft - 1 and left[i] == rval:
+                indexer[i] = j
+                i += 1
+            j += 1
+        elif left[i] > rval:
+            indexer[i] = -1
+            j += 1
+        else:
+            indexer[i] = -1
+            i += 1
+    return indexer
+
+
+
+def left_join_indexer_float64(ndarray[float64_t] left,
+                              ndarray[float64_t] right):
+    '''
+    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
+    '''
+    cdef:
+        Py_ssize_t i, j, k, nright, nleft, count
+        float64_t lval, rval
+        ndarray[int64_t] lindexer, rindexer
+        ndarray[float64_t] result
+
+    nleft = len(left)
+    nright = len(right)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0:
+        while i < nleft:
+            if j == nright:
+                count += nleft - i
+                break
 
-    if False and _checknan(fill_value):
-        for i in range(n):
-            idx = idx0[i]
-            if idx == -1:
-                for j in range(k):
-                    raise ValueError('No NA values allowed')
+            lval = left[i]
+            rval = right[j]
+
+            if lval == rval:
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                count += 1
+                i += 1
             else:
-                for j in range(k):
-                    if idx1[j] == -1:
-                        raise ValueError('No NA values allowed')
+                j += 1
+
+    # do it again now that result size is known
+
+    lindexer = np.empty(count, dtype=np.int64)
+    rindexer = np.empty(count, dtype=np.int64)
+    result = np.empty(count, dtype=np.float64)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0:
+        while i < nleft:
+            if j == nright:
+                while i < nleft:
+                    lindexer[count] = i
+                    rindexer[count] = -1
+                    result[count] = left[i]
+                    i += 1
+                    count += 1
+                break
+
+            lval = left[i]
+            rval = right[j]
+
+            if lval == rval:
+                lindexer[count] = i
+                rindexer[count] = j
+                result[count] = lval
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                lindexer[count] = i
+                rindexer[count] = -1
+                result[count] = left[i]
+                count += 1
+                i += 1
+            else:
+                j += 1
+
+    return result, lindexer, rindexer
+
+
+def left_join_indexer_float32(ndarray[float32_t] left,
+                              ndarray[float32_t] right):
+    '''
+    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
+    '''
+    cdef:
+        Py_ssize_t i, j, k, nright, nleft, count
+        float32_t lval, rval
+        ndarray[int64_t] lindexer, rindexer
+        ndarray[float32_t] result
+
+    nleft = len(left)
+    nright = len(right)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0:
+        while i < nleft:
+            if j == nright:
+                count += nleft - i
+                break
+
+            lval = left[i]
+            rval = right[j]
+
+            if lval == rval:
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                count += 1
+                i += 1
+            else:
+                j += 1
+
+    # do it again now that result size is known
+
+    lindexer = np.empty(count, dtype=np.int64)
+    rindexer = np.empty(count, dtype=np.int64)
+    result = np.empty(count, dtype=np.float32)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0:
+        while i < nleft:
+            if j == nright:
+                while i < nleft:
+                    lindexer[count] = i
+                    rindexer[count] = -1
+                    result[count] = left[i]
+                    i += 1
+                    count += 1
+                break
+
+            lval = left[i]
+            rval = right[j]
+
+            if lval == rval:
+                lindexer[count] = i
+                rindexer[count] = j
+                result[count] = lval
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                lindexer[count] = i
+                rindexer[count] = -1
+                result[count] = left[i]
+                count += 1
+                i += 1
+            else:
+                j += 1
+
+    return result, lindexer, rindexer
+
+
+def left_join_indexer_object(ndarray[object] left,
+                              ndarray[object] right):
+    '''
+    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
+    '''
+    cdef:
+        Py_ssize_t i, j, k, nright, nleft, count
+        object lval, rval
+        ndarray[int64_t] lindexer, rindexer
+        ndarray[object] result
+
+    nleft = len(left)
+    nright = len(right)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0:
+        while i < nleft:
+            if j == nright:
+                count += nleft - i
+                break
+
+            lval = left[i]
+            rval = right[j]
+
+            if lval == rval:
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                count += 1
+                i += 1
+            else:
+                j += 1
+
+    # do it again now that result size is known
+
+    lindexer = np.empty(count, dtype=np.int64)
+    rindexer = np.empty(count, dtype=np.int64)
+    result = np.empty(count, dtype=object)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0:
+        while i < nleft:
+            if j == nright:
+                while i < nleft:
+                    lindexer[count] = i
+                    rindexer[count] = -1
+                    result[count] = left[i]
+                    i += 1
+                    count += 1
+                break
+
+            lval = left[i]
+            rval = right[j]
+
+            if lval == rval:
+                lindexer[count] = i
+                rindexer[count] = j
+                result[count] = lval
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
                     else:
-                        outbuf[i, j] = values[idx, idx1[j]]
-    else:
-        fv = fill_value
-        for i in range(n):
-            idx = idx0[i]
-            if idx == -1:
-                for j in range(k):
-                    outbuf[i, j] = fv
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                lindexer[count] = i
+                rindexer[count] = -1
+                result[count] = left[i]
+                count += 1
+                i += 1
             else:
-                for j in range(k):
-                    if idx1[j] == -1:
-                        outbuf[i, j] = fv
-                    else:
-                        outbuf[i, j] = values[idx, idx1[j]]
+                j += 1
 
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_object(ndarray[object, ndim=2] values,
-                           ndarray[int64_t] idx0,
-                           ndarray[int64_t] idx1,
-                           out=None, fill_value=np.nan):
+    return result, lindexer, rindexer
+
+
+def left_join_indexer_int8(ndarray[int8_t] left,
+                              ndarray[int8_t] right):
+    '''
+    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
+    '''
     cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[object, ndim=2] outbuf
-        object fv
+        Py_ssize_t i, j, k, nright, nleft, count
+        int8_t lval, rval
+        ndarray[int64_t] lindexer, rindexer
+        ndarray[int8_t] result
 
-    n = len(idx0)
-    k = len(idx1)
+    nleft = len(left)
+    nright = len(right)
 
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0:
+        while i < nleft:
+            if j == nright:
+                count += nleft - i
+                break
 
+            lval = left[i]
+            rval = right[j]
 
-    if False and _checknan(fill_value):
-        for i in range(n):
-            idx = idx0[i]
-            if idx == -1:
-                for j in range(k):
-                    raise ValueError('No NA values allowed')
-            else:
-                for j in range(k):
-                    if idx1[j] == -1:
-                        raise ValueError('No NA values allowed')
+            if lval == rval:
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
                     else:
-                        outbuf[i, j] = values[idx, idx1[j]]
-    else:
-        fv = fill_value
-        for i in range(n):
-            idx = idx0[i]
-            if idx == -1:
-                for j in range(k):
-                    outbuf[i, j] = fv
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                count += 1
+                i += 1
             else:
-                for j in range(k):
-                    if idx1[j] == -1:
-                        outbuf[i, j] = fv
-                    else:
-                        outbuf[i, j] = values[idx, idx1[j]]
+                j += 1
 
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int32(ndarray[int32_t, ndim=2] values,
-                           ndarray[int64_t] idx0,
-                           ndarray[int64_t] idx1,
-                           out=None, fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int32_t, ndim=2] outbuf
-        int32_t fv
+    # do it again now that result size is known
 
-    n = len(idx0)
-    k = len(idx1)
+    lindexer = np.empty(count, dtype=np.int64)
+    rindexer = np.empty(count, dtype=np.int64)
+    result = np.empty(count, dtype=np.int8)
 
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0:
+        while i < nleft:
+            if j == nright:
+                while i < nleft:
+                    lindexer[count] = i
+                    rindexer[count] = -1
+                    result[count] = left[i]
+                    i += 1
+                    count += 1
+                break
 
+            lval = left[i]
+            rval = right[j]
 
-    if True and _checknan(fill_value):
-        for i in range(n):
-            idx = idx0[i]
-            if idx == -1:
-                for j in range(k):
-                    raise ValueError('No NA values allowed')
-            else:
-                for j in range(k):
-                    if idx1[j] == -1:
-                        raise ValueError('No NA values allowed')
+            if lval == rval:
+                lindexer[count] = i
+                rindexer[count] = j
+                result[count] = lval
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
                     else:
-                        outbuf[i, j] = values[idx, idx1[j]]
-    else:
-        fv = fill_value
-        for i in range(n):
-            idx = idx0[i]
-            if idx == -1:
-                for j in range(k):
-                    outbuf[i, j] = fv
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                lindexer[count] = i
+                rindexer[count] = -1
+                result[count] = left[i]
+                count += 1
+                i += 1
             else:
-                for j in range(k):
-                    if idx1[j] == -1:
-                        outbuf[i, j] = fv
-                    else:
-                        outbuf[i, j] = values[idx, idx1[j]]
+                j += 1
 
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int64(ndarray[int64_t, ndim=2] values,
-                           ndarray[int64_t] idx0,
-                           ndarray[int64_t] idx1,
-                           out=None, fill_value=np.nan):
+    return result, lindexer, rindexer
+
+
+def left_join_indexer_int16(ndarray[int16_t] left,
+                              ndarray[int16_t] right):
+    '''
+    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
+    '''
     cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t, ndim=2] outbuf
-        int64_t fv
+        Py_ssize_t i, j, k, nright, nleft, count
+        int16_t lval, rval
+        ndarray[int64_t] lindexer, rindexer
+        ndarray[int16_t] result
+
+    nleft = len(left)
+    nright = len(right)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0:
+        while i < nleft:
+            if j == nright:
+                count += nleft - i
+                break
+
+            lval = left[i]
+            rval = right[j]
+
+            if lval == rval:
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                count += 1
+                i += 1
+            else:
+                j += 1
 
-    n = len(idx0)
-    k = len(idx1)
+    # do it again now that result size is known
 
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
+    lindexer = np.empty(count, dtype=np.int64)
+    rindexer = np.empty(count, dtype=np.int64)
+    result = np.empty(count, dtype=np.int16)
 
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0:
+        while i < nleft:
+            if j == nright:
+                while i < nleft:
+                    lindexer[count] = i
+                    rindexer[count] = -1
+                    result[count] = left[i]
+                    i += 1
+                    count += 1
+                break
 
-    if True and _checknan(fill_value):
-        for i in range(n):
-            idx = idx0[i]
-            if idx == -1:
-                for j in range(k):
-                    raise ValueError('No NA values allowed')
-            else:
-                for j in range(k):
-                    if idx1[j] == -1:
-                        raise ValueError('No NA values allowed')
+            lval = left[i]
+            rval = right[j]
+
+            if lval == rval:
+                lindexer[count] = i
+                rindexer[count] = j
+                result[count] = lval
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
                     else:
-                        outbuf[i, j] = values[idx, idx1[j]]
-    else:
-        fv = fill_value
-        for i in range(n):
-            idx = idx0[i]
-            if idx == -1:
-                for j in range(k):
-                    outbuf[i, j] = fv
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                lindexer[count] = i
+                rindexer[count] = -1
+                result[count] = left[i]
+                count += 1
+                i += 1
             else:
-                for j in range(k):
-                    if idx1[j] == -1:
-                        outbuf[i, j] = fv
-                    else:
-                        outbuf[i, j] = values[idx, idx1[j]]
+                j += 1
 
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_bool(ndarray[uint8_t, ndim=2] values,
-                           ndarray[int64_t] idx0,
-                           ndarray[int64_t] idx1,
-                           out=None, fill_value=np.nan):
+    return result, lindexer, rindexer
+
+
+def left_join_indexer_int32(ndarray[int32_t] left,
+                              ndarray[int32_t] right):
+    '''
+    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
+    '''
     cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[uint8_t, ndim=2] outbuf
-        uint8_t fv
+        Py_ssize_t i, j, k, nright, nleft, count
+        int32_t lval, rval
+        ndarray[int64_t] lindexer, rindexer
+        ndarray[int32_t] result
 
-    n = len(idx0)
-    k = len(idx1)
+    nleft = len(left)
+    nright = len(right)
 
-    if out is None:
-        outbuf = np.empty((n, k), dtype=values.dtype)
-    else:
-        outbuf = out
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0:
+        while i < nleft:
+            if j == nright:
+                count += nleft - i
+                break
 
+            lval = left[i]
+            rval = right[j]
 
-    if True and _checknan(fill_value):
-        for i in range(n):
-            idx = idx0[i]
-            if idx == -1:
-                for j in range(k):
-                    raise ValueError('No NA values allowed')
-            else:
-                for j in range(k):
-                    if idx1[j] == -1:
-                        raise ValueError('No NA values allowed')
+            if lval == rval:
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
                     else:
-                        outbuf[i, j] = values[idx, idx1[j]]
-    else:
-        fv = fill_value
-        for i in range(n):
-            idx = idx0[i]
-            if idx == -1:
-                for j in range(k):
-                    outbuf[i, j] = fv
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                count += 1
+                i += 1
             else:
-                for j in range(k):
-                    if idx1[j] == -1:
-                        outbuf[i, j] = fv
-                    else:
-                        outbuf[i, j] = values[idx, idx1[j]]
+                j += 1
 
+    # do it again now that result size is known
 
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def left_join_indexer_unique_float64(ndarray[float64_t] left,
-                                      ndarray[float64_t] right):
-    cdef:
-        Py_ssize_t i, j, nleft, nright
-        ndarray[int64_t] indexer
-        float64_t lval, rval
+    lindexer = np.empty(count, dtype=np.int64)
+    rindexer = np.empty(count, dtype=np.int64)
+    result = np.empty(count, dtype=np.int32)
 
     i = 0
     j = 0
-    nleft = len(left)
-    nright = len(right)
-
-    indexer = np.empty(nleft, dtype=np.int64)
-    while True:
-        if i == nleft:
-            break
+    count = 0
+    if nleft > 0:
+        while i < nleft:
+            if j == nright:
+                while i < nleft:
+                    lindexer[count] = i
+                    rindexer[count] = -1
+                    result[count] = left[i]
+                    i += 1
+                    count += 1
+                break
 
-        if j == nright:
-            indexer[i] = -1
-            i += 1
-            continue
+            lval = left[i]
+            rval = right[j]
 
-        rval = right[j]
+            if lval == rval:
+                lindexer[count] = i
+                rindexer[count] = j
+                result[count] = lval
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                lindexer[count] = i
+                rindexer[count] = -1
+                result[count] = left[i]
+                count += 1
+                i += 1
+            else:
+                j += 1
 
-        while i < nleft - 1 and left[i] == rval:
-            indexer[i] = j
-            i += 1
+    return result, lindexer, rindexer
 
-        if left[i] == right[j]:
-            indexer[i] = j
-            i += 1
-            while i < nleft - 1 and left[i] == rval:
-                indexer[i] = j
-                i += 1
-            j += 1
-        elif left[i] > rval:
-            indexer[i] = -1
-            j += 1
-        else:
-            indexer[i] = -1
-            i += 1
-    return indexer
 
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def left_join_indexer_unique_object(ndarray[object] left,
-                                      ndarray[object] right):
+def left_join_indexer_int64(ndarray[int64_t] left,
+                              ndarray[int64_t] right):
+    '''
+    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
+    '''
     cdef:
-        Py_ssize_t i, j, nleft, nright
-        ndarray[int64_t] indexer
-        object lval, rval
-
-    i = 0
-    j = 0
+        Py_ssize_t i, j, k, nright, nleft, count
+        int64_t lval, rval
+        ndarray[int64_t] lindexer, rindexer
+        ndarray[int64_t] result
+
     nleft = len(left)
     nright = len(right)
 
-    indexer = np.empty(nleft, dtype=np.int64)
-    while True:
-        if i == nleft:
-            break
-
-        if j == nright:
-            indexer[i] = -1
-            i += 1
-            continue
-
-        rval = right[j]
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0:
+        while i < nleft:
+            if j == nright:
+                count += nleft - i
+                break
 
-        while i < nleft - 1 and left[i] == rval:
-            indexer[i] = j
-            i += 1
+            lval = left[i]
+            rval = right[j]
 
-        if left[i] == right[j]:
-            indexer[i] = j
-            i += 1
-            while i < nleft - 1 and left[i] == rval:
-                indexer[i] = j
+            if lval == rval:
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                count += 1
                 i += 1
-            j += 1
-        elif left[i] > rval:
-            indexer[i] = -1
-            j += 1
-        else:
-            indexer[i] = -1
-            i += 1
-    return indexer
+            else:
+                j += 1
 
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def left_join_indexer_unique_int32(ndarray[int32_t] left,
-                                      ndarray[int32_t] right):
-    cdef:
-        Py_ssize_t i, j, nleft, nright
-        ndarray[int64_t] indexer
-        int32_t lval, rval
+    # do it again now that result size is known
+
+    lindexer = np.empty(count, dtype=np.int64)
+    rindexer = np.empty(count, dtype=np.int64)
+    result = np.empty(count, dtype=np.int64)
 
     i = 0
     j = 0
-    nleft = len(left)
-    nright = len(right)
-
-    indexer = np.empty(nleft, dtype=np.int64)
-    while True:
-        if i == nleft:
-            break
+    count = 0
+    if nleft > 0:
+        while i < nleft:
+            if j == nright:
+                while i < nleft:
+                    lindexer[count] = i
+                    rindexer[count] = -1
+                    result[count] = left[i]
+                    i += 1
+                    count += 1
+                break
 
-        if j == nright:
-            indexer[i] = -1
-            i += 1
-            continue
+            lval = left[i]
+            rval = right[j]
 
-        rval = right[j]
+            if lval == rval:
+                lindexer[count] = i
+                rindexer[count] = j
+                result[count] = lval
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                lindexer[count] = i
+                rindexer[count] = -1
+                result[count] = left[i]
+                count += 1
+                i += 1
+            else:
+                j += 1
 
-        while i < nleft - 1 and left[i] == rval:
-            indexer[i] = j
-            i += 1
+    return result, lindexer, rindexer
 
-        if left[i] == right[j]:
-            indexer[i] = j
-            i += 1
-            while i < nleft - 1 and left[i] == rval:
-                indexer[i] = j
-                i += 1
-            j += 1
-        elif left[i] > rval:
-            indexer[i] = -1
-            j += 1
-        else:
-            indexer[i] = -1
-            i += 1
-    return indexer
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def left_join_indexer_unique_int64(ndarray[int64_t] left,
-                                      ndarray[int64_t] right):
+def outer_join_indexer_float64(ndarray[float64_t] left,
+                                ndarray[float64_t] right):
     cdef:
-        Py_ssize_t i, j, nleft, nright
-        ndarray[int64_t] indexer
-        int64_t lval, rval
+        Py_ssize_t i, j, nright, nleft, count
+        float64_t lval, rval
+        ndarray[int64_t] lindexer, rindexer
+        ndarray[float64_t] result
 
-    i = 0
-    j = 0
     nleft = len(left)
     nright = len(right)
 
-    indexer = np.empty(nleft, dtype=np.int64)
-    while True:
-        if i == nleft:
-            break
+    i = 0
+    j = 0
+    count = 0
+    if nleft == 0:
+        count = nright
+    elif nright == 0:
+        count = nleft
+    else:
+        while True:
+            if i == nleft:
+                count += nright - j
+                break
+            if j == nright:
+                count += nleft - i
+                break
 
-        if j == nright:
-            indexer[i] = -1
-            i += 1
-            continue
+            lval = left[i]
+            rval = right[j]
+            if lval == rval:
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                count += 1
+                i += 1
+            else:
+                count += 1
+                j += 1
 
-        rval = right[j]
+    lindexer = np.empty(count, dtype=np.int64)
+    rindexer = np.empty(count, dtype=np.int64)
+    result = np.empty(count, dtype=np.float64)
 
-        while i < nleft - 1 and left[i] == rval:
-            indexer[i] = j
-            i += 1
+    # do it again, but populate the indexers / result
 
-        if left[i] == right[j]:
-            indexer[i] = j
-            i += 1
-            while i < nleft - 1 and left[i] == rval:
-                indexer[i] = j
-                i += 1
-            j += 1
-        elif left[i] > rval:
-            indexer[i] = -1
-            j += 1
-        else:
-            indexer[i] = -1
-            i += 1
-    return indexer
+    i = 0
+    j = 0
+    count = 0
+    if nleft == 0:
+        for j in range(nright):
+            lindexer[j] = -1
+            rindexer[j] = j
+            result[j] = right[j]
+    elif nright == 0:
+        for i in range(nright):
+            lindexer[i] = i
+            rindexer[i] = -1
+            result[i] = left[i]
+    else:
+        while True:
+            if i == nleft:
+                while j < nright:
+                    lindexer[count] = -1
+                    rindexer[count] = j
+                    result[count] = right[j]
+                    count += 1
+                    j += 1
+                break
+            if j == nright:
+                while i < nleft:
+                    lindexer[count] = i
+                    rindexer[count] = -1
+                    result[count] = left[i]
+                    count += 1
+                    i += 1
+                break
+
+            lval = left[i]
+            rval = right[j]
 
+            if lval == rval:
+                lindexer[count] = i
+                rindexer[count] = j
+                result[count] = lval
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                lindexer[count] = i
+                rindexer[count] = -1
+                result[count] = lval
+                count += 1
+                i += 1
+            else:
+                lindexer[count] = -1
+                rindexer[count] = j
+                result[count] = rval
+                count += 1
+                j += 1
 
+    return result, lindexer, rindexer
 
-def left_join_indexer_float64(ndarray[float64_t] left,
-                              ndarray[float64_t] right):
-    '''
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    '''
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def outer_join_indexer_float32(ndarray[float32_t] left,
+                                ndarray[float32_t] right):
     cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
-        float64_t lval, rval
+        Py_ssize_t i, j, nright, nleft, count
+        float32_t lval, rval
         ndarray[int64_t] lindexer, rindexer
-        ndarray[float64_t] result
+        ndarray[float32_t] result
 
     nleft = len(left)
     nright = len(right)
@@ -2725,15 +7425,21 @@ def left_join_indexer_float64(ndarray[float64_t] left,
     i = 0
     j = 0
     count = 0
-    if nleft > 0:
-        while i < nleft:
+    if nleft == 0:
+        count = nright
+    elif nright == 0:
+        count = nleft
+    else:
+        while True:
+            if i == nleft:
+                count += nright - j
+                break
             if j == nright:
                 count += nleft - i
                 break
 
             lval = left[i]
             rval = right[j]
-
             if lval == rval:
                 count += 1
                 if i < nleft - 1:
@@ -2754,26 +7460,45 @@ def left_join_indexer_float64(ndarray[float64_t] left,
                 count += 1
                 i += 1
             else:
+                count += 1
                 j += 1
 
-    # do it again now that result size is known
-
     lindexer = np.empty(count, dtype=np.int64)
     rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.float64)
+    result = np.empty(count, dtype=np.float32)
+
+    # do it again, but populate the indexers / result
 
     i = 0
     j = 0
     count = 0
-    if nleft > 0:
-        while i < nleft:
+    if nleft == 0:
+        for j in range(nright):
+            lindexer[j] = -1
+            rindexer[j] = j
+            result[j] = right[j]
+    elif nright == 0:
+        for i in range(nright):
+            lindexer[i] = i
+            rindexer[i] = -1
+            result[i] = left[i]
+    else:
+        while True:
+            if i == nleft:
+                while j < nright:
+                    lindexer[count] = -1
+                    rindexer[count] = j
+                    result[count] = right[j]
+                    count += 1
+                    j += 1
+                break
             if j == nright:
                 while i < nleft:
                     lindexer[count] = i
                     rindexer[count] = -1
                     result[count] = left[i]
-                    i += 1
                     count += 1
+                    i += 1
                 break
 
             lval = left[i]
@@ -2801,22 +7526,24 @@ def left_join_indexer_float64(ndarray[float64_t] left,
             elif lval < rval:
                 lindexer[count] = i
                 rindexer[count] = -1
-                result[count] = left[i]
+                result[count] = lval
                 count += 1
                 i += 1
             else:
+                lindexer[count] = -1
+                rindexer[count] = j
+                result[count] = rval
+                count += 1
                 j += 1
 
     return result, lindexer, rindexer
 
-
-def left_join_indexer_object(ndarray[object] left,
-                              ndarray[object] right):
-    '''
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    '''
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def outer_join_indexer_object(ndarray[object] left,
+                                ndarray[object] right):
     cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
+        Py_ssize_t i, j, nright, nleft, count
         object lval, rval
         ndarray[int64_t] lindexer, rindexer
         ndarray[object] result
@@ -2827,15 +7554,21 @@ def left_join_indexer_object(ndarray[object] left,
     i = 0
     j = 0
     count = 0
-    if nleft > 0:
-        while i < nleft:
+    if nleft == 0:
+        count = nright
+    elif nright == 0:
+        count = nleft
+    else:
+        while True:
+            if i == nleft:
+                count += nright - j
+                break
             if j == nright:
                 count += nleft - i
                 break
 
             lval = left[i]
             rval = right[j]
-
             if lval == rval:
                 count += 1
                 if i < nleft - 1:
@@ -2856,26 +7589,45 @@ def left_join_indexer_object(ndarray[object] left,
                 count += 1
                 i += 1
             else:
+                count += 1
                 j += 1
 
-    # do it again now that result size is known
-
     lindexer = np.empty(count, dtype=np.int64)
     rindexer = np.empty(count, dtype=np.int64)
     result = np.empty(count, dtype=object)
 
+    # do it again, but populate the indexers / result
+
     i = 0
     j = 0
     count = 0
-    if nleft > 0:
-        while i < nleft:
+    if nleft == 0:
+        for j in range(nright):
+            lindexer[j] = -1
+            rindexer[j] = j
+            result[j] = right[j]
+    elif nright == 0:
+        for i in range(nright):
+            lindexer[i] = i
+            rindexer[i] = -1
+            result[i] = left[i]
+    else:
+        while True:
+            if i == nleft:
+                while j < nright:
+                    lindexer[count] = -1
+                    rindexer[count] = j
+                    result[count] = right[j]
+                    count += 1
+                    j += 1
+                break
             if j == nright:
                 while i < nleft:
                     lindexer[count] = i
                     rindexer[count] = -1
                     result[count] = left[i]
-                    i += 1
                     count += 1
+                    i += 1
                 break
 
             lval = left[i]
@@ -2903,25 +7655,27 @@ def left_join_indexer_object(ndarray[object] left,
             elif lval < rval:
                 lindexer[count] = i
                 rindexer[count] = -1
-                result[count] = left[i]
+                result[count] = lval
                 count += 1
                 i += 1
             else:
+                lindexer[count] = -1
+                rindexer[count] = j
+                result[count] = rval
+                count += 1
                 j += 1
 
     return result, lindexer, rindexer
 
-
-def left_join_indexer_int32(ndarray[int32_t] left,
-                              ndarray[int32_t] right):
-    '''
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    '''
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def outer_join_indexer_int8(ndarray[int8_t] left,
+                                ndarray[int8_t] right):
     cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
-        int32_t lval, rval
+        Py_ssize_t i, j, nright, nleft, count
+        int8_t lval, rval
         ndarray[int64_t] lindexer, rindexer
-        ndarray[int32_t] result
+        ndarray[int8_t] result
 
     nleft = len(left)
     nright = len(right)
@@ -2929,15 +7683,21 @@ def left_join_indexer_int32(ndarray[int32_t] left,
     i = 0
     j = 0
     count = 0
-    if nleft > 0:
-        while i < nleft:
+    if nleft == 0:
+        count = nright
+    elif nright == 0:
+        count = nleft
+    else:
+        while True:
+            if i == nleft:
+                count += nright - j
+                break
             if j == nright:
                 count += nleft - i
                 break
 
             lval = left[i]
             rval = right[j]
-
             if lval == rval:
                 count += 1
                 if i < nleft - 1:
@@ -2958,26 +7718,45 @@ def left_join_indexer_int32(ndarray[int32_t] left,
                 count += 1
                 i += 1
             else:
+                count += 1
                 j += 1
 
-    # do it again now that result size is known
-
     lindexer = np.empty(count, dtype=np.int64)
     rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.int32)
+    result = np.empty(count, dtype=np.int8)
+
+    # do it again, but populate the indexers / result
 
     i = 0
     j = 0
     count = 0
-    if nleft > 0:
-        while i < nleft:
+    if nleft == 0:
+        for j in range(nright):
+            lindexer[j] = -1
+            rindexer[j] = j
+            result[j] = right[j]
+    elif nright == 0:
+        for i in range(nright):
+            lindexer[i] = i
+            rindexer[i] = -1
+            result[i] = left[i]
+    else:
+        while True:
+            if i == nleft:
+                while j < nright:
+                    lindexer[count] = -1
+                    rindexer[count] = j
+                    result[count] = right[j]
+                    count += 1
+                    j += 1
+                break
             if j == nright:
                 while i < nleft:
                     lindexer[count] = i
                     rindexer[count] = -1
                     result[count] = left[i]
-                    i += 1
                     count += 1
+                    i += 1
                 break
 
             lval = left[i]
@@ -3005,25 +7784,27 @@ def left_join_indexer_int32(ndarray[int32_t] left,
             elif lval < rval:
                 lindexer[count] = i
                 rindexer[count] = -1
-                result[count] = left[i]
+                result[count] = lval
                 count += 1
                 i += 1
             else:
+                lindexer[count] = -1
+                rindexer[count] = j
+                result[count] = rval
+                count += 1
                 j += 1
 
     return result, lindexer, rindexer
 
-
-def left_join_indexer_int64(ndarray[int64_t] left,
-                              ndarray[int64_t] right):
-    '''
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    '''
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def outer_join_indexer_int16(ndarray[int16_t] left,
+                                ndarray[int16_t] right):
     cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
-        int64_t lval, rval
+        Py_ssize_t i, j, nright, nleft, count
+        int16_t lval, rval
         ndarray[int64_t] lindexer, rindexer
-        ndarray[int64_t] result
+        ndarray[int16_t] result
 
     nleft = len(left)
     nright = len(right)
@@ -3031,15 +7812,21 @@ def left_join_indexer_int64(ndarray[int64_t] left,
     i = 0
     j = 0
     count = 0
-    if nleft > 0:
-        while i < nleft:
+    if nleft == 0:
+        count = nright
+    elif nright == 0:
+        count = nleft
+    else:
+        while True:
+            if i == nleft:
+                count += nright - j
+                break
             if j == nright:
                 count += nleft - i
                 break
 
             lval = left[i]
             rval = right[j]
-
             if lval == rval:
                 count += 1
                 if i < nleft - 1:
@@ -3060,26 +7847,45 @@ def left_join_indexer_int64(ndarray[int64_t] left,
                 count += 1
                 i += 1
             else:
+                count += 1
                 j += 1
 
-    # do it again now that result size is known
-
     lindexer = np.empty(count, dtype=np.int64)
     rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.int64)
+    result = np.empty(count, dtype=np.int16)
+
+    # do it again, but populate the indexers / result
 
     i = 0
     j = 0
     count = 0
-    if nleft > 0:
-        while i < nleft:
+    if nleft == 0:
+        for j in range(nright):
+            lindexer[j] = -1
+            rindexer[j] = j
+            result[j] = right[j]
+    elif nright == 0:
+        for i in range(nright):
+            lindexer[i] = i
+            rindexer[i] = -1
+            result[i] = left[i]
+    else:
+        while True:
+            if i == nleft:
+                while j < nright:
+                    lindexer[count] = -1
+                    rindexer[count] = j
+                    result[count] = right[j]
+                    count += 1
+                    j += 1
+                break
             if j == nright:
                 while i < nleft:
                     lindexer[count] = i
                     rindexer[count] = -1
                     result[count] = left[i]
-                    i += 1
                     count += 1
+                    i += 1
                 break
 
             lval = left[i]
@@ -3107,24 +7913,27 @@ def left_join_indexer_int64(ndarray[int64_t] left,
             elif lval < rval:
                 lindexer[count] = i
                 rindexer[count] = -1
-                result[count] = left[i]
+                result[count] = lval
                 count += 1
                 i += 1
             else:
+                lindexer[count] = -1
+                rindexer[count] = j
+                result[count] = rval
+                count += 1
                 j += 1
 
     return result, lindexer, rindexer
 
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def outer_join_indexer_float64(ndarray[float64_t] left,
-                                ndarray[float64_t] right):
+def outer_join_indexer_int32(ndarray[int32_t] left,
+                                ndarray[int32_t] right):
     cdef:
         Py_ssize_t i, j, nright, nleft, count
-        float64_t lval, rval
+        int32_t lval, rval
         ndarray[int64_t] lindexer, rindexer
-        ndarray[float64_t] result
+        ndarray[int32_t] result
 
     nleft = len(left)
     nright = len(right)
@@ -3172,7 +7981,7 @@ def outer_join_indexer_float64(ndarray[float64_t] left,
 
     lindexer = np.empty(count, dtype=np.int64)
     rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.float64)
+    result = np.empty(count, dtype=np.int32)
 
     # do it again, but populate the indexers / result
 
@@ -3247,13 +8056,13 @@ def outer_join_indexer_float64(ndarray[float64_t] left,
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def outer_join_indexer_object(ndarray[object] left,
-                                ndarray[object] right):
+def outer_join_indexer_int64(ndarray[int64_t] left,
+                                ndarray[int64_t] right):
     cdef:
         Py_ssize_t i, j, nright, nleft, count
-        object lval, rval
+        int64_t lval, rval
         ndarray[int64_t] lindexer, rindexer
-        ndarray[object] result
+        ndarray[int64_t] result
 
     nleft = len(left)
     nright = len(right)
@@ -3301,7 +8110,7 @@ def outer_join_indexer_object(ndarray[object] left,
 
     lindexer = np.empty(count, dtype=np.int64)
     rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=object)
+    result = np.empty(count, dtype=np.int64)
 
     # do it again, but populate the indexers / result
 
@@ -3374,15 +8183,19 @@ def outer_join_indexer_object(ndarray[object] left,
 
     return result, lindexer, rindexer
 
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def outer_join_indexer_int32(ndarray[int32_t] left,
-                                ndarray[int32_t] right):
+def inner_join_indexer_float64(ndarray[float64_t] left,
+                              ndarray[float64_t] right):
+    '''
+    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
+    '''
     cdef:
-        Py_ssize_t i, j, nright, nleft, count
-        int32_t lval, rval
+        Py_ssize_t i, j, k, nright, nleft, count
+        float64_t lval, rval
         ndarray[int64_t] lindexer, rindexer
-        ndarray[int32_t] result
+        ndarray[float64_t] result
 
     nleft = len(left)
     nright = len(right)
@@ -3390,17 +8203,11 @@ def outer_join_indexer_int32(ndarray[int32_t] left,
     i = 0
     j = 0
     count = 0
-    if nleft == 0:
-        count = nright
-    elif nright == 0:
-        count = nleft
-    else:
+    if nleft > 0 and nright > 0:
         while True:
             if i == nleft:
-                count += nright - j
                 break
             if j == nright:
-                count += nleft - i
                 break
 
             lval = left[i]
@@ -3422,57 +8229,32 @@ def outer_join_indexer_int32(ndarray[int32_t] left,
                     # end of the road
                     break
             elif lval < rval:
-                count += 1
                 i += 1
             else:
-                count += 1
                 j += 1
 
+    # do it again now that result size is known
+
     lindexer = np.empty(count, dtype=np.int64)
     rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.int32)
-
-    # do it again, but populate the indexers / result
+    result = np.empty(count, dtype=np.float64)
 
     i = 0
     j = 0
     count = 0
-    if nleft == 0:
-        for j in range(nright):
-            lindexer[j] = -1
-            rindexer[j] = j
-            result[j] = right[j]
-    elif nright == 0:
-        for i in range(nright):
-            lindexer[i] = i
-            rindexer[i] = -1
-            result[i] = left[i]
-    else:
+    if nleft > 0 and nright > 0:
         while True:
             if i == nleft:
-                while j < nright:
-                    lindexer[count] = -1
-                    rindexer[count] = j
-                    result[count] = right[j]
-                    count += 1
-                    j += 1
                 break
             if j == nright:
-                while i < nleft:
-                    lindexer[count] = i
-                    rindexer[count] = -1
-                    result[count] = left[i]
-                    count += 1
-                    i += 1
                 break
 
             lval = left[i]
             rval = right[j]
-
             if lval == rval:
                 lindexer[count] = i
                 rindexer[count] = j
-                result[count] = lval
+                result[count] = rval
                 count += 1
                 if i < nleft - 1:
                     if j < nright - 1 and right[j + 1] == rval:
@@ -3489,29 +8271,24 @@ def outer_join_indexer_int32(ndarray[int32_t] left,
                     # end of the road
                     break
             elif lval < rval:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = lval
-                count += 1
                 i += 1
             else:
-                lindexer[count] = -1
-                rindexer[count] = j
-                result[count] = rval
-                count += 1
                 j += 1
 
     return result, lindexer, rindexer
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def outer_join_indexer_int64(ndarray[int64_t] left,
-                                ndarray[int64_t] right):
+def inner_join_indexer_float32(ndarray[float32_t] left,
+                              ndarray[float32_t] right):
+    '''
+    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
+    '''
     cdef:
-        Py_ssize_t i, j, nright, nleft, count
-        int64_t lval, rval
+        Py_ssize_t i, j, k, nright, nleft, count
+        float32_t lval, rval
         ndarray[int64_t] lindexer, rindexer
-        ndarray[int64_t] result
+        ndarray[float32_t] result
 
     nleft = len(left)
     nright = len(right)
@@ -3519,17 +8296,11 @@ def outer_join_indexer_int64(ndarray[int64_t] left,
     i = 0
     j = 0
     count = 0
-    if nleft == 0:
-        count = nright
-    elif nright == 0:
-        count = nleft
-    else:
+    if nleft > 0 and nright > 0:
         while True:
             if i == nleft:
-                count += nright - j
                 break
             if j == nright:
-                count += nleft - i
                 break
 
             lval = left[i]
@@ -3551,57 +8322,32 @@ def outer_join_indexer_int64(ndarray[int64_t] left,
                     # end of the road
                     break
             elif lval < rval:
-                count += 1
                 i += 1
             else:
-                count += 1
                 j += 1
 
+    # do it again now that result size is known
+
     lindexer = np.empty(count, dtype=np.int64)
     rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.int64)
-
-    # do it again, but populate the indexers / result
+    result = np.empty(count, dtype=np.float32)
 
     i = 0
     j = 0
     count = 0
-    if nleft == 0:
-        for j in range(nright):
-            lindexer[j] = -1
-            rindexer[j] = j
-            result[j] = right[j]
-    elif nright == 0:
-        for i in range(nright):
-            lindexer[i] = i
-            rindexer[i] = -1
-            result[i] = left[i]
-    else:
+    if nleft > 0 and nright > 0:
         while True:
             if i == nleft:
-                while j < nright:
-                    lindexer[count] = -1
-                    rindexer[count] = j
-                    result[count] = right[j]
-                    count += 1
-                    j += 1
                 break
             if j == nright:
-                while i < nleft:
-                    lindexer[count] = i
-                    rindexer[count] = -1
-                    result[count] = left[i]
-                    count += 1
-                    i += 1
                 break
 
             lval = left[i]
             rval = right[j]
-
             if lval == rval:
                 lindexer[count] = i
                 rindexer[count] = j
-                result[count] = lval
+                result[count] = rval
                 count += 1
                 if i < nleft - 1:
                     if j < nright - 1 and right[j + 1] == rval:
@@ -3618,33 +8364,117 @@ def outer_join_indexer_int64(ndarray[int64_t] left,
                     # end of the road
                     break
             elif lval < rval:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = lval
+                i += 1
+            else:
+                j += 1
+
+    return result, lindexer, rindexer
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def inner_join_indexer_object(ndarray[object] left,
+                              ndarray[object] right):
+    '''
+    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
+    '''
+    cdef:
+        Py_ssize_t i, j, k, nright, nleft, count
+        object lval, rval
+        ndarray[int64_t] lindexer, rindexer
+        ndarray[object] result
+
+    nleft = len(left)
+    nright = len(right)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0 and nright > 0:
+        while True:
+            if i == nleft:
+                break
+            if j == nright:
+                break
+
+            lval = left[i]
+            rval = right[j]
+            if lval == rval:
                 count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
                 i += 1
             else:
-                lindexer[count] = -1
+                j += 1
+
+    # do it again now that result size is known
+
+    lindexer = np.empty(count, dtype=np.int64)
+    rindexer = np.empty(count, dtype=np.int64)
+    result = np.empty(count, dtype=object)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0 and nright > 0:
+        while True:
+            if i == nleft:
+                break
+            if j == nright:
+                break
+
+            lval = left[i]
+            rval = right[j]
+            if lval == rval:
+                lindexer[count] = i
                 rindexer[count] = j
                 result[count] = rval
                 count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                i += 1
+            else:
                 j += 1
 
     return result, lindexer, rindexer
 
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def inner_join_indexer_float64(ndarray[float64_t] left,
-                              ndarray[float64_t] right):
+def inner_join_indexer_int8(ndarray[int8_t] left,
+                              ndarray[int8_t] right):
     '''
     Two-pass algorithm for monotonic indexes. Handles many-to-one merges
     '''
     cdef:
         Py_ssize_t i, j, k, nright, nleft, count
-        float64_t lval, rval
+        int8_t lval, rval
         ndarray[int64_t] lindexer, rindexer
-        ndarray[float64_t] result
+        ndarray[int8_t] result
 
     nleft = len(left)
     nright = len(right)
@@ -3686,7 +8516,7 @@ def inner_join_indexer_float64(ndarray[float64_t] left,
 
     lindexer = np.empty(count, dtype=np.int64)
     rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.float64)
+    result = np.empty(count, dtype=np.int8)
 
     i = 0
     j = 0
@@ -3728,16 +8558,16 @@ def inner_join_indexer_float64(ndarray[float64_t] left,
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def inner_join_indexer_object(ndarray[object] left,
-                              ndarray[object] right):
+def inner_join_indexer_int16(ndarray[int16_t] left,
+                              ndarray[int16_t] right):
     '''
     Two-pass algorithm for monotonic indexes. Handles many-to-one merges
     '''
     cdef:
         Py_ssize_t i, j, k, nright, nleft, count
-        object lval, rval
+        int16_t lval, rval
         ndarray[int64_t] lindexer, rindexer
-        ndarray[object] result
+        ndarray[int16_t] result
 
     nleft = len(left)
     nright = len(right)
@@ -3779,7 +8609,7 @@ def inner_join_indexer_object(ndarray[object] left,
 
     lindexer = np.empty(count, dtype=np.int64)
     rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=object)
+    result = np.empty(count, dtype=np.int16)
 
     i = 0
     j = 0
diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx
index 41ac1b3f3480f..ea14245e10731 100644
--- a/pandas/src/inference.pyx
+++ b/pandas/src/inference.pyx
@@ -302,7 +302,7 @@ cdef double fINT64_MAX = <double> INT64_MAX
 cdef double fINT64_MIN = <double> INT64_MIN
 
 def maybe_convert_numeric(ndarray[object] values, set na_values,
-                          convert_empty=True):
+                          convert_empty=True, coerce_numeric=False):
     '''
     Type inference function-- convert strings to numeric (potentially) and
     convert to proper dtype array
@@ -346,17 +346,25 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
             complexes[i] = val
             seen_complex = 1
         else:
-            status = floatify(val, &fval)
-            floats[i] = fval
-            if not seen_float:
-                if '.' in val or fval == INF or fval == NEGINF:
-                    seen_float = 1
-                elif 'inf' in val:  # special case to handle +/-inf
-                    seen_float = 1
-                elif fval < fINT64_MAX and fval > fINT64_MIN:
-                    ints[i] = <int64_t> fval
-                else:
-                    seen_float = 1
+            try:
+                status = floatify(val, &fval)
+                floats[i] = fval
+                if not seen_float:
+                    if '.' in val or fval == INF or fval == NEGINF:
+                        seen_float = 1
+                    elif 'inf' in val:  # special case to handle +/-inf
+                        seen_float = 1
+                    elif fval < fINT64_MAX and fval > fINT64_MIN:
+                        ints[i] = <int64_t> fval
+                    else:
+                        seen_float = 1
+            except:
+                if not coerce_numeric:
+                    raise
+
+                floats[i] = nan
+                seen_float = 1
+               
 
     if seen_complex:
         return complexes
diff --git a/pandas/src/numpy.pxd b/pandas/src/numpy.pxd
index 45c2fc184a911..b005a716e7d5f 100644
--- a/pandas/src/numpy.pxd
+++ b/pandas/src/numpy.pxd
@@ -326,6 +326,7 @@ cdef extern from "numpy/arrayobject.h":
     ctypedef unsigned long long npy_uint96
     ctypedef unsigned long long npy_uint128
 
+    ctypedef float        npy_float16
     ctypedef float        npy_float32
     ctypedef double       npy_float64
     ctypedef long double  npy_float80
@@ -735,6 +736,7 @@ ctypedef npy_uint64     uint64_t
 #ctypedef npy_uint96     uint96_t
 #ctypedef npy_uint128    uint128_t
 
+ctypedef npy_float16    float16_t
 ctypedef npy_float32    float32_t
 ctypedef npy_float64    float64_t
 #ctypedef npy_float80    float80_t
diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py
index 5b1d6c31403cb..1017f9cd7c503 100644
--- a/pandas/tests/test_common.py
+++ b/pandas/tests/test_common.py
@@ -406,8 +406,8 @@ def test_2d_float32(self):
         expected[[2, 4]] = np.nan
         tm.assert_almost_equal(result, expected)
 
-        # test with float64 out buffer
-        out = np.empty((len(indexer), arr.shape[1]), dtype='f8')
+        #### this now accepts a float32! # test with float64 out buffer
+        out = np.empty((len(indexer), arr.shape[1]), dtype='float32')
         com.take_2d(arr, indexer, out=out)  # it works!
 
         # axis=1
diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py
index a14d6027361cc..0e3134d940c99 100644
--- a/pandas/tests/test_format.py
+++ b/pandas/tests/test_format.py
@@ -140,7 +140,8 @@ def test_to_string_repr_unicode(self):
                 line = line.decode(get_option("display.encoding"))
             except:
                 pass
-            self.assert_(len(line) == line_len)
+            if not line.startswith('Dtype:'):
+                self.assert_(len(line) == line_len)
 
         # it works even if sys.stdin in None
         _stdin= sys.stdin
@@ -1056,6 +1057,8 @@ def test_float_trim_zeros(self):
                 2.03954217305e+10, 5.59897817305e+10]
         skip = True
         for line in repr(DataFrame({'A': vals})).split('\n'):
+            if line.startswith('Dtype:'):
+                continue
             if _three_digit_exp():
                 self.assert_(('+010' in line) or skip)
             else:
@@ -1101,7 +1104,7 @@ def test_to_string(self):
         format = '%.4f'.__mod__
         result = self.ts.to_string(float_format=format)
         result = [x.split()[1] for x in result.split('\n')]
-        expected = [format(x) for x in self.ts]
+        expected = [format(x) for x in self.ts] + [u'float64']
         self.assertEqual(result, expected)
 
         # empty string
@@ -1116,7 +1119,7 @@ def test_to_string(self):
         cp.name = 'foo'
         result = cp.to_string(length=True, name=True)
         last_line = result.split('\n')[-1].strip()
-        self.assertEqual(last_line, "Freq: B, Name: foo, Length: %d" % len(cp))
+        self.assertEqual(last_line, "Freq: B, Name: foo, Length: %d, Dtype: float64" % len(cp))
 
     def test_freq_name_separation(self):
         s = Series(np.random.randn(10),
@@ -1131,7 +1134,8 @@ def test_to_string_mixed(self):
         expected = (u'0     foo\n'
                     u'1     NaN\n'
                     u'2   -1.23\n'
-                    u'3    4.56')
+                    u'3    4.56\n'
+                    u'Dtype: object')
         self.assertEqual(result, expected)
 
         # but don't count NAs as floats
@@ -1140,7 +1144,8 @@ def test_to_string_mixed(self):
         expected = (u'0    foo\n'
                     '1    NaN\n'
                     '2    bar\n'
-                    '3    baz')
+                    '3    baz\n'
+                    u'Dtype: object')
         self.assertEqual(result, expected)
 
         s = Series(['foo', 5, 'bar', 'baz'])
@@ -1148,7 +1153,8 @@ def test_to_string_mixed(self):
         expected = (u'0    foo\n'
                     '1      5\n'
                     '2    bar\n'
-                    '3    baz')
+                    '3    baz\n'
+                    u'Dtype: object')
         self.assertEqual(result, expected)
 
     def test_to_string_float_na_spacing(self):
@@ -1160,7 +1166,8 @@ def test_to_string_float_na_spacing(self):
                     '1    1.5678\n'
                     '2       NaN\n'
                     '3   -3.0000\n'
-                    '4       NaN')
+                    '4       NaN\n'
+                    u'Dtype: float64')
         self.assertEqual(result, expected)
 
     def test_unicode_name_in_footer(self):
@@ -1172,6 +1179,8 @@ def test_float_trim_zeros(self):
         vals = [2.08430917305e+10, 3.52205017305e+10, 2.30674817305e+10,
                 2.03954217305e+10, 5.59897817305e+10]
         for line in repr(Series(vals)).split('\n'):
+            if line.startswith('Dtype:'):
+                continue
             if _three_digit_exp():
                 self.assert_('+010' in line)
             else:
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
index 09747ba3f09f0..03fdd53ce19af 100644
--- a/pandas/tests/test_frame.py
+++ b/pandas/tests/test_frame.py
@@ -46,7 +46,38 @@ def _skip_if_no_scipy():
 # DataFrame test cases
 
 JOIN_TYPES = ['inner', 'outer', 'left', 'right']
-
+MIXED_FLOAT_DTYPES = ['float16','float32','float64']
+MIXED_INT_DTYPES   = ['uint8','uint16','uint32','uint64','int8','int16','int32','int64']
+
+def _check_mixed_float(df, dtype = None):
+    dtypes = dict(A = 'float32', B = 'float32', C = 'float16', D = 'float64')
+    if isinstance(dtype, basestring):
+        dtypes = dict([ (k,dtype) for k, v in dtypes.items() ])
+    elif isinstance(dtype, dict):
+        dtypes.update(dtype)
+    if dtypes.get('A'):
+        assert(df.dtypes['A'] == dtypes['A'])
+    if dtypes.get('B'):
+        assert(df.dtypes['B'] == dtypes['B'])
+    if dtypes.get('C'):
+        assert(df.dtypes['C'] == dtypes['C'])
+    if dtypes.get('D'):
+        assert(df.dtypes['D'] == dtypes['D'])
+
+def _check_mixed_int(df, dtype = None):
+    dtypes = dict(A = 'int32', B = 'uint64', C = 'uint8', D = 'int64')
+    if isinstance(dtype, basestring):
+        dtypes = dict([ (k,dtype) for k, v in dtypes.items() ])
+    elif isinstance(dtype, dict):
+        dtypes.update(dtype)
+    if dtypes.get('A'):
+        assert(df.dtypes['A'] == dtypes['A'])
+    if dtypes.get('B'):
+        assert(df.dtypes['B'] == dtypes['B'])
+    if dtypes.get('C'):
+        assert(df.dtypes['C'] == dtypes['C'])
+    if dtypes.get('D'):
+        assert(df.dtypes['D'] == dtypes['D'])
 
 class CheckIndexing(object):
 
@@ -121,6 +152,7 @@ def test_getitem_list(self):
         self.assertEqual(result.columns.name, 'sth')
 
     def test_setitem_list(self):
+
         self.frame['E'] = 'foo'
         data = self.frame[['A', 'B']]
         self.frame[['B', 'A']] = data
@@ -128,11 +160,11 @@ def test_setitem_list(self):
         assert_series_equal(self.frame['B'], data['A'])
         assert_series_equal(self.frame['A'], data['B'])
 
-        df = DataFrame(0, range(3), ['tt1', 'tt2'])
+        df = DataFrame(0, range(3), ['tt1', 'tt2'], dtype=np.int_)
         df.ix[1, ['tt1', 'tt2']] = [1, 2]
 
         result = df.ix[1, ['tt1', 'tt2']]
-        expected = Series([1, 2], df.columns)
+        expected = Series([1, 2], df.columns, dtype=np.int_)
         assert_series_equal(result, expected)
 
         df['tt1'] = df['tt2'] = '0'
@@ -171,14 +203,43 @@ def test_getitem_boolean(self):
 
         self.assertRaises(ValueError, self.tsframe.__getitem__, self.tsframe)
 
-        # test df[df >0] works
-        bif = self.tsframe[self.tsframe > 0]
-        bifw = DataFrame(np.where(self.tsframe > 0, self.tsframe, np.nan),
-                         index=self.tsframe.index, columns=self.tsframe.columns)
-        self.assert_(isinstance(bif, DataFrame))
-        self.assert_(bif.shape == self.tsframe.shape)
-        assert_frame_equal(bif, bifw)
+        # test df[df > 0]
+        for df in [ self.tsframe, self.mixed_frame, self.mixed_float, self.mixed_int ]:
+
+            data = df._get_numeric_data()
+            bif = df[df > 0]
+            bifw = DataFrame(dict([ (c,np.where(data[c] > 0, data[c], np.nan)) for c in data.columns ]),
+                             index=data.index, columns=data.columns)
+
+            # add back other columns to compare
+            for c in df.columns:
+                if c not in bifw:
+                    bifw[c] = df[c]
+            bifw = bifw.reindex(columns = df.columns)
+
+            assert_frame_equal(bif, bifw, check_dtype=False)
+            for c in df.columns:
+                if bif[c].dtype != bifw[c].dtype:
+                    self.assert_(bif[c].dtype == df[c].dtype)
+
 
+    def test_getitem_boolean_casting(self):
+
+        #### this currently disabled ###
+
+        # don't upcast if we don't need to
+        df = self.tsframe.copy()
+        df['E'] = 1
+        df['E'] = df['E'].astype('int32')
+        df['F'] = 1
+        df['F'] = df['F'].astype('int64')
+        casted = df[df>0]
+        result = casted.get_dtype_counts()
+        #expected = Series({'float64': 4, 'int32' : 1, 'int64' : 1})
+        expected = Series({'float64': 6 })
+        assert_series_equal(result, expected)
+
+                    
     def test_getitem_boolean_list(self):
         df = DataFrame(np.arange(12).reshape(3, 4))
 
@@ -194,9 +255,9 @@ def _checkit(lst):
     def test_getitem_boolean_iadd(self):
         arr = randn(5, 5)
 
-        df = DataFrame(arr.copy())
-        df[df < 0] += 1
+        df = DataFrame(arr.copy(), columns = ['A','B','C','D','E'])
 
+        df[df < 0] += 1
         arr[arr < 0] += 1
 
         assert_almost_equal(df.values, arr)
@@ -341,7 +402,7 @@ def test_setitem_cast(self):
 
         # #669, should not cast?
         self.frame['B'] = 0
-        self.assert_(self.frame['B'].dtype == np.float64)
+        self.assert_(self.frame['B'].dtype == np.float_)
 
         # cast if pass array of course
         self.frame['B'] = np.arange(len(self.frame))
@@ -349,18 +410,18 @@ def test_setitem_cast(self):
 
         self.frame['foo'] = 'bar'
         self.frame['foo'] = 0
-        self.assert_(self.frame['foo'].dtype == np.int64)
+        self.assert_(self.frame['foo'].dtype == np.int_)
 
         self.frame['foo'] = 'bar'
         self.frame['foo'] = 2.5
-        self.assert_(self.frame['foo'].dtype == np.float64)
+        self.assert_(self.frame['foo'].dtype == np.float_)
 
         self.frame['something'] = 0
-        self.assert_(self.frame['something'].dtype == np.int64)
+        self.assert_(self.frame['something'].dtype == np.int_)
         self.frame['something'] = 2
-        self.assert_(self.frame['something'].dtype == np.int64)
+        self.assert_(self.frame['something'].dtype == np.int_)
         self.frame['something'] = 2.5
-        self.assert_(self.frame['something'].dtype == np.float64)
+        self.assert_(self.frame['something'].dtype == np.float_)
 
     def test_setitem_boolean_column(self):
         expected = self.frame.copy()
@@ -395,7 +456,7 @@ def test_setitem_corner(self):
         self.assertEqual(dm.values.dtype, np.object_)
 
         dm['C'] = 1
-        self.assertEqual(dm['C'].dtype, np.int64)
+        self.assertEqual(dm['C'].dtype, np.int_)
 
         # set existing column
         dm['A'] = 'bar'
@@ -1114,10 +1175,6 @@ def test_setitem_single_column_mixed_datetime(self):
         self.assertRaises(
             Exception, df.ix.__setitem__, ('d', 'timestamp'), [nan])
 
-        # prior to 0.10.1 this failed
-        # self.assertRaises(TypeError, df.ix.__setitem__, ('c','timestamp'),
-        # nan)
-
     def test_setitem_frame(self):
         piece = self.frame.ix[:2, ['A', 'B']]
         self.frame.ix[-2:, ['A', 'B']] = piece.values
@@ -1562,10 +1619,30 @@ def setUp(self):
 
         self.frame = _frame.copy()
         self.frame2 = _frame2.copy()
-        self.intframe = _intframe.copy()
+
+        # force these all to int64 to avoid platform testing issues
+        self.intframe = DataFrame(dict([ (c,s) for c,s in _intframe.iteritems() ]), dtype = np.int64)
         self.tsframe = _tsframe.copy()
         self.mixed_frame = _mixed_frame.copy()
-
+        self.mixed_float  = DataFrame({ 'A': _frame['A'].copy().astype('float32'),
+                                        'B': _frame['B'].copy().astype('float32'),
+                                        'C': _frame['C'].copy().astype('float16'),
+                                        'D': _frame['D'].copy().astype('float64') })
+        self.mixed_float2 = DataFrame({ 'A': _frame2['A'].copy().astype('float32'),
+                                        'B': _frame2['B'].copy().astype('float32'),
+                                        'C': _frame2['C'].copy().astype('float16'),
+                                        'D': _frame2['D'].copy().astype('float64') })
+        self.mixed_int    = DataFrame({ 'A': _intframe['A'].copy().astype('int32'),
+                                        'B': np.ones(len(_intframe['B']),dtype='uint64'),
+                                        'C': _intframe['C'].copy().astype('uint8'),
+                                        'D': _intframe['D'].copy().astype('int64') })
+        self.all_mixed    = DataFrame({'a': 1., 'b': 2, 'c': 'foo', 'float32' : np.array([1.]*10,dtype='float32'), 
+                                       'int32' : np.array([1]*10,dtype='int32'),
+                                       }, index=np.arange(10))
+        #self.all_mixed    = DataFrame({'a': 1., 'b': 2, 'c': 'foo', 'float32' : np.array([1.]*10,dtype='float32'), 
+        #                               'int32' : np.array([1]*10,dtype='int32'), 'timestamp' : Timestamp('20010101'),
+        #                               }, index=np.arange(10))
+        
         self.ts1 = tm.makeTimeSeries()
         self.ts2 = tm.makeTimeSeries()[5:]
         self.ts3 = tm.makeTimeSeries()[-5:]
@@ -1806,6 +1883,44 @@ def test_constructor_dtype_list_data(self):
         self.assert_(df.ix[1, 0] is None)
         self.assert_(df.ix[0, 1] == '2')
 
+    def test_constructor_mixed_dtypes(self):
+
+        def _make_mixed_dtypes_df(typ, ad = None):
+
+            if typ == 'int':
+                dtypes = MIXED_INT_DTYPES
+                arrays = [ np.array(np.random.rand(10), dtype = d) for d in dtypes ]
+            elif typ == 'float':
+                dtypes = MIXED_FLOAT_DTYPES
+                arrays = [ np.array(np.random.randint(10, size=10), dtype = d) for d in dtypes ]
+        
+            zipper = zip(dtypes,arrays)
+            for d,a in zipper:
+                assert(a.dtype == d)
+            if ad is None:
+                ad = dict()
+            ad.update(dict([ (d,a) for d,a in zipper ]))
+            return DataFrame(ad)
+
+        def _check_mixed_dtypes(df, dtypes = None):
+            if dtypes is None:
+                dtypes = MIXED_FLOAT_DTYPES + MIXED_INT_DTYPES
+            for d in dtypes:
+                if d in df:
+                    assert(df.dtypes[d] == d)
+
+        # mixed floating and integer coexinst in the same frame
+        df     = _make_mixed_dtypes_df('float')
+        _check_mixed_dtypes(df)
+
+        # add lots of types
+        df     = _make_mixed_dtypes_df('float', dict(A = 1, B = 'foo', C = 'bar'))
+        _check_mixed_dtypes(df)
+
+        # GH 622
+        df     = _make_mixed_dtypes_df('int')
+        _check_mixed_dtypes(df)
+        
     def test_constructor_rec(self):
         rec = self.frame.to_records(index=False)
 
@@ -1975,7 +2090,7 @@ def test_constructor_dict_of_tuples(self):
 
         result = DataFrame(data)
         expected = DataFrame(dict((k, list(v)) for k, v in data.iteritems()))
-        assert_frame_equal(result, expected)
+        assert_frame_equal(result, expected, check_dtype=False)
 
     def test_constructor_ndarray(self):
         mat = np.zeros((2, 3), dtype=float)
@@ -1988,7 +2103,7 @@ def test_constructor_ndarray(self):
 
         # cast type
         frame = DataFrame(mat, columns=['A', 'B', 'C'],
-                          index=[1, 2], dtype=int)
+                          index=[1, 2], dtype=np.int64)
         self.assert_(frame.values.dtype == np.int64)
 
         # 1-D input
@@ -2040,7 +2155,7 @@ def test_constructor_maskedarray(self):
 
         # cast type
         frame = DataFrame(mat, columns=['A', 'B', 'C'],
-                          index=[1, 2], dtype=int)
+                          index=[1, 2], dtype=np.int64)
         self.assert_(frame.values.dtype == np.int64)
 
         # Check non-masked values
@@ -2098,7 +2213,7 @@ def test_constructor_maskedarray_nonfloat(self):
 
         # cast type
         frame = DataFrame(mat, columns=['A', 'B', 'C'],
-                          index=[1, 2], dtype=float)
+                          index=[1, 2], dtype=np.float64)
         self.assert_(frame.values.dtype == np.float64)
 
         # Check non-masked values
@@ -2174,9 +2289,9 @@ def test_constructor_scalar_inference(self):
                 'float': 3., 'complex': 4j, 'object': 'foo'}
         df = DataFrame(data, index=np.arange(10))
 
-        self.assert_(df['int'].dtype == np.int64)
+        self.assert_(df['int'].dtype == np.int_)
         self.assert_(df['bool'].dtype == np.bool_)
-        self.assert_(df['float'].dtype == np.float64)
+        self.assert_(df['float'].dtype == np.float_)
         self.assert_(df['complex'].dtype == np.complex128)
         self.assert_(df['object'].dtype == np.object_)
 
@@ -2192,7 +2307,7 @@ def test_constructor_DataFrame(self):
         df = DataFrame(self.frame)
         assert_frame_equal(df, self.frame)
 
-        df_casted = DataFrame(self.frame, dtype=int)
+        df_casted = DataFrame(self.frame, dtype=np.int64)
         self.assert_(df_casted.values.dtype == np.int64)
 
     def test_constructor_more(self):
@@ -2229,7 +2344,7 @@ def test_constructor_more(self):
 
         # int cast
         dm = DataFrame({'A': np.ones(10, dtype=int),
-                        'B': np.ones(10, dtype=float)},
+                        'B': np.ones(10, dtype=np.float64)},
                        index=np.arange(10))
 
         self.assertEqual(len(dm.columns), 2)
@@ -2339,7 +2454,7 @@ def test_constructor_scalar(self):
         idx = Index(range(3))
         df = DataFrame({"a": 0}, index=idx)
         expected = DataFrame({"a": [0, 0, 0]}, index=idx)
-        assert_frame_equal(df, expected)
+        assert_frame_equal(df, expected, check_dtype=False)
 
     def test_constructor_Series_copy_bug(self):
         df = DataFrame(self.frame['A'], index=self.frame.index, columns=['A'])
@@ -2523,6 +2638,12 @@ def test_astype(self):
                              columns=self.frame.columns)
         assert_frame_equal(casted, expected)
 
+        casted = self.frame.astype(np.int32)
+        expected = DataFrame(self.frame.values.astype(np.int32),
+                             index=self.frame.index,
+                             columns=self.frame.columns)
+        assert_frame_equal(casted, expected)
+
         self.frame['foo'] = '5'
         casted = self.frame.astype(int)
         expected = DataFrame(self.frame.values.astype(int),
@@ -2530,6 +2651,81 @@ def test_astype(self):
                              columns=self.frame.columns)
         assert_frame_equal(casted, expected)
 
+        # mixed casting
+        def _check_cast(df, v):
+            self.assert_(list(set([ s.dtype.name for _, s in df.iteritems() ]))[0] == v)
+
+        mn = self.all_mixed._get_numeric_data().copy()
+        mn['little_float'] = np.array(12345.,dtype='float16')
+        mn['big_float']    = np.array(123456789101112.,dtype='float64')
+
+        casted = mn.astype('float64')
+        _check_cast(casted, 'float64')
+
+        casted = mn.astype('int64')
+        _check_cast(casted, 'int64')
+
+        casted = self.mixed_float.reindex(columns = ['A','B']).astype('float32')
+        _check_cast(casted, 'float32')
+
+        casted = mn.reindex(columns = ['little_float']).astype('float16')
+        _check_cast(casted, 'float16')
+
+        casted = self.mixed_float.reindex(columns = ['A','B']).astype('float16')
+        _check_cast(casted, 'float16')
+
+        casted = mn.astype('float32')
+        _check_cast(casted, 'float32')
+
+        casted = mn.astype('int32')
+        _check_cast(casted, 'int32')
+
+        # to object
+        casted = mn.astype('O')
+        _check_cast(casted, 'object')
+
+    def test_astype_with_exclude_string(self):
+        df = self.frame.copy()
+        expected = self.frame.astype(int)
+        df['string'] = 'foo'
+        casted = df.astype(int, raise_on_error = False)
+    
+        expected['string'] = 'foo'
+        assert_frame_equal(casted, expected)
+
+        df = self.frame.copy()
+        expected = self.frame.astype(np.int32)
+        df['string'] = 'foo'
+        casted = df.astype(np.int32, raise_on_error = False)
+
+        expected['string'] = 'foo'
+        assert_frame_equal(casted, expected)
+
+    def test_astype_with_view(self):
+
+        tf = self.mixed_float.reindex(columns = ['A','B','C'])
+        self.assertRaises(TypeError, self.frame.astype, np.int32, copy = False)
+
+        self.assertRaises(TypeError, tf, np.int32, copy = False)
+
+        self.assertRaises(TypeError, tf, np.int64, copy = False)
+        casted = tf.astype(np.int64)
+
+        self.assertRaises(TypeError, tf, np.float32, copy = False)
+        casted = tf.astype(np.float32)
+
+        # this is the only real reason to do it this way
+        tf = np.round(self.frame).astype(np.int32)
+        casted = tf.astype(np.float32, copy = False)
+        #self.assert_(casted.values.data == tf.values.data)
+
+        tf = self.frame.astype(np.float64)
+        casted = tf.astype(np.int64, copy = False)
+        #self.assert_(casted.values.data == tf.values.data)
+
+        # can't view to an object array
+        self.assertRaises(Exception, self.frame.astype, 'O', copy = False)
+
     def test_astype_cast_nan_int(self):
         df = DataFrame(data={"Values": [1.0, 2.0, 3.0, np.nan]})
         self.assertRaises(ValueError, df.astype, np.int64)
@@ -2634,7 +2830,7 @@ def _check_all_orients(df, dtype=None):
         # dtypes
         _check_all_orients(DataFrame(biggie, dtype=np.float64),
                            dtype=np.float64)
-        _check_all_orients(DataFrame(biggie, dtype=np.int), dtype=np.int)
+        _check_all_orients(DataFrame(biggie, dtype=np.int64), dtype=np.int64)
         _check_all_orients(DataFrame(biggie, dtype='<U3'), dtype='<U3')
 
         # empty
@@ -2760,17 +2956,20 @@ def test_from_records_nones(self):
         self.assert_(np.isnan(df['c'][0]))
 
     def test_from_records_iterator(self):
-        arr = np.array([(1.0, 2), (3.0, 4), (5., 6), (7., 8)],
-                       dtype=[('x', float), ('y', int)])
+        arr = np.array([(1.0, 1.0, 2, 2), (3.0, 3.0, 4, 4), (5., 5., 6, 6), (7., 7., 8, 8)],
+                       dtype=[('x', np.float64), ('u', np.float32), ('y', np.int64), ('z', np.int32) ])
         df = DataFrame.from_records(iter(arr), nrows=2)
-        xp = DataFrame({'x': np.array([1.0, 3.0], dtype=float),
-                        'y': np.array([2, 4], dtype=int)})
-        assert_frame_equal(df, xp)
+        xp = DataFrame({'x': np.array([1.0, 3.0], dtype=np.float64),
+                        'u': np.array([1.0, 3.0], dtype=np.float32),
+                        'y': np.array([2, 4], dtype=np.int64),
+                        'z': np.array([2, 4], dtype=np.int32)})
+        assert_frame_equal(df.reindex_like(xp), xp)
 
+        # no dtypes specified here, so just compare with the default
         arr = [(1.0, 2), (3.0, 4), (5., 6), (7., 8)]
         df = DataFrame.from_records(iter(arr), columns=['x', 'y'],
                                     nrows=2)
-        assert_frame_equal(df, xp)
+        assert_frame_equal(df, xp.reindex(columns=['x','y']), check_dtype=False)
 
     def test_from_records_columns_not_modified(self):
         tuples = [(1, 2, 3),
@@ -2867,30 +3066,60 @@ def test_join_str_datetime(self):
         self.assert_(len(tst.columns) == 3)
 
     def test_from_records_sequencelike(self):
-        df = DataFrame({'A': np.random.randn(6),
-                        'B': np.arange(6),
-                        'C': ['foo'] * 6,
-                        'D': np.array([True, False] * 3, dtype=bool)})
-
-        tuples = [tuple(x) for x in df.values]
-        lists = [list(x) for x in tuples]
-        asdict = dict((x, y) for x, y in df.iteritems())
-
-        result = DataFrame.from_records(tuples, columns=df.columns)
-        result2 = DataFrame.from_records(lists, columns=df.columns)
-        result3 = DataFrame.from_records(asdict, columns=df.columns)
-
-        assert_frame_equal(result, df)
+        df = DataFrame({'A' : np.array(np.random.randn(6), dtype = np.float64),
+                        'A1': np.array(np.random.randn(6), dtype = np.float64),
+                        'B' : np.array(np.arange(6), dtype = np.int64),
+                        'C' : ['foo'] * 6,
+                        'D' : np.array([True, False] * 3, dtype=bool),
+                        'E' : np.array(np.random.randn(6), dtype = np.float32),
+                        'E1': np.array(np.random.randn(6), dtype = np.float32),
+                        'F' : np.array(np.arange(6), dtype = np.int32) })
+
+        # this is actually tricky to create the recordlike arrays and have the dtypes be intact
+        blocks = df.blocks
+        tuples = []
+        columns = []
+        dtypes  = []
+        for dtype, b in blocks.iteritems():
+            columns.extend(b.columns)
+            dtypes.extend([ (c,np.dtype(dtype).descr[0][1]) for c in b.columns ])
+        for i in xrange(len(df.index)):
+            tup = []
+            for _, b in blocks.iteritems():
+                tup.extend(b.irow(i).values)
+            tuples.append(tuple(tup))
+       
+        recarray  = np.array(tuples, dtype=dtypes).view(np.recarray)
+        recarray2 = df.to_records()
+        lists     = [list(x) for x in tuples]
+
+        # tuples (lose the dtype info)
+        result  = DataFrame.from_records(tuples,    columns=columns).reindex(columns=df.columns)
+
+        # created recarray and with to_records recarray (have dtype info)
+        result2 = DataFrame.from_records(recarray,  columns=columns).reindex(columns=df.columns)
+        result3 = DataFrame.from_records(recarray2, columns=columns).reindex(columns=df.columns)
+
+        # list of tupels (no dtype info)
+        result4 = DataFrame.from_records(lists,     columns=columns).reindex(columns=df.columns)
+
+        assert_frame_equal(result, df, check_dtype=False)
         assert_frame_equal(result2, df)
         assert_frame_equal(result3, df)
+        assert_frame_equal(result4, df, check_dtype=False)
 
+        # tuples is in the order of the columns
         result = DataFrame.from_records(tuples)
-        self.assert_(np.array_equal(result.columns, range(4)))
+        self.assert_(np.array_equal(result.columns, range(8)))
 
-        # test exclude parameter
-        result = DataFrame.from_records(tuples, exclude=[0, 1, 3])
-        result.columns = ['C']
-        assert_frame_equal(result, df[['C']])
+        # test exclude parameter & we are casting the results here (as we don't have dtype info to recover)
+        columns_to_test = [ columns.index('C'), columns.index('E1') ]
+
+        exclude = list(set(xrange(8))-set(columns_to_test))
+        result = DataFrame.from_records(tuples, exclude=exclude)
+        result.columns = [ columns[i] for i in sorted(columns_to_test) ]
+        assert_series_equal(result['C'], df['C'])
+        assert_series_equal(result['E1'], df['E1'].astype('float64'))
 
         # empty case
         result = DataFrame.from_records([], columns=['foo', 'bar', 'baz'])
@@ -2901,6 +3130,35 @@ def test_from_records_sequencelike(self):
         self.assertEqual(len(result), 0)
         self.assertEqual(len(result.columns), 0)
 
+    def test_from_records_dictlike(self):
+
+        # test the dict methods
+        df = DataFrame({'A' : np.array(np.random.randn(6), dtype = np.float64),
+                        'A1': np.array(np.random.randn(6), dtype = np.float64),
+                        'B' : np.array(np.arange(6), dtype = np.int64),
+                        'C' : ['foo'] * 6,
+                        'D' : np.array([True, False] * 3, dtype=bool),
+                        'E' : np.array(np.random.randn(6), dtype = np.float32),
+                        'E1': np.array(np.random.randn(6), dtype = np.float32),
+                        'F' : np.array(np.arange(6), dtype = np.int32) })
+
+        # columns is in a different order here than the actual items iterated from the dict
+        columns = []
+        for dtype, b in df.blocks.iteritems():
+            columns.extend(b.columns)
+
+        asdict    = dict((x, y) for x, y in df.iteritems())
+        asdict2   = dict((x, y.values) for x, y in df.iteritems())
+
+        # dict of series & dict of ndarrays (have dtype info)
+        results = []
+        results.append(DataFrame.from_records(asdict).reindex(columns=df.columns))
+        results.append(DataFrame.from_records(asdict,    columns=columns).reindex(columns=df.columns))
+        results.append(DataFrame.from_records(asdict2,   columns=columns).reindex(columns=df.columns))
+
+        for r in results:
+            assert_frame_equal(r, df)
+
     def test_from_records_with_index_data(self):
         df = DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'])
 
@@ -3127,6 +3385,22 @@ def test_insert(self):
         self.assert_(np.array_equal(df.columns, ['foo', 'c', 'bar', 'b', 'a']))
         assert_almost_equal(df['c'], df['bar'])
 
+        # diff dtype
+
+        # new item
+        df['x'] = df['a'].astype('float32')
+        result = Series(dict(float64 = 5, float32 = 1))
+        self.assert_((df.get_dtype_counts() == result).all() == True)
+
+        # replacing current (in different block)
+        df['a'] = df['a'].astype('float32')
+        result = Series(dict(float64 = 4, float32 = 2))
+        self.assert_((df.get_dtype_counts() == result).all() == True)
+
+        df['y'] = df['a'].astype('int32')
+        result = Series(dict(float64 = 4, float32 = 2, int32 = 1))
+        self.assert_((df.get_dtype_counts() == result).all() == True)
+
         self.assertRaises(Exception, df.insert, 1, 'a', df['b'])
         self.assertRaises(Exception, df.insert, 1, 'c', df['b'])
 
@@ -3285,10 +3559,13 @@ def _check_unary_op(op):
         _check_unary_op(operator.neg)
 
     def test_logical_typeerror(self):
-        self.assertRaises(TypeError, self.frame.__eq__, 'foo')
-        self.assertRaises(TypeError, self.frame.__lt__, 'foo')
-        self.assertRaises(TypeError, self.frame.__gt__, 'foo')
-        self.assertRaises(TypeError, self.frame.__ne__, 'foo')
+        if py3compat.PY3:
+            pass
+        else:
+            self.assertRaises(TypeError, self.frame.__eq__, 'foo')
+            self.assertRaises(TypeError, self.frame.__lt__, 'foo')
+            self.assertRaises(TypeError, self.frame.__gt__, 'foo')
+            self.assertRaises(TypeError, self.frame.__ne__, 'foo')
 
     def test_constructor_lists_to_object_dtype(self):
         # from #1074
@@ -3339,6 +3616,24 @@ def test_arith_flex_frame(self):
             exp = f(self.frame, 2 * self.frame)
             assert_frame_equal(result, exp)
 
+            # vs mix float
+            result = getattr(self.mixed_float, op)(2 * self.mixed_float)
+            exp = f(self.mixed_float, 2 * self.mixed_float)
+            assert_frame_equal(result, exp)
+            _check_mixed_float(result)
+
+            # vs mix int
+            if op in ['add','sub','mul']:
+                result = getattr(self.mixed_int, op)(2 + self.mixed_int)
+                exp = f(self.mixed_int, 2 + self.mixed_int)
+
+                # overflow in the uint
+                dtype = None
+                if op in ['sub']:
+                    dtype = dict(B = 'object')
+                assert_frame_equal(result, exp)
+                _check_mixed_int(result, dtype = dtype)
+
         # res_add = self.frame.add(self.frame)
         # res_sub = self.frame.sub(self.frame)
         # res_mul = self.frame.mul(self.frame)
@@ -3617,6 +3912,22 @@ def test_combineFrame(self):
 
         assert_frame_equal(reverse + self.frame, self.frame * 2)
 
+        # mix vs float64, upcast
+        added = self.frame + self.mixed_float
+        _check_mixed_float(added, dtype = 'float64')
+        added = self.mixed_float + self.frame
+        _check_mixed_float(added, dtype = 'float64')
+
+        # mix vs mix
+        added = self.mixed_float + self.mixed_float2
+        _check_mixed_float(added)
+        added = self.mixed_float2 + self.mixed_float
+        _check_mixed_float(added)
+
+        # with int
+        added = self.frame + self.mixed_int
+        _check_mixed_float(added, dtype = 'float64')
+
     def test_combineSeries(self):
 
         # Series
@@ -3637,6 +3948,20 @@ def test_combineSeries(self):
         self.assert_('E' in larger_added)
         self.assert_(np.isnan(larger_added['E']).all())
 
+        # vs mix (upcast) as needed
+        added = self.mixed_float + series
+        _check_mixed_float(added, dtype = 'float64')
+        added = self.mixed_float + series.astype('float32')
+        _check_mixed_float(added, dtype = dict(C = 'float32'))
+        added = self.mixed_float + series.astype('float16')
+        _check_mixed_float(added)
+
+        # vs int
+        added = self.mixed_int + (100*series).astype('int64')
+        _check_mixed_int(added, dtype = dict(A = 'int64', B = 'float64', C = 'int64', D = 'int64'))
+        added = self.mixed_int + (100*series).astype('int32')
+        _check_mixed_int(added, dtype = dict(A = 'int32', B = 'float64', C = 'int32', D = 'int64'))
+
         # TimeSeries
         import sys
 
@@ -3678,6 +4003,12 @@ def test_combineFunc(self):
         result = self.frame * 2
         self.assert_(np.array_equal(result.values, self.frame.values * 2))
 
+        # vs mix
+        result = self.mixed_float * 2
+        for c, s in result.iteritems():
+            self.assert_(np.array_equal(s.values, self.mixed_float[c].values * 2))
+        _check_mixed_float(result)
+
         result = self.empty * 2
         self.assert_(result.index is self.empty.index)
         self.assertEqual(len(result.columns), 0)
@@ -4075,11 +4406,40 @@ def test_dtypes(self):
         assert_series_equal(result, expected)
 
     def test_convert_objects(self):
+
         oops = self.mixed_frame.T.T
         converted = oops.convert_objects()
         assert_frame_equal(converted, self.mixed_frame)
         self.assert_(converted['A'].dtype == np.float64)
 
+        # force numeric conversion
+        self.mixed_frame['H'] = '1.'
+        self.mixed_frame['I'] = '1'
+
+        # add in some items that will be nan
+        l = len(self.mixed_frame)
+        self.mixed_frame['J'] = '1.'
+        self.mixed_frame['K'] = '1'
+        self.mixed_frame.ix[0:5,['J','K']] = 'garbled'
+        converted = self.mixed_frame.convert_objects(convert_numeric=True)
+        self.assert_(converted['H'].dtype == 'float64')
+        self.assert_(converted['I'].dtype == 'int64')
+        self.assert_(converted['J'].dtype == 'float64')
+        self.assert_(converted['K'].dtype == 'float64')
+        self.assert_(len(converted['J'].dropna()) == l-5)
+        self.assert_(len(converted['K'].dropna()) == l-5)
+
+        # via astype
+        converted = self.mixed_frame.copy()
+        converted['H'] = converted['H'].astype('float64')
+        converted['I'] = converted['I'].astype('int64')
+        self.assert_(converted['H'].dtype == 'float64')
+        self.assert_(converted['I'].dtype == 'int64')
+
+        # via astype, but errors
+        converted = self.mixed_frame.copy()
+        self.assertRaises(Exception, converted['H'].astype, 'int32')
+
     def test_convert_objects_no_conversion(self):
         mixed1 = DataFrame(
             {'a': [1, 2, 3], 'b': [4.0, 5, 6], 'c': ['x', 'y', 'z']})
@@ -4198,6 +4558,11 @@ def test_as_matrix_duplicates(self):
 
         self.assertTrue(np.array_equal(result, expected))
 
+    def test_as_blocks(self):
+        frame = self.mixed_float
+        mat = frame.blocks
+        self.assert_(set([ x.name for x in frame.dtypes.values ]) == set(mat.keys()))
+
     def test_values(self):
         self.frame.values[:, 0] = 5.
         self.assert_((self.frame.values[:, 0] == 5).all())
@@ -4664,12 +5029,27 @@ def test_fillna(self):
         # mixed type
         self.mixed_frame['foo'][5:20] = nan
         self.mixed_frame['A'][-10:] = nan
-
         result = self.mixed_frame.fillna(value=0)
+        result = self.mixed_frame.fillna(method='pad')
 
         self.assertRaises(ValueError, self.tsframe.fillna)
         self.assertRaises(ValueError, self.tsframe.fillna, 5, method='ffill')
 
+        # mixed numeric (but no float16)
+        mf = self.mixed_float.reindex(columns=['A','B','D'])
+        mf['A'][-10:] = nan
+        result = mf.fillna(value=0)
+        _check_mixed_float(result, dtype = dict(C = None))
+
+        result = mf.fillna(method='pad')
+        _check_mixed_float(result, dtype = dict(C = None))
+
+        # empty frame (GH #2778)
+        df = DataFrame(columns=['x'])
+        for m in ['pad','backfill']:
+            df.x.fillna(method=m,inplace=1)
+            df.x.fillna(method=m)
+
     def test_ffill(self):
         self.tsframe['A'][:5] = nan
         self.tsframe['A'][-5:] = nan
@@ -4845,6 +5225,27 @@ def test_replace_interpolate(self):
         expected = self.tsframe.fillna(method='bfill')
         assert_frame_equal(result, expected)
 
+    def test_replace_for_new_dtypes(self):
+
+        # dtypes
+        tsframe = self.tsframe.copy().astype(np.float32)
+        tsframe['A'][:5] = nan
+        tsframe['A'][-5:] = nan
+
+        zero_filled = tsframe.replace(nan, -1e8)
+        assert_frame_equal(zero_filled, tsframe.fillna(-1e8))
+        assert_frame_equal(zero_filled.replace(-1e8, nan), tsframe)
+
+        tsframe['A'][:5] = nan
+        tsframe['A'][-5:] = nan
+        tsframe['B'][:5] = -1e8
+
+        b = tsframe['B']
+        b[b == -1e8] = nan
+        tsframe['B'] = b
+        result = tsframe.fillna(method='bfill')
+        assert_frame_equal(result, tsframe.fillna(method='bfill'))
+
     def test_replace_dtypes(self):
         # int
         df = DataFrame({'ints': [1, 2, 3]})
@@ -4852,6 +5253,16 @@ def test_replace_dtypes(self):
         expected = DataFrame({'ints': [0, 2, 3]})
         assert_frame_equal(result, expected)
 
+        df = DataFrame({'ints': [1, 2, 3]}, dtype=np.int32)
+        result = df.replace(1, 0)
+        expected = DataFrame({'ints': [0, 2, 3]}, dtype=np.int32)
+        assert_frame_equal(result, expected)
+
+        df = DataFrame({'ints': [1, 2, 3]}, dtype=np.int16)
+        result = df.replace(1, 0)
+        expected = DataFrame({'ints': [0, 2, 3]}, dtype=np.int16)
+        assert_frame_equal(result, expected)
+
         # bools
         df = DataFrame({'bools': [True, False, True]})
         result = df.replace(False, True)
@@ -4961,6 +5372,16 @@ def test_replace_limit(self):
         assert_frame_equal(padded, self.tsframe.fillna(method='bfill',
                                                        axis=1, limit=2))
 
+    def test_combine_multiple_frames_dtypes(self):
+        from pandas import concat
+
+        # GH 2759
+        A = DataFrame(data=np.ones((10, 2)), columns=['foo', 'bar'], dtype=np.float64)
+        B = DataFrame(data=np.ones((10, 2)), dtype=np.float32)
+        results = concat((A, B), axis=1).get_dtype_counts()
+        expected = Series(dict( float64 = 2, float32 = 2 ))
+        assert_series_equal(results,expected)
+
     def test_truncate(self):
         offset = datetools.bday
 
@@ -5324,6 +5745,15 @@ def test_align(self):
                                   method=None, fill_value=0)
         self.assert_(bf.index.equals(Index([])))
 
+        # mixed floats/ints
+        af, bf = self.mixed_float.align(other.ix[:, 0], join='inner', axis=1,
+                                        method=None, fill_value=0)
+        self.assert_(bf.index.equals(Index([])))
+
+        af, bf = self.mixed_int.align(other.ix[:, 0], join='inner', axis=1,
+                                        method=None, fill_value=0)
+        self.assert_(bf.index.equals(Index([])))
+
         # try to align dataframe to series along bad axis
         self.assertRaises(ValueError, self.frame.align, af.ix[0, :3],
                           join='inner', axis=2)
@@ -5405,8 +5835,9 @@ def _check_align_fill(self, kind, meth, ax, fax):
 
     def test_align_int_fill_bug(self):
         # GH #910
-        X = np.random.rand(10, 10)
+        X = np.arange(10*10, dtype='float64').reshape(10, 10)
         Y = np.ones((10, 1), dtype=int)
+
         df1 = DataFrame(X)
         df1['0.X'] = Y.squeeze()
 
@@ -5417,45 +5848,130 @@ def test_align_int_fill_bug(self):
         assert_frame_equal(result, expected)
 
     def test_where(self):
-        df = DataFrame(np.random.randn(5, 3))
-        cond = df > 0
-
-        other1 = df + 1
-        rs = df.where(cond, other1)
-        rs2 = df.where(cond.values, other1)
-        for k, v in rs.iteritems():
-            assert_series_equal(v, np.where(cond[k], df[k], other1[k]))
-        assert_frame_equal(rs, rs2)
-
-        # it works!
-        rs = df.where(cond[1:], other1)
-
-        other2 = (df + 1).values
-        rs = df.where(cond, other2)
-        for k, v in rs.iteritems():
-            assert_series_equal(v, np.where(cond[k], df[k], other2[:, k]))
-
-        other5 = np.nan
-        rs = df.where(cond, other5)
-        for k, v in rs.iteritems():
-            assert_series_equal(v, np.where(cond[k], df[k], other5))
+        default_frame = DataFrame(np.random.randn(5, 3),columns=['A','B','C'])
+
+        def _safe_add(df):
+            # only add to the numeric items
+            return DataFrame(dict([ (c,s+1) if issubclass(s.dtype.type, (np.integer,np.floating)) else (c,s) for c, s in df.iteritems() ]))
+
+        def _check_get(df, cond, check_dtypes = True):
+            other1 = _safe_add(df)
+            rs = df.where(cond, other1)
+            rs2 = df.where(cond.values, other1)
+            for k, v in rs.iteritems():
+                assert_series_equal(v, np.where(cond[k], df[k], other1[k]))
+            assert_frame_equal(rs, rs2)
+
+            # dtypes
+            if check_dtypes:
+                self.assert_((rs.dtypes == df.dtypes).all() == True)
+
+
+        # check getting
+        for df in [ default_frame, self.mixed_frame, self.mixed_float, self.mixed_int ]:
+            cond = df > 0
+            _check_get(df, cond)
+
+        # aligning
+        def _check_align(df, cond, other, check_dtypes = True):
+            rs = df.where(cond, other)
+            for i, k in enumerate(rs.columns):
+                v = rs[k]
+                d = df[k].values
+                c = cond[k].reindex(df[k].index).fillna(False).values
+
+                if np.isscalar(other):
+                    o = other
+                else:
+                    if isinstance(other,np.ndarray):
+                        o = Series(other[:,i],index=v.index).values
+                    else:
+                        o = other[k].values
+                        
+                assert_series_equal(v, Series(np.where(c, d, o),index=v.index))
+            
+            # dtypes
+            # can't check dtype when other is an ndarray
+            if check_dtypes and not isinstance(other,np.ndarray):
+                self.assert_((rs.dtypes == df.dtypes).all() == True)
+
+        for df in [ self.mixed_frame, self.mixed_float, self.mixed_int ]:
+
+            # other is a frame
+            cond = (df > 0)[1:]
+            _check_align(df, cond, _safe_add(df))
+
+            # check other is ndarray
+            cond = df > 0
+            _check_align(df, cond, (_safe_add(df).values))
+
+            # integers are upcast, so don't check the dtypes
+            cond = df > 0
+            check_dtypes = all([ not issubclass(s.type,np.integer) for s in df.dtypes ])
+            _check_align(df, cond, np.nan, check_dtypes = check_dtypes)
 
+        # invalid conditions
+        df = default_frame
         err1 = (df + 1).values[0:2, :]
         self.assertRaises(ValueError, df.where, cond, err1)
 
         err2 = cond.ix[:2, :].values
+        other1 = _safe_add(df)
         self.assertRaises(ValueError, df.where, err2, other1)
 
-        # invalid conditions
         self.assertRaises(ValueError, df.mask, True)
         self.assertRaises(ValueError, df.mask, 0)
 
         # where inplace
-        df = DataFrame(np.random.randn(5, 3))
+        def _check_set(df, cond, check_dtypes = True):
+            dfi = df.copy()
+            econd = cond.reindex_like(df).fillna(True)
+            expected = dfi.mask(~econd)
+            dfi.where(cond, np.nan, inplace=True)
+            assert_frame_equal(dfi, expected)
 
-        expected = df.mask(df < 0)
-        df.where(df >= 0, np.nan, inplace=True)
-        assert_frame_equal(df, expected)
+            # dtypes (and confirm upcasts)x
+            if check_dtypes:
+                for k, v in df.dtypes.iteritems():
+                    if issubclass(v.type,np.integer):
+                        v = np.dtype('float64')
+                    self.assert_(dfi[k].dtype == v)
+
+        for df in [ default_frame, self.mixed_frame, self.mixed_float, self.mixed_int ]:
+
+            cond = df > 0
+            _check_set(df, cond)
+
+            cond = df >= 0
+            _check_set(df, cond)
+
+            # aligining
+            cond = (df >= 0)[1:]
+            _check_set(df, cond)
+
+    def test_where_bug(self):
+
+        # GH 2793
+
+        df = DataFrame({'a': [1.0, 2.0, 3.0, 4.0], 'b': [4.0, 3.0, 2.0, 1.0]}, dtype = 'float64')
+        expected = DataFrame({'a': [np.nan, np.nan, 3.0, 4.0], 'b': [4.0, 3.0, np.nan, np.nan]}, dtype = 'float64')
+        result   = df.where(df > 2, np.nan)
+        assert_frame_equal(result, expected)
+
+        result = df.copy()
+        result.where(result > 2, np.nan, inplace=True)
+        assert_frame_equal(result, expected)
+
+        # mixed
+        for dtype in ['int16','int8','int32','int64']:
+            df = DataFrame({'a': np.array([1, 2, 3, 4],dtype=dtype), 'b': np.array([4.0, 3.0, 2.0, 1.0], dtype = 'float64') })
+            expected = DataFrame({'a': [np.nan, np.nan, 3.0, 4.0], 'b': [4.0, 3.0, np.nan, np.nan]}, dtype = 'float64')
+            result   = df.where(df > 2, np.nan)
+            assert_frame_equal(result, expected)
+            
+            result = df.copy()
+            result.where(result > 2, np.nan, inplace=True)
+            assert_frame_equal(result, expected)
 
     def test_mask(self):
         df = DataFrame(np.random.randn(5, 3))
@@ -5568,6 +6084,13 @@ def test_diff(self):
         rs = DataFrame({'s': s}).diff()
         self.assertEqual(rs.s[1], 1)
 
+        # mixed numeric
+        tf = self.tsframe.astype('float32')
+        the_diff = tf.diff(1)
+        assert_series_equal(the_diff['A'],
+                            tf['A'] - tf['A'].shift(1))
+
+
     def test_diff_mixed_dtype(self):
         df = DataFrame(np.random.randn(5, 3))
         df['A'] = np.array([1, 2, 3, 4, 5], dtype=object)
@@ -5938,7 +6461,7 @@ def test_apply_convert_objects(self):
                           'F': np.random.randn(11)})
 
         result = data.apply(lambda x: x, axis=1)
-        assert_frame_equal(result, data)
+        assert_frame_equal(result.convert_objects(), data)
 
     def test_apply_attach_name(self):
         result = self.frame.apply(lambda x: x.name)
@@ -6484,11 +7007,16 @@ def test_get_X_columns(self):
                                     ['a', 'e']))
 
     def test_get_numeric_data(self):
-        df = DataFrame({'a': 1., 'b': 2, 'c': 'foo'},
+
+        #df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', 'd' : np.array(1.*10.,dtype='float32'), 'e' : np.array(1*10,dtype='int32')},
+        #               index=np.arange(10))
+        df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', 'd' : np.array([1.]*10,dtype='float32'), 
+                        'e' : np.array([1]*10,dtype='int32'),
+                        'f' : np.array([1]*10,dtype='int16')},
                        index=np.arange(10))
 
         result = df._get_numeric_data()
-        expected = df.ix[:, ['a', 'b']]
+        expected = df.ix[:, ['a', 'b','d','e','f']]
         assert_frame_equal(result, expected)
 
         only_obj = df.ix[:, ['c']]
@@ -6500,7 +7028,8 @@ def test_count(self):
         f = lambda s: notnull(s).sum()
         self._check_stat_op('count', f,
                             has_skipna=False,
-                            has_numeric_only=True)
+                            has_numeric_only=True,
+                            check_dtypes=False)
 
         # corner case
         frame = DataFrame()
@@ -6529,6 +7058,11 @@ def test_count(self):
     def test_sum(self):
         self._check_stat_op('sum', np.sum, has_numeric_only=True)
 
+    def test_sum_mixed_numeric(self):
+        raise nose.SkipTest
+        # mixed types
+        self._check_stat_op('sum', np.sum, frame = self.mixed_float, has_numeric_only=True)
+
     def test_stat_operators_attempt_obj_array(self):
         data = {
             'a': [-0.00049987540199591344, -0.0016467257772919831,
@@ -6679,7 +7213,7 @@ def alt(x):
         assert_series_equal(df.kurt(), df.kurt(level=0).xs('bar'))
 
     def _check_stat_op(self, name, alternative, frame=None, has_skipna=True,
-                       has_numeric_only=False):
+                       has_numeric_only=False, check_dtypes=True):
         if frame is None:
             frame = self.frame
             # set some NAs
@@ -6713,6 +7247,12 @@ def wrapper(x):
         assert_series_equal(result1, frame.apply(skipna_wrapper, axis=1),
                             check_dtype=False)
 
+        # check dtypes
+        if check_dtypes:
+            lcd_dtype = frame.values.dtype
+            self.assert_(lcd_dtype == result0.dtype)
+            self.assert_(lcd_dtype == result1.dtype)
+
         # result = f(axis=1)
         # comp = frame.apply(alternative, axis=1).reindex(result.index)
         # assert_series_equal(result, comp)
@@ -6788,7 +7328,7 @@ def wrapper(x):
                 return np.nan
             return np.median(x)
 
-        self._check_stat_op('median', wrapper, frame=self.intframe)
+        self._check_stat_op('median', wrapper, frame=self.intframe, check_dtypes=False)
 
     def test_quantile(self):
         from pandas.compat.scipy import scoreatpercentile
@@ -6856,6 +7396,11 @@ def test_cumprod(self):
         df.cumprod(0)
         df.cumprod(1)
 
+        # ints32
+        df = self.tsframe.fillna(0).astype(np.int32)
+        df.cumprod(0)
+        df.cumprod(1)
+
     def test_rank(self):
         from pandas.compat.scipy import rankdata
 
@@ -7367,6 +7912,40 @@ def test_as_matrix_numeric_cols(self):
         values = self.frame.as_matrix(['A', 'B', 'C', 'D'])
         self.assert_(values.dtype == np.float64)
 
+    def test_as_matrix_lcd(self):
+
+        # mixed lcd
+        values = self.mixed_float.as_matrix(['A', 'B', 'C', 'D'])
+        self.assert_(values.dtype == np.float64)
+
+        values = self.mixed_float.as_matrix(['A', 'B', 'C' ])
+        self.assert_(values.dtype == np.float32)
+
+        values = self.mixed_float.as_matrix(['C'])
+        self.assert_(values.dtype == np.float16)
+
+        values = self.mixed_int.as_matrix(['A','B','C','D'])
+        self.assert_(values.dtype == np.uint64)
+
+        values = self.mixed_int.as_matrix(['A','D'])
+        self.assert_(values.dtype == np.int64)
+
+        # guess all ints are cast to uints....
+        values = self.mixed_int.as_matrix(['A','B','C'])
+        self.assert_(values.dtype == np.uint64)
+
+        values = self.mixed_int.as_matrix(['A','C'])
+        self.assert_(values.dtype == np.int32)
+
+        values = self.mixed_int.as_matrix(['C','D'])
+        self.assert_(values.dtype == np.int64)
+
+        values = self.mixed_int.as_matrix(['A'])
+        self.assert_(values.dtype == np.int32)
+
+        values = self.mixed_int.as_matrix(['C'])
+        self.assert_(values.dtype == np.uint8)
+
     def test_constructor_frame_copy(self):
         cop = DataFrame(self.frame, copy=True)
         cop['A'] = 5
@@ -7404,6 +7983,10 @@ def test_cast_internals(self):
         expected = DataFrame(self.frame._series, dtype=int)
         assert_frame_equal(casted, expected)
 
+        casted = DataFrame(self.frame._data, dtype=np.int32)
+        expected = DataFrame(self.frame._series, dtype=np.int32)
+        assert_frame_equal(casted, expected)
+
     def test_consolidate(self):
         self.frame['E'] = 7.
         consolidated = self.frame.consolidate()
@@ -7475,7 +8058,7 @@ def test_xs_view(self):
 
     def test_boolean_indexing(self):
         idx = range(3)
-        cols = range(3)
+        cols = ['A','B','C']
         df1 = DataFrame(index=idx, columns=cols,
                         data=np.array([[0.0, 0.5, 1.0],
                                        [1.5, 2.0, 2.5],
@@ -7512,15 +8095,29 @@ def test_take(self):
         # mixed-dtype
         #----------------------------------------
         order = [4, 1, 2, 0, 3]
+        for df in [self.mixed_frame]:
 
-        result = self.mixed_frame.take(order, axis=0)
-        expected = self.mixed_frame.reindex(self.mixed_frame.index.take(order))
-        assert_frame_equal(result, expected)
+            result = df.take(order, axis=0)
+            expected = df.reindex(df.index.take(order))
+            assert_frame_equal(result, expected)
+            
+            # axis = 1
+            result = df.take(order, axis=1)
+            expected = df.ix[:, ['foo', 'B', 'C', 'A', 'D']]
+            assert_frame_equal(result, expected)
 
-        # axis = 1
-        result = self.mixed_frame.take(order, axis=1)
-        expected = self.mixed_frame.ix[:, ['foo', 'B', 'C', 'A', 'D']]
-        assert_frame_equal(result, expected)
+        # by dtype
+        order = [1, 2, 0, 3]
+        for df in [self.mixed_float,self.mixed_int]:
+
+            result = df.take(order, axis=0)
+            expected = df.reindex(df.index.take(order))
+            assert_frame_equal(result, expected)
+            
+            # axis = 1
+            result = df.take(order, axis=1)
+            expected = df.ix[:, ['B', 'C', 'A', 'D']]
+            assert_frame_equal(result, expected)
 
     def test_iterkv_names(self):
         for k, v in self.mixed_frame.iterkv():
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
index 114697bc5c8cd..54d29263b2308 100644
--- a/pandas/tests/test_groupby.py
+++ b/pandas/tests/test_groupby.py
@@ -62,6 +62,13 @@ def setUp(self):
                              'C': np.random.randn(8),
                              'D': np.random.randn(8)})
 
+        self.df_mixed_floats = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
+                                                'foo', 'bar', 'foo', 'foo'],
+                                          'B': ['one', 'one', 'two', 'three',
+                                                'two', 'two', 'one', 'three'],
+                                          'C': np.random.randn(8),
+                                          'D': np.array(np.random.randn(8),dtype='float32')})
+
         index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                    ['one', 'two', 'three']],
                            labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
@@ -155,6 +162,25 @@ def test_first_last_nth(self):
         self.assert_(com.isnull(grouped['B'].last()['foo']))
         self.assert_(com.isnull(grouped['B'].nth(0)['foo']))
 
+    def test_first_last_nth_dtypes(self):
+        # tests for first / last / nth
+
+        grouped = self.df_mixed_floats.groupby('A')
+        first = grouped.first()
+        expected = self.df_mixed_floats.ix[[1, 0], ['B', 'C', 'D']]
+        expected.index = ['bar', 'foo']
+        assert_frame_equal(first, expected)
+
+        last = grouped.last()
+        expected = self.df_mixed_floats.ix[[5, 7], ['B', 'C', 'D']]
+        expected.index = ['bar', 'foo']
+        assert_frame_equal(last, expected)
+
+        nth = grouped.nth(1)
+        expected = self.df_mixed_floats.ix[[3, 2], ['B', 'C', 'D']]
+        expected.index = ['bar', 'foo']
+        assert_frame_equal(nth, expected)
+
     def test_grouper_iter(self):
         self.assertEqual(sorted(self.df.groupby('A').grouper), ['bar', 'foo'])
 
@@ -478,16 +504,30 @@ def test_transform_function_aliases(self):
 
     def test_with_na(self):
         index = Index(np.arange(10))
-        values = Series(np.ones(10), index)
-        labels = Series([nan, 'foo', 'bar', 'bar', nan, nan, 'bar',
-                         'bar', nan, 'foo'], index=index)
 
-        grouped = values.groupby(labels)
-        agged = grouped.agg(len)
-        expected = Series([4, 2], index=['bar', 'foo'])
+        for dtype in ['float64','float32','int64','int32','int16','int8']:
+            values = Series(np.ones(10), index, dtype=dtype)
+            labels = Series([nan, 'foo', 'bar', 'bar', nan, nan, 'bar',
+                             'bar', nan, 'foo'], index=index)
+
+
+            # this SHOULD be an int
+            grouped = values.groupby(labels)
+            agged = grouped.agg(len)
+            expected = Series([4, 2], index=['bar', 'foo'])
+            
+            assert_series_equal(agged, expected, check_dtype=False)
+            #self.assert_(issubclass(agged.dtype.type, np.integer))
 
-        assert_series_equal(agged, expected, check_dtype=False)
-        self.assert_(issubclass(agged.dtype.type, np.integer))
+            # explicity return a float from my function
+            def f(x):
+                return float(len(x))
+
+            agged = grouped.agg(f)
+            expected = Series([4, 2], index=['bar', 'foo'])
+            
+            assert_series_equal(agged, expected, check_dtype=False)
+            self.assert_(issubclass(agged.dtype.type, np.dtype(dtype).type))
 
     def test_attr_wrapper(self):
         grouped = self.ts.groupby(lambda x: x.weekday())
@@ -1596,6 +1636,7 @@ def test_series_grouper_noncontig_index(self):
         grouped.agg(f)
 
     def test_convert_objects_leave_decimal_alone(self):
+
         from decimal import Decimal
 
         s = Series(range(5))
diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py
index 9deddb802d1bf..f39a6d3b3feec 100644
--- a/pandas/tests/test_internals.py
+++ b/pandas/tests/test_internals.py
@@ -20,20 +20,20 @@ def assert_block_equal(left, right):
     assert(left.ref_items.equals(right.ref_items))
 
 
-def get_float_mat(n, k):
-    return np.repeat(np.atleast_2d(np.arange(k, dtype=float)), n, axis=0)
+def get_float_mat(n, k, dtype):
+    return np.repeat(np.atleast_2d(np.arange(k, dtype=dtype)), n, axis=0)
 
 TEST_COLS = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
 N = 10
 
 
-def get_float_ex(cols=['a', 'c', 'e']):
-    floats = get_float_mat(N, len(cols)).T
+def get_float_ex(cols=['a', 'c', 'e'], dtype = np.float_):
+    floats = get_float_mat(N, len(cols), dtype = dtype).T
     return make_block(floats, cols, TEST_COLS)
 
 
 def get_complex_ex(cols=['h']):
-    complexes = (get_float_mat(N, 1).T * 1j).astype(np.complex128)
+    complexes = (get_float_mat(N, 1, dtype = np.float_).T * 1j).astype(np.complex128)
     return make_block(complexes, cols, TEST_COLS)
 
 
@@ -49,13 +49,8 @@ def get_bool_ex(cols=['f']):
     return make_block(mat.T, cols, TEST_COLS)
 
 
-def get_int_ex(cols=['g']):
-    mat = randn(N, 1).astype(int)
-    return make_block(mat.T, cols, TEST_COLS)
-
-
-def get_int32_ex(cols):
-    mat = randn(N, 1).astype(np.int32)
+def get_int_ex(cols=['g'], dtype = np.int_):
+    mat = randn(N, 1).astype(dtype)
     return make_block(mat.T, cols, TEST_COLS)
 
 
@@ -63,6 +58,16 @@ def get_dt_ex(cols=['h']):
     mat = randn(N, 1).astype(int).astype('M8[ns]')
     return make_block(mat.T, cols, TEST_COLS)
 
+def create_blockmanager(blocks):
+    l = []
+    for b in blocks:
+        l.extend(b.items)
+    items = Index(l)
+    for b in blocks:
+        b.ref_items = items
+
+    index_sz = blocks[0].values.shape[1]
+    return BlockManager(blocks, [items, np.arange(index_sz)])
 
 class TestBlock(unittest.TestCase):
 
@@ -76,8 +81,8 @@ def setUp(self):
         self.int_block = get_int_ex()
 
     def test_constructor(self):
-        int32block = get_int32_ex(['a'])
-        self.assert_(int32block.dtype == np.int64)
+        int32block = get_int_ex(['a'],dtype = np.int32)
+        self.assert_(int32block.dtype == np.int32)
 
     def test_pickle(self):
         import pickle
@@ -235,12 +240,7 @@ def test_attrs(self):
     def test_is_mixed_dtype(self):
         self.assert_(self.mgr.is_mixed_dtype())
 
-        items = Index(['a', 'b'])
-        blocks = [get_bool_ex(['a']), get_bool_ex(['b'])]
-        for b in blocks:
-            b.ref_items = items
-
-        mgr = BlockManager(blocks, [items, np.arange(N)])
+        mgr = create_blockmanager([get_bool_ex(['a']), get_bool_ex(['b'])])
         self.assert_(not mgr.is_mixed_dtype())
 
     def test_is_indexed_like(self):
@@ -254,9 +254,12 @@ def test_block_id_vector_item_dtypes(self):
         assert_almost_equal(expected, result)
 
         result = self.mgr.item_dtypes
+
+        # as the platform may not exactly match this, pseudo match
         expected = ['float64', 'object', 'float64', 'object', 'float64',
                     'bool', 'int64', 'complex128']
-        self.assert_(np.array_equal(result, expected))
+        for e, r in zip(expected, result):
+            np.dtype(e).kind == np.dtype(r).kind
 
     def test_duplicate_item_failure(self):
         items = Index(['a', 'a'])
@@ -315,7 +318,7 @@ def test_set_change_dtype(self):
         self.assert_(mgr2.get('baz').dtype == np.object_)
 
         mgr2.set('quux', randn(N).astype(int))
-        self.assert_(mgr2.get('quux').dtype == np.int64)
+        self.assert_(mgr2.get('quux').dtype == np.int_)
 
         mgr2.set('quux', randn(N))
         self.assert_(mgr2.get('quux').dtype == np.float_)
@@ -326,36 +329,110 @@ def test_copy(self):
         for cp_blk, blk in zip(shallow.blocks, self.mgr.blocks):
             self.assert_(cp_blk.values is blk.values)
 
-    def test_as_matrix(self):
-        pass
+    def test_as_matrix_float(self):
+
+        mgr = create_blockmanager([get_float_ex(['c'],np.float32), get_float_ex(['d'],np.float16), get_float_ex(['e'],np.float64)])
+        self.assert_(mgr.as_matrix().dtype == np.float64)
+
+        mgr = create_blockmanager([get_float_ex(['c'],np.float32), get_float_ex(['d'],np.float16)])
+        self.assert_(mgr.as_matrix().dtype == np.float32)
 
     def test_as_matrix_int_bool(self):
-        items = Index(['a', 'b'])
 
-        blocks = [get_bool_ex(['a']), get_bool_ex(['b'])]
-        for b in blocks:
-            b.ref_items = items
-        index_sz = blocks[0].values.shape[1]
-        mgr = BlockManager(blocks, [items, np.arange(index_sz)])
+        mgr = create_blockmanager([get_bool_ex(['a']), get_bool_ex(['b'])])
         self.assert_(mgr.as_matrix().dtype == np.bool_)
 
-        blocks = [get_int_ex(['a']), get_int_ex(['b'])]
-        for b in blocks:
-            b.ref_items = items
-
-        mgr = BlockManager(blocks, [items, np.arange(index_sz)])
+        mgr = create_blockmanager([get_int_ex(['a'],np.int64), get_int_ex(['b'],np.int64), get_int_ex(['c'],np.int32), get_int_ex(['d'],np.int16), get_int_ex(['e'],np.uint8) ])
         self.assert_(mgr.as_matrix().dtype == np.int64)
 
-    def test_as_matrix_datetime(self):
-        items = Index(['h', 'g'])
-        blocks = [get_dt_ex(['h']), get_dt_ex(['g'])]
-        for b in blocks:
-            b.ref_items = items
+        mgr = create_blockmanager([get_int_ex(['c'],np.int32), get_int_ex(['d'],np.int16), get_int_ex(['e'],np.uint8) ])
+        self.assert_(mgr.as_matrix().dtype == np.int32)
 
-        index_sz = blocks[0].values.shape[1]
-        mgr = BlockManager(blocks, [items, np.arange(index_sz)])
+    def test_as_matrix_datetime(self):
+        mgr = create_blockmanager([get_dt_ex(['h']), get_dt_ex(['g'])])
         self.assert_(mgr.as_matrix().dtype == 'M8[ns]')
 
+    def test_astype(self):
+
+        # coerce all
+        mgr = create_blockmanager([get_float_ex(['c'],np.float32), get_float_ex(['d'],np.float16), get_float_ex(['e'],np.float64)])
+
+        for t in ['float16','float32','float64','int32','int64']:
+            tmgr = mgr.astype(t)
+            self.assert_(tmgr.as_matrix().dtype == np.dtype(t))
+
+        # mixed
+        mgr = create_blockmanager([get_obj_ex(['a','b']),get_bool_ex(['c']),get_dt_ex(['d']),get_float_ex(['e'],np.float32), get_float_ex(['f'],np.float16), get_float_ex(['g'],np.float64)])
+        for t in ['float16','float32','float64','int32','int64']:
+            tmgr = mgr.astype(t, raise_on_error = False).get_numeric_data()
+            self.assert_(tmgr.as_matrix().dtype == np.dtype(t))
+
+    def test_convert(self):
+        
+        def _compare(old_mgr, new_mgr):
+            """ compare the blocks, numeric compare ==, object don't """
+            old_blocks = set(old_mgr.blocks)
+            new_blocks = set(new_mgr.blocks)
+            self.assert_(len(old_blocks) == len(new_blocks))
+
+            # compare non-numeric
+            for b in old_blocks:
+                found = False
+                for nb in new_blocks:
+                    if (b.values == nb.values).all():
+                        found = True
+                        break
+                self.assert_(found == True)
+
+            for b in new_blocks:
+                found = False
+                for ob in old_blocks:
+                    if (b.values == ob.values).all():
+                        found = True
+                        break
+                self.assert_(found == True)
+
+        # noops
+        mgr = create_blockmanager([get_int_ex(['f']), get_float_ex(['g'])])
+        new_mgr = mgr.convert()
+        _compare(mgr,new_mgr)
+
+        mgr = create_blockmanager([get_obj_ex(['a','b']), get_int_ex(['f']), get_float_ex(['g'])])
+        new_mgr = mgr.convert()
+        _compare(mgr,new_mgr)
+
+        # there could atcually be multiple dtypes resulting
+        def _check(new_mgr,block_type, citems):
+            items = set()
+            for b in new_mgr.blocks:
+                if isinstance(b,block_type):
+                    for i in list(b.items):
+                        items.add(i)
+            self.assert_(items == set(citems))
+
+        # convert
+        mat = np.empty((N, 3), dtype=object)
+        mat[:, 0] = '1'
+        mat[:, 1] = '2.'
+        mat[:, 2] = 'foo'
+        b = make_block(mat.T, ['a','b','foo'], TEST_COLS)
+
+        mgr = create_blockmanager([b, get_int_ex(['f']), get_float_ex(['g'])])
+        new_mgr = mgr.convert(convert_numeric = True)
+
+        _check(new_mgr,FloatBlock,['b','g'])
+        _check(new_mgr,IntBlock,['a','f'])
+
+        mgr = create_blockmanager([b, get_int_ex(['f'],np.int32), get_bool_ex(['bool']), get_dt_ex(['dt']), 
+                                   get_int_ex(['i'],np.int64), get_float_ex(['g'],np.float64), get_float_ex(['h'],np.float16)])
+        new_mgr = mgr.convert(convert_numeric = True)
+
+        _check(new_mgr,FloatBlock,['b','g','h'])
+        _check(new_mgr,IntBlock,['a','f','i'])
+        _check(new_mgr,ObjectBlock,['foo'])
+        _check(new_mgr,BoolBlock,['bool'])
+        _check(new_mgr,DatetimeBlock,['dt'])
+
     def test_xs(self):
         pass
 
diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py
index a4df141fefef9..87b820faa3dc8 100644
--- a/pandas/tests/test_multilevel.py
+++ b/pandas/tests/test_multilevel.py
@@ -616,12 +616,22 @@ def test_sortlevel(self):
         assert_frame_equal(rs, self.frame.sortlevel(0))
 
     def test_sortlevel_large_cardinality(self):
-        # #2684
+
+        # #2684 (int64)
+        index = MultiIndex.from_arrays([np.arange(4000)]*3)
+        df = DataFrame(np.random.randn(4000), index=index, dtype = np.int64)
+
+        # it works!
+        result = df.sortlevel(0)
+        self.assertTrue(result.index.lexsort_depth == 3)
+
+        # #2684 (int32)
         index = MultiIndex.from_arrays([np.arange(4000)]*3)
-        df = DataFrame(np.random.randn(4000), index=index)
+        df = DataFrame(np.random.randn(4000), index=index, dtype = np.int32)
 
         # it works!
         result = df.sortlevel(0)
+        self.assert_((result.dtypes.values == df.dtypes.values).all() == True)
         self.assertTrue(result.index.lexsort_depth == 3)
 
     def test_delevel_infer_dtype(self):
@@ -723,7 +733,7 @@ def test_count_level_corner(self):
         df = self.frame[:0]
         result = df.count(level=0)
         expected = DataFrame({}, index=s.index.levels[0],
-                             columns=df.columns).fillna(0).astype(int)
+                             columns=df.columns).fillna(0).astype(np.int64)
         assert_frame_equal(result, expected)
 
     def test_unstack(self):
@@ -734,6 +744,9 @@ def test_unstack(self):
         # test that ints work
         unstacked = self.ymd.astype(int).unstack()
 
+        # test that int32 work
+        unstacked = self.ymd.astype(np.int32).unstack()
+
     def test_unstack_multiple_no_empty_columns(self):
         index = MultiIndex.from_tuples([(0, 'foo', 0), (0, 'bar', 0),
                                         (1, 'baz', 1), (1, 'qux', 1)])
diff --git a/pandas/tests/test_ndframe.py b/pandas/tests/test_ndframe.py
index e017bf07039d7..0c004884c5559 100644
--- a/pandas/tests/test_ndframe.py
+++ b/pandas/tests/test_ndframe.py
@@ -24,7 +24,10 @@ def test_ndim(self):
 
     def test_astype(self):
         casted = self.ndf.astype(int)
-        self.assert_(casted.values.dtype == np.int64)
+        self.assert_(casted.values.dtype == np.int_)
+
+        casted = self.ndf.astype(np.int32)
+        self.assert_(casted.values.dtype == np.int32)
 
 if __name__ == '__main__':
     import nose
diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py
index 235b3e153574c..07a02f18d8337 100644
--- a/pandas/tests/test_panel.py
+++ b/pandas/tests/test_panel.py
@@ -418,7 +418,7 @@ def test_setitem(self):
         # scalar
         self.panel['ItemG'] = 1
         self.panel['ItemE'] = True
-        self.assert_(self.panel['ItemG'].values.dtype == np.int64)
+        self.assert_(self.panel['ItemG'].values.dtype == np.int_)
         self.assert_(self.panel['ItemE'].values.dtype == np.bool_)
 
         # object dtype
@@ -782,6 +782,13 @@ def test_constructor_cast(self):
         assert_almost_equal(casted.values, exp_values)
         assert_almost_equal(casted2.values, exp_values)
 
+        casted = Panel(zero_filled._data, dtype=np.int32)
+        casted2 = Panel(zero_filled.values, dtype=np.int32)
+
+        exp_values = zero_filled.values.astype(np.int32)
+        assert_almost_equal(casted.values, exp_values)
+        assert_almost_equal(casted2.values, exp_values)
+
         # can't cast
         data = [[['foo', 'bar', 'baz']]]
         self.assertRaises(ValueError, Panel, data, dtype=float)
@@ -798,6 +805,30 @@ def test_constructor_observe_dtype(self):
                       minor_axis=range(3), dtype='O')
         self.assert_(panel.values.dtype == np.object_)
 
+    def test_constructor_dtypes(self):
+        # GH #797
+
+        def _check_dtype(panel, dtype):
+            for i in panel.items:
+                self.assert_(panel[i].values.dtype.name == dtype)
+
+        # only nan holding types allowed here
+        for dtype in ['float64','float32','object']:
+            panel = Panel(items=range(2),major_axis=range(10),minor_axis=range(5),dtype=dtype)
+            _check_dtype(panel,dtype)
+
+        for dtype in ['float64','float32','int64','int32','object']:
+            panel = Panel(np.array(np.random.randn(2,10,5),dtype=dtype),items=range(2),major_axis=range(10),minor_axis=range(5),dtype=dtype)
+            _check_dtype(panel,dtype)
+
+        for dtype in ['float64','float32','int64','int32','object']:
+            panel = Panel(np.array(np.random.randn(2,10,5),dtype='O'),items=range(2),major_axis=range(10),minor_axis=range(5),dtype=dtype)
+            _check_dtype(panel,dtype)
+
+        for dtype in ['float64','float32','int64','int32','object']:
+            panel = Panel(np.random.randn(2,10,5),items=range(2),major_axis=range(10),minor_axis=range(5),dtype=dtype)
+            _check_dtype(panel,dtype)
+
     def test_consolidate(self):
         self.assert_(self.panel._data.is_consolidated())
 
@@ -845,6 +876,11 @@ def test_ctor_dict(self):
                               for k, v in dcasted.iteritems()))
         assert_panel_equal(result, expected)
 
+        result = Panel(dcasted, dtype=np.int32)
+        expected = Panel(dict((k, v.astype(np.int32))
+                              for k, v in dcasted.iteritems()))
+        assert_panel_equal(result, expected)
+
     def test_constructor_dict_mixed(self):
         data = dict((k, v.values) for k, v in self.panel.iterkv())
         result = Panel(data)
diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py
index e0180f475ca45..87bfba7c55cce 100644
--- a/pandas/tests/test_panel4d.py
+++ b/pandas/tests/test_panel4d.py
@@ -358,7 +358,7 @@ def test_setitem(self):
         # scalar
         self.panel4d['lG'] = 1
         self.panel4d['lE'] = True
-        self.assert_(self.panel4d['lG'].values.dtype == np.int64)
+        self.assert_(self.panel4d['lG'].values.dtype == np.int_)
         self.assert_(self.panel4d['lE'].values.dtype == np.bool_)
 
         # object dtype
@@ -592,6 +592,13 @@ def test_constructor_cast(self):
         assert_almost_equal(casted.values, exp_values)
         assert_almost_equal(casted2.values, exp_values)
 
+        casted = Panel4D(zero_filled._data, dtype=np.int32)
+        casted2 = Panel4D(zero_filled.values, dtype=np.int32)
+
+        exp_values = zero_filled.values.astype(np.int32)
+        assert_almost_equal(casted.values, exp_values)
+        assert_almost_equal(casted2.values, exp_values)
+
         # can't cast
         data = [[['foo', 'bar', 'baz']]]
         self.assertRaises(ValueError, Panel, data, dtype=float)
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
index 896c7dc34901f..3bae492d5cb81 100644
--- a/pandas/tests/test_series.py
+++ b/pandas/tests/test_series.py
@@ -137,7 +137,7 @@ def test_multilevel_name_print(self):
                     "qux    one       7",
                     "       two       8",
                     "       three     9",
-                    "Name: sth"]
+                    "Name: sth, Dtype: int64"]
         expected = "\n".join(expected)
         self.assertEquals(repr(s), expected)
 
@@ -2705,6 +2705,64 @@ def test_apply_dont_convert_dtype(self):
         result = s.apply(f, convert_dtype=False)
         self.assert_(result.dtype == object)
 
+    def test_convert_objects(self):
+
+        s = Series([1., 2, 3],index=['a','b','c'])
+        result = s.convert_objects(convert_dates=False,convert_numeric=True)
+        assert_series_equal(s,result)
+
+        # force numeric conversion
+        r = s.copy().astype('O')
+        r['a'] = '1'
+        result = r.convert_objects(convert_dates=False,convert_numeric=True)
+        assert_series_equal(s,result)
+
+        r = s.copy().astype('O')
+        r['a'] = '1.'
+        result = r.convert_objects(convert_dates=False,convert_numeric=True)
+        assert_series_equal(s,result)
+
+        r = s.copy().astype('O')
+        r['a'] = 'garbled'
+        expected = s.copy()
+        expected['a'] = np.nan
+        result = r.convert_objects(convert_dates=False,convert_numeric=True)
+        assert_series_equal(expected,result)
+
+        # dates
+        s = Series([datetime(2001,1,1,0,0), datetime(2001,1,2,0,0), datetime(2001,1,3,0,0) ])
+        s2 = Series([datetime(2001,1,1,0,0), datetime(2001,1,2,0,0), datetime(2001,1,3,0,0), 'foo', 1.0, 1, Timestamp('20010104'), '20010105'],dtype='O')
+
+        result = s.convert_objects(convert_dates=True,convert_numeric=False)
+        expected = Series([Timestamp('20010101'),Timestamp('20010102'),Timestamp('20010103')],dtype='M8[ns]')
+        assert_series_equal(expected,result)
+        
+        result = s.convert_objects(convert_dates='coerce',convert_numeric=False)
+        assert_series_equal(expected,result)
+        result = s.convert_objects(convert_dates='coerce',convert_numeric=True)
+        assert_series_equal(expected,result)
+
+        expected = Series([Timestamp('20010101'),Timestamp('20010102'),Timestamp('20010103'),lib.NaT,lib.NaT,lib.NaT,Timestamp('20010104'),Timestamp('20010105')],dtype='M8[ns]')
+        result = s2.convert_objects(convert_dates='coerce',convert_numeric=False)
+        assert_series_equal(expected,result)
+        result = s2.convert_objects(convert_dates='coerce',convert_numeric=True)
+        assert_series_equal(expected,result)
+
+        # preserver all-nans (if convert_dates='coerce')
+        s = Series(['foo','bar',1,1.0],dtype='O')
+        result = s.convert_objects(convert_dates='coerce',convert_numeric=False)
+        assert_series_equal(result,s)
+
+        # preserver if non-object
+        s = Series([1],dtype='float32')
+        result = s.convert_objects(convert_dates='coerce',convert_numeric=False)
+        assert_series_equal(result,s)
+
+        #r = s.copy()
+        #r[0] = np.nan
+        #result = r.convert_objects(convert_dates=True,convert_numeric=False)
+        #self.assert_(result.dtype == 'M8[ns]')
+
     def test_apply_args(self):
         s = Series(['foo,bar'])
 
diff --git a/pandas/tests/test_tseries.py b/pandas/tests/test_tseries.py
index 7e5341fd5b311..eaeb3325685ec 100644
--- a/pandas/tests/test_tseries.py
+++ b/pandas/tests/test_tseries.py
@@ -458,7 +458,9 @@ def test_generate_bins(self):
                           values, [-3, -1], 'right')
 
     def test_group_bin_functions(self):
-        funcs = ['add', 'mean', 'prod', 'min', 'max', 'var']
+
+        dtypes = ['float32','float64']
+        funcs  = ['add', 'mean', 'prod', 'min', 'max', 'var']
 
         np_funcs = {
             'add': np.sum,
@@ -470,71 +472,82 @@ def test_group_bin_functions(self):
         }
 
         for fname in funcs:
-            args = [getattr(algos, 'group_%s' % fname),
-                    getattr(algos, 'group_%s_bin' % fname),
-                    np_funcs[fname]]
-            self._check_versions(*args)
-
-    def _check_versions(self, irr_func, bin_func, np_func):
-        obj = self.obj
+            for d in dtypes:
+                check_less_precise = False
+                if d == 'float32':
+                    check_less_precise = True
+                args = [getattr(algos, 'group_%s_%s' % (fname,d)),
+                        getattr(algos, 'group_%s_bin_%s' % (fname,d)),
+                        np_funcs[fname],
+                        d,
+                        check_less_precise]
+                self._check_versions(*args)
+
+    def _check_versions(self, irr_func, bin_func, np_func, dtype, check_less_precise):
+        obj = self.obj.astype(dtype)
 
         cts = np.zeros(3, dtype=np.int64)
-        exp = np.zeros((3, 1), np.float64)
+        exp = np.zeros((3, 1), dtype)
         irr_func(exp, cts, obj, self.labels)
 
         # bin-based version
         bins = np.array([3, 6], dtype=np.int64)
-        out = np.zeros((3, 1), np.float64)
+        out = np.zeros((3, 1), dtype)
         counts = np.zeros(len(out), dtype=np.int64)
         bin_func(out, counts, obj, bins)
 
-        assert_almost_equal(out, exp)
+        assert_almost_equal(out, exp, check_less_precise=check_less_precise)
 
         bins = np.array([3, 9, 10], dtype=np.int64)
-        out = np.zeros((3, 1), np.float64)
+        out = np.zeros((3, 1), dtype)
         counts = np.zeros(len(out), dtype=np.int64)
         bin_func(out, counts, obj, bins)
         exp = np.array([np_func(obj[:3]), np_func(obj[3:9]),
                         np_func(obj[9:])],
-                       dtype=np.float64)
-        assert_almost_equal(out.squeeze(), exp)
+                       dtype=dtype)
+        assert_almost_equal(out.squeeze(), exp, check_less_precise=check_less_precise)
 
         # duplicate bins
         bins = np.array([3, 6, 10, 10], dtype=np.int64)
-        out = np.zeros((4, 1), np.float64)
+        out = np.zeros((4, 1), dtype)
         counts = np.zeros(len(out), dtype=np.int64)
         bin_func(out, counts, obj, bins)
         exp = np.array([np_func(obj[:3]), np_func(obj[3:6]),
                         np_func(obj[6:10]), np.nan],
-                       dtype=np.float64)
-        assert_almost_equal(out.squeeze(), exp)
+                       dtype=dtype)
+        assert_almost_equal(out.squeeze(), exp, check_less_precise=check_less_precise)
 
 
 def test_group_ohlc():
-    obj = np.random.randn(20)
 
-    bins = np.array([6, 12], dtype=np.int64)
-    out = np.zeros((3, 4), np.float64)
-    counts = np.zeros(len(out), dtype=np.int64)
+    def _check(dtype):
+        obj = np.array(np.random.randn(20),dtype=dtype)
 
-    algos.group_ohlc(out, counts, obj[:, None], bins)
+        bins = np.array([6, 12], dtype=np.int64)
+        out = np.zeros((3, 4), dtype)
+        counts = np.zeros(len(out), dtype=np.int64)
+        
+        func = getattr(algos,'group_ohlc_%s' % dtype)
+        func(out, counts, obj[:, None], bins)
 
-    def _ohlc(group):
-        if isnull(group).all():
-            return np.repeat(nan, 4)
-        return [group[0], group.max(), group.min(), group[-1]]
+        def _ohlc(group):
+            if isnull(group).all():
+                return np.repeat(nan, 4)
+            return [group[0], group.max(), group.min(), group[-1]]
 
-    expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]),
-                         _ohlc(obj[12:])])
+        expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]),
+                             _ohlc(obj[12:])])
 
-    assert_almost_equal(out, expected)
-    assert_almost_equal(counts, [6, 6, 8])
+        assert_almost_equal(out, expected)
+        assert_almost_equal(counts, [6, 6, 8])
 
-    obj[:6] = nan
-    algos.group_ohlc(out, counts, obj[:, None], bins)
-    expected[0] = nan
-    assert_almost_equal(out, expected)
+        obj[:6] = nan
+        func(out, counts, obj[:, None], bins)
+        expected[0] = nan
+        assert_almost_equal(out, expected)
 
+    _check('float32')
+    _check('float64')
 
 def test_try_parse_dates():
     from dateutil.parser import parse
diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py
index c058580ab0f45..3adfb38e6144b 100644
--- a/pandas/tools/merge.py
+++ b/pandas/tools/merge.py
@@ -2,6 +2,7 @@
 SQL-style merge routines
 """
 
+import itertools
 import numpy as np
 
 from pandas.core.categorical import Factor
@@ -658,7 +659,7 @@ def _prepare_blocks(self):
             join_blocks = unit.get_upcasted_blocks()
             type_map = {}
             for blk in join_blocks:
-                type_map.setdefault(type(blk), []).append(blk)
+                type_map.setdefault(blk.dtype, []).append(blk)
             blockmaps.append((unit, type_map))
 
         return blockmaps
@@ -985,7 +986,8 @@ def _prepare_blocks(self):
         blockmaps = []
         for data in reindexed_data:
             data = data.consolidate()
-            type_map = dict((type(blk), blk) for blk in data.blocks)
+
+            type_map = dict((blk.dtype, blk) for blk in data.blocks)
             blockmaps.append(type_map)
         return blockmaps, reindexed_data
 
diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py
index 47ab02d892c3f..8820d43975885 100644
--- a/pandas/tools/tests/test_merge.py
+++ b/pandas/tools/tests/test_merge.py
@@ -287,7 +287,7 @@ def test_join_index_mixed(self):
         df1 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True},
                         index=np.arange(10),
                         columns=['A', 'B', 'C', 'D'])
-        self.assert_(df1['B'].dtype == np.int64)
+        self.assert_(df1['B'].dtype == np.int)
         self.assert_(df1['D'].dtype == np.bool_)
 
         df2 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True},
@@ -422,23 +422,27 @@ def test_join_hierarchical_mixed(self):
         self.assertTrue('b' in result)
 
     def test_join_float64_float32(self):
-        a = DataFrame(randn(10, 2), columns=['a', 'b'])
-        b = DataFrame(randn(10, 1), columns=['c']).astype(np.float32)
-        joined = a.join(b)
-        expected = a.join(b.astype('f8'))
-        assert_frame_equal(joined, expected)
 
-        joined = b.join(a)
-        assert_frame_equal(expected, joined.reindex(columns=['a', 'b', 'c']))
+        a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype = np.float64)
+        b = DataFrame(randn(10, 1), columns=['c'], dtype = np.float32)
+        joined = a.join(b)
+        self.assert_(joined.dtypes['a'] == 'float64')
+        self.assert_(joined.dtypes['b'] == 'float64')
+        self.assert_(joined.dtypes['c'] == 'float32')
 
-        a = np.random.randint(0, 5, 100)
-        b = np.random.random(100).astype('Float64')
-        c = np.random.random(100).astype('Float32')
+        a = np.random.randint(0, 5, 100).astype('int64')
+        b = np.random.random(100).astype('float64')
+        c = np.random.random(100).astype('float32')
         df = DataFrame({'a': a, 'b': b, 'c': c})
-        xpdf = DataFrame({'a': a, 'b': b, 'c': c.astype('Float64')})
-        s = DataFrame(np.random.random(5).astype('f'), columns=['md'])
+        xpdf = DataFrame({'a': a, 'b': b, 'c': c })
+        s = DataFrame(np.random.random(5).astype('float32'), columns=['md'])
         rs = df.merge(s, left_on='a', right_index=True)
-        xp = xpdf.merge(s.astype('f8'), left_on='a', right_index=True)
+        self.assert_(rs.dtypes['a'] == 'int64')
+        self.assert_(rs.dtypes['b'] == 'float64')
+        self.assert_(rs.dtypes['c'] == 'float32')
+        self.assert_(rs.dtypes['md'] == 'float32')
+
+        xp = xpdf.merge(s, left_on='a', right_index=True)
         assert_frame_equal(rs, xp)
 
     def test_join_many_non_unique_index(self):
@@ -591,7 +595,7 @@ def test_intelligently_handle_join_key(self):
                                                  np.nan, np.nan]),
                               'rvalue': np.array([0, 1, 0, 1, 2, 2, 3, 4, 5])},
                              columns=['value', 'key', 'rvalue'])
-        assert_frame_equal(joined, expected)
+        assert_frame_equal(joined, expected, check_dtype=False)
 
         self.assert_(joined._data.is_consolidated())
 
@@ -801,7 +805,25 @@ def test_left_join_index_preserve_order(self):
 
         left = DataFrame({'k1': [0, 1, 2] * 8,
                           'k2': ['foo', 'bar'] * 12,
-                          'v': np.arange(24)})
+                          'v': np.array(np.arange(24),dtype=np.int64) })
+
+        index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
+        right = DataFrame({'v2': [5, 7]}, index=index)
+
+        result = left.join(right, on=['k1', 'k2'])
+
+        expected = left.copy()
+        expected['v2'] = np.nan
+        expected['v2'][(expected.k1 == 2) & (expected.k2 == 'bar')] = 5
+        expected['v2'][(expected.k1 == 1) & (expected.k2 == 'foo')] = 7
+
+        tm.assert_frame_equal(result, expected)
+
+        # test join with multi dtypes blocks
+        left = DataFrame({'k1': [0, 1, 2] * 8,
+                          'k2': ['foo', 'bar'] * 12,
+                          'k3' : np.array([0, 1, 2]*8, dtype=np.float32),
+                          'v': np.array(np.arange(24),dtype=np.int32) })
 
         index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
         right = DataFrame({'v2': [5, 7]}, index=index)
@@ -820,6 +842,33 @@ def test_left_join_index_preserve_order(self):
                        right_on=['k1', 'k2'], how='right')
         tm.assert_frame_equal(joined.ix[:, expected.columns], expected)
 
+    def test_join_multi_dtypes(self):
+
+        # test with multi dtypes in the join index
+        def _test(dtype1,dtype2):
+            left = DataFrame({'k1': np.array([0, 1, 2] * 8, dtype=dtype1),
+                              'k2': ['foo', 'bar'] * 12,
+                              'v': np.array(np.arange(24),dtype=np.int64) })
+            
+            index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
+            right = DataFrame({'v2': np.array([5, 7], dtype=dtype2)}, index=index)
+            
+            result = left.join(right, on=['k1', 'k2'])
+            
+            expected = left.copy()
+
+            if dtype2.kind == 'i':
+                dtype2 = np.dtype('float64')
+            expected['v2'] = np.array(np.nan,dtype=dtype2)
+            expected['v2'][(expected.k1 == 2) & (expected.k2 == 'bar')] = 5
+            expected['v2'][(expected.k1 == 1) & (expected.k2 == 'foo')] = 7
+            
+            tm.assert_frame_equal(result, expected)
+
+        for d1 in [np.int64,np.int32,np.int16,np.int8,np.uint8]:
+            for d2 in [np.int64,np.float64,np.float32,np.float16]:
+                _test(np.dtype(d1),np.dtype(d2))
+
     def test_left_merge_na_buglet(self):
         left = DataFrame({'id': list('abcde'), 'v1': randn(5),
                           'v2': randn(5), 'dummy': list('abcde'),
diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py
index 4d81119bd4a34..29b844d330af2 100644
--- a/pandas/tseries/tests/test_resample.py
+++ b/pandas/tseries/tests/test_resample.py
@@ -550,17 +550,20 @@ def test_resample_not_monotonic(self):
         assert_series_equal(result, exp)
 
     def test_resample_median_bug_1688(self):
-        df = DataFrame([1, 2], index=[datetime(2012, 1, 1, 0, 0, 0),
-                                      datetime(2012, 1, 1, 0, 5, 0)])
-
-        result = df.resample("T", how=lambda x: x.mean())
-        exp = df.asfreq('T')
-        tm.assert_frame_equal(result, exp)
-
-        result = df.resample("T", how="median")
-        exp = df.asfreq('T')
-        tm.assert_frame_equal(result, exp)
 
+        for dtype in ['int64','int32','float64','float32']:
+            df = DataFrame([1, 2], index=[datetime(2012, 1, 1, 0, 0, 0),
+                                          datetime(2012, 1, 1, 0, 5, 0)],
+                           dtype = dtype)
+
+            result = df.resample("T", how=lambda x: x.mean())
+            exp = df.asfreq('T')
+            tm.assert_frame_equal(result, exp)
+            
+            result = df.resample("T", how="median")
+            exp = df.asfreq('T')
+            tm.assert_frame_equal(result, exp)
+            
     def test_how_lambda_functions(self):
         ts = _simple_ts('1/1/2000', '4/1/2000')
 
diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py
index aa12d6142d6d8..861a8aa9d3a95 100644
--- a/pandas/tseries/tests/test_timeseries.py
+++ b/pandas/tseries/tests/test_timeseries.py
@@ -567,7 +567,8 @@ def test_series_repr_nat(self):
         expected = ('0          1970-01-01 00:00:00\n'
                     '1   1970-01-01 00:00:00.000001\n'
                     '2   1970-01-01 00:00:00.000002\n'
-                    '3                          NaT')
+                    '3                          NaT\n'
+                    'Dtype: datetime64[ns]')
         self.assertEquals(result, expected)
 
     def test_fillna_nat(self):
diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx
index bbbe090225b83..200ab632e094e 100644
--- a/pandas/tslib.pyx
+++ b/pandas/tslib.pyx
@@ -774,7 +774,7 @@ def datetime_to_datetime64(ndarray[object] values):
 
 
 def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False,
-                      format=None, utc=None):
+                      format=None, utc=None, coerce=False):
     cdef:
         Py_ssize_t i, n = len(values)
         object val
@@ -813,14 +813,16 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False,
                 _check_dts_bounds(iresult[i], &dts)
             elif util.is_datetime64_object(val):
                 iresult[i] = _get_datetime64_nanos(val)
-            elif util.is_integer_object(val):
+
+            # if we are coercing, dont' allow integers
+            elif util.is_integer_object(val) and not coerce:
                 iresult[i] = val
             else:
-                if len(val) == 0:
-                    iresult[i] = iNaT
-                    continue
-
                 try:
+                    if len(val) == 0:
+                       iresult[i] = iNaT
+                       continue
+
                     _string_to_dts(val, &dts)
                     iresult[i] = pandas_datetimestruct_to_datetime(PANDAS_FR_ns,
                                                                    &dts)
@@ -829,10 +831,19 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False,
                     try:
                         result[i] = parse(val, dayfirst=dayfirst)
                     except Exception:
+                        if coerce:
+                           iresult[i] = iNaT
+                           continue
                         raise TypeError
                     pandas_datetime_to_datetimestruct(iresult[i], PANDAS_FR_ns,
                                                       &dts)
                     _check_dts_bounds(iresult[i], &dts)
+                except:
+                    if coerce:
+                        iresult[i] = iNaT
+                        continue
+                    raise
+
         return result
     except TypeError:
         oresult = np.empty(n, dtype=object)
diff --git a/pandas/util/testing.py b/pandas/util/testing.py
index a9a6bab893ac1..702ae7d5c72ef 100644
--- a/pandas/util/testing.py
+++ b/pandas/util/testing.py
@@ -88,7 +88,7 @@ def isiterable(obj):
     return hasattr(obj, '__iter__')
 
 
-def assert_almost_equal(a, b):
+def assert_almost_equal(a, b, check_less_precise = False):
     if isinstance(a, dict) or isinstance(b, dict):
         return assert_dict_equal(a, b)
 
@@ -103,7 +103,7 @@ def assert_almost_equal(a, b):
             return True
         else:
             for i in xrange(len(a)):
-                assert_almost_equal(a[i], b[i])
+                assert_almost_equal(a[i], b[i], check_less_precise)
         return True
 
     err_msg = lambda a, b: 'expected %.5f but got %.5f' % (a, b)
@@ -112,16 +112,29 @@ def assert_almost_equal(a, b):
         np.testing.assert_(isnull(b))
         return
 
-    if isinstance(a, (bool, float, int)):
+    if isinstance(a, (bool, float, int, np.float32)):
+        decimal = 5
+
+        # deal with differing dtypes
+        if check_less_precise:
+            dtype_a = np.dtype(a)
+            dtype_b = np.dtype(b)
+            if dtype_a.kind == 'i' and dtype_b == 'i':
+                pass
+            if dtype_a.kind == 'f' and dtype_b == 'f':
+                if dtype_a.itemsize <= 4 and dtype_b.itemsize <= 4:
+                    decimal = 3
+
         if np.isinf(a):
             assert np.isinf(b), err_msg(a, b)
+
         # case for zero
         elif abs(a) < 1e-5:
             np.testing.assert_almost_equal(
-                a, b, decimal=5, err_msg=err_msg(a, b), verbose=False)
+                a, b, decimal=decimal, err_msg=err_msg(a, b), verbose=False)
         else:
             np.testing.assert_almost_equal(
-                1, a / b, decimal=5, err_msg=err_msg(a, b), verbose=False)
+                1, a / b, decimal=decimal, err_msg=err_msg(a, b), verbose=False)
     else:
         assert(a == b)
 
@@ -144,10 +157,11 @@ def assert_dict_equal(a, b, compare_keys=True):
 def assert_series_equal(left, right, check_dtype=True,
                         check_index_type=False,
                         check_index_freq=False,
-                        check_series_type=False):
+                        check_series_type=False,
+                        check_less_precise=False):
     if check_series_type:
         assert(type(left) == type(right))
-    assert_almost_equal(left.values, right.values)
+    assert_almost_equal(left.values, right.values, check_less_precise)
     if check_dtype:
         assert(left.dtype == right.dtype)
     assert(left.index.equals(right.index))
@@ -160,9 +174,11 @@ def assert_series_equal(left, right, check_dtype=True,
                getattr(right, 'freqstr', None))
 
 
-def assert_frame_equal(left, right, check_index_type=False,
+def assert_frame_equal(left, right, check_dtype=True, 
+                       check_index_type=False,
                        check_column_type=False,
-                       check_frame_type=False):
+                       check_frame_type=False,
+                       check_less_precise=False):
     if check_frame_type:
         assert(type(left) == type(right))
     assert(isinstance(left, DataFrame))
@@ -175,7 +191,10 @@ def assert_frame_equal(left, right, check_index_type=False,
         assert(col in right)
         lcol = left.icol(i)
         rcol = right.icol(i)
-        assert_series_equal(lcol, rcol)
+        assert_series_equal(lcol, rcol, 
+                            check_dtype=check_dtype,
+                            check_index_type=check_index_type,
+                            check_less_precise=check_less_precise)
 
     if check_index_type:
         assert(type(left.index) == type(right.index))
@@ -187,7 +206,9 @@ def assert_frame_equal(left, right, check_index_type=False,
         assert(left.columns.inferred_type == right.columns.inferred_type)
 
 
-def assert_panel_equal(left, right, check_panel_type=False):
+def assert_panel_equal(left, right, 
+                       check_panel_type=False,
+                       check_less_precise=False):
     if check_panel_type:
         assert(type(left) == type(right))
 
@@ -197,13 +218,14 @@ def assert_panel_equal(left, right, check_panel_type=False):
 
     for col, series in left.iterkv():
         assert(col in right)
-        assert_frame_equal(series, right[col])
+        assert_frame_equal(series, right[col], check_less_precise=check_less_precise)
 
     for col in right:
         assert(col in left)
 
 
-def assert_panel4d_equal(left, right):
+def assert_panel4d_equal(left, right,
+                         check_less_precise=False):
     assert(left.labels.equals(right.labels))
     assert(left.items.equals(right.items))
     assert(left.major_axis.equals(right.major_axis))
@@ -211,7 +233,7 @@ def assert_panel4d_equal(left, right):
 
     for col, series in left.iterkv():
         assert(col in right)
-        assert_panel_equal(series, right[col])
+        assert_panel_equal(series, right[col], check_less_precise=check_less_precise)
 
     for col in right:
         assert(col in left)
diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py
index 502752d9ec6a6..caa09c219a866 100644
--- a/vb_suite/groupby.py
+++ b/vb_suite/groupby.py
@@ -177,15 +177,24 @@ def f():
 data = Series(randn(len(labels)))
 data[::3] = np.nan
 data[1::3] = np.nan
+data2 = Series(randn(len(labels)),dtype='float32')
+data2[::3] = np.nan
+data2[1::3] = np.nan
 labels = labels.take(np.random.permutation(len(labels)))
 """
 
 groupby_first = Benchmark('data.groupby(labels).first()', setup,
                           start_date=datetime(2012, 5, 1))
 
+groupby_first_float32 = Benchmark('data2.groupby(labels).first()', setup,
+                          start_date=datetime(2013, 1, 1))
+
 groupby_last = Benchmark('data.groupby(labels).last()', setup,
                          start_date=datetime(2012, 5, 1))
 
+groupby_last_float32 = Benchmark('data2.groupby(labels).last()', setup,
+                         start_date=datetime(2013, 1, 1))
+
 
 #----------------------------------------------------------------------
 # groupby_indices replacement, chop up Series
diff --git a/vb_suite/reindex.py b/vb_suite/reindex.py
index 2f675636ee928..acf8f6f043bad 100644
--- a/vb_suite/reindex.py
+++ b/vb_suite/reindex.py
@@ -56,6 +56,7 @@
 ts = Series(np.random.randn(len(rng)), index=rng)
 ts2 = ts[::2]
 ts3 = ts2.reindex(ts.index)
+ts4 = ts3.astype('float32')
 
 def pad():
     try:
@@ -81,9 +82,16 @@ def backfill():
                                name="reindex_fillna_pad",
                                start_date=datetime(2011, 3, 1))
 
+reindex_fillna_pad_float32 = Benchmark("ts4.fillna(method='pad')", setup,
+                                       name="reindex_fillna_pad_float32",
+                                       start_date=datetime(2013, 1, 1))
+
 reindex_fillna_backfill = Benchmark("ts3.fillna(method='backfill')", setup,
                                     name="reindex_fillna_backfill",
                                     start_date=datetime(2011, 3, 1))
+reindex_fillna_backfill_float32 = Benchmark("ts4.fillna(method='backfill')", setup,
+                                            name="reindex_fillna_backfill_float32",
+                                            start_date=datetime(2013, 1, 1))
 
 #----------------------------------------------------------------------
 # align on level