pandas-dev · jreback · Sep 29, 2017 · Sep 25, 2017 · Sep 25, 2017 · Sep 25, 2017
diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx
@@ -105,7 +105,7 @@ from tslibs.timezones cimport (
     is_utc, is_tzlocal, is_fixed_offset,
     treat_tz_as_dateutil, treat_tz_as_pytz,
     get_timezone, get_utcoffset, maybe_get_tz,
-    get_dst_info
+    get_dst_info, _infer_dst
     )
 
 
@@ -4003,48 +4003,7 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None,
             result_b[i] = v
 
     if infer_dst:
-        dst_hours = np.empty(n, dtype=np.int64)
-        dst_hours.fill(NPY_NAT)
-
-        # Get the ambiguous hours (given the above, these are the hours
-        # where result_a != result_b and neither of them are NAT)
-        both_nat = np.logical_and(result_a != NPY_NAT, result_b != NPY_NAT)
-        both_eq = result_a == result_b
-        trans_idx = np.squeeze(np.nonzero(np.logical_and(both_nat, ~both_eq)))
-        if trans_idx.size == 1:
-            stamp = Timestamp(vals[trans_idx])
-            raise pytz.AmbiguousTimeError(
-                "Cannot infer dst time from %s as there "
-                "are no repeated times" % stamp)
-        # Split the array into contiguous chunks (where the difference between
-        # indices is 1).  These are effectively dst transitions in different
-        # years which is useful for checking that there is not an ambiguous
-        # transition in an individual year.
-        if trans_idx.size > 0:
-            one_diff = np.where(np.diff(trans_idx) != 1)[0] +1
-            trans_grp = np.array_split(trans_idx, one_diff)
-
-            # Iterate through each day, if there are no hours where the
-            # delta is negative (indicates a repeat of hour) the switch
-            # cannot be inferred
-            for grp in trans_grp:
-
-                delta = np.diff(result_a[grp])
-                if grp.size == 1 or np.all(delta > 0):
-                    stamp = Timestamp(vals[grp[0]])
-                    raise pytz.AmbiguousTimeError(stamp)
-
-                # Find the index for the switch and pull from a for dst and b
-                # for standard
-                switch_idx = (delta <= 0).nonzero()[0]
-                if switch_idx.size > 1:
-                    raise pytz.AmbiguousTimeError(
-                        "There are %i dst switches when "
-                        "there should only be 1." % switch_idx.size)
-                switch_idx = switch_idx[0] + 1 # Pull the only index and adjust
-                a_idx = grp[:switch_idx]
-                b_idx = grp[switch_idx:]
-                dst_hours[grp] = np.hstack((result_a[a_idx], result_b[b_idx]))
+        dst_hours = _infer_dst(vals, result_a, result_b)
 
     for i in range(n):
         left = result_a[i]

diff --git a/pandas/_libs/tslibs/timezones.pxd b/pandas/_libs/tslibs/timezones.pxd
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # cython: profile=False
 
-from numpy cimport ndarray
+from numpy cimport ndarray, int64_t
 
 cdef bint is_utc(object tz)
 cdef bint is_tzlocal(object tz)
@@ -16,3 +16,7 @@ cpdef get_utcoffset(tzinfo, obj)
 cdef bint is_fixed_offset(object tz)
 
 cdef object get_dst_info(object tz)
+
+cdef ndarray[int64_t] _infer_dst(ndarray[int64_t] vals,
+                                 ndarray[int64_t] result_a,
+                                 ndarray[int64_t] result_b)
diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx
@@ -275,3 +275,84 @@ cdef object get_dst_info(object tz):
         dst_cache[cache_key] = (trans, deltas, typ)
 
     return dst_cache[cache_key]
+
+
+def _infer_tzinfo(start, end):
+    def _infer(a, b):
+        tz = a.tzinfo
+        if b and b.tzinfo:
+            if not (get_timezone(tz) == get_timezone(b.tzinfo)):
+                raise AssertionError('Inputs must both have the same timezone,'
+                                     ' {timezone1} != {timezone2}'
+                                     .format(timezone1=tz, timezone2=b.tzinfo))
+        return tz
+
+    tz = None
+    if start is not None:
+        tz = _infer(start, end)
+    elif end is not None:
+        tz = _infer(end, start)
+    return tz
+
+
+cdef ndarray[int64_t] _infer_dst(ndarray[int64_t] vals,
+                                 ndarray[int64_t] result_a,
+                                 ndarray[int64_t] result_b):
+    cdef:
+        Py_ssize_t n = len(vals)
+        ndarray[int64_t] dst_hours
+
+    dst_hours = np.empty(n, dtype=np.int64)
+    dst_hours.fill(NPY_NAT)
+
+    # Get the ambiguous hours (given the above, these are the hours
+    # where result_a != result_b and neither of them are NAT)
+    both_nat = np.logical_and(result_a != NPY_NAT, result_b != NPY_NAT)
+    both_eq = result_a == result_b
+    trans_idx = np.squeeze(np.nonzero(np.logical_and(both_nat, ~both_eq)))
+    if trans_idx.size == 1:
+        stamp = np.int64(vals[trans_idx]).astype('datetime64[ns]')
+        # Render `stamp` as e.g. '2017-08-30 07:59:23.123456'
+        # as opposed to str(stamp) which would
+        # be '2017-08-30T07:59:23.123456789'
+        stamp = str(stamp).replace('T', ' ')[:-3]
+        raise pytz.AmbiguousTimeError(
+            "Cannot infer dst time from %s as there "
+            "are no repeated times" % stamp)
+
+    # Split the array into contiguous chunks (where the difference between
+    # indices is 1).  These are effectively dst transitions in different
+    # years which is useful for checking that there is not an ambiguous
+    # transition in an individual year.
+    if trans_idx.size > 0:
+        one_diff = np.where(np.diff(trans_idx) != 1)[0] +1
+        trans_grp = np.array_split(trans_idx, one_diff)
+
+        # Iterate through each day, if there are no hours where the
+        # delta is negative (indicates a repeat of hour) the switch
+        # cannot be inferred
+        for grp in trans_grp:
+
+            delta = np.diff(result_a[grp])
+            if grp.size == 1 or np.all(delta > 0):
+                stamp = np.int64(vals[grp[0]]).astype('datetime64[ns]')
+                # Render `stamp` as e.g. '2017-08-30 07:59:23.123456'
+                # as opposed to str(stamp) which would
+                # be '2017-08-30T07:59:23.123456789'
+                stamp = str(stamp).replace('T', ' ')[:-3]
+                raise pytz.AmbiguousTimeError(stamp)
+
+            # Find the index for the switch and pull from a for dst and b
+            # for standard
+            switch_idx = (delta <= 0).nonzero()[0]
+            if switch_idx.size > 1:
+                raise pytz.AmbiguousTimeError(
+                    "There are %i dst switches when "
+                    "there should only be 1." % switch_idx.size)
+
+            switch_idx = switch_idx[0] + 1 # Pull the only index and adjust
+            a_idx = grp[:switch_idx]
+            b_idx = grp[switch_idx:]
+            dst_hours[grp] = np.hstack((result_a[a_idx], result_b[b_idx]))
+
+    return dst_hours
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
@@ -4,7 +4,7 @@
 
 from pandas._libs import lib, tslib
 from pandas._libs.tslibs.strptime import array_strptime
-from pandas._libs.tslibs.timezones import get_timezone
+from pandas._libs.tslibs.timezones import get_timezone, _infer_tzinfo  # noqa
 
 from pandas.core.dtypes.common import (
     _ensure_object,
@@ -42,22 +42,6 @@ def _lexer_split_from_str(dt_str):
     pass
 
 
-def _infer_tzinfo(start, end):
-    def _infer(a, b):
-        tz = a.tzinfo
-        if b and b.tzinfo:
-            if not (get_timezone(tz) == get_timezone(b.tzinfo)):
-                raise AssertionError('Inputs must both have the same timezone,'
-                                     ' {timezone1} != {timezone2}'
-                                     .format(timezone1=tz, timezone2=b.tzinfo))
-        return tz
-
-    tz = None
-    if start is not None:
-        tz = _infer(start, end)
-    elif end is not None:
-        tz = _infer(end, start)
-    return tz
 
 
 def _guess_datetime_format(dt_str, dayfirst=False,