Skip to content

Commit

Permalink
Add asvs, modify tests for caches
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke committed Sep 25, 2017
1 parent 2a1a064 commit 89ce37f
Show file tree
Hide file tree
Showing 3 changed files with 308 additions and 241 deletions.
53 changes: 19 additions & 34 deletions asv_bench/benchmarks/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,27 +346,21 @@ class ToDatetime(object):

def setup(self):
self.rng = date_range(start='1/1/2000', periods=10000, freq='D')
self.stringsD = Series((((self.rng.year * 10000) + (self.rng.month * 100)) + self.rng.day), dtype=np.int64).apply(str)
self.stringsD = Series(self.rng.strftime('%Y%m%d'))

self.rng = date_range(start='1/1/2000', periods=20000, freq='H')
self.strings = [x.strftime('%Y-%m-%d %H:%M:%S') for x in self.rng]
self.strings_nosep = [x.strftime('%Y%m%d %H:%M:%S') for x in self.rng]
self.strings = self.rng.strftime('%Y-%m-%d %H:%M:%S').tolist()
self.strings_nosep = self.rng.strftime('%Y%m%d %H:%M:%S').tolist()
self.strings_tz_space = [x.strftime('%Y-%m-%d %H:%M:%S') + ' -0800'
for x in self.rng]

self.s = Series((['19MAY11', '19MAY11:00:00:00'] * 100000))
self.s2 = self.s.str.replace(':\\S+$', '')
self.dup_numeric_data_10_5 = Series([1000] * 100000)
self.dup_string_data_10_5 = ['2013-01-01 01:00:00'] * 100000
self.dup_datetime_data_10_5 = [dt.datetime(2010, 1, 1)] * 100000

self.dup_numeric_data_10_3 = Series([1000] * 100)
self.dup_string_data_10_3 = ['2013-01-01 01:00:00'] * 100
self.dup_datetime_data_10_3 = [dt.datetime(2010, 1, 1)] * 100

self.dup_numeric_data_10_7 = Series([1000] * 10**7)
self.dup_string_data_10_7 = ['2013-01-01 01:00:00'] * 10**7
self.dup_datetime_data_10_7 = [dt.datetime(2010, 1, 1)] * 10**7
self.unique_numeric_seconds = range(10000)
self.dup_numeric_seconds = [1000] * 10000
self.dup_string_dates = ['2000-02-11'] * 10000
self.dup_string_with_tz = ['2000-02-11 15:00:00-0800'] * 10000

def time_format_YYYYMMDD(self):
to_datetime(self.stringsD, format='%Y%m%d')
Expand All @@ -392,32 +386,23 @@ def time_format_exact(self):
def time_format_no_exact(self):
to_datetime(self.s, format='%d%b%y', exact=False)

def time_cache_dup_numeric_data_10_3(self):
to_datetime(self.dup_numeric_data_10_3, unit='s')

def time_cache_dup_datetime_data_10_3(self):
to_datetime(self.dup_datetime_data_10_3)

def time_cache_dup_string_data_10_3(self):
to_datetime(self.dup_string_data_10_3)

def time_cache_dup_numeric_data_10_5(self):
to_datetime(self.dup_numeric_data_10_5, unit='s')
def time_cache_with_unique_seconds_and unit(self):
to_datetime(self.unique_numeric_seconds, unit='s')

def time_cache_dup_datetime_data_10_5(self):
to_datetime(self.dup_datetime_data_10_5)
def time_cache_with_dup_seconds_and_unit(self):
to_datetime(self.dup_numeric_seconds, unit='s')

def time_cache_dup_string_data_10_5(self):
to_datetime(self.dup_string_data_10_5)
def time_cache_with_dup_string_dates(self):
to_datetime(self.dup_string_dates)

def time_cache_dup_numeric_data_10_7(self):
to_datetime(self.dup_numeric_data_10_7, unit='s')
def time_cache_with_dup_string_dates_and_format(self):
to_datetime(self.dup_string_dates, format='%Y-%m-%d')

def time_cache_dup_datetime_data_10_7(self):
to_datetime(self.dup_datetime_data_10_7)
def time_cache_with_dup_string_tzoffset_dates(self):
to_datetime(self.dup_string_with_tz)

def time_cache_dup_string_data_10_7(self):
to_datetime(self.dup_string_data_10_7)
def time_cache_with_dup_string_tzoffset_dates_and_format(self):
to_datetim(self.dup_string_with_tz, format='%Y-%m-%d %H:%M:%S%z')


class Offsets(object):
Expand Down
21 changes: 7 additions & 14 deletions pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
origin.
.. versionadded: 0.20.0
cache_datetime : boolean, default False
cache : boolean, default False
If True, use a cache of unique, converted dates to apply the datetime
conversion. Produces signficant speed-ups when parsing duplicate date.
Expand Down Expand Up @@ -355,7 +355,6 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,

def _convert_listlike(arg, box, format, name=None, tz=tz):

import pdb; pdb.set_trace()
if isinstance(arg, (list, tuple)):
arg = np.array(arg, dtype='O')

Expand Down Expand Up @@ -523,18 +522,12 @@ def _convert_listlike(arg, box, format, name=None, tz=tz):

convert_cache = None
if cache and is_list_like(arg):
# Create a cache only if there are more than 10k values and the user
# passes in datestrings
#min_cache_threshold = 10**5
#if len(arg) >= min_cache_threshold and is_string_dtype(arg):
# unique currently cannot determine dates that are out of bounds
# recurison errors with datetime
unique_dates = algorithms.unique(arg)
# Essentially they need to all be the same value
if len(unique_dates) != len(arg):
from pandas import Series
cache_data = _convert_listlike(unique_dates, False, format)
convert_cache = Series(cache_data, index=unique_dates)
if len(arg) >= 1000:
unique_dates = algorithms.unique(arg)
if len(unique_dates) != len(arg):
from pandas import Series
cache_dates = _convert_listlike(unique_dates, False, format)
convert_cache = Series(cache_dates, index=unique_dates)

if isinstance(arg, tslib.Timestamp):
result = arg
Expand Down
Loading

0 comments on commit 89ce37f

Please sign in to comment.