From 86e136042d169d509247b14d0440f2ad227a65aa Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 25 Jul 2018 03:12:03 -0700 Subject: [PATCH] Move FrequencyInferer out of libresolution (#21992) --- pandas/_libs/algos.pyx | 41 ++++ pandas/_libs/tslibs/resolution.pyx | 289 +---------------------------- pandas/tseries/frequencies.py | 265 +++++++++++++++++++++++++- 3 files changed, 304 insertions(+), 291 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index a418e54e4da9b..ecfc7355dddfc 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -22,6 +22,11 @@ cnp.import_array() cimport util from util cimport numeric, get_nat +from khash cimport (khiter_t, + kh_destroy_int64, kh_put_int64, + kh_init_int64, kh_int64_t, + kh_resize_int64, kh_get_int64) + import missing cdef float64_t FP_ERR = 1e-13 @@ -71,6 +76,42 @@ class NegInfinity(object): __ge__ = lambda self, other: isinstance(other, NegInfinity) +cpdef ndarray[int64_t, ndim=1] unique_deltas(ndarray[int64_t] arr): + """ + Efficiently find the unique first-differences of the given array. + + Parameters + ---------- + arr : ndarray[in64_t] + + Returns + ------- + result : ndarray[int64_t] + result is sorted + """ + cdef: + Py_ssize_t i, n = len(arr) + int64_t val + khiter_t k + kh_int64_t *table + int ret = 0 + list uniques = [] + + table = kh_init_int64() + kh_resize_int64(table, 10) + for i in range(n - 1): + val = arr[i + 1] - arr[i] + k = kh_get_int64(table, val) + if k == table.n_buckets: + kh_put_int64(table, val, &ret) + uniques.append(val) + kh_destroy_int64(table) + + result = np.array(uniques, dtype=np.int64) + result.sort() + return result + + @cython.wraparound(False) @cython.boundscheck(False) def is_lexsorted(list list_of_arrays): diff --git a/pandas/_libs/tslibs/resolution.pyx b/pandas/_libs/tslibs/resolution.pyx index 0835a43411783..4b90c669eebba 100644 --- a/pandas/_libs/tslibs/resolution.pyx +++ b/pandas/_libs/tslibs/resolution.pyx @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- # cython: profile=False +cimport cython from cython cimport Py_ssize_t import numpy as np @@ -10,23 +11,12 @@ cnp.import_array() from util cimport is_string_object, get_nat -from pandas._libs.khash cimport (khiter_t, - kh_destroy_int64, kh_put_int64, - kh_init_int64, kh_int64_t, - kh_resize_int64, kh_get_int64) - from np_datetime cimport npy_datetimestruct, dt64_to_dtstruct from frequencies cimport get_freq_code from timezones cimport (is_utc, is_tzlocal, maybe_get_tz, get_dst_info) -from fields import build_field_sarray -from conversion import tz_convert from conversion cimport tz_convert_utc_to_tzlocal -from ccalendar import MONTH_ALIASES, int_to_weekday from ccalendar cimport get_days_in_month -from timestamps import Timestamp - -from pandas._libs.properties import cache_readonly # ---------------------------------------------------------------------- # Constants @@ -41,13 +31,6 @@ cdef int RESO_MIN = 4 cdef int RESO_HR = 5 cdef int RESO_DAY = 6 -_ONE_MICRO = 1000L -_ONE_MILLI = (_ONE_MICRO * 1000) -_ONE_SECOND = (_ONE_MILLI * 1000) -_ONE_MINUTE = (60 * _ONE_SECOND) -_ONE_HOUR = (60 * _ONE_MINUTE) -_ONE_DAY = (24 * _ONE_HOUR) - # ---------------------------------------------------------------------- cpdef resolution(ndarray[int64_t] stamps, tz=None): @@ -331,31 +314,7 @@ class Resolution(object): # ---------------------------------------------------------------------- # Frequency Inference -cdef ndarray[int64_t, ndim=1] unique_deltas(ndarray[int64_t] arr): - cdef: - Py_ssize_t i, n = len(arr) - int64_t val - khiter_t k - kh_int64_t *table - int ret = 0 - list uniques = [] - - table = kh_init_int64() - kh_resize_int64(table, 10) - for i in range(n - 1): - val = arr[i + 1] - arr[i] - k = kh_get_int64(table, val) - if k == table.n_buckets: - kh_put_int64(table, val, &ret) - uniques.append(val) - kh_destroy_int64(table) - - result = np.array(uniques, dtype=np.int64) - result.sort() - return result - - -cdef object month_position_check(fields, weekdays): +def month_position_check(fields, weekdays): cdef: int32_t daysinmonth, y, m, d bint calendar_end = True @@ -397,247 +356,3 @@ cdef object month_position_check(fields, weekdays): return 'bs' else: return None - - -cdef inline bint _is_multiple(int64_t us, int64_t mult): - return us % mult == 0 - - -cdef inline str _maybe_add_count(str base, int64_t count): - if count != 1: - return '{count}{base}'.format(count=count, base=base) - else: - return base - - -cdef class _FrequencyInferer(object): - """ - Not sure if I can avoid the state machine here - """ - cdef public: - object index - object values - bint warn - bint is_monotonic - dict _cache - - def __init__(self, index, warn=True): - self.index = index - self.values = np.asarray(index).view('i8') - - # This moves the values, which are implicitly in UTC, to the - # the timezone so they are in local time - if hasattr(index, 'tz'): - if index.tz is not None: - self.values = tz_convert(self.values, 'UTC', index.tz) - - self.warn = warn - - if len(index) < 3: - raise ValueError('Need at least 3 dates to infer frequency') - - self.is_monotonic = (self.index.is_monotonic_increasing or - self.index.is_monotonic_decreasing) - - @cache_readonly - def deltas(self): - return unique_deltas(self.values) - - @cache_readonly - def deltas_asi8(self): - return unique_deltas(self.index.asi8) - - @cache_readonly - def is_unique(self): - return len(self.deltas) == 1 - - @cache_readonly - def is_unique_asi8(self): - return len(self.deltas_asi8) == 1 - - def get_freq(self): - if not self.is_monotonic or not self.index.is_unique: - return None - - delta = self.deltas[0] - if _is_multiple(delta, _ONE_DAY): - return self._infer_daily_rule() - else: - # Business hourly, maybe. 17: one day / 65: one weekend - if self.hour_deltas in ([1, 17], [1, 65], [1, 17, 65]): - return 'BH' - # Possibly intraday frequency. Here we use the - # original .asi8 values as the modified values - # will not work around DST transitions. See #8772 - elif not self.is_unique_asi8: - return None - delta = self.deltas_asi8[0] - if _is_multiple(delta, _ONE_HOUR): - # Hours - return _maybe_add_count('H', delta / _ONE_HOUR) - elif _is_multiple(delta, _ONE_MINUTE): - # Minutes - return _maybe_add_count('T', delta / _ONE_MINUTE) - elif _is_multiple(delta, _ONE_SECOND): - # Seconds - return _maybe_add_count('S', delta / _ONE_SECOND) - elif _is_multiple(delta, _ONE_MILLI): - # Milliseconds - return _maybe_add_count('L', delta / _ONE_MILLI) - elif _is_multiple(delta, _ONE_MICRO): - # Microseconds - return _maybe_add_count('U', delta / _ONE_MICRO) - else: - # Nanoseconds - return _maybe_add_count('N', delta) - - @cache_readonly - def day_deltas(self): - return [x / _ONE_DAY for x in self.deltas] - - @cache_readonly - def hour_deltas(self): - return [x / _ONE_HOUR for x in self.deltas] - - @cache_readonly - def fields(self): - return build_field_sarray(self.values) - - @cache_readonly - def rep_stamp(self): - return Timestamp(self.values[0]) - - cdef object month_position_check(self): - return month_position_check(self.fields, self.index.dayofweek) - - @cache_readonly - def mdiffs(self): - nmonths = self.fields['Y'] * 12 + self.fields['M'] - return unique_deltas(nmonths.astype('i8')) - - @cache_readonly - def ydiffs(self): - return unique_deltas(self.fields['Y'].astype('i8')) - - cdef _infer_daily_rule(self): - annual_rule = self._get_annual_rule() - if annual_rule: - nyears = self.ydiffs[0] - month = MONTH_ALIASES[self.rep_stamp.month] - alias = '{prefix}-{month}'.format(prefix=annual_rule, month=month) - return _maybe_add_count(alias, nyears) - - quarterly_rule = self._get_quarterly_rule() - if quarterly_rule: - nquarters = self.mdiffs[0] / 3 - mod_dict = {0: 12, 2: 11, 1: 10} - month = MONTH_ALIASES[mod_dict[self.rep_stamp.month % 3]] - alias = '{prefix}-{month}'.format(prefix=quarterly_rule, - month=month) - return _maybe_add_count(alias, nquarters) - - monthly_rule = self._get_monthly_rule() - if monthly_rule: - return _maybe_add_count(monthly_rule, self.mdiffs[0]) - - if self.is_unique: - days = self.deltas[0] / _ONE_DAY - if days % 7 == 0: - # Weekly - day = int_to_weekday[self.rep_stamp.weekday()] - return _maybe_add_count('W-{day}'.format(day=day), days / 7) - else: - return _maybe_add_count('D', days) - - if self._is_business_daily(): - return 'B' - - wom_rule = self._get_wom_rule() - if wom_rule: - return wom_rule - - cdef _get_annual_rule(self): - if len(self.ydiffs) > 1: - return None - - # lazy import to prevent circularity - # TODO: Avoid non-cython dependency - from pandas.core.algorithms import unique - - if len(unique(self.fields['M'])) > 1: - return None - - pos_check = self.month_position_check() - return {'cs': 'AS', 'bs': 'BAS', - 'ce': 'A', 'be': 'BA'}.get(pos_check) - - cdef _get_quarterly_rule(self): - if len(self.mdiffs) > 1: - return None - - if not self.mdiffs[0] % 3 == 0: - return None - - pos_check = self.month_position_check() - return {'cs': 'QS', 'bs': 'BQS', - 'ce': 'Q', 'be': 'BQ'}.get(pos_check) - - cdef _get_monthly_rule(self): - if len(self.mdiffs) > 1: - return None - pos_check = self.month_position_check() - return {'cs': 'MS', 'bs': 'BMS', - 'ce': 'M', 'be': 'BM'}.get(pos_check) - - cdef bint _is_business_daily(self): - # quick check: cannot be business daily - if self.day_deltas != [1, 3]: - return False - - # probably business daily, but need to confirm - first_weekday = self.index[0].weekday() - shifts = np.diff(self.index.asi8) - shifts = np.floor_divide(shifts, _ONE_DAY) - weekdays = np.mod(first_weekday + np.cumsum(shifts), 7) - return np.all(((weekdays == 0) & (shifts == 3)) | - ((weekdays > 0) & (weekdays <= 4) & (shifts == 1))) - - cdef _get_wom_rule(self): - # wdiffs = unique(np.diff(self.index.week)) - # We also need -47, -49, -48 to catch index spanning year boundary - # if not lib.ismember(wdiffs, set([4, 5, -47, -49, -48])).all(): - # return None - - # lazy import to prevent circularity - # TODO: Avoid non-cython dependency - from pandas.core.algorithms import unique - - weekdays = unique(self.index.weekday) - if len(weekdays) > 1: - return None - - week_of_months = unique((self.index.day - 1) // 7) - # Only attempt to infer up to WOM-4. See #9425 - week_of_months = week_of_months[week_of_months < 4] - if len(week_of_months) == 0 or len(week_of_months) > 1: - return None - - # get which week - week = week_of_months[0] + 1 - wd = int_to_weekday[weekdays[0]] - - return 'WOM-{week}{weekday}'.format(week=week, weekday=wd) - - -cdef class _TimedeltaFrequencyInferer(_FrequencyInferer): - - cdef _infer_daily_rule(self): - if self.is_unique: - days = self.deltas[0] / _ONE_DAY - if days % 7 == 0: - # Weekly - wd = int_to_weekday[self.rep_stamp.weekday()] - alias = 'W-{weekday}'.format(weekday=wd) - return _maybe_add_count(alias, days / 7) - else: - return _maybe_add_count('D', days) diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 59cd4743f857b..d6e4824575468 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -6,25 +6,32 @@ import numpy as np +from pandas.util._decorators import cache_readonly + from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.common import ( is_period_arraylike, is_timedelta64_dtype, is_datetime64_dtype) +from pandas.core.algorithms import unique + from pandas.tseries.offsets import DateOffset -from pandas._libs.tslibs import Timedelta +from pandas._libs.tslibs import Timedelta, Timestamp import pandas._libs.tslibs.frequencies as libfreqs from pandas._libs.tslibs.frequencies import ( # noqa, semi-public API get_freq, get_base_alias, get_to_timestamp_base, get_freq_code, FreqGroup, is_subperiod, is_superperiod) +from pandas._libs.tslibs.ccalendar import MONTH_ALIASES, int_to_weekday +import pandas._libs.tslibs.resolution as libresolution +from pandas._libs.tslibs.resolution import Resolution +from pandas._libs.tslibs.fields import build_field_sarray +from pandas._libs.tslibs.conversion import tz_convert -from pandas._libs.tslibs.resolution import (Resolution, - _FrequencyInferer, - _TimedeltaFrequencyInferer) +from pandas._libs.algos import unique_deltas from pytz import AmbiguousTimeError @@ -37,6 +44,13 @@ RESO_HR = 5 RESO_DAY = 6 +_ONE_MICRO = 1000 +_ONE_MILLI = (_ONE_MICRO * 1000) +_ONE_SECOND = (_ONE_MILLI * 1000) +_ONE_MINUTE = (60 * _ONE_SECOND) +_ONE_HOUR = (60 * _ONE_MINUTE) +_ONE_DAY = (24 * _ONE_HOUR) + # --------------------------------------------------------------------- # Offset names ("time rules") and related functions @@ -269,3 +283,246 @@ def infer_freq(index, warn=True): inferer = _FrequencyInferer(index, warn=warn) return inferer.get_freq() + + +class _FrequencyInferer(object): + """ + Not sure if I can avoid the state machine here + """ + + def __init__(self, index, warn=True): + self.index = index + self.values = np.asarray(index).view('i8') + + # This moves the values, which are implicitly in UTC, to the + # the timezone so they are in local time + if hasattr(index, 'tz'): + if index.tz is not None: + self.values = tz_convert(self.values, 'UTC', index.tz) + + self.warn = warn + + if len(index) < 3: + raise ValueError('Need at least 3 dates to infer frequency') + + self.is_monotonic = (self.index.is_monotonic_increasing or + self.index.is_monotonic_decreasing) + + @cache_readonly + def deltas(self): + return unique_deltas(self.values) + + @cache_readonly + def deltas_asi8(self): + return unique_deltas(self.index.asi8) + + @cache_readonly + def is_unique(self): + return len(self.deltas) == 1 + + @cache_readonly + def is_unique_asi8(self): + return len(self.deltas_asi8) == 1 + + def get_freq(self): # noqa:F811 + """ + Find the appropriate frequency string to describe the inferred + frequency of self.values + + Returns + ------- + freqstr : str or None + """ + if not self.is_monotonic or not self.index.is_unique: + return None + + delta = self.deltas[0] + if _is_multiple(delta, _ONE_DAY): + return self._infer_daily_rule() + + # Business hourly, maybe. 17: one day / 65: one weekend + if self.hour_deltas in ([1, 17], [1, 65], [1, 17, 65]): + return 'BH' + # Possibly intraday frequency. Here we use the + # original .asi8 values as the modified values + # will not work around DST transitions. See #8772 + elif not self.is_unique_asi8: + return None + + delta = self.deltas_asi8[0] + if _is_multiple(delta, _ONE_HOUR): + # Hours + return _maybe_add_count('H', delta / _ONE_HOUR) + elif _is_multiple(delta, _ONE_MINUTE): + # Minutes + return _maybe_add_count('T', delta / _ONE_MINUTE) + elif _is_multiple(delta, _ONE_SECOND): + # Seconds + return _maybe_add_count('S', delta / _ONE_SECOND) + elif _is_multiple(delta, _ONE_MILLI): + # Milliseconds + return _maybe_add_count('L', delta / _ONE_MILLI) + elif _is_multiple(delta, _ONE_MICRO): + # Microseconds + return _maybe_add_count('U', delta / _ONE_MICRO) + else: + # Nanoseconds + return _maybe_add_count('N', delta) + + @cache_readonly + def day_deltas(self): + return [x / _ONE_DAY for x in self.deltas] + + @cache_readonly + def hour_deltas(self): + return [x / _ONE_HOUR for x in self.deltas] + + @cache_readonly + def fields(self): + return build_field_sarray(self.values) + + @cache_readonly + def rep_stamp(self): + return Timestamp(self.values[0]) + + def month_position_check(self): + return libresolution.month_position_check(self.fields, + self.index.dayofweek) + + @cache_readonly + def mdiffs(self): + nmonths = self.fields['Y'] * 12 + self.fields['M'] + return unique_deltas(nmonths.astype('i8')) + + @cache_readonly + def ydiffs(self): + return unique_deltas(self.fields['Y'].astype('i8')) + + def _infer_daily_rule(self): + annual_rule = self._get_annual_rule() + if annual_rule: + nyears = self.ydiffs[0] + month = MONTH_ALIASES[self.rep_stamp.month] + alias = '{prefix}-{month}'.format(prefix=annual_rule, month=month) + return _maybe_add_count(alias, nyears) + + quarterly_rule = self._get_quarterly_rule() + if quarterly_rule: + nquarters = self.mdiffs[0] / 3 + mod_dict = {0: 12, 2: 11, 1: 10} + month = MONTH_ALIASES[mod_dict[self.rep_stamp.month % 3]] + alias = '{prefix}-{month}'.format(prefix=quarterly_rule, + month=month) + return _maybe_add_count(alias, nquarters) + + monthly_rule = self._get_monthly_rule() + if monthly_rule: + return _maybe_add_count(monthly_rule, self.mdiffs[0]) + + if self.is_unique: + days = self.deltas[0] / _ONE_DAY + if days % 7 == 0: + # Weekly + day = int_to_weekday[self.rep_stamp.weekday()] + return _maybe_add_count( + 'W-{day}'.format(day=day), days / 7) + else: + return _maybe_add_count('D', days) + + if self._is_business_daily(): + return 'B' + + wom_rule = self._get_wom_rule() + if wom_rule: + return wom_rule + + def _get_annual_rule(self): + if len(self.ydiffs) > 1: + return None + + if len(unique(self.fields['M'])) > 1: + return None + + pos_check = self.month_position_check() + return {'cs': 'AS', 'bs': 'BAS', + 'ce': 'A', 'be': 'BA'}.get(pos_check) + + def _get_quarterly_rule(self): + if len(self.mdiffs) > 1: + return None + + if not self.mdiffs[0] % 3 == 0: + return None + + pos_check = self.month_position_check() + return {'cs': 'QS', 'bs': 'BQS', + 'ce': 'Q', 'be': 'BQ'}.get(pos_check) + + def _get_monthly_rule(self): + if len(self.mdiffs) > 1: + return None + pos_check = self.month_position_check() + return {'cs': 'MS', 'bs': 'BMS', + 'ce': 'M', 'be': 'BM'}.get(pos_check) + + def _is_business_daily(self): + # quick check: cannot be business daily + if self.day_deltas != [1, 3]: + return False + + # probably business daily, but need to confirm + first_weekday = self.index[0].weekday() + shifts = np.diff(self.index.asi8) + shifts = np.floor_divide(shifts, _ONE_DAY) + weekdays = np.mod(first_weekday + np.cumsum(shifts), 7) + return np.all(((weekdays == 0) & (shifts == 3)) | + ((weekdays > 0) & (weekdays <= 4) & (shifts == 1))) + + def _get_wom_rule(self): + # wdiffs = unique(np.diff(self.index.week)) + # We also need -47, -49, -48 to catch index spanning year boundary + # if not lib.ismember(wdiffs, set([4, 5, -47, -49, -48])).all(): + # return None + + weekdays = unique(self.index.weekday) + if len(weekdays) > 1: + return None + + week_of_months = unique((self.index.day - 1) // 7) + # Only attempt to infer up to WOM-4. See #9425 + week_of_months = week_of_months[week_of_months < 4] + if len(week_of_months) == 0 or len(week_of_months) > 1: + return None + + # get which week + week = week_of_months[0] + 1 + wd = int_to_weekday[weekdays[0]] + + return 'WOM-{week}{weekday}'.format(week=week, weekday=wd) + + +class _TimedeltaFrequencyInferer(_FrequencyInferer): + + def _infer_daily_rule(self): + if self.is_unique: + days = self.deltas[0] / _ONE_DAY + if days % 7 == 0: + # Weekly + wd = int_to_weekday[self.rep_stamp.weekday()] + alias = 'W-{weekday}'.format(weekday=wd) + return _maybe_add_count(alias, days / 7) + else: + return _maybe_add_count('D', days) + + +def _is_multiple(us, mult): + return us % mult == 0 + + +def _maybe_add_count(base, count): + if count != 1: + assert count == int(count) + count = int(count) + return '{count}{base}'.format(count=count, base=base) + else: + return base