Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move FrequencyInferer out of libresolution #21992

Merged
merged 12 commits into from
Jul 25, 2018
Merged
29 changes: 29 additions & 0 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ cnp.import_array()
cimport util
from util cimport numeric, get_nat

from khash cimport (khiter_t,
kh_destroy_int64, kh_put_int64,
kh_init_int64, kh_int64_t,
kh_resize_int64, kh_get_int64)

import missing

cdef float64_t FP_ERR = 1e-13
Expand Down Expand Up @@ -71,6 +76,30 @@ class NegInfinity(object):
__ge__ = lambda self, other: isinstance(other, NegInfinity)


cpdef ndarray[int64_t, ndim=1] unique_deltas(ndarray[int64_t] arr):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as an aside I think we usually call this diff elsewhere, so maybe can share code, and might thing about renaming (future PR to think about)

cdef:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a doc-string

Py_ssize_t i, n = len(arr)
int64_t val
khiter_t k
kh_int64_t *table
int ret = 0
list uniques = []

table = kh_init_int64()
kh_resize_int64(table, 10)
for i in range(n - 1):
val = arr[i + 1] - arr[i]
k = kh_get_int64(table, val)
if k == table.n_buckets:
kh_put_int64(table, val, &ret)
uniques.append(val)
kh_destroy_int64(table)

result = np.array(uniques, dtype=np.int64)
result.sort()
return result


@cython.wraparound(False)
@cython.boundscheck(False)
def is_lexsorted(list list_of_arrays):
Expand Down
3 changes: 2 additions & 1 deletion pandas/_libs/tslibs/period.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -927,7 +927,8 @@ def extract_freq(ndarray[object] values):
# -----------------------------------------------------------------------
# period helpers


@cython.wraparound(False)
@cython.boundscheck(False)
cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps,
int freq, object tz):
cdef:
Expand Down
289 changes: 2 additions & 287 deletions pandas/_libs/tslibs/resolution.pyx
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
# cython: profile=False

cimport cython
from cython cimport Py_ssize_t

import numpy as np
Expand All @@ -10,23 +11,12 @@ cnp.import_array()

from util cimport is_string_object, get_nat

from pandas._libs.khash cimport (khiter_t,
kh_destroy_int64, kh_put_int64,
kh_init_int64, kh_int64_t,
kh_resize_int64, kh_get_int64)

from np_datetime cimport npy_datetimestruct, dt64_to_dtstruct
from frequencies cimport get_freq_code
from timezones cimport (is_utc, is_tzlocal,
maybe_get_tz, get_dst_info)
from fields import build_field_sarray
from conversion import tz_convert
from conversion cimport tz_convert_utc_to_tzlocal
from ccalendar import MONTH_ALIASES, int_to_weekday
from ccalendar cimport get_days_in_month
from timestamps import Timestamp

from pandas._libs.properties import cache_readonly

# ----------------------------------------------------------------------
# Constants
Expand All @@ -41,13 +31,6 @@ cdef int RESO_MIN = 4
cdef int RESO_HR = 5
cdef int RESO_DAY = 6

_ONE_MICRO = <int64_t>1000L
_ONE_MILLI = <int64_t>(_ONE_MICRO * 1000)
_ONE_SECOND = <int64_t>(_ONE_MILLI * 1000)
_ONE_MINUTE = <int64_t>(60 * _ONE_SECOND)
_ONE_HOUR = <int64_t>(60 * _ONE_MINUTE)
_ONE_DAY = <int64_t>(24 * _ONE_HOUR)

# ----------------------------------------------------------------------

cpdef resolution(ndarray[int64_t] stamps, tz=None):
Expand Down Expand Up @@ -331,31 +314,7 @@ class Resolution(object):
# ----------------------------------------------------------------------
# Frequency Inference

cdef ndarray[int64_t, ndim=1] unique_deltas(ndarray[int64_t] arr):
cdef:
Py_ssize_t i, n = len(arr)
int64_t val
khiter_t k
kh_int64_t *table
int ret = 0
list uniques = []

table = kh_init_int64()
kh_resize_int64(table, 10)
for i in range(n - 1):
val = arr[i + 1] - arr[i]
k = kh_get_int64(table, val)
if k == table.n_buckets:
kh_put_int64(table, val, &ret)
uniques.append(val)
kh_destroy_int64(table)

result = np.array(uniques, dtype=np.int64)
result.sort()
return result


cdef object month_position_check(fields, weekdays):
def month_position_check(fields, weekdays):
cdef:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

doc-string in future, does this need cdef?

int32_t daysinmonth, y, m, d
bint calendar_end = True
Expand Down Expand Up @@ -397,247 +356,3 @@ cdef object month_position_check(fields, weekdays):
return 'bs'
else:
return None


cdef inline bint _is_multiple(int64_t us, int64_t mult):
return us % mult == 0


cdef inline str _maybe_add_count(str base, int64_t count):
if count != 1:
return '{count}{base}'.format(count=count, base=base)
else:
return base


cdef class _FrequencyInferer(object):
"""
Not sure if I can avoid the state machine here
"""
cdef public:
object index
object values
bint warn
bint is_monotonic
dict _cache

def __init__(self, index, warn=True):
self.index = index
self.values = np.asarray(index).view('i8')

# This moves the values, which are implicitly in UTC, to the
# the timezone so they are in local time
if hasattr(index, 'tz'):
if index.tz is not None:
self.values = tz_convert(self.values, 'UTC', index.tz)

self.warn = warn

if len(index) < 3:
raise ValueError('Need at least 3 dates to infer frequency')

self.is_monotonic = (self.index.is_monotonic_increasing or
self.index.is_monotonic_decreasing)

@cache_readonly
def deltas(self):
return unique_deltas(self.values)

@cache_readonly
def deltas_asi8(self):
return unique_deltas(self.index.asi8)

@cache_readonly
def is_unique(self):
return len(self.deltas) == 1

@cache_readonly
def is_unique_asi8(self):
return len(self.deltas_asi8) == 1

def get_freq(self):
if not self.is_monotonic or not self.index.is_unique:
return None

delta = self.deltas[0]
if _is_multiple(delta, _ONE_DAY):
return self._infer_daily_rule()
else:
# Business hourly, maybe. 17: one day / 65: one weekend
if self.hour_deltas in ([1, 17], [1, 65], [1, 17, 65]):
return 'BH'
# Possibly intraday frequency. Here we use the
# original .asi8 values as the modified values
# will not work around DST transitions. See #8772
elif not self.is_unique_asi8:
return None
delta = self.deltas_asi8[0]
if _is_multiple(delta, _ONE_HOUR):
# Hours
return _maybe_add_count('H', delta / _ONE_HOUR)
elif _is_multiple(delta, _ONE_MINUTE):
# Minutes
return _maybe_add_count('T', delta / _ONE_MINUTE)
elif _is_multiple(delta, _ONE_SECOND):
# Seconds
return _maybe_add_count('S', delta / _ONE_SECOND)
elif _is_multiple(delta, _ONE_MILLI):
# Milliseconds
return _maybe_add_count('L', delta / _ONE_MILLI)
elif _is_multiple(delta, _ONE_MICRO):
# Microseconds
return _maybe_add_count('U', delta / _ONE_MICRO)
else:
# Nanoseconds
return _maybe_add_count('N', delta)

@cache_readonly
def day_deltas(self):
return [x / _ONE_DAY for x in self.deltas]

@cache_readonly
def hour_deltas(self):
return [x / _ONE_HOUR for x in self.deltas]

@cache_readonly
def fields(self):
return build_field_sarray(self.values)

@cache_readonly
def rep_stamp(self):
return Timestamp(self.values[0])

cdef object month_position_check(self):
return month_position_check(self.fields, self.index.dayofweek)

@cache_readonly
def mdiffs(self):
nmonths = self.fields['Y'] * 12 + self.fields['M']
return unique_deltas(nmonths.astype('i8'))

@cache_readonly
def ydiffs(self):
return unique_deltas(self.fields['Y'].astype('i8'))

cdef _infer_daily_rule(self):
annual_rule = self._get_annual_rule()
if annual_rule:
nyears = self.ydiffs[0]
month = MONTH_ALIASES[self.rep_stamp.month]
alias = '{prefix}-{month}'.format(prefix=annual_rule, month=month)
return _maybe_add_count(alias, nyears)

quarterly_rule = self._get_quarterly_rule()
if quarterly_rule:
nquarters = self.mdiffs[0] / 3
mod_dict = {0: 12, 2: 11, 1: 10}
month = MONTH_ALIASES[mod_dict[self.rep_stamp.month % 3]]
alias = '{prefix}-{month}'.format(prefix=quarterly_rule,
month=month)
return _maybe_add_count(alias, nquarters)

monthly_rule = self._get_monthly_rule()
if monthly_rule:
return _maybe_add_count(monthly_rule, self.mdiffs[0])

if self.is_unique:
days = self.deltas[0] / _ONE_DAY
if days % 7 == 0:
# Weekly
day = int_to_weekday[self.rep_stamp.weekday()]
return _maybe_add_count('W-{day}'.format(day=day), days / 7)
else:
return _maybe_add_count('D', days)

if self._is_business_daily():
return 'B'

wom_rule = self._get_wom_rule()
if wom_rule:
return wom_rule

cdef _get_annual_rule(self):
if len(self.ydiffs) > 1:
return None

# lazy import to prevent circularity
# TODO: Avoid non-cython dependency
from pandas.core.algorithms import unique

if len(unique(self.fields['M'])) > 1:
return None

pos_check = self.month_position_check()
return {'cs': 'AS', 'bs': 'BAS',
'ce': 'A', 'be': 'BA'}.get(pos_check)

cdef _get_quarterly_rule(self):
if len(self.mdiffs) > 1:
return None

if not self.mdiffs[0] % 3 == 0:
return None

pos_check = self.month_position_check()
return {'cs': 'QS', 'bs': 'BQS',
'ce': 'Q', 'be': 'BQ'}.get(pos_check)

cdef _get_monthly_rule(self):
if len(self.mdiffs) > 1:
return None
pos_check = self.month_position_check()
return {'cs': 'MS', 'bs': 'BMS',
'ce': 'M', 'be': 'BM'}.get(pos_check)

cdef bint _is_business_daily(self):
# quick check: cannot be business daily
if self.day_deltas != [1, 3]:
return False

# probably business daily, but need to confirm
first_weekday = self.index[0].weekday()
shifts = np.diff(self.index.asi8)
shifts = np.floor_divide(shifts, _ONE_DAY)
weekdays = np.mod(first_weekday + np.cumsum(shifts), 7)
return np.all(((weekdays == 0) & (shifts == 3)) |
((weekdays > 0) & (weekdays <= 4) & (shifts == 1)))

cdef _get_wom_rule(self):
# wdiffs = unique(np.diff(self.index.week))
# We also need -47, -49, -48 to catch index spanning year boundary
# if not lib.ismember(wdiffs, set([4, 5, -47, -49, -48])).all():
# return None

# lazy import to prevent circularity
# TODO: Avoid non-cython dependency
from pandas.core.algorithms import unique

weekdays = unique(self.index.weekday)
if len(weekdays) > 1:
return None

week_of_months = unique((self.index.day - 1) // 7)
# Only attempt to infer up to WOM-4. See #9425
week_of_months = week_of_months[week_of_months < 4]
if len(week_of_months) == 0 or len(week_of_months) > 1:
return None

# get which week
week = week_of_months[0] + 1
wd = int_to_weekday[weekdays[0]]

return 'WOM-{week}{weekday}'.format(week=week, weekday=wd)


cdef class _TimedeltaFrequencyInferer(_FrequencyInferer):

cdef _infer_daily_rule(self):
if self.is_unique:
days = self.deltas[0] / _ONE_DAY
if days % 7 == 0:
# Weekly
wd = int_to_weekday[self.rep_stamp.weekday()]
alias = 'W-{weekday}'.format(weekday=wd)
return _maybe_add_count(alias, days / 7)
else:
return _maybe_add_count('D', days)
Loading