Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Price repair fixes #1688

Merged
merged 1 commit into from
Sep 22, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 91 additions & 33 deletions yfinance/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -829,6 +829,8 @@ def _reconstruct_intervals_batch(self, df, interval, prepost, tag=-1):

@utils.log_indent_decorator
def _fix_unit_mixups(self, df, interval, tz_exchange, prepost):
if df.empty:
return df
df2 = self._fix_unit_switch(df, interval, tz_exchange)
df3 = self._fix_unit_random_mixups(df2, interval, tz_exchange, prepost)
return df3
Expand All @@ -842,6 +844,9 @@ def _fix_unit_random_mixups(self, df, interval, tz_exchange, prepost):
# - a sudden switch between $<->cents at some date
# This function fixes the first.

if df.empty:
return df

# Easy to detect and fix, just look for outliers = ~100x local median
logger = utils.get_yf_logger()

Expand Down Expand Up @@ -885,7 +890,11 @@ def _fix_unit_random_mixups(self, df, interval, tz_exchange, prepost):
ratio = df2_data / median
ratio_rounded = (ratio / 20).round() * 20 # round ratio to nearest 20
f = ratio_rounded == 100
if not f.any():
ratio_rcp = 1.0/ratio
ratio_rcp_rounded = (ratio_rcp / 20).round() * 20 # round ratio to nearest 20
f_rcp = (ratio_rounded == 100) | (ratio_rcp_rounded == 100)
f_either = f | f_rcp
if not f_either.any():
logger.info("price-repair-100x: No sporadic 100x errors")
if "Repaired?" not in df.columns:
df["Repaired?"] = False
Expand All @@ -894,7 +903,7 @@ def _fix_unit_random_mixups(self, df, interval, tz_exchange, prepost):
# Mark values to send for repair
tag = -1.0
for i in range(len(data_cols)):
fi = f[:, i]
fi = f_either[:, i]
c = data_cols[i]
df2.loc[fi, c] = tag

Expand All @@ -906,35 +915,43 @@ def _fix_unit_random_mixups(self, df, interval, tz_exchange, prepost):
if n_after > 0:
# This second pass will *crudely* "fix" any remaining errors in High/Low
# simply by ensuring they don't contradict e.g. Low = 100x High.
f = df2_tagged
f = (df2[data_cols].to_numpy() == tag) & f
for i in range(f.shape[0]):
fi = f[i, :]
if not fi.any():
continue
idx = df2.index[i]

c = "Open"
j = data_cols.index(c)
if fi[j]:
df2.loc[idx, c] = df.loc[idx, c] * 0.01
#
c = "Close"
j = data_cols.index(c)
for c in ['Open', 'Close']:
j = data_cols.index(c)
if fi[j]:
df2.loc[idx, c] = df.loc[idx, c] * 0.01

c = "High" ; j = data_cols.index(c)
if fi[j]:
df2.loc[idx, c] = df.loc[idx, c] * 0.01
#
c = "Adj Close"
j = data_cols.index(c)
df2.loc[idx, c] = df2.loc[idx, ["Open", "Close"]].max()

c = "Low" ; j = data_cols.index(c)
if fi[j]:
df2.loc[idx, c] = df.loc[idx, c] * 0.01
#
c = "High"
j = data_cols.index(c)
df2.loc[idx, c] = df2.loc[idx, ["Open", "Close"]].min()

f_rcp = (df2[data_cols].to_numpy() == tag) & f_rcp
for i in range(f_rcp.shape[0]):
fi = f_rcp[i, :]
if not fi.any():
continue
idx = df2.index[i]

for c in ['Open', 'Close']:
j = data_cols.index(c)
if fi[j]:
df2.loc[idx, c] = df.loc[idx, c] * 100.0

c = "High" ; j = data_cols.index(c)
if fi[j]:
df2.loc[idx, c] = df2.loc[idx, ["Open", "Close"]].max()
#
c = "Low"
j = data_cols.index(c)

c = "Low" ; j = data_cols.index(c)
if fi[j]:
df2.loc[idx, c] = df2.loc[idx, ["Open", "Close"]].min()

Expand All @@ -953,9 +970,9 @@ def _fix_unit_random_mixups(self, df, interval, tz_exchange, prepost):
logger.info('price-repair-100x: ' + report_msg)

# Restore original values where repair failed
f = df2_tagged
f_either = df2[data_cols].to_numpy() == tag
for j in range(len(data_cols)):
fj = f[:, j]
fj = f_either[:, j]
if fj.any():
c = data_cols[j]
df2.loc[fj, c] = df.loc[fj, c]
Expand All @@ -977,14 +994,6 @@ def _fix_unit_switch(self, df, interval, tz_exchange):
# This function fixes the second.
# Eventually Yahoo fixes but could take them 2 weeks.

# To detect, use 'bad split adjustment' algorithm. But only correct
# if no stock splits in data

f_splits = df['Stock Splits'].to_numpy() != 0.0
if f_splits.any():
utils.get_yf_logger().debug('price-repair-100x: Cannot check for chunked 100x errors because splits present')
return df

return self._fix_prices_sudden_change(df, interval, tz_exchange, 100.0)

@utils.log_indent_decorator
Expand All @@ -993,6 +1002,9 @@ def _fix_zeroes(self, df, interval, tz_exchange, prepost):
# But most times when prices=0 or NaN returned is because no trades.
# Impossible to distinguish, so only attempt repair if few or rare.

if df.empty:
return df

logger = utils.get_yf_logger()

if df.shape[0] == 0:
Expand Down Expand Up @@ -1101,6 +1113,9 @@ def _fix_missing_div_adjust(self, df, interval, tz_exchange):
# Easy to detect and correct BUT ONLY IF the data 'df' includes today's dividend.
# E.g. if fetching historic prices before todays dividend, then cannot fix.

if df.empty:
return df

logger = utils.get_yf_logger()

if df is None or df.empty:
Expand Down Expand Up @@ -1173,6 +1188,9 @@ def _fix_bad_stock_split(self, df, interval, tz_exchange):
# which direction to reverse adjustment - have to analyse prices and detect.
# Not difficult.

if df.empty:
return df

logger = utils.get_yf_logger()

interday = interval in ['1d', '1wk', '1mo', '3mo']
Expand All @@ -1198,6 +1216,9 @@ def _fix_bad_stock_split(self, df, interval, tz_exchange):

@utils.log_indent_decorator
def _fix_prices_sudden_change(self, df, interval, tz_exchange, change, correct_volume=False):
if df.empty:
return df

logger = utils.get_yf_logger()

df = df.sort_index(ascending=False)
Expand Down Expand Up @@ -1262,11 +1283,25 @@ def _fix_prices_sudden_change(self, df, interval, tz_exchange, change, correct_v
# Avoid using 'Low' and 'High'. For multiday intervals, these can be
# very volatile so reduce ability to detect genuine stock split errors
_1d_change_x = np.full((n, 2), 1.0)
price_data = df2[['Open','Close']].replace(0.0, 1.0).to_numpy()
price_data = df2[['Open','Close']].to_numpy()
f_zero = price_data == 0.0
else:
_1d_change_x = np.full((n, 4), 1.0)
price_data = df2[OHLC].replace(0.0, 1.0).to_numpy()
price_data = df2[OHLC].to_numpy()
f_zero = price_data == 0.0
if f_zero.any():
price_data[f_zero] = 1.0

# Update: if a VERY large dividend is paid out, then can be mistaken for a 1:2 stock split.
# Fix = use adjusted prices
adj = df2['Adj Close'].to_numpy() / df2['Close'].to_numpy()
for j in range(price_data.shape[1]):
price_data[:,j] *= adj

_1d_change_x[1:] = price_data[1:, ] / price_data[:-1, ]
f_zero_num_denom = f_zero | np.roll(f_zero, 1, axis=0)
if f_zero_num_denom.any():
_1d_change_x[f_zero_num_denom] = 1.0
if interday and interval != '1d':
# average change
_1d_change_minx = np.average(_1d_change_x, axis=1)
Expand Down Expand Up @@ -1365,6 +1400,29 @@ def _fix_prices_sudden_change(self, df, interval, tz_exchange, change, correct_v
logger.info(f'price-repair-split: No {fix_type}s detected')
return df

# Update: if any 100x changes are soon after a stock split, so could be confused with split error, then abort
threshold_days = 30
f_splits = df['Stock Splits'].to_numpy() != 0.0
if change in [100.0, 0.01] and f_splits.any():
indices_A = np.where(f_splits)[0]
indices_B = np.where(f)[0]
if not len(indices_A) or not len(indices_B):
return None
gaps = indices_B[:, None] - indices_A
# Because data is sorted in DEscending order, need to flip gaps
gaps *= -1
f_pos = gaps > 0
if f_pos.any():
gap_min = gaps[f_pos].min()
gap_td = utils._interval_to_timedelta(interval) * gap_min
if isinstance(gap_td, _dateutil.relativedelta.relativedelta):
threshold = _dateutil.relativedelta.relativedelta(days=threshold_days)
else:
threshold = _datetime.timedelta(days=threshold_days)
if gap_td < threshold:
logger.info(f'price-repair-split: 100x changes are too soon after stock split events, aborting')
return df

# if logger.isEnabledFor(logging.DEBUG):
# df_debug['i'] = list(range(0, df_debug.shape[0]))
# df_debug['i_rev'] = df_debug.shape[0]-1 - df_debug['i']
Expand Down