From ec3de0710d664e9a86691459c2c37687a79506ba Mon Sep 17 00:00:00 2001 From: Alessandro Di Felice Date: Fri, 1 Sep 2023 14:07:53 -0500 Subject: [PATCH 1/4] Fix pandas warning when retrieving quotes --- yfinance/utils.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/yfinance/utils.py b/yfinance/utils.py index a9a16adb2..86eaa99b2 100644 --- a/yfinance/utils.py +++ b/yfinance/utils.py @@ -628,22 +628,22 @@ def fix_Yahoo_returning_live_separate(quotes, interval, tz_exchange): # Yahoo is not returning live data (phew!) return quotes if _np.isnan(quotes.loc[idx2, "Open"]): - quotes.loc[idx2, "Open"] = quotes["Open"][n - 1] + quotes.loc[idx2, "Open"] = quotes["Open"].iloc[n - 1] # Note: nanmax() & nanmin() ignores NaNs, but still need to check not all are NaN to avoid warnings - if not _np.isnan(quotes["High"][n - 1]): - quotes.loc[idx2, "High"] = _np.nanmax([quotes["High"][n - 1], quotes["High"][n - 2]]) + if not _np.isnan(quotes["High"].iloc[n - 1]): + quotes.loc[idx2, "High"] = _np.nanmax([quotes["High"].iloc[n - 1], quotes["High"].iloc[n - 2]]) if "Adj High" in quotes.columns: - quotes.loc[idx2, "Adj High"] = _np.nanmax([quotes["Adj High"][n - 1], quotes["Adj High"][n - 2]]) + quotes.loc[idx2, "Adj High"] = _np.nanmax([quotes["Adj High"].iloc[n - 1], quotes["Adj High"].iloc[n - 2]]) - if not _np.isnan(quotes["Low"][n - 1]): - quotes.loc[idx2, "Low"] = _np.nanmin([quotes["Low"][n - 1], quotes["Low"][n - 2]]) + if not _np.isnan(quotes["Low"].iloc[n - 1]): + quotes.loc[idx2, "Low"] = _np.nanmin([quotes["Low"].iloc[n - 1], quotes["Low"].iloc[n - 2]]) if "Adj Low" in quotes.columns: - quotes.loc[idx2, "Adj Low"] = _np.nanmin([quotes["Adj Low"][n - 1], quotes["Adj Low"][n - 2]]) + quotes.loc[idx2, "Adj Low"] = _np.nanmin([quotes["Adj Low"].iloc[n - 1], quotes["Adj Low"].iloc[n - 2]]) - quotes.loc[idx2, "Close"] = quotes["Close"][n - 1] + quotes.loc[idx2, "Close"] = quotes["Close"].iloc[n - 1] if "Adj Close" in quotes.columns: - quotes.loc[idx2, "Adj Close"] = quotes["Adj Close"][n - 1] - quotes.loc[idx2, "Volume"] += quotes["Volume"][n - 1] + quotes.loc[idx2, "Adj Close"] = quotes["Adj Close"].iloc[n - 1] + quotes.loc[idx2, "Volume"] += quotes["Volume"].iloc[n - 1] quotes = quotes.drop(quotes.index[n - 1]) return quotes From 937386f3ef1050248b7b5408f199d4781fcd6317 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EC=A1=B0=EB=8B=A4=EB=8B=88=EC=97=98=28Daniel=20Cho=29?= Date: Tue, 19 Sep 2023 10:02:28 +0900 Subject: [PATCH 2/4] Fix error when calling enable_debug_mode twice --- yfinance/utils.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/yfinance/utils.py b/yfinance/utils.py index 86eaa99b2..f83744303 100644 --- a/yfinance/utils.py +++ b/yfinance/utils.py @@ -169,14 +169,15 @@ def setup_debug_formatting(): yf_logger.warning("logging mode not set to 'DEBUG', so not setting up debug formatting") return - if yf_logger.handlers is None or len(yf_logger.handlers) == 0: - h = logging.StreamHandler() - # Ensure different level strings don't interfere with indentation - formatter = MultiLineFormatter(fmt='%(levelname)-8s %(message)s') - h.setFormatter(formatter) - yf_logger.addHandler(h) - global yf_log_indented + if not yf_log_indented: + if yf_logger.handlers is None or len(yf_logger.handlers) == 0: + h = logging.StreamHandler() + # Ensure different level strings don't interfere with indentation + formatter = MultiLineFormatter(fmt='%(levelname)-8s %(message)s') + h.setFormatter(formatter) + yf_logger.addHandler(h) + yf_log_indented = True From d3dfb4c6a82716ca0abd303617cb4dbaf4b99b66 Mon Sep 17 00:00:00 2001 From: Value Raider Date: Sun, 17 Sep 2023 21:44:06 +0100 Subject: [PATCH 3/4] Fix merging events with intraday prices If Yahoo returns intraday price data with dividend or stock-split event in future, then this broke the merge. Fix is to discard out-of-range events. Assumes that if user requesting intraday then they aren't interested in events. --- tests/prices.py | 37 ++++++++++++++++++++++ yfinance/utils.py | 81 ++++++++++++++++++++++++++--------------------- 2 files changed, 82 insertions(+), 36 deletions(-) diff --git a/tests/prices.py b/tests/prices.py index 8d9d49532..a3e485e18 100644 --- a/tests/prices.py +++ b/tests/prices.py @@ -114,6 +114,43 @@ def test_duplicatingWeekly(self): if not test_run: self.skipTest("Skipping test_duplicatingWeekly() because not possible to fail Monday/weekend") + def test_pricesEventsMerge(self): + # Test case: dividend occurs after last row in price data + tkr = 'INTC' + start_d = _dt.date(2022, 1, 1) + end_d = _dt.date(2023, 1, 1) + df = yf.Ticker(tkr, session=self.session).history(interval='1d', start=start_d, end=end_d) + div = 1.0 + future_div_dt = df.index[-1] + _dt.timedelta(days=1) + if future_div_dt.weekday() in [5, 6]: + future_div_dt += _dt.timedelta(days=1) * (7 - future_div_dt.weekday()) + divs = _pd.DataFrame(data={"Dividends":[div]}, index=[future_div_dt]) + df2 = yf.utils.safe_merge_dfs(df.drop(['Dividends', 'Stock Splits'], axis=1), divs, '1d') + self.assertIn(future_div_dt, df2.index) + self.assertIn("Dividends", df2.columns) + self.assertEqual(df2['Dividends'].iloc[-1], div) + + def test_pricesEventsMerge_bug(self): + # Reproduce exception when merging intraday prices with future dividend + tkr = 'S32.AX' + interval = '30m' + df_index = [] + d = 13 + for h in range(0, 16): + for m in [0, 30]: + df_index.append(_dt.datetime(2023, 9, d, h, m)) + df_index.append(_dt.datetime(2023, 9, d, 16)) + df = _pd.DataFrame(index=df_index) + df.index = _pd.to_datetime(df.index) + df['Close'] = 1.0 + + div = 1.0 + future_div_dt = _dt.datetime(2023, 9, 14, 10) + divs = _pd.DataFrame(data={"Dividends":[div]}, index=[future_div_dt]) + + df2 = yf.utils.safe_merge_dfs(df, divs, interval) + # No exception = test pass + def test_intraDayWithEvents(self): tkrs = ["BHP.AX", "IMP.JO", "BP.L", "PNL.L", "INTC"] test_run = False diff --git a/yfinance/utils.py b/yfinance/utils.py index 86eaa99b2..82c1e1a3e 100644 --- a/yfinance/utils.py +++ b/yfinance/utils.py @@ -650,8 +650,10 @@ def fix_Yahoo_returning_live_separate(quotes, interval, tz_exchange): def safe_merge_dfs(df_main, df_sub, interval): - if df_sub.shape[0] == 0: + if df_sub.empty: raise Exception("No data to merge") + if df_main.empty: + return df_main df_sub_backup = df_sub.copy() data_cols = [c for c in df_sub.columns if c not in df_main] @@ -675,47 +677,54 @@ def safe_merge_dfs(df_main, df_sub, interval): else: indices = _np.searchsorted(_np.append(df_main.index, df_main.index[-1] + td), df_sub.index, side='right') indices -= 1 # Convert from [[i-1], [i]) to [[i], [i+1]) - # Numpy.searchsorted does not handle out-of-range well, so handle manually: - for i in range(len(df_sub.index)): - dt = df_sub.index[i] - if dt < df_main.index[0] or dt >= df_main.index[-1] + td: - # Out-of-range - indices[i] = -1 + # Numpy.searchsorted does not handle out-of-range well, so handle manually: + for i in range(len(df_sub.index)): + dt = df_sub.index[i] + if dt < df_main.index[0] or dt >= df_main.index[-1] + td: + # Out-of-range + indices[i] = -1 f_outOfRange = indices == -1 - if f_outOfRange.any() and not intraday: - empty_row_data = {c:[_np.nan] for c in const.price_colnames}|{'Volume':[0]} - if interval == '1d': - # For 1d, add all out-of-range event dates - for i in _np.where(f_outOfRange)[0]: - dt = df_sub.index[i] - get_yf_logger().debug(f"Adding out-of-range {data_col} @ {dt.date()} in new prices row of NaNs") - empty_row = _pd.DataFrame(data=empty_row_data, index=[dt]) - df_main = _pd.concat([df_main, empty_row], sort=True) + if f_outOfRange.any(): + if intraday: + # Discard out-of-range dividends in intraday data, assume user not interested + df_sub = df_sub[~f_outOfRange] + if df_sub.empty: + df_main['Dividends'] = 0.0 + return df_main else: - # Else, only add out-of-range event dates if occurring in interval - # immediately after last pricfe row - last_dt = df_main.index[-1] - next_interval_start_dt = last_dt + td - next_interval_end_dt = next_interval_start_dt + td - for i in _np.where(f_outOfRange)[0]: - dt = df_sub.index[i] - if next_interval_start_dt <= dt < next_interval_end_dt: - new_dt = next_interval_start_dt + empty_row_data = {c:[_np.nan] for c in const.price_colnames}|{'Volume':[0]} + if interval == '1d': + # For 1d, add all out-of-range event dates + for i in _np.where(f_outOfRange)[0]: + dt = df_sub.index[i] get_yf_logger().debug(f"Adding out-of-range {data_col} @ {dt.date()} in new prices row of NaNs") empty_row = _pd.DataFrame(data=empty_row_data, index=[dt]) df_main = _pd.concat([df_main, empty_row], sort=True) - df_main = df_main.sort_index() - - # Re-calculate indices - indices = _np.searchsorted(_np.append(df_main.index, df_main.index[-1] + td), df_sub.index, side='right') - indices -= 1 # Convert from [[i-1], [i]) to [[i], [i+1]) - # Numpy.searchsorted does not handle out-of-range well, so handle manually: - for i in range(len(df_sub.index)): - dt = df_sub.index[i] - if dt < df_main.index[0] or dt >= df_main.index[-1] + td: - # Out-of-range - indices[i] = -1 + else: + # Else, only add out-of-range event dates if occurring in interval + # immediately after last price row + last_dt = df_main.index[-1] + next_interval_start_dt = last_dt + td + next_interval_end_dt = next_interval_start_dt + td + for i in _np.where(f_outOfRange)[0]: + dt = df_sub.index[i] + if next_interval_start_dt <= dt < next_interval_end_dt: + new_dt = next_interval_start_dt + get_yf_logger().debug(f"Adding out-of-range {data_col} @ {dt.date()} in new prices row of NaNs") + empty_row = _pd.DataFrame(data=empty_row_data, index=[dt]) + df_main = _pd.concat([df_main, empty_row], sort=True) + df_main = df_main.sort_index() + + # Re-calculate indices + indices = _np.searchsorted(_np.append(df_main.index, df_main.index[-1] + td), df_sub.index, side='right') + indices -= 1 # Convert from [[i-1], [i]) to [[i], [i+1]) + # Numpy.searchsorted does not handle out-of-range well, so handle manually: + for i in range(len(df_sub.index)): + dt = df_sub.index[i] + if dt < df_main.index[0] or dt >= df_main.index[-1] + td: + # Out-of-range + indices[i] = -1 f_outOfRange = indices == -1 if f_outOfRange.any(): From 5208c8cf05df7174abc9ed984602fa5d442af31c Mon Sep 17 00:00:00 2001 From: Value Raider Date: Tue, 19 Sep 2023 19:38:16 +0100 Subject: [PATCH 4/4] Price repair improvements Price repair improvements: - don't attempt repair of empty prices table - random-mixups: fix 0.01x errors, not just 100x - stop zeroes, big-dividends, and 100x errors triggering false split errors --- yfinance/base.py | 124 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 91 insertions(+), 33 deletions(-) diff --git a/yfinance/base.py b/yfinance/base.py index 46f6a09c4..8122a4a10 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -829,6 +829,8 @@ def _reconstruct_intervals_batch(self, df, interval, prepost, tag=-1): @utils.log_indent_decorator def _fix_unit_mixups(self, df, interval, tz_exchange, prepost): + if df.empty: + return df df2 = self._fix_unit_switch(df, interval, tz_exchange) df3 = self._fix_unit_random_mixups(df2, interval, tz_exchange, prepost) return df3 @@ -842,6 +844,9 @@ def _fix_unit_random_mixups(self, df, interval, tz_exchange, prepost): # - a sudden switch between $<->cents at some date # This function fixes the first. + if df.empty: + return df + # Easy to detect and fix, just look for outliers = ~100x local median logger = utils.get_yf_logger() @@ -885,7 +890,11 @@ def _fix_unit_random_mixups(self, df, interval, tz_exchange, prepost): ratio = df2_data / median ratio_rounded = (ratio / 20).round() * 20 # round ratio to nearest 20 f = ratio_rounded == 100 - if not f.any(): + ratio_rcp = 1.0/ratio + ratio_rcp_rounded = (ratio_rcp / 20).round() * 20 # round ratio to nearest 20 + f_rcp = (ratio_rounded == 100) | (ratio_rcp_rounded == 100) + f_either = f | f_rcp + if not f_either.any(): logger.info("price-repair-100x: No sporadic 100x errors") if "Repaired?" not in df.columns: df["Repaired?"] = False @@ -894,7 +903,7 @@ def _fix_unit_random_mixups(self, df, interval, tz_exchange, prepost): # Mark values to send for repair tag = -1.0 for i in range(len(data_cols)): - fi = f[:, i] + fi = f_either[:, i] c = data_cols[i] df2.loc[fi, c] = tag @@ -906,35 +915,43 @@ def _fix_unit_random_mixups(self, df, interval, tz_exchange, prepost): if n_after > 0: # This second pass will *crudely* "fix" any remaining errors in High/Low # simply by ensuring they don't contradict e.g. Low = 100x High. - f = df2_tagged + f = (df2[data_cols].to_numpy() == tag) & f for i in range(f.shape[0]): fi = f[i, :] if not fi.any(): continue idx = df2.index[i] - c = "Open" - j = data_cols.index(c) - if fi[j]: - df2.loc[idx, c] = df.loc[idx, c] * 0.01 - # - c = "Close" - j = data_cols.index(c) + for c in ['Open', 'Close']: + j = data_cols.index(c) + if fi[j]: + df2.loc[idx, c] = df.loc[idx, c] * 0.01 + + c = "High" ; j = data_cols.index(c) if fi[j]: - df2.loc[idx, c] = df.loc[idx, c] * 0.01 - # - c = "Adj Close" - j = data_cols.index(c) + df2.loc[idx, c] = df2.loc[idx, ["Open", "Close"]].max() + + c = "Low" ; j = data_cols.index(c) if fi[j]: - df2.loc[idx, c] = df.loc[idx, c] * 0.01 - # - c = "High" - j = data_cols.index(c) + df2.loc[idx, c] = df2.loc[idx, ["Open", "Close"]].min() + + f_rcp = (df2[data_cols].to_numpy() == tag) & f_rcp + for i in range(f_rcp.shape[0]): + fi = f_rcp[i, :] + if not fi.any(): + continue + idx = df2.index[i] + + for c in ['Open', 'Close']: + j = data_cols.index(c) + if fi[j]: + df2.loc[idx, c] = df.loc[idx, c] * 100.0 + + c = "High" ; j = data_cols.index(c) if fi[j]: df2.loc[idx, c] = df2.loc[idx, ["Open", "Close"]].max() - # - c = "Low" - j = data_cols.index(c) + + c = "Low" ; j = data_cols.index(c) if fi[j]: df2.loc[idx, c] = df2.loc[idx, ["Open", "Close"]].min() @@ -953,9 +970,9 @@ def _fix_unit_random_mixups(self, df, interval, tz_exchange, prepost): logger.info('price-repair-100x: ' + report_msg) # Restore original values where repair failed - f = df2_tagged + f_either = df2[data_cols].to_numpy() == tag for j in range(len(data_cols)): - fj = f[:, j] + fj = f_either[:, j] if fj.any(): c = data_cols[j] df2.loc[fj, c] = df.loc[fj, c] @@ -977,14 +994,6 @@ def _fix_unit_switch(self, df, interval, tz_exchange): # This function fixes the second. # Eventually Yahoo fixes but could take them 2 weeks. - # To detect, use 'bad split adjustment' algorithm. But only correct - # if no stock splits in data - - f_splits = df['Stock Splits'].to_numpy() != 0.0 - if f_splits.any(): - utils.get_yf_logger().debug('price-repair-100x: Cannot check for chunked 100x errors because splits present') - return df - return self._fix_prices_sudden_change(df, interval, tz_exchange, 100.0) @utils.log_indent_decorator @@ -993,6 +1002,9 @@ def _fix_zeroes(self, df, interval, tz_exchange, prepost): # But most times when prices=0 or NaN returned is because no trades. # Impossible to distinguish, so only attempt repair if few or rare. + if df.empty: + return df + logger = utils.get_yf_logger() if df.shape[0] == 0: @@ -1101,6 +1113,9 @@ def _fix_missing_div_adjust(self, df, interval, tz_exchange): # Easy to detect and correct BUT ONLY IF the data 'df' includes today's dividend. # E.g. if fetching historic prices before todays dividend, then cannot fix. + if df.empty: + return df + logger = utils.get_yf_logger() if df is None or df.empty: @@ -1173,6 +1188,9 @@ def _fix_bad_stock_split(self, df, interval, tz_exchange): # which direction to reverse adjustment - have to analyse prices and detect. # Not difficult. + if df.empty: + return df + logger = utils.get_yf_logger() interday = interval in ['1d', '1wk', '1mo', '3mo'] @@ -1198,6 +1216,9 @@ def _fix_bad_stock_split(self, df, interval, tz_exchange): @utils.log_indent_decorator def _fix_prices_sudden_change(self, df, interval, tz_exchange, change, correct_volume=False): + if df.empty: + return df + logger = utils.get_yf_logger() df = df.sort_index(ascending=False) @@ -1262,11 +1283,25 @@ def _fix_prices_sudden_change(self, df, interval, tz_exchange, change, correct_v # Avoid using 'Low' and 'High'. For multiday intervals, these can be # very volatile so reduce ability to detect genuine stock split errors _1d_change_x = np.full((n, 2), 1.0) - price_data = df2[['Open','Close']].replace(0.0, 1.0).to_numpy() + price_data = df2[['Open','Close']].to_numpy() + f_zero = price_data == 0.0 else: _1d_change_x = np.full((n, 4), 1.0) - price_data = df2[OHLC].replace(0.0, 1.0).to_numpy() + price_data = df2[OHLC].to_numpy() + f_zero = price_data == 0.0 + if f_zero.any(): + price_data[f_zero] = 1.0 + + # Update: if a VERY large dividend is paid out, then can be mistaken for a 1:2 stock split. + # Fix = use adjusted prices + adj = df2['Adj Close'].to_numpy() / df2['Close'].to_numpy() + for j in range(price_data.shape[1]): + price_data[:,j] *= adj + _1d_change_x[1:] = price_data[1:, ] / price_data[:-1, ] + f_zero_num_denom = f_zero | np.roll(f_zero, 1, axis=0) + if f_zero_num_denom.any(): + _1d_change_x[f_zero_num_denom] = 1.0 if interday and interval != '1d': # average change _1d_change_minx = np.average(_1d_change_x, axis=1) @@ -1365,6 +1400,29 @@ def _fix_prices_sudden_change(self, df, interval, tz_exchange, change, correct_v logger.info(f'price-repair-split: No {fix_type}s detected') return df + # Update: if any 100x changes are soon after a stock split, so could be confused with split error, then abort + threshold_days = 30 + f_splits = df['Stock Splits'].to_numpy() != 0.0 + if change in [100.0, 0.01] and f_splits.any(): + indices_A = np.where(f_splits)[0] + indices_B = np.where(f)[0] + if not len(indices_A) or not len(indices_B): + return None + gaps = indices_B[:, None] - indices_A + # Because data is sorted in DEscending order, need to flip gaps + gaps *= -1 + f_pos = gaps > 0 + if f_pos.any(): + gap_min = gaps[f_pos].min() + gap_td = utils._interval_to_timedelta(interval) * gap_min + if isinstance(gap_td, _dateutil.relativedelta.relativedelta): + threshold = _dateutil.relativedelta.relativedelta(days=threshold_days) + else: + threshold = _datetime.timedelta(days=threshold_days) + if gap_td < threshold: + logger.info(f'price-repair-split: 100x changes are too soon after stock split events, aborting') + return df + # if logger.isEnabledFor(logging.DEBUG): # df_debug['i'] = list(range(0, df_debug.shape[0])) # df_debug['i_rev'] = df_debug.shape[0]-1 - df_debug['i']