From b49fd797fc29f234d5540cf90c03a275e01021fe Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Wed, 11 Jan 2023 17:41:34 +0000 Subject: [PATCH 01/10] Fix & improve price repair Fix repair calibration & volume=0 repair ; Extend repair to sub-hour ; Avoid attempting repair of mostly-NaN days --- yfinance/base.py | 169 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 117 insertions(+), 52 deletions(-) diff --git a/yfinance/base.py b/yfinance/base.py index 34deef9b0..68353e800 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -354,6 +354,21 @@ def history(self, period="1mo", interval="1d", else: df["Capital Gains"] = 0.0 + # # Drop any rows that are too close in time to previous row + # td = utils._interval_to_timedelta(interval) + # steps = _np.full(df.shape[0], td) + # steps[1:] = df.index[1:] - df.index[0:df.shape[0]-1] + # df["step"] = steps ; print(df) ; raise Exception("here") + # if td >= pd.Timedelta("1d"): + # # Allow for DST + # f_drop = steps < (td-pd.Timedelta('1h')) + # else: + # f_drop = steps < td + # if f_drop.any(): + # print(df) + # raise Exception("Dropping too-close rows @", df.index[f_drop]) + # df = df[~f_drop].copy() + if repair: # Do this before auto/back adjust df = self._fix_zeroes(df, interval, tz_exchange) @@ -407,25 +422,27 @@ def _reconstruct_intervals_batch(self, df, interval, tag=-1): # Reconstruct values in df using finer-grained price data. Delimiter marks what to reconstruct + debug = False + # debug = True + price_cols = [c for c in ["Open", "High", "Low", "Close", "Adj Close"] if c in df] data_cols = price_cols + ["Volume"] # If interval is weekly then can construct with daily. But if smaller intervals then # restricted to recent times: - # - daily = hourly restricted to last 730 days - sub_interval = None - td_range = None - if interval == "1wk": - # Correct by fetching week of daily data - sub_interval = "1d" - td_range = _datetime.timedelta(days=7) - elif interval == "1d": - # Correct by fetching day of hourly data - sub_interval = "1h" - td_range = _datetime.timedelta(days=1) - elif interval == "1h": - sub_interval = "30m" - td_range = _datetime.timedelta(hours=1) + intervals = ["1wk", "1d", "1h", "30m", "15m", "5m", "2m", "1m"] + itds = {i:utils._interval_to_timedelta(interval) for i in intervals} + nexts = {intervals[i]:intervals[i+1] for i in range(len(intervals)-1)} + min_lookbacks = {"1wk":None, "1d":None, "1h":_datetime.timedelta(days=730)} + for i in ["30m", "15m", "5m", "2m"]: + min_lookbacks[i] = _datetime.timedelta(days=60) + min_lookbacks["1m"] = _datetime.timedelta(days=30) + # Hopefully never have to use max_lengths, because complicates fetch logic + # max_lengths = {i:None for i in intervals} + # max_lengths["1m"] = _datetime.timedelta(days=7) + if interval in nexts: + sub_interval = nexts[interval] + td_range = itds[interval] else: print("WARNING: Have not implemented repair for '{}' interval. Contact developers".format(interval)) raise Exception("why here") @@ -437,15 +454,13 @@ def _reconstruct_intervals_batch(self, df, interval, tag=-1): f_repair_rows = f_repair.any(axis=1) # Ignore old intervals for which Yahoo won't return finer data: - if sub_interval == "1h": - f_recent = _datetime.date.today() - df.index.date < _datetime.timedelta(days=730) + m = min_lookbacks[sub_interval] + if m is not None: + f_recent = _datetime.date.today() - df.index.date < m f_repair_rows = f_repair_rows & f_recent - elif sub_interval in ["30m", "15m"]: - f_recent = _datetime.date.today() - df.index.date < _datetime.timedelta(days=60) - f_repair_rows = f_repair_rows & f_recent - if not f_repair_rows.any(): - print("data too old to fix") - return df + if not f_repair_rows.any(): + print("data too old to fix") + return df dts_to_repair = df.index[f_repair_rows] indices_to_repair = _np.where(f_repair_rows)[0] @@ -454,7 +469,9 @@ def _reconstruct_intervals_batch(self, df, interval, tag=-1): return df df_v2 = df.copy() - df_noNa = df[~df[price_cols].isna().any(axis=1)] + f_good = ~(df[price_cols].isna().any(axis=1)) + f_good = f_good & (df[price_cols].to_numpy()!=tag).all(axis=1) + df_good = df[f_good] # Group nearby NaN-intervals together to reduce number of Yahoo fetches dts_groups = [[dts_to_repair[0]]] @@ -471,7 +488,6 @@ def _reconstruct_intervals_batch(self, df, interval, tag=-1): grp_td_threshold = _datetime.timedelta(days=7) else: grp_td_threshold = _datetime.timedelta(days=2) - # grp_td_threshold = _datetime.timedelta(days=7) for i in range(1, len(dts_to_repair)): ind = indices_to_repair[i] dt = dts_to_repair[i] @@ -488,25 +504,35 @@ def _reconstruct_intervals_batch(self, df, interval, tag=-1): for i in range(len(dts_groups)): g = dts_groups[i] g0 = g[0] - i0 = df_noNa.index.get_loc(g0) + i0 = df_good.index.get_indexer([g0], method="nearest")[0] if i0 > 0: - dts_groups[i].insert(0, df_noNa.index[i0-1]) + i0 -= 1 gl = g[-1] - il = df_noNa.index.get_loc(gl) - if il < len(df_noNa)-1: - dts_groups[i].append(df_noNa.index[il+1]) + il = df_good.index.get_indexer([gl], method="nearest")[0] + if il < len(df_good)-1: + il += 1 + good_dts = df_good.index[i0:il+1] + dts_groups[i] += good_dts.to_list() + dts_groups[i].sort() n_fixed = 0 for g in dts_groups: df_block = df[df.index.isin(g)] + if debug: + print("- df_block:") + print(df_block) start_dt = g[0] start_d = start_dt.date() if sub_interval == "1h" and (_datetime.date.today() - start_d) > _datetime.timedelta(days=729): # Don't bother requesting more price data, Yahoo will reject + if debug: + print(f"- Don't bother requesting {sub_interval} price data, Yahoo will reject") continue elif sub_interval in ["30m", "15m"] and (_datetime.date.today() - start_d) > _datetime.timedelta(days=59): # Don't bother requesting more price data, Yahoo will reject + if debug: + print(f"- Don't bother requesting {sub_interval} price data, Yahoo will reject") continue td_1d = _datetime.timedelta(days=1) @@ -528,7 +554,6 @@ def _reconstruct_intervals_batch(self, df, interval, tag=-1): df_fine["ctr"] = 0 if interval == "1wk": - # df_fine["Week Start"] = df_fine.index.tz_localize(None).to_period("W-SUN").start_time weekdays = ["MON", "TUE", "WED", "THU", "FRI", "SAT", "SUN"] week_end_day = weekdays[(df_block.index[0].weekday()+7-1)%7] df_fine["Week Start"] = df_fine.index.tz_localize(None).to_period("W-"+week_end_day).start_time @@ -557,31 +582,36 @@ def _reconstruct_intervals_batch(self, df, interval, tag=-1): new_index = _np.append([df_fine.index[0]], df_fine.index[df_fine["intervalID"].diff()>0]) df_new.index = new_index + if debug: + print("- df_new:") + print(df_new) + # Calibrate! Check whether 'df_fine' has different split-adjustment. # If different, then adjust to match 'df' - df_block_calib = df_block[price_cols] - common_index = df_block_calib.index[df_block_calib.index.isin(df_new.index)] + common_index = _np.intersect1d(df_block.index, df_new.index) if len(common_index) == 0: # Can't calibrate so don't attempt repair + if debug: + print("Can't calibrate so don't attempt repair") continue - df_new_calib = df_new[df_new.index.isin(common_index)][price_cols] - df_block_calib = df_block_calib[df_block_calib.index.isin(common_index)] - calib_filter = (df_block_calib != tag).to_numpy() + df_new_calib = df_new[df_new.index.isin(common_index)][price_cols].to_numpy() + df_block_calib = df_block[df_block.index.isin(common_index)][price_cols].to_numpy() + calib_filter = (df_block_calib != tag) if not calib_filter.any(): # Can't calibrate so don't attempt repair + if debug: + print("Can't calibrate so don't attempt repair") continue - # Avoid divide-by-zero warnings printing: - df_new_calib = df_new_calib.to_numpy() - df_block_calib = df_block_calib.to_numpy() + # Avoid divide-by-zero warnings: for j in range(len(price_cols)): - c = price_cols[j] f = ~calib_filter[:,j] if f.any(): df_block_calib[f,j] = 1 df_new_calib[f,j] = 1 - ratios = (df_block_calib / df_new_calib)[calib_filter] + ratios = df_block_calib[calib_filter] / df_new_calib[calib_filter] ratio = _np.mean(ratios) - # + if debug: + print(f"- price calibration ratio = {ratio}") ratio_rcp = round(1.0 / ratio, 1) ratio = round(ratio, 1) if ratio == 1 and ratio_rcp == 1: @@ -600,13 +630,14 @@ def _reconstruct_intervals_batch(self, df, interval, tag=-1): df_new["Volume"] *= ratio_rcp # Repair! - bad_dts = df_block.index[(df_block[price_cols]==tag).any(axis=1)] + bad_dts = df_block.index[(df_block[price_cols+["Volume"]]==tag).any(axis=1)] for idx in bad_dts: if not idx in df_new.index: # Yahoo didn't return finer-grain data for this interval, # so probably no trading happened. - # print("no fine data") + if debug: + print(f"Yahoo didn't return finer-grain data for interval {idx}") continue df_new_row = df_new.loc[idx] @@ -635,6 +666,9 @@ def _reconstruct_intervals_batch(self, df, interval, tag=-1): df_v2.loc[idx, "Volume"] = df_new_row["Volume"] n_fixed += 1 + if debug: + print("df_v2:") ; print(df_v2) + return df_v2 def _fix_unit_mixups(self, df, interval, tz_exchange): @@ -659,7 +693,7 @@ def _fix_unit_mixups(self, df, interval, tz_exchange): # adding it to dependencies. from scipy import ndimage as _ndimage - data_cols = ["High", "Open", "Low", "Close"] # Order important, separate High from Low + data_cols = ["High", "Open", "Low", "Close", "Adj Close"] # Order important, separate High from Low data_cols = [c for c in data_cols if c in df2.columns] f_zeroes = (df2[data_cols]==0).any(axis=1) if f_zeroes.any(): @@ -707,6 +741,11 @@ def _fix_unit_mixups(self, df, interval, tz_exchange): if fi[j]: df2.loc[idx, c] = df.loc[idx, c] * 0.01 # + c = "Adj Close" + j = data_cols.index(c) + if fi[j]: + df2.loc[idx, c] = df.loc[idx, c] * 0.01 + # c = "High" j = data_cols.index(c) if fi[j]: @@ -749,6 +788,12 @@ def _fix_zeroes(self, df, interval, tz_exchange): if df.shape[0] == 0: return df + debug = False + # debug = True + + intraday = interval[-1] in ("m", 'h') + + df = df.sort_index() # important! df2 = df.copy() if df2.index.tz is None: @@ -757,16 +802,29 @@ def _fix_zeroes(self, df, interval, tz_exchange): df2.index = df2.index.tz_convert(tz_exchange) price_cols = [c for c in ["Open", "High", "Low", "Close", "Adj Close"] if c in df2.columns] - f_zero_or_nan = (df2[price_cols] == 0.0).values | df2[price_cols].isna().values + f_zero_or_nan = (df2[price_cols] == 0.0) | df2[price_cols].isna() + df2_reserve = None + if intraday: + # Ignore days with >50% intervals containing NaNs + df_nans = pd.DataFrame(f_zero_or_nan.any(axis=1), columns=["nan"]) + df_nans["_date"] = df_nans.index.date + grp = df_nans.groupby("_date") + nan_pct = grp.sum() / grp.count() + dts = nan_pct.index[nan_pct["nan"]>0.5] + f_zero_or_nan_ignore = _np.isin(f_zero_or_nan.index.date, dts) + df2_reserve = df2[f_zero_or_nan_ignore] + df2 = df2[~f_zero_or_nan_ignore] + f_zero_or_nan = (df2[price_cols] == 0.0) | df2[price_cols].isna() # Check whether worth attempting repair + f_zero_or_nan = f_zero_or_nan.to_numpy() if f_zero_or_nan.any(axis=1).sum() == 0: + if debug: + print("no bad data to repair") return df if f_zero_or_nan.sum() == len(price_cols)*len(df2): # Need some good data to calibrate - return df - # - avoid repair if many zeroes/NaNs - pct_zero_or_nan = f_zero_or_nan.sum() / (len(price_cols)*len(df2)) - if f_zero_or_nan.any(axis=1).sum()>2 and pct_zero_or_nan > 0.05: + if debug: + print("no good data to calibrate") return df data_cols = price_cols + ["Volume"] @@ -777,15 +835,22 @@ def _fix_zeroes(self, df, interval, tz_exchange): c = price_cols[i] df2.loc[f_zero_or_nan[:,i], c] = tag # If volume=0 or NaN for bad prices, then tag volume for repair - df2.loc[f_zero_or_nan.any(axis=1) & (df2["Volume"]==0), "Volume"] = tag - df2.loc[f_zero_or_nan.any(axis=1) & (df2["Volume"].isna()), "Volume"] = tag + f_vol_zero_or_nan = (df2["Volume"].to_numpy()==0) | (df2["Volume"].isna().to_numpy()) + df2.loc[f_zero_or_nan.any(axis=1) & f_vol_zero_or_nan, "Volume"] = tag + # If volume=0 or NaN but price moved in interval, then tag volume for repair + f_change = df2["High"].to_numpy() != df2["Low"].to_numpy() + df2.loc[f_change & f_vol_zero_or_nan, "Volume"] = tag n_before = (df2[data_cols].to_numpy()==tag).sum() df2 = self._reconstruct_intervals_batch(df2, interval, tag=tag) n_after = (df2[data_cols].to_numpy()==tag).sum() n_fixed = n_before - n_after if n_fixed > 0: - print("{}: fixed {} price=0.0 errors in {} price data".format(self.ticker, n_fixed, interval)) + print(f"{self.ticker}: fixed {n_fixed}/{n_before} value=0 errors in {interval} price data") + + if df2_reserve is not None: + df2 = _pd.concat([df2, df2_reserve]) + df2 = df2.sort_index() # Restore original values where repair failed (i.e. remove tag values) f = df2[data_cols].values==tag From 7460dbea17b939acd9b747d0942bcb14e169445d Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Sun, 15 Jan 2023 20:58:34 +0000 Subject: [PATCH 02/10] If reconstructing 1d interval with 1h, always request prepost --- yfinance/base.py | 33 +++++++++++---------------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/yfinance/base.py b/yfinance/base.py index 68353e800..aa71d27d0 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -354,25 +354,11 @@ def history(self, period="1mo", interval="1d", else: df["Capital Gains"] = 0.0 - # # Drop any rows that are too close in time to previous row - # td = utils._interval_to_timedelta(interval) - # steps = _np.full(df.shape[0], td) - # steps[1:] = df.index[1:] - df.index[0:df.shape[0]-1] - # df["step"] = steps ; print(df) ; raise Exception("here") - # if td >= pd.Timedelta("1d"): - # # Allow for DST - # f_drop = steps < (td-pd.Timedelta('1h')) - # else: - # f_drop = steps < td - # if f_drop.any(): - # print(df) - # raise Exception("Dropping too-close rows @", df.index[f_drop]) - # df = df[~f_drop].copy() if repair: # Do this before auto/back adjust - df = self._fix_zeroes(df, interval, tz_exchange) - df = self._fix_unit_mixups(df, interval, tz_exchange) + df = self._fix_zeroes(df, interval, tz_exchange, prepost) + df = self._fix_unit_mixups(df, interval, tz_exchange, prepost) # Auto/back adjust try: @@ -416,7 +402,7 @@ def history(self, period="1mo", interval="1d", # ------------------------ - def _reconstruct_intervals_batch(self, df, interval, tag=-1): + def _reconstruct_intervals_batch(self, df, interval, prepost, tag=-1): if not isinstance(df, _pd.DataFrame): raise Exception("'df' must be a Pandas DataFrame not", type(df)) @@ -425,6 +411,10 @@ def _reconstruct_intervals_batch(self, df, interval, tag=-1): debug = False # debug = True + if interval[1:] in ['d', 'wk', 'mo']: + # Interday data always includes pre & post + prepost = True + price_cols = [c for c in ["Open", "High", "Low", "Close", "Adj Close"] if c in df] data_cols = price_cols + ["Volume"] @@ -546,7 +536,6 @@ def _reconstruct_intervals_batch(self, df, interval, tag=-1): fetch_start = g[0] fetch_end = g[-1] + td_range - prepost = interval == "1d" df_fine = self.history(start=fetch_start, end=fetch_end, interval=sub_interval, auto_adjust=False, prepost=prepost, repair=False, keepna=True) if df_fine is None or df_fine.empty: print("YF: WARNING: Cannot reconstruct because Yahoo not returning data in interval") @@ -671,7 +660,7 @@ def _reconstruct_intervals_batch(self, df, interval, tag=-1): return df_v2 - def _fix_unit_mixups(self, df, interval, tz_exchange): + def _fix_unit_mixups(self, df, interval, tz_exchange, prepost): # Sometimes Yahoo returns few prices in cents/pence instead of $/£ # I.e. 100x bigger # Easy to detect and fix, just look for outliers = ~100x local median @@ -718,7 +707,7 @@ def _fix_unit_mixups(self, df, interval, tz_exchange): df2.loc[fi, c] = tag n_before = (df2[data_cols].to_numpy()==tag).sum() - df2 = self._reconstruct_intervals_batch(df2, interval, tag=tag) + df2 = self._reconstruct_intervals_batch(df2, interval, prepost, tag=tag) n_after = (df2[data_cols].to_numpy()==tag).sum() if n_after > 0: @@ -780,7 +769,7 @@ def _fix_unit_mixups(self, df, interval, tz_exchange): return df2 - def _fix_zeroes(self, df, interval, tz_exchange): + def _fix_zeroes(self, df, interval, tz_exchange, prepost): # Sometimes Yahoo returns prices=0 or NaN when trades occurred. # But most times when prices=0 or NaN returned is because no trades. # Impossible to distinguish, so only attempt repair if few or rare. @@ -842,7 +831,7 @@ def _fix_zeroes(self, df, interval, tz_exchange): df2.loc[f_change & f_vol_zero_or_nan, "Volume"] = tag n_before = (df2[data_cols].to_numpy()==tag).sum() - df2 = self._reconstruct_intervals_batch(df2, interval, tag=tag) + df2 = self._reconstruct_intervals_batch(df2, interval, prepost, tag=tag) n_after = (df2[data_cols].to_numpy()==tag).sum() n_fixed = n_before - n_after if n_fixed > 0: From 197d2968e3abf83713628ca7078404fe20eb7d1b Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Thu, 19 Jan 2023 22:19:16 +0000 Subject: [PATCH 03/10] Add 'repair_intervals', rename 'repair'->'repair_prices' --- README.md | 5 +- tests/prices.py | 48 ++++++++++++++++-- yfinance/base.py | 22 ++++++-- yfinance/utils.py | 127 +++++++++++++++++++++++++++++++++++++++------- 4 files changed, 174 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index c3ff752c3..27c10996a 100644 --- a/README.md +++ b/README.md @@ -225,8 +225,9 @@ data = yf.download( # or pdr.get_data_yahoo(... # (optional, default is False) auto_adjust = True, - # attempt repair of missing data or currency mixups e.g. $/cents - repair = False, + # attempt repair of Yahoo data issues + repair_prices = False, + repair_intervals = False, # download pre/post regular market hours data # (optional, default is False) diff --git a/tests/prices.py b/tests/prices.py index e0d722578..9eb983aec 100644 --- a/tests/prices.py +++ b/tests/prices.py @@ -261,6 +261,44 @@ def test_dst_fix(self): print("Weekly data not aligned to Monday") raise + def test_correct_early_close(self): + # Stockholm exchange closed early on 2022-12-23 @ 13:02. + # For hourly intervals, Yahoo returns: + # - 13:00 filled with NaNs + # - 13:02 contains data for 13:00 + # Test that 'repair' fixes this without affecting other intervals. + tkr = "AEC.ST" + d = "2022-12-23" + start = "2022-12-01" + end = "2023-01-01" + data_cols = ["Open","High","Low","Close","Volume","Dividends","Stock Splits"] + + dat = yf.Ticker(tkr, session=self.session) + df_old = dat.history(start=start, end=end, interval="1h", keepna=True) + df_repair = dat.history(start=start, end=end, interval="1h", keepna=True, repair_intervals=True) + + tz = df_old.index.tz + expected_intervals_fixed = [] + expected_intervals_fixed.append(tz.localize(_dt.datetime(2022,12,23,13,0))) + expected_intervals_lost = [] + expected_intervals_lost.append(tz.localize(_dt.datetime(2022,12,23,13,2))) + + # Test no unexpected intervals lost + dts_lost = df_old.index[~df_old.index.isin(df_repair.index)] + self.assertTrue(_np.equal(dts_lost.to_numpy(), expected_intervals_lost)) + + # Test only the expected interval changed + dts_shared = df_old.index[df_old.index.isin(df_repair.index)] + f_changed = (df_old.loc[dts_shared, data_cols].to_numpy() != df_repair.loc[dts_shared, data_cols].to_numpy()).any(axis=1) + self.assertTrue(f_changed.any(), "Expected data to change") + dts_changed = dts_shared[f_changed] + self.assertEqual(len(dts_changed), len(expected_intervals_fixed), "Different number of intervals changed") + self.assertTrue(_np.equal(dts_shared[f_changed], expected_intervals_fixed), "Unexpected intervals were changed") + + # Test the expected interval is valid data + f_na = df_repair.loc[expected_intervals_fixed, data_cols].isna().any(axis=1) + self.assertFalse(f_na.any(), "Repaired interval still contains NaNs") + def test_weekly_2rows_fix(self): tkr = "AMZN" start = _dt.date.today() - _dt.timedelta(days=14) @@ -298,7 +336,7 @@ def test_repair_100x_weekly(self): # Run test - df_repaired = dat._fix_unit_mixups(df_bad, "1wk", tz_exchange) + df_repaired = dat._fix_unit_mixups(df_bad, "1wk", tz_exchange, prepost=False) # First test - no errors left for c in data_cols: @@ -353,7 +391,7 @@ def test_repair_100x_weekly_preSplit(self): df.index = df.index.tz_localize(tz_exchange) df_bad.index = df_bad.index.tz_localize(tz_exchange) - df_repaired = dat._fix_unit_mixups(df_bad, "1wk", tz_exchange) + df_repaired = dat._fix_unit_mixups(df_bad, "1wk", tz_exchange, prepost=False) # First test - no errors left for c in data_cols: @@ -403,7 +441,7 @@ def test_repair_100x_daily(self): df.index = df.index.tz_localize(tz_exchange) df_bad.index = df_bad.index.tz_localize(tz_exchange) - df_repaired = dat._fix_unit_mixups(df_bad, "1d", tz_exchange) + df_repaired = dat._fix_unit_mixups(df_bad, "1d", tz_exchange, prepost=False) # First test - no errors left for c in data_cols: @@ -438,7 +476,7 @@ def test_repair_zeroes_daily(self): df_bad.index.name = "Date" df_bad.index = df_bad.index.tz_localize(tz_exchange) - repaired_df = dat._fix_zeroes(df_bad, "1d", tz_exchange) + repaired_df = dat._fix_zeroes(df_bad, "1d", tz_exchange, prepost=False) correct_df = df_bad.copy() correct_df.loc["2022-11-01", "Open"] = 102.080002 @@ -467,7 +505,7 @@ def test_repair_zeroes_hourly(self): df_bad.index.name = "Date" df_bad.index = df_bad.index.tz_localize(tz_exchange) - repaired_df = dat._fix_zeroes(df_bad, "1h", tz_exchange) + repaired_df = dat._fix_zeroes(df_bad, "1h", tz_exchange, prepost=False) correct_df = df_bad.copy() idx = _pd.Timestamp(2022,11,25, 12,30).tz_localize(tz_exchange) diff --git a/yfinance/base.py b/yfinance/base.py index aa71d27d0..ca26637c5 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -86,7 +86,10 @@ def stats(self, proxy=None): def history(self, period="1mo", interval="1d", start=None, end=None, prepost=False, actions=True, - auto_adjust=True, back_adjust=False, repair=False, keepna=False, + auto_adjust=True, back_adjust=False, + repair=None, # deprecated + repair_prices=False, repair_intervals=False, + keepna=False, proxy=None, rounding=False, timeout=10, debug=True, raise_errors=False) -> pd.DataFrame: """ @@ -110,9 +113,12 @@ def history(self, period="1mo", interval="1d", Adjust all OHLC automatically? Default is True back_adjust: bool Back-adjusted data to mimic true historical prices - repair: bool + repair_prices: bool Detect currency unit 100x mixups and attempt repair Default is False + repair_intervals: bool + Detect + Default is False keepna: bool Keep NaN rows returned by Yahoo? Default is False @@ -133,6 +139,11 @@ def history(self, period="1mo", interval="1d", exceptions instead of printing to console. """ + # Handle deprecated arguments + if repair is not None: + print("WARNING: 'repair' is deprecated and will be removed in future version. Use 'repair_prices' instead") + repair_prices = repair + if start or period is None or period.lower() == "max": # Check can get TZ. Fail => probably delisted tz = self._get_ticker_tz(debug, proxy, timeout) @@ -291,6 +302,9 @@ def history(self, period="1mo", interval="1d", quotes = utils.fix_Yahoo_dst_issue(quotes, params["interval"]) quotes = utils.fix_Yahoo_returning_live_separate(quotes, params["interval"], tz_exchange) + if repair_intervals: + quotes = utils.fix_Yahoo_including_unaligned_intervals(quotes, params["interval"]) + # actions dividends, splits, capital_gains = utils.parse_actions(data["chart"]["result"][0]) if not expect_capital_gains: @@ -355,7 +369,7 @@ def history(self, period="1mo", interval="1d", df["Capital Gains"] = 0.0 - if repair: + if repair_prices: # Do this before auto/back adjust df = self._fix_zeroes(df, interval, tz_exchange, prepost) df = self._fix_unit_mixups(df, interval, tz_exchange, prepost) @@ -536,7 +550,7 @@ def _reconstruct_intervals_batch(self, df, interval, prepost, tag=-1): fetch_start = g[0] fetch_end = g[-1] + td_range - df_fine = self.history(start=fetch_start, end=fetch_end, interval=sub_interval, auto_adjust=False, prepost=prepost, repair=False, keepna=True) + df_fine = self.history(start=fetch_start, end=fetch_end, interval=sub_interval, auto_adjust=False, prepost=prepost, repair_prices=False, keepna=True) if df_fine is None or df_fine.empty: print("YF: WARNING: Cannot reconstruct because Yahoo not returning data in interval") continue diff --git a/yfinance/utils.py b/yfinance/utils.py index 48b043435..7922f19a2 100644 --- a/yfinance/utils.py +++ b/yfinance/utils.py @@ -49,6 +49,11 @@ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} +def TypeCheckSeries(var, varName): + if not isinstance(var, _pd.Series) or isinstance(var, _pd.DataFrame): + raise TypeError(f"'{varName}' must be _pd.Series not {type(var)}") + + def is_isin(string): return bool(_re.match("^([A-Z]{2})([A-Z0-9]{9})([0-9]{1})$", string)) @@ -307,7 +312,7 @@ def _parse_user_dt(dt, exchange_tz): def _interval_to_timedelta(interval): if interval == "1mo": - return _dateutil.relativedelta(months=1) + return _dateutil.relativedelta.relativedelta(months=1) elif interval == "1wk": return _pd.Timedelta(days=7, unit='d') else: @@ -459,26 +464,114 @@ def fix_Yahoo_returning_live_separate(quotes, interval, tz_exchange): if last_rows_same_interval: # Last two rows are within same interval - idx1 = quotes.index[n - 1] - idx2 = quotes.index[n - 2] - if _np.isnan(quotes.loc[idx2, "Open"]): - quotes.loc[idx2, "Open"] = quotes["Open"][n - 1] - # Note: nanmax() & nanmin() ignores NaNs - quotes.loc[idx2, "High"] = _np.nanmax([quotes["High"][n - 1], quotes["High"][n - 2]]) - quotes.loc[idx2, "Low"] = _np.nanmin([quotes["Low"][n - 1], quotes["Low"][n - 2]]) - quotes.loc[idx2, "Close"] = quotes["Close"][n - 1] - if "Adj High" in quotes.columns: - quotes.loc[idx2, "Adj High"] = _np.nanmax([quotes["Adj High"][n - 1], quotes["Adj High"][n - 2]]) - if "Adj Low" in quotes.columns: - quotes.loc[idx2, "Adj Low"] = _np.nanmin([quotes["Adj Low"][n - 1], quotes["Adj Low"][n - 2]]) - if "Adj Close" in quotes.columns: - quotes.loc[idx2, "Adj Close"] = quotes["Adj Close"][n - 1] - quotes.loc[idx2, "Volume"] += quotes["Volume"][n - 1] - quotes = quotes.drop(quotes.index[n - 1]) + ia = quotes.index[n - 2] + ib = quotes.index[n - 1] + quotes.loc[ia] = merge_two_prices_intervals(quotes.loc[ia], quotes.loc[ib]) + quotes = quotes.drop(ib) + + return quotes + + +def fix_Yahoo_including_unaligned_intervals(quotes, interval): + if interval[1] not in ['m', 'h']: + # Only correct intraday + return quotes + # Merge adjacent rows if in same interval + # e.g. 13:02pm with 13:00pm 1h interval + n = quotes.shape[0] + itd = _interval_to_timedelta(interval) + td0 = _pd.Timedelta(0) + iend = quotes.index + itd + if interval[1:] in ["d", "wk", "mo"]: + # # Allow for DST + # iend -= _pd.Timedelta('2h') + return quotes + steps = _np.full(n, td0) + steps[1:] = quotes.index[1:] - iend[0:n-1] + f_overlap = steps < td0 + if f_overlap.any(): + # Process overlaps one-at-time because some may be false positives. + # Recalculate subsequent step after removing an overlap. + overlaps_exist = True + n_merged = 0 + dts_to_drop = [] + while overlaps_exist: + indices = _np.where(f_overlap)[0] + i = indices[0] + dt1 = quotes.index[i-1] + dt2 = quotes.index[i] + dt3 = quotes.index[i+1] + + dropped_dt = dt2 + quotes.loc[dt1] = merge_two_prices_intervals(quotes.iloc[i-1], quotes.iloc[i]) + + # Remove record of i: + dts_to_drop.append(dt2) + f_overlap[i] = False + steps[i] = td0 + # Recalc step of following dt: + steps[i+1] = quotes.index[i+1] - iend[i-1] + + f_overlap[i+1] = steps[i+1] < td0 + overlaps_exist = f_overlap[i+1:].any() + # Useful debug code: + # for d in [str(dt.date()) for dt in dts_to_drop]: + # print(quotes.loc[d]) + print("Dropping unaligned intervals:", dts_to_drop) + quotes = quotes.drop(dts_to_drop) return quotes +def merge_two_prices_intervals(i1, i2): + TypeCheckSeries(i1, "i1") + TypeCheckSeries(i2, "i2") + + price_cols = ["Open", "High", "Low", "Close"] + na1 = i1[price_cols].isna().all() + na2 = i2[price_cols].isna().all() + if na1 and na2: + return i1 + elif na1: + return i2 + elif na2: + return i1 + + # First check if two intervals are almost identical. If yes, keep 2nd + ratio = _np.mean(i2[price_cols+["Volume"]] / i1[price_cols+["Volume"]]) + if ratio > 0.99 and ratio < 1.01: + return i2 + + m = i1.copy() + + if _np.isnan(m["Open"]): + m["Open"] = i2["Open"] + if "Adj Open" in m.index: + m["Adj Open"] = i2["Adj Open"] + + # Note: nanmax() & nanmin() ignores NaNs + m["High"] = _np.nanmax([i2["High"], i1["High"]]) + m["Low"] = _np.nanmin([i2["Low"], i1["Low"]]) + if not _np.isnan(i2["Close"]): + m["Close"] = i2["Close"] + + if "Adj High" in m.index: + m["Adj High"] = _np.nanmax([i2["Adj High"], i1["Adj High"]]) + if "Adj Low" in m.index: + m["Adj Low"] = _np.nanmin([i2["Adj Low"], i1["Adj Low"]]) + if "Adj Close" in m.index: + m["Adj Close"] = i2["Adj Close"] + + if _np.isnan(m["Volume"]): + m["Volume"] = i2["Volume"] + elif _np.isnan(i2["Volume"]): + pass + else: + m["Volume"] += i2["Volume"] + + return m + + def safe_merge_dfs(df_main, df_sub, interval): # Carefully merge 'df_sub' onto 'df_main' # If naive merge fails, try again with reindexing df_sub: From 65b97d024b211a40472e14170dbe84a40b2301ee Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Fri, 20 Jan 2023 00:13:02 +0000 Subject: [PATCH 04/10] Improve reporting --- yfinance/base.py | 2 +- yfinance/utils.py | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/yfinance/base.py b/yfinance/base.py index ca26637c5..0dc32722f 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -463,7 +463,7 @@ def _reconstruct_intervals_batch(self, df, interval, prepost, tag=-1): f_recent = _datetime.date.today() - df.index.date < m f_repair_rows = f_repair_rows & f_recent if not f_repair_rows.any(): - print("data too old to fix") + # print("data too old to repair") return df dts_to_repair = df.index[f_repair_rows] diff --git a/yfinance/utils.py b/yfinance/utils.py index 7922f19a2..4b4ab4f58 100644 --- a/yfinance/utils.py +++ b/yfinance/utils.py @@ -496,6 +496,7 @@ def fix_Yahoo_including_unaligned_intervals(quotes, interval): overlaps_exist = True n_merged = 0 dts_to_drop = [] + dts_merged = [] while overlaps_exist: indices = _np.where(f_overlap)[0] i = indices[0] @@ -505,6 +506,7 @@ def fix_Yahoo_including_unaligned_intervals(quotes, interval): dropped_dt = dt2 quotes.loc[dt1] = merge_two_prices_intervals(quotes.iloc[i-1], quotes.iloc[i]) + dts_merged.append((dt2, dt1)) # Remove record of i: dts_to_drop.append(dt2) @@ -515,10 +517,15 @@ def fix_Yahoo_including_unaligned_intervals(quotes, interval): f_overlap[i+1] = steps[i+1] < td0 overlaps_exist = f_overlap[i+1:].any() + # Useful debug code: # for d in [str(dt.date()) for dt in dts_to_drop]: # print(quotes.loc[d]) - print("Dropping unaligned intervals:", dts_to_drop) + # + # print("Dropped unaligned intervals:", dts_to_drop) + print(f"Removed {len(dts_merged)} unaligned intervals by merging:") + for i in range(len(dts_merged)): + print(f"- {dts_merged[i][0].date()}: {dts_merged[i][0].time()} -> {dts_merged[i][1].time()}") quotes = quotes.drop(dts_to_drop) return quotes From 1636839b67cf49784f0918706388338c9fe381a7 Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Fri, 20 Jan 2023 00:13:28 +0000 Subject: [PATCH 05/10] Handle request to reconstruct 1m --- yfinance/base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/yfinance/base.py b/yfinance/base.py index 0dc32722f..a83cb5aa2 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -419,6 +419,9 @@ def history(self, period="1mo", interval="1d", def _reconstruct_intervals_batch(self, df, interval, prepost, tag=-1): if not isinstance(df, _pd.DataFrame): raise Exception("'df' must be a Pandas DataFrame not", type(df)) + if interval == "1m": + # Can't go smaller than 1m so can't reconstruct + return df # Reconstruct values in df using finer-grained price data. Delimiter marks what to reconstruct From 2b0ae5a6c1993816511ec8da7bd62385f73a2dfb Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Fri, 20 Jan 2023 17:29:01 +0000 Subject: [PATCH 06/10] Remove 'repair_intervals' --- tests/prices.py | 38 ------------------------------- yfinance/base.py | 22 ++++-------------- yfinance/utils.py | 58 ----------------------------------------------- 3 files changed, 4 insertions(+), 114 deletions(-) diff --git a/tests/prices.py b/tests/prices.py index 9eb983aec..52ac94f33 100644 --- a/tests/prices.py +++ b/tests/prices.py @@ -261,44 +261,6 @@ def test_dst_fix(self): print("Weekly data not aligned to Monday") raise - def test_correct_early_close(self): - # Stockholm exchange closed early on 2022-12-23 @ 13:02. - # For hourly intervals, Yahoo returns: - # - 13:00 filled with NaNs - # - 13:02 contains data for 13:00 - # Test that 'repair' fixes this without affecting other intervals. - tkr = "AEC.ST" - d = "2022-12-23" - start = "2022-12-01" - end = "2023-01-01" - data_cols = ["Open","High","Low","Close","Volume","Dividends","Stock Splits"] - - dat = yf.Ticker(tkr, session=self.session) - df_old = dat.history(start=start, end=end, interval="1h", keepna=True) - df_repair = dat.history(start=start, end=end, interval="1h", keepna=True, repair_intervals=True) - - tz = df_old.index.tz - expected_intervals_fixed = [] - expected_intervals_fixed.append(tz.localize(_dt.datetime(2022,12,23,13,0))) - expected_intervals_lost = [] - expected_intervals_lost.append(tz.localize(_dt.datetime(2022,12,23,13,2))) - - # Test no unexpected intervals lost - dts_lost = df_old.index[~df_old.index.isin(df_repair.index)] - self.assertTrue(_np.equal(dts_lost.to_numpy(), expected_intervals_lost)) - - # Test only the expected interval changed - dts_shared = df_old.index[df_old.index.isin(df_repair.index)] - f_changed = (df_old.loc[dts_shared, data_cols].to_numpy() != df_repair.loc[dts_shared, data_cols].to_numpy()).any(axis=1) - self.assertTrue(f_changed.any(), "Expected data to change") - dts_changed = dts_shared[f_changed] - self.assertEqual(len(dts_changed), len(expected_intervals_fixed), "Different number of intervals changed") - self.assertTrue(_np.equal(dts_shared[f_changed], expected_intervals_fixed), "Unexpected intervals were changed") - - # Test the expected interval is valid data - f_na = df_repair.loc[expected_intervals_fixed, data_cols].isna().any(axis=1) - self.assertFalse(f_na.any(), "Repaired interval still contains NaNs") - def test_weekly_2rows_fix(self): tkr = "AMZN" start = _dt.date.today() - _dt.timedelta(days=14) diff --git a/yfinance/base.py b/yfinance/base.py index a83cb5aa2..5ab863377 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -86,9 +86,7 @@ def stats(self, proxy=None): def history(self, period="1mo", interval="1d", start=None, end=None, prepost=False, actions=True, - auto_adjust=True, back_adjust=False, - repair=None, # deprecated - repair_prices=False, repair_intervals=False, + auto_adjust=True, back_adjust=False, repair=None, keepna=False, proxy=None, rounding=False, timeout=10, debug=True, raise_errors=False) -> pd.DataFrame: @@ -113,12 +111,9 @@ def history(self, period="1mo", interval="1d", Adjust all OHLC automatically? Default is True back_adjust: bool Back-adjusted data to mimic true historical prices - repair_prices: bool + repair: bool Detect currency unit 100x mixups and attempt repair Default is False - repair_intervals: bool - Detect - Default is False keepna: bool Keep NaN rows returned by Yahoo? Default is False @@ -139,11 +134,6 @@ def history(self, period="1mo", interval="1d", exceptions instead of printing to console. """ - # Handle deprecated arguments - if repair is not None: - print("WARNING: 'repair' is deprecated and will be removed in future version. Use 'repair_prices' instead") - repair_prices = repair - if start or period is None or period.lower() == "max": # Check can get TZ. Fail => probably delisted tz = self._get_ticker_tz(debug, proxy, timeout) @@ -302,9 +292,6 @@ def history(self, period="1mo", interval="1d", quotes = utils.fix_Yahoo_dst_issue(quotes, params["interval"]) quotes = utils.fix_Yahoo_returning_live_separate(quotes, params["interval"], tz_exchange) - if repair_intervals: - quotes = utils.fix_Yahoo_including_unaligned_intervals(quotes, params["interval"]) - # actions dividends, splits, capital_gains = utils.parse_actions(data["chart"]["result"][0]) if not expect_capital_gains: @@ -368,8 +355,7 @@ def history(self, period="1mo", interval="1d", else: df["Capital Gains"] = 0.0 - - if repair_prices: + if repair: # Do this before auto/back adjust df = self._fix_zeroes(df, interval, tz_exchange, prepost) df = self._fix_unit_mixups(df, interval, tz_exchange, prepost) @@ -553,7 +539,7 @@ def _reconstruct_intervals_batch(self, df, interval, prepost, tag=-1): fetch_start = g[0] fetch_end = g[-1] + td_range - df_fine = self.history(start=fetch_start, end=fetch_end, interval=sub_interval, auto_adjust=False, prepost=prepost, repair_prices=False, keepna=True) + df_fine = self.history(start=fetch_start, end=fetch_end, interval=sub_interval, auto_adjust=False, prepost=prepost, repair=False, keepna=True) if df_fine is None or df_fine.empty: print("YF: WARNING: Cannot reconstruct because Yahoo not returning data in interval") continue diff --git a/yfinance/utils.py b/yfinance/utils.py index 4b4ab4f58..1de147a1a 100644 --- a/yfinance/utils.py +++ b/yfinance/utils.py @@ -472,64 +472,6 @@ def fix_Yahoo_returning_live_separate(quotes, interval, tz_exchange): return quotes -def fix_Yahoo_including_unaligned_intervals(quotes, interval): - if interval[1] not in ['m', 'h']: - # Only correct intraday - return quotes - - # Merge adjacent rows if in same interval - # e.g. 13:02pm with 13:00pm 1h interval - n = quotes.shape[0] - itd = _interval_to_timedelta(interval) - td0 = _pd.Timedelta(0) - iend = quotes.index + itd - if interval[1:] in ["d", "wk", "mo"]: - # # Allow for DST - # iend -= _pd.Timedelta('2h') - return quotes - steps = _np.full(n, td0) - steps[1:] = quotes.index[1:] - iend[0:n-1] - f_overlap = steps < td0 - if f_overlap.any(): - # Process overlaps one-at-time because some may be false positives. - # Recalculate subsequent step after removing an overlap. - overlaps_exist = True - n_merged = 0 - dts_to_drop = [] - dts_merged = [] - while overlaps_exist: - indices = _np.where(f_overlap)[0] - i = indices[0] - dt1 = quotes.index[i-1] - dt2 = quotes.index[i] - dt3 = quotes.index[i+1] - - dropped_dt = dt2 - quotes.loc[dt1] = merge_two_prices_intervals(quotes.iloc[i-1], quotes.iloc[i]) - dts_merged.append((dt2, dt1)) - - # Remove record of i: - dts_to_drop.append(dt2) - f_overlap[i] = False - steps[i] = td0 - # Recalc step of following dt: - steps[i+1] = quotes.index[i+1] - iend[i-1] - - f_overlap[i+1] = steps[i+1] < td0 - overlaps_exist = f_overlap[i+1:].any() - - # Useful debug code: - # for d in [str(dt.date()) for dt in dts_to_drop]: - # print(quotes.loc[d]) - # - # print("Dropped unaligned intervals:", dts_to_drop) - print(f"Removed {len(dts_merged)} unaligned intervals by merging:") - for i in range(len(dts_merged)): - print(f"- {dts_merged[i][0].date()}: {dts_merged[i][0].time()} -> {dts_merged[i][1].time()}") - quotes = quotes.drop(dts_to_drop) - return quotes - - def merge_two_prices_intervals(i1, i2): TypeCheckSeries(i1, "i1") TypeCheckSeries(i2, "i2") From eb6d830e2a9e3badad3a26aac084a6b933cfb185 Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Sat, 21 Jan 2023 23:00:30 +0000 Subject: [PATCH 07/10] Fix repair volume=0 ; Tidy code --- README.md | 3 +-- yfinance/base.py | 27 ++++++++++++++++----------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 27c10996a..1e5d9a4ca 100644 --- a/README.md +++ b/README.md @@ -226,8 +226,7 @@ data = yf.download( # or pdr.get_data_yahoo(... auto_adjust = True, # attempt repair of Yahoo data issues - repair_prices = False, - repair_intervals = False, + repair = False, # download pre/post regular market hours data # (optional, default is False) diff --git a/yfinance/base.py b/yfinance/base.py index 5ab863377..086fbf33a 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -86,8 +86,7 @@ def stats(self, proxy=None): def history(self, period="1mo", interval="1d", start=None, end=None, prepost=False, actions=True, - auto_adjust=True, back_adjust=False, repair=None, - keepna=False, + auto_adjust=True, back_adjust=False, repair=False, keepna=False, proxy=None, rounding=False, timeout=10, debug=True, raise_errors=False) -> pd.DataFrame: """ @@ -794,26 +793,31 @@ def _fix_zeroes(self, df, interval, tz_exchange, prepost): df2.index = df2.index.tz_convert(tz_exchange) price_cols = [c for c in ["Open", "High", "Low", "Close", "Adj Close"] if c in df2.columns] - f_zero_or_nan = (df2[price_cols] == 0.0) | df2[price_cols].isna() + f_prices_bad = (df2[price_cols] == 0.0) | df2[price_cols].isna() df2_reserve = None if intraday: # Ignore days with >50% intervals containing NaNs - df_nans = pd.DataFrame(f_zero_or_nan.any(axis=1), columns=["nan"]) + df_nans = pd.DataFrame(f_prices_bad.any(axis=1), columns=["nan"]) df_nans["_date"] = df_nans.index.date grp = df_nans.groupby("_date") nan_pct = grp.sum() / grp.count() dts = nan_pct.index[nan_pct["nan"]>0.5] - f_zero_or_nan_ignore = _np.isin(f_zero_or_nan.index.date, dts) + f_zero_or_nan_ignore = _np.isin(f_prices_bad.index.date, dts) df2_reserve = df2[f_zero_or_nan_ignore] df2 = df2[~f_zero_or_nan_ignore] - f_zero_or_nan = (df2[price_cols] == 0.0) | df2[price_cols].isna() + f_prices_bad = (df2[price_cols] == 0.0) | df2[price_cols].isna() + + f_high_low_good = (~df2["High"].isna()) & (~df2["Low"].isna()) + f_vol_bad = (df2["Volume"]==0).to_numpy() & f_high_low_good & (df2["High"]!=df2["Low"]).to_numpy() + # Check whether worth attempting repair - f_zero_or_nan = f_zero_or_nan.to_numpy() - if f_zero_or_nan.any(axis=1).sum() == 0: + f_prices_bad = f_prices_bad.to_numpy() + f_bad_rows = f_prices_bad.any(axis=1) | f_vol_bad + if not f_bad_rows.any(): if debug: print("no bad data to repair") return df - if f_zero_or_nan.sum() == len(price_cols)*len(df2): + if f_prices_bad.sum() == len(price_cols)*len(df2): # Need some good data to calibrate if debug: print("no good data to calibrate") @@ -825,10 +829,11 @@ def _fix_zeroes(self, df, interval, tz_exchange, prepost): tag = -1.0 for i in range(len(price_cols)): c = price_cols[i] - df2.loc[f_zero_or_nan[:,i], c] = tag + df2.loc[f_prices_bad[:,i], c] = tag + df2.loc[f_vol_bad, "Volume"] = tag # If volume=0 or NaN for bad prices, then tag volume for repair f_vol_zero_or_nan = (df2["Volume"].to_numpy()==0) | (df2["Volume"].isna().to_numpy()) - df2.loc[f_zero_or_nan.any(axis=1) & f_vol_zero_or_nan, "Volume"] = tag + df2.loc[f_prices_bad.any(axis=1) & f_vol_zero_or_nan, "Volume"] = tag # If volume=0 or NaN but price moved in interval, then tag volume for repair f_change = df2["High"].to_numpy() != df2["Low"].to_numpy() df2.loc[f_change & f_vol_zero_or_nan, "Volume"] = tag From 39c1ecc7a29717c6b0bf43e0705c14d68ddcbdf3 Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Wed, 25 Jan 2023 14:37:43 +0000 Subject: [PATCH 08/10] Improve price repair - reduce spam, improve data reliability Extend 'reconstruct groups' to reduce Yahoo spam ; Extend fetch range to avoid first/last day irregularities ; Improve handling of 'max fetch days' Yahoo limit --- yfinance/base.py | 83 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 62 insertions(+), 21 deletions(-) diff --git a/yfinance/base.py b/yfinance/base.py index 086fbf33a..4795b8ff1 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -23,6 +23,7 @@ import time as _time import datetime as _datetime +import dateutil as _dateutil from typing import Optional import pandas as _pd @@ -416,6 +417,9 @@ def _reconstruct_intervals_batch(self, df, interval, prepost, tag=-1): if interval[1:] in ['d', 'wk', 'mo']: # Interday data always includes pre & post prepost = True + intraday = False + else: + intraday = True price_cols = [c for c in ["Open", "High", "Low", "Close", "Adj Close"] if c in df] data_cols = price_cols + ["Volume"] @@ -447,8 +451,14 @@ def _reconstruct_intervals_batch(self, df, interval, prepost, tag=-1): # Ignore old intervals for which Yahoo won't return finer data: m = min_lookbacks[sub_interval] - if m is not None: - f_recent = _datetime.date.today() - df.index.date < m + if m is None: + min_dt = None + else: + min_dt = _pd.Timestamp.utcnow() - m + if debug: + print(f"- min_dt={min_dt} interval={interval} sub_interval={sub_interval}") + if min_dt is not None: + f_recent = df.index >= min_dt f_repair_rows = f_repair_rows & f_recent if not f_repair_rows.any(): # print("data too old to repair") @@ -470,39 +480,50 @@ def _reconstruct_intervals_batch(self, df, interval, prepost, tag=-1): last_dt = dts_to_repair[0] last_ind = indices_to_repair[0] td = utils._interval_to_timedelta(interval) - if interval == "1mo": - grp_td_threshold = _datetime.timedelta(days=28) - elif interval == "1wk": - grp_td_threshold = _datetime.timedelta(days=28) - elif interval == "1d": - grp_td_threshold = _datetime.timedelta(days=14) - elif interval == "1h": - grp_td_threshold = _datetime.timedelta(days=7) + # Note on setting max size: have to allow space for adding good data + if sub_interval == "1mo": + grp_max_size = _dateutil.relativedelta.relativedelta(years=2) + elif sub_interval == "1wk": + grp_max_size = _dateutil.relativedelta.relativedelta(years=2) + elif sub_interval == "1d": + grp_max_size = _dateutil.relativedelta.relativedelta(years=2) + elif sub_interval == "1h": + grp_max_size = _dateutil.relativedelta.relativedelta(years=1) + elif sub_interval == "1m": + grp_max_size = _datetime.timedelta(days=5) # allow 2 days for buffer below else: - grp_td_threshold = _datetime.timedelta(days=2) + grp_max_size = _datetime.timedelta(days=30) + if debug: + print("- grp_max_size =", grp_max_size) for i in range(1, len(dts_to_repair)): ind = indices_to_repair[i] dt = dts_to_repair[i] - if (dt-dts_groups[-1][-1]) < grp_td_threshold: - dts_groups[-1].append(dt) - elif ind - last_ind <= 3: + if dt.date() < dts_groups[-1][0].date()+grp_max_size: dts_groups[-1].append(dt) else: dts_groups.append([dt]) last_dt = dt last_ind = ind + if debug: + print("Repair groups:") + for g in dts_groups: + print(f"- {g[0]} -> {g[-1]}") + # Add some good data to each group, so can calibrate later: for i in range(len(dts_groups)): g = dts_groups[i] g0 = g[0] i0 = df_good.index.get_indexer([g0], method="nearest")[0] if i0 > 0: - i0 -= 1 + if (min_dt is None or df_good.index[i0-1] >= min_dt) and \ + ((not intraday) or df_good.index[i0-1].date()==g0.date()): + i0 -= 1 gl = g[-1] il = df_good.index.get_indexer([gl], method="nearest")[0] if il < len(df_good)-1: - il += 1 + if (not intraday) or df_good.index[il+1].date()==gl.date(): + il += 1 good_dts = df_good.index[i0:il+1] dts_groups[i] += good_dts.to_list() dts_groups[i].sort() @@ -538,7 +559,13 @@ def _reconstruct_intervals_batch(self, df, interval, prepost, tag=-1): fetch_start = g[0] fetch_end = g[-1] + td_range - df_fine = self.history(start=fetch_start, end=fetch_end, interval=sub_interval, auto_adjust=False, prepost=prepost, repair=False, keepna=True) + # The first and last day returned by Yahoo can be slightly wrong, so add buffer: + fetch_start -= td_1d + fetch_end += td_1d + if intraday: + df_fine = self.history(start=fetch_start.date(), end=fetch_end.date()+td_1d, interval=sub_interval, auto_adjust=False, actions=False, prepost=prepost, repair=False, keepna=True) + else: + df_fine = self.history(start=fetch_start, end=fetch_end, interval=sub_interval, auto_adjust=False, actions=False, prepost=prepost, repair=False, keepna=True) if df_fine is None or df_fine.empty: print("YF: WARNING: Cannot reconstruct because Yahoo not returning data in interval") continue @@ -602,7 +629,7 @@ def _reconstruct_intervals_batch(self, df, interval, prepost, tag=-1): ratios = df_block_calib[calib_filter] / df_new_calib[calib_filter] ratio = _np.mean(ratios) if debug: - print(f"- price calibration ratio = {ratio}") + print(f"- price calibration ratio (raw) = {ratio}") ratio_rcp = round(1.0 / ratio, 1) ratio = round(ratio, 1) if ratio == 1 and ratio_rcp == 1: @@ -623,12 +650,20 @@ def _reconstruct_intervals_batch(self, df, interval, prepost, tag=-1): # Repair! bad_dts = df_block.index[(df_block[price_cols+["Volume"]]==tag).any(axis=1)] + if debug: + no_fine_data_dts = [] + for idx in bad_dts: + if not idx in df_new.index: + # Yahoo didn't return finer-grain data for this interval, + # so probably no trading happened. + no_fine_data_dts.append(idx) + if len(no_fine_data_dts) > 0: + print(f"Yahoo didn't return finer-grain data for these intervals:") + print(no_fine_data_dts) for idx in bad_dts: if not idx in df_new.index: # Yahoo didn't return finer-grain data for this interval, # so probably no trading happened. - if debug: - print(f"Yahoo didn't return finer-grain data for interval {idx}") continue df_new_row = df_new.loc[idx] @@ -839,11 +874,17 @@ def _fix_zeroes(self, df, interval, tz_exchange, prepost): df2.loc[f_change & f_vol_zero_or_nan, "Volume"] = tag n_before = (df2[data_cols].to_numpy()==tag).sum() + dts_tagged = df2.index[(df2[data_cols].to_numpy()==tag).any(axis=1)] df2 = self._reconstruct_intervals_batch(df2, interval, prepost, tag=tag) n_after = (df2[data_cols].to_numpy()==tag).sum() + dts_not_repaired = df2.index[(df2[data_cols].to_numpy()==tag).any(axis=1)] n_fixed = n_before - n_after if n_fixed > 0: - print(f"{self.ticker}: fixed {n_fixed}/{n_before} value=0 errors in {interval} price data") + msg = f"{self.ticker}: fixed {n_fixed}/{n_before} value=0 errors in {interval} price data" + if n_fixed < 4: + dts_repaired = sorted(list(set(dts_tagged).difference(dts_not_repaired))) + msg += f": {dts_repaired}" + print(msg) if df2_reserve is not None: df2 = _pd.concat([df2, df2_reserve]) From aad46baf286c9556301fb0644273281decc33dfa Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Sat, 28 Jan 2023 23:14:28 +0000 Subject: [PATCH 09/10] price repair: Fix 'min_dt', add 'silent' mode --- yfinance/base.py | 42 +++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/yfinance/base.py b/yfinance/base.py index 4795b8ff1..da6de912a 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -111,8 +111,9 @@ def history(self, period="1mo", interval="1d", Adjust all OHLC automatically? Default is True back_adjust: bool Back-adjusted data to mimic true historical prices - repair: bool - Detect currency unit 100x mixups and attempt repair + repair: bool or "silent" + Detect currency unit 100x mixups and attempt repair. + If True, fix & print summary. If "silent", just fix. Default is False keepna: bool Keep NaN rows returned by Yahoo? @@ -355,10 +356,10 @@ def history(self, period="1mo", interval="1d", else: df["Capital Gains"] = 0.0 - if repair: + if repair==True or repair=="silent": # Do this before auto/back adjust - df = self._fix_zeroes(df, interval, tz_exchange, prepost) - df = self._fix_unit_mixups(df, interval, tz_exchange, prepost) + df = self._fix_zeroes(df, interval, tz_exchange, prepost, silent=(repair=="silent")) + df = self._fix_unit_mixups(df, interval, tz_exchange, prepost, silent=(repair=="silent")) # Auto/back adjust try: @@ -402,7 +403,7 @@ def history(self, period="1mo", interval="1d", # ------------------------ - def _reconstruct_intervals_batch(self, df, interval, prepost, tag=-1): + def _reconstruct_intervals_batch(self, df, interval, prepost, tag=-1, silent=False): if not isinstance(df, _pd.DataFrame): raise Exception("'df' must be a Pandas DataFrame not", type(df)) if interval == "1m": @@ -433,9 +434,6 @@ def _reconstruct_intervals_batch(self, df, interval, prepost, tag=-1): for i in ["30m", "15m", "5m", "2m"]: min_lookbacks[i] = _datetime.timedelta(days=60) min_lookbacks["1m"] = _datetime.timedelta(days=30) - # Hopefully never have to use max_lengths, because complicates fetch logic - # max_lengths = {i:None for i in intervals} - # max_lengths["1m"] = _datetime.timedelta(days=7) if interval in nexts: sub_interval = nexts[interval] td_range = itds[interval] @@ -454,7 +452,9 @@ def _reconstruct_intervals_batch(self, df, interval, prepost, tag=-1): if m is None: min_dt = None else: + m -= _datetime.timedelta(days=1) # allow space for 1-day padding min_dt = _pd.Timestamp.utcnow() - m + min_dt = min_dt.tz_convert(df.index.tz).ceil("D") if debug: print(f"- min_dt={min_dt} interval={interval} sub_interval={sub_interval}") if min_dt is not None: @@ -563,11 +563,15 @@ def _reconstruct_intervals_batch(self, df, interval, prepost, tag=-1): fetch_start -= td_1d fetch_end += td_1d if intraday: - df_fine = self.history(start=fetch_start.date(), end=fetch_end.date()+td_1d, interval=sub_interval, auto_adjust=False, actions=False, prepost=prepost, repair=False, keepna=True) - else: - df_fine = self.history(start=fetch_start, end=fetch_end, interval=sub_interval, auto_adjust=False, actions=False, prepost=prepost, repair=False, keepna=True) + fetch_start = fetch_start.date() + fetch_end = fetch_end.date()+td_1d + if debug: + print(f"- fetching {sub_interval} prepost={prepost} {fetch_start}->{fetch_end}") + r = "silent" if silent else True + df_fine = self.history(start=fetch_start, end=fetch_end, interval=sub_interval, auto_adjust=False, actions=False, prepost=prepost, repair=r, keepna=True) if df_fine is None or df_fine.empty: - print("YF: WARNING: Cannot reconstruct because Yahoo not returning data in interval") + if not silent: + print("YF: WARNING: Cannot reconstruct because Yahoo not returning data in interval") continue df_fine["ctr"] = 0 @@ -697,7 +701,7 @@ def _reconstruct_intervals_batch(self, df, interval, prepost, tag=-1): return df_v2 - def _fix_unit_mixups(self, df, interval, tz_exchange, prepost): + def _fix_unit_mixups(self, df, interval, tz_exchange, prepost, silent=False): # Sometimes Yahoo returns few prices in cents/pence instead of $/£ # I.e. 100x bigger # Easy to detect and fix, just look for outliers = ~100x local median @@ -744,7 +748,7 @@ def _fix_unit_mixups(self, df, interval, tz_exchange, prepost): df2.loc[fi, c] = tag n_before = (df2[data_cols].to_numpy()==tag).sum() - df2 = self._reconstruct_intervals_batch(df2, interval, prepost, tag=tag) + df2 = self._reconstruct_intervals_batch(df2, interval, prepost, tag, silent) n_after = (df2[data_cols].to_numpy()==tag).sum() if n_after > 0: @@ -786,7 +790,7 @@ def _fix_unit_mixups(self, df, interval, tz_exchange, prepost): n_fixed = n_before - n_after_crude n_fixed_crudely = n_after - n_after_crude - if n_fixed > 0: + if not silent and n_fixed > 0: report_msg = f"{self.ticker}: fixed {n_fixed}/{n_before} currency unit mixups " if n_fixed_crudely > 0: report_msg += f"({n_fixed_crudely} crudely) " @@ -806,7 +810,7 @@ def _fix_unit_mixups(self, df, interval, tz_exchange, prepost): return df2 - def _fix_zeroes(self, df, interval, tz_exchange, prepost): + def _fix_zeroes(self, df, interval, tz_exchange, prepost, silent=False): # Sometimes Yahoo returns prices=0 or NaN when trades occurred. # But most times when prices=0 or NaN returned is because no trades. # Impossible to distinguish, so only attempt repair if few or rare. @@ -875,11 +879,11 @@ def _fix_zeroes(self, df, interval, tz_exchange, prepost): n_before = (df2[data_cols].to_numpy()==tag).sum() dts_tagged = df2.index[(df2[data_cols].to_numpy()==tag).any(axis=1)] - df2 = self._reconstruct_intervals_batch(df2, interval, prepost, tag=tag) + df2 = self._reconstruct_intervals_batch(df2, interval, prepost, tag, silent) n_after = (df2[data_cols].to_numpy()==tag).sum() dts_not_repaired = df2.index[(df2[data_cols].to_numpy()==tag).any(axis=1)] n_fixed = n_before - n_after - if n_fixed > 0: + if not silent and n_fixed > 0: msg = f"{self.ticker}: fixed {n_fixed}/{n_before} value=0 errors in {interval} price data" if n_fixed < 4: dts_repaired = sorted(list(set(dts_tagged).difference(dts_not_repaired))) From a4f11b0243c393bfe578b61e0a428508dd00c277 Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Sun, 29 Jan 2023 12:45:30 +0000 Subject: [PATCH 10/10] Fix price repair tests, remove unrelated changes --- tests/prices.py | 67 ++++++++++++++++++++++++++++-------------- yfinance/base.py | 5 +++- yfinance/utils.py | 74 ++++++++++------------------------------------- 3 files changed, 65 insertions(+), 81 deletions(-) diff --git a/tests/prices.py b/tests/prices.py index 52ac94f33..ccbd425ac 100644 --- a/tests/prices.py +++ b/tests/prices.py @@ -270,6 +270,38 @@ def test_weekly_2rows_fix(self): df = dat.history(start=start, interval="1wk") self.assertTrue((df.index.weekday == 0).all()) +class TestPriceRepair(unittest.TestCase): + session = None + + @classmethod + def setUpClass(cls): + cls.session = requests_cache.CachedSession(backend='memory') + + @classmethod + def tearDownClass(cls): + if cls.session is not None: + cls.session.close() + + def test_reconstruct_2m(self): + # 2m repair requires 1m data. + # Yahoo restricts 1m fetches to 7 days max within last 30 days. + # Need to test that '_reconstruct_intervals_batch()' can handle this. + + tkrs = ["BHP.AX", "IMP.JO", "BP.L", "PNL.L", "INTC"] + + dt_now = _pd.Timestamp.utcnow() + td_7d = _dt.timedelta(days=7) + td_60d = _dt.timedelta(days=60) + + # Round time for 'requests_cache' reuse + dt_now = dt_now.ceil("1h") + + for tkr in tkrs: + dat = yf.Ticker(tkr, session=self.session) + end_dt = dt_now + start_dt = end_dt - td_60d + df = dat.history(start=start_dt, end=end_dt, interval="2m", repair=True) + def test_repair_100x_weekly(self): # Setup: tkr = "PNL.L" @@ -452,38 +484,29 @@ def test_repair_zeroes_hourly(self): dat = yf.Ticker(tkr, session=self.session) tz_exchange = dat.info["exchangeTimezoneName"] - df_bad = _pd.DataFrame(data={"Open": [29.68, 29.49, 29.545, _np.nan, 29.485], - "High": [29.68, 29.625, 29.58, _np.nan, 29.49], - "Low": [29.46, 29.4, 29.45, _np.nan, 29.31], - "Close": [29.485, 29.545, 29.485, _np.nan, 29.325], - "Adj Close": [29.485, 29.545, 29.485, _np.nan, 29.325], - "Volume": [3258528, 2140195, 1621010, 0, 0]}, - index=_pd.to_datetime([_dt.datetime(2022,11,25, 9,30), - _dt.datetime(2022,11,25, 10,30), - _dt.datetime(2022,11,25, 11,30), - _dt.datetime(2022,11,25, 12,30), - _dt.datetime(2022,11,25, 13,00)])) - df_bad = df_bad.sort_index() - df_bad.index.name = "Date" - df_bad.index = df_bad.index.tz_localize(tz_exchange) + correct_df = dat.history(period="1wk", interval="1h", auto_adjust=False, repair=True) + + df_bad = correct_df.copy() + bad_idx = correct_df.index[10] + df_bad.loc[bad_idx, "Open"] = _np.nan + df_bad.loc[bad_idx, "High"] = _np.nan + df_bad.loc[bad_idx, "Low"] = _np.nan + df_bad.loc[bad_idx, "Close"] = _np.nan + df_bad.loc[bad_idx, "Adj Close"] = _np.nan + df_bad.loc[bad_idx, "Volume"] = 0 repaired_df = dat._fix_zeroes(df_bad, "1h", tz_exchange, prepost=False) - correct_df = df_bad.copy() - idx = _pd.Timestamp(2022,11,25, 12,30).tz_localize(tz_exchange) - correct_df.loc[idx, "Open"] = 29.485001 - correct_df.loc[idx, "High"] = 29.49 - correct_df.loc[idx, "Low"] = 29.43 - correct_df.loc[idx, "Close"] = 29.455 - correct_df.loc[idx, "Adj Close"] = 29.455 - correct_df.loc[idx, "Volume"] = 609164 for c in ["Open", "Low", "High", "Close"]: try: self.assertTrue(_np.isclose(repaired_df[c], correct_df[c], rtol=1e-7).all()) except: print("COLUMN", c) + print("- repaired_df") print(repaired_df) + print("- correct_df[c]:") print(correct_df[c]) + print("- diff:") print(repaired_df[c] - correct_df[c]) raise diff --git a/yfinance/base.py b/yfinance/base.py index db403f67a..afcc0faa7 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -852,13 +852,16 @@ def _reconstruct_intervals_batch(self, df, interval, prepost, tag=-1, silent=Fal f_recent = df.index >= min_dt f_repair_rows = f_repair_rows & f_recent if not f_repair_rows.any(): - # print("data too old to repair") + if debug: + print("data too old to repair") return df dts_to_repair = df.index[f_repair_rows] indices_to_repair = _np.where(f_repair_rows)[0] if len(dts_to_repair) == 0: + if debug: + print("dts_to_repair[] is empty") return df df_v2 = df.copy() diff --git a/yfinance/utils.py b/yfinance/utils.py index 7c99bdedd..927609d85 100644 --- a/yfinance/utils.py +++ b/yfinance/utils.py @@ -49,11 +49,6 @@ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} -def TypeCheckSeries(var, varName): - if not isinstance(var, _pd.Series) or isinstance(var, _pd.DataFrame): - raise TypeError(f"'{varName}' must be _pd.Series not {type(var)}") - - # From https://stackoverflow.com/a/59128615 from types import FunctionType from inspect import getmembers @@ -485,63 +480,26 @@ def fix_Yahoo_returning_live_separate(quotes, interval, tz_exchange): if last_rows_same_interval: # Last two rows are within same interval - ia = quotes.index[n - 2] - ib = quotes.index[n - 1] - quotes.loc[ia] = merge_two_prices_intervals(quotes.loc[ia], quotes.loc[ib]) - quotes = quotes.drop(ib) + idx1 = quotes.index[n - 1] + idx2 = quotes.index[n - 2] + if _np.isnan(quotes.loc[idx2, "Open"]): + quotes.loc[idx2, "Open"] = quotes["Open"][n - 1] + # Note: nanmax() & nanmin() ignores NaNs + quotes.loc[idx2, "High"] = _np.nanmax([quotes["High"][n - 1], quotes["High"][n - 2]]) + quotes.loc[idx2, "Low"] = _np.nanmin([quotes["Low"][n - 1], quotes["Low"][n - 2]]) + quotes.loc[idx2, "Close"] = quotes["Close"][n - 1] + if "Adj High" in quotes.columns: + quotes.loc[idx2, "Adj High"] = _np.nanmax([quotes["Adj High"][n - 1], quotes["Adj High"][n - 2]]) + if "Adj Low" in quotes.columns: + quotes.loc[idx2, "Adj Low"] = _np.nanmin([quotes["Adj Low"][n - 1], quotes["Adj Low"][n - 2]]) + if "Adj Close" in quotes.columns: + quotes.loc[idx2, "Adj Close"] = quotes["Adj Close"][n - 1] + quotes.loc[idx2, "Volume"] += quotes["Volume"][n - 1] + quotes = quotes.drop(quotes.index[n - 1]) return quotes -def merge_two_prices_intervals(i1, i2): - TypeCheckSeries(i1, "i1") - TypeCheckSeries(i2, "i2") - - price_cols = ["Open", "High", "Low", "Close"] - na1 = i1[price_cols].isna().all() - na2 = i2[price_cols].isna().all() - if na1 and na2: - return i1 - elif na1: - return i2 - elif na2: - return i1 - - # First check if two intervals are almost identical. If yes, keep 2nd - ratio = _np.mean(i2[price_cols+["Volume"]] / i1[price_cols+["Volume"]]) - if ratio > 0.99 and ratio < 1.01: - return i2 - - m = i1.copy() - - if _np.isnan(m["Open"]): - m["Open"] = i2["Open"] - if "Adj Open" in m.index: - m["Adj Open"] = i2["Adj Open"] - - # Note: nanmax() & nanmin() ignores NaNs - m["High"] = _np.nanmax([i2["High"], i1["High"]]) - m["Low"] = _np.nanmin([i2["Low"], i1["Low"]]) - if not _np.isnan(i2["Close"]): - m["Close"] = i2["Close"] - - if "Adj High" in m.index: - m["Adj High"] = _np.nanmax([i2["Adj High"], i1["Adj High"]]) - if "Adj Low" in m.index: - m["Adj Low"] = _np.nanmin([i2["Adj Low"], i1["Adj Low"]]) - if "Adj Close" in m.index: - m["Adj Close"] = i2["Adj Close"] - - if _np.isnan(m["Volume"]): - m["Volume"] = i2["Volume"] - elif _np.isnan(i2["Volume"]): - pass - else: - m["Volume"] += i2["Volume"] - - return m - - def safe_merge_dfs(df_main, df_sub, interval): # Carefully merge 'df_sub' onto 'df_main' # If naive merge fails, try again with reindexing df_sub: