diff --git a/README.md b/README.md index 0ab3a3876..985245e31 100644 --- a/README.md +++ b/README.md @@ -216,7 +216,7 @@ data = yf.download( # or pdr.get_data_yahoo(... # (optional, default is False) auto_adjust = True, - # attempt repair of missing data or currency mixups e.g. $/cents + # attempt repair of Yahoo data issues repair = False, # download pre/post regular market hours data diff --git a/tests/prices.py b/tests/prices.py index e0d722578..ccbd425ac 100644 --- a/tests/prices.py +++ b/tests/prices.py @@ -270,6 +270,38 @@ def test_weekly_2rows_fix(self): df = dat.history(start=start, interval="1wk") self.assertTrue((df.index.weekday == 0).all()) +class TestPriceRepair(unittest.TestCase): + session = None + + @classmethod + def setUpClass(cls): + cls.session = requests_cache.CachedSession(backend='memory') + + @classmethod + def tearDownClass(cls): + if cls.session is not None: + cls.session.close() + + def test_reconstruct_2m(self): + # 2m repair requires 1m data. + # Yahoo restricts 1m fetches to 7 days max within last 30 days. + # Need to test that '_reconstruct_intervals_batch()' can handle this. + + tkrs = ["BHP.AX", "IMP.JO", "BP.L", "PNL.L", "INTC"] + + dt_now = _pd.Timestamp.utcnow() + td_7d = _dt.timedelta(days=7) + td_60d = _dt.timedelta(days=60) + + # Round time for 'requests_cache' reuse + dt_now = dt_now.ceil("1h") + + for tkr in tkrs: + dat = yf.Ticker(tkr, session=self.session) + end_dt = dt_now + start_dt = end_dt - td_60d + df = dat.history(start=start_dt, end=end_dt, interval="2m", repair=True) + def test_repair_100x_weekly(self): # Setup: tkr = "PNL.L" @@ -298,7 +330,7 @@ def test_repair_100x_weekly(self): # Run test - df_repaired = dat._fix_unit_mixups(df_bad, "1wk", tz_exchange) + df_repaired = dat._fix_unit_mixups(df_bad, "1wk", tz_exchange, prepost=False) # First test - no errors left for c in data_cols: @@ -353,7 +385,7 @@ def test_repair_100x_weekly_preSplit(self): df.index = df.index.tz_localize(tz_exchange) df_bad.index = df_bad.index.tz_localize(tz_exchange) - df_repaired = dat._fix_unit_mixups(df_bad, "1wk", tz_exchange) + df_repaired = dat._fix_unit_mixups(df_bad, "1wk", tz_exchange, prepost=False) # First test - no errors left for c in data_cols: @@ -403,7 +435,7 @@ def test_repair_100x_daily(self): df.index = df.index.tz_localize(tz_exchange) df_bad.index = df_bad.index.tz_localize(tz_exchange) - df_repaired = dat._fix_unit_mixups(df_bad, "1d", tz_exchange) + df_repaired = dat._fix_unit_mixups(df_bad, "1d", tz_exchange, prepost=False) # First test - no errors left for c in data_cols: @@ -438,7 +470,7 @@ def test_repair_zeroes_daily(self): df_bad.index.name = "Date" df_bad.index = df_bad.index.tz_localize(tz_exchange) - repaired_df = dat._fix_zeroes(df_bad, "1d", tz_exchange) + repaired_df = dat._fix_zeroes(df_bad, "1d", tz_exchange, prepost=False) correct_df = df_bad.copy() correct_df.loc["2022-11-01", "Open"] = 102.080002 @@ -452,38 +484,29 @@ def test_repair_zeroes_hourly(self): dat = yf.Ticker(tkr, session=self.session) tz_exchange = dat.info["exchangeTimezoneName"] - df_bad = _pd.DataFrame(data={"Open": [29.68, 29.49, 29.545, _np.nan, 29.485], - "High": [29.68, 29.625, 29.58, _np.nan, 29.49], - "Low": [29.46, 29.4, 29.45, _np.nan, 29.31], - "Close": [29.485, 29.545, 29.485, _np.nan, 29.325], - "Adj Close": [29.485, 29.545, 29.485, _np.nan, 29.325], - "Volume": [3258528, 2140195, 1621010, 0, 0]}, - index=_pd.to_datetime([_dt.datetime(2022,11,25, 9,30), - _dt.datetime(2022,11,25, 10,30), - _dt.datetime(2022,11,25, 11,30), - _dt.datetime(2022,11,25, 12,30), - _dt.datetime(2022,11,25, 13,00)])) - df_bad = df_bad.sort_index() - df_bad.index.name = "Date" - df_bad.index = df_bad.index.tz_localize(tz_exchange) + correct_df = dat.history(period="1wk", interval="1h", auto_adjust=False, repair=True) - repaired_df = dat._fix_zeroes(df_bad, "1h", tz_exchange) + df_bad = correct_df.copy() + bad_idx = correct_df.index[10] + df_bad.loc[bad_idx, "Open"] = _np.nan + df_bad.loc[bad_idx, "High"] = _np.nan + df_bad.loc[bad_idx, "Low"] = _np.nan + df_bad.loc[bad_idx, "Close"] = _np.nan + df_bad.loc[bad_idx, "Adj Close"] = _np.nan + df_bad.loc[bad_idx, "Volume"] = 0 + + repaired_df = dat._fix_zeroes(df_bad, "1h", tz_exchange, prepost=False) - correct_df = df_bad.copy() - idx = _pd.Timestamp(2022,11,25, 12,30).tz_localize(tz_exchange) - correct_df.loc[idx, "Open"] = 29.485001 - correct_df.loc[idx, "High"] = 29.49 - correct_df.loc[idx, "Low"] = 29.43 - correct_df.loc[idx, "Close"] = 29.455 - correct_df.loc[idx, "Adj Close"] = 29.455 - correct_df.loc[idx, "Volume"] = 609164 for c in ["Open", "Low", "High", "Close"]: try: self.assertTrue(_np.isclose(repaired_df[c], correct_df[c], rtol=1e-7).all()) except: print("COLUMN", c) + print("- repaired_df") print(repaired_df) + print("- correct_df[c]:") print(correct_df[c]) + print("- diff:") print(repaired_df[c] - correct_df[c]) raise diff --git a/yfinance/base.py b/yfinance/base.py index a74c36f9d..afcc0faa7 100644 --- a/yfinance/base.py +++ b/yfinance/base.py @@ -23,6 +23,7 @@ import time as _time import datetime as _datetime +import dateutil as _dateutil from typing import Optional import pandas as _pd @@ -501,8 +502,9 @@ def history(self, period="1mo", interval="1d", Adjust all OHLC automatically? Default is True back_adjust: bool Back-adjusted data to mimic true historical prices - repair: bool - Detect currency unit 100x mixups and attempt repair + repair: bool or "silent" + Detect currency unit 100x mixups and attempt repair. + If True, fix & print summary. If "silent", just fix. Default is False keepna: bool Keep NaN rows returned by Yahoo? @@ -745,10 +747,10 @@ def history(self, period="1mo", interval="1d", else: df["Capital Gains"] = 0.0 - if repair: + if repair==True or repair=="silent": # Do this before auto/back adjust - df = self._fix_zeroes(df, interval, tz_exchange) - df = self._fix_unit_mixups(df, interval, tz_exchange) + df = self._fix_zeroes(df, interval, tz_exchange, prepost, silent=(repair=="silent")) + df = self._fix_unit_mixups(df, interval, tz_exchange, prepost, silent=(repair=="silent")) # Auto/back adjust try: @@ -792,31 +794,40 @@ def history(self, period="1mo", interval="1d", # ------------------------ - def _reconstruct_intervals_batch(self, df, interval, tag=-1): + def _reconstruct_intervals_batch(self, df, interval, prepost, tag=-1, silent=False): if not isinstance(df, _pd.DataFrame): raise Exception("'df' must be a Pandas DataFrame not", type(df)) + if interval == "1m": + # Can't go smaller than 1m so can't reconstruct + return df # Reconstruct values in df using finer-grained price data. Delimiter marks what to reconstruct + debug = False + # debug = True + + if interval[1:] in ['d', 'wk', 'mo']: + # Interday data always includes pre & post + prepost = True + intraday = False + else: + intraday = True + price_cols = [c for c in ["Open", "High", "Low", "Close", "Adj Close"] if c in df] data_cols = price_cols + ["Volume"] # If interval is weekly then can construct with daily. But if smaller intervals then # restricted to recent times: - # - daily = hourly restricted to last 730 days - sub_interval = None - td_range = None - if interval == "1wk": - # Correct by fetching week of daily data - sub_interval = "1d" - td_range = _datetime.timedelta(days=7) - elif interval == "1d": - # Correct by fetching day of hourly data - sub_interval = "1h" - td_range = _datetime.timedelta(days=1) - elif interval == "1h": - sub_interval = "30m" - td_range = _datetime.timedelta(hours=1) + intervals = ["1wk", "1d", "1h", "30m", "15m", "5m", "2m", "1m"] + itds = {i:utils._interval_to_timedelta(interval) for i in intervals} + nexts = {intervals[i]:intervals[i+1] for i in range(len(intervals)-1)} + min_lookbacks = {"1wk":None, "1d":None, "1h":_datetime.timedelta(days=730)} + for i in ["30m", "15m", "5m", "2m"]: + min_lookbacks[i] = _datetime.timedelta(days=60) + min_lookbacks["1m"] = _datetime.timedelta(days=30) + if interval in nexts: + sub_interval = nexts[interval] + td_range = itds[interval] else: print("WARNING: Have not implemented repair for '{}' interval. Contact developers".format(interval)) raise Exception("why here") @@ -828,76 +839,107 @@ def _reconstruct_intervals_batch(self, df, interval, tag=-1): f_repair_rows = f_repair.any(axis=1) # Ignore old intervals for which Yahoo won't return finer data: - if sub_interval == "1h": - f_recent = _datetime.date.today() - df.index.date < _datetime.timedelta(days=730) - f_repair_rows = f_repair_rows & f_recent - elif sub_interval in ["30m", "15m"]: - f_recent = _datetime.date.today() - df.index.date < _datetime.timedelta(days=60) + m = min_lookbacks[sub_interval] + if m is None: + min_dt = None + else: + m -= _datetime.timedelta(days=1) # allow space for 1-day padding + min_dt = _pd.Timestamp.utcnow() - m + min_dt = min_dt.tz_convert(df.index.tz).ceil("D") + if debug: + print(f"- min_dt={min_dt} interval={interval} sub_interval={sub_interval}") + if min_dt is not None: + f_recent = df.index >= min_dt f_repair_rows = f_repair_rows & f_recent - if not f_repair_rows.any(): - print("data too old to fix") - return df + if not f_repair_rows.any(): + if debug: + print("data too old to repair") + return df dts_to_repair = df.index[f_repair_rows] indices_to_repair = _np.where(f_repair_rows)[0] if len(dts_to_repair) == 0: + if debug: + print("dts_to_repair[] is empty") return df df_v2 = df.copy() - df_noNa = df[~df[price_cols].isna().any(axis=1)] + f_good = ~(df[price_cols].isna().any(axis=1)) + f_good = f_good & (df[price_cols].to_numpy()!=tag).all(axis=1) + df_good = df[f_good] # Group nearby NaN-intervals together to reduce number of Yahoo fetches dts_groups = [[dts_to_repair[0]]] last_dt = dts_to_repair[0] last_ind = indices_to_repair[0] td = utils._interval_to_timedelta(interval) - if interval == "1mo": - grp_td_threshold = _datetime.timedelta(days=28) - elif interval == "1wk": - grp_td_threshold = _datetime.timedelta(days=28) - elif interval == "1d": - grp_td_threshold = _datetime.timedelta(days=14) - elif interval == "1h": - grp_td_threshold = _datetime.timedelta(days=7) + # Note on setting max size: have to allow space for adding good data + if sub_interval == "1mo": + grp_max_size = _dateutil.relativedelta.relativedelta(years=2) + elif sub_interval == "1wk": + grp_max_size = _dateutil.relativedelta.relativedelta(years=2) + elif sub_interval == "1d": + grp_max_size = _dateutil.relativedelta.relativedelta(years=2) + elif sub_interval == "1h": + grp_max_size = _dateutil.relativedelta.relativedelta(years=1) + elif sub_interval == "1m": + grp_max_size = _datetime.timedelta(days=5) # allow 2 days for buffer below else: - grp_td_threshold = _datetime.timedelta(days=2) - # grp_td_threshold = _datetime.timedelta(days=7) + grp_max_size = _datetime.timedelta(days=30) + if debug: + print("- grp_max_size =", grp_max_size) for i in range(1, len(dts_to_repair)): ind = indices_to_repair[i] dt = dts_to_repair[i] - if (dt-dts_groups[-1][-1]) < grp_td_threshold: - dts_groups[-1].append(dt) - elif ind - last_ind <= 3: + if dt.date() < dts_groups[-1][0].date()+grp_max_size: dts_groups[-1].append(dt) else: dts_groups.append([dt]) last_dt = dt last_ind = ind + if debug: + print("Repair groups:") + for g in dts_groups: + print(f"- {g[0]} -> {g[-1]}") + # Add some good data to each group, so can calibrate later: for i in range(len(dts_groups)): g = dts_groups[i] g0 = g[0] - i0 = df_noNa.index.get_loc(g0) + i0 = df_good.index.get_indexer([g0], method="nearest")[0] if i0 > 0: - dts_groups[i].insert(0, df_noNa.index[i0-1]) + if (min_dt is None or df_good.index[i0-1] >= min_dt) and \ + ((not intraday) or df_good.index[i0-1].date()==g0.date()): + i0 -= 1 gl = g[-1] - il = df_noNa.index.get_loc(gl) - if il < len(df_noNa)-1: - dts_groups[i].append(df_noNa.index[il+1]) + il = df_good.index.get_indexer([gl], method="nearest")[0] + if il < len(df_good)-1: + if (not intraday) or df_good.index[il+1].date()==gl.date(): + il += 1 + good_dts = df_good.index[i0:il+1] + dts_groups[i] += good_dts.to_list() + dts_groups[i].sort() n_fixed = 0 for g in dts_groups: df_block = df[df.index.isin(g)] + if debug: + print("- df_block:") + print(df_block) start_dt = g[0] start_d = start_dt.date() if sub_interval == "1h" and (_datetime.date.today() - start_d) > _datetime.timedelta(days=729): # Don't bother requesting more price data, Yahoo will reject + if debug: + print(f"- Don't bother requesting {sub_interval} price data, Yahoo will reject") continue elif sub_interval in ["30m", "15m"] and (_datetime.date.today() - start_d) > _datetime.timedelta(days=59): # Don't bother requesting more price data, Yahoo will reject + if debug: + print(f"- Don't bother requesting {sub_interval} price data, Yahoo will reject") continue td_1d = _datetime.timedelta(days=1) @@ -911,15 +953,23 @@ def _reconstruct_intervals_batch(self, df, interval, tag=-1): fetch_start = g[0] fetch_end = g[-1] + td_range - prepost = interval == "1d" - df_fine = self.history(start=fetch_start, end=fetch_end, interval=sub_interval, auto_adjust=False, prepost=prepost, repair=False, keepna=True) + # The first and last day returned by Yahoo can be slightly wrong, so add buffer: + fetch_start -= td_1d + fetch_end += td_1d + if intraday: + fetch_start = fetch_start.date() + fetch_end = fetch_end.date()+td_1d + if debug: + print(f"- fetching {sub_interval} prepost={prepost} {fetch_start}->{fetch_end}") + r = "silent" if silent else True + df_fine = self.history(start=fetch_start, end=fetch_end, interval=sub_interval, auto_adjust=False, actions=False, prepost=prepost, repair=r, keepna=True) if df_fine is None or df_fine.empty: - print("YF: WARNING: Cannot reconstruct because Yahoo not returning data in interval") + if not silent: + print("YF: WARNING: Cannot reconstruct because Yahoo not returning data in interval") continue df_fine["ctr"] = 0 if interval == "1wk": - # df_fine["Week Start"] = df_fine.index.tz_localize(None).to_period("W-SUN").start_time weekdays = ["MON", "TUE", "WED", "THU", "FRI", "SAT", "SUN"] week_end_day = weekdays[(df_block.index[0].weekday()+7-1)%7] df_fine["Week Start"] = df_fine.index.tz_localize(None).to_period("W-"+week_end_day).start_time @@ -948,31 +998,36 @@ def _reconstruct_intervals_batch(self, df, interval, tag=-1): new_index = _np.append([df_fine.index[0]], df_fine.index[df_fine["intervalID"].diff()>0]) df_new.index = new_index + if debug: + print("- df_new:") + print(df_new) + # Calibrate! Check whether 'df_fine' has different split-adjustment. # If different, then adjust to match 'df' - df_block_calib = df_block[price_cols] - common_index = df_block_calib.index[df_block_calib.index.isin(df_new.index)] + common_index = _np.intersect1d(df_block.index, df_new.index) if len(common_index) == 0: # Can't calibrate so don't attempt repair + if debug: + print("Can't calibrate so don't attempt repair") continue - df_new_calib = df_new[df_new.index.isin(common_index)][price_cols] - df_block_calib = df_block_calib[df_block_calib.index.isin(common_index)] - calib_filter = (df_block_calib != tag).to_numpy() + df_new_calib = df_new[df_new.index.isin(common_index)][price_cols].to_numpy() + df_block_calib = df_block[df_block.index.isin(common_index)][price_cols].to_numpy() + calib_filter = (df_block_calib != tag) if not calib_filter.any(): # Can't calibrate so don't attempt repair + if debug: + print("Can't calibrate so don't attempt repair") continue - # Avoid divide-by-zero warnings printing: - df_new_calib = df_new_calib.to_numpy() - df_block_calib = df_block_calib.to_numpy() + # Avoid divide-by-zero warnings: for j in range(len(price_cols)): - c = price_cols[j] f = ~calib_filter[:,j] if f.any(): df_block_calib[f,j] = 1 df_new_calib[f,j] = 1 - ratios = (df_block_calib / df_new_calib)[calib_filter] + ratios = df_block_calib[calib_filter] / df_new_calib[calib_filter] ratio = _np.mean(ratios) - # + if debug: + print(f"- price calibration ratio (raw) = {ratio}") ratio_rcp = round(1.0 / ratio, 1) ratio = round(ratio, 1) if ratio == 1 and ratio_rcp == 1: @@ -991,13 +1046,22 @@ def _reconstruct_intervals_batch(self, df, interval, tag=-1): df_new["Volume"] *= ratio_rcp # Repair! - bad_dts = df_block.index[(df_block[price_cols]==tag).any(axis=1)] + bad_dts = df_block.index[(df_block[price_cols+["Volume"]]==tag).any(axis=1)] + if debug: + no_fine_data_dts = [] + for idx in bad_dts: + if not idx in df_new.index: + # Yahoo didn't return finer-grain data for this interval, + # so probably no trading happened. + no_fine_data_dts.append(idx) + if len(no_fine_data_dts) > 0: + print(f"Yahoo didn't return finer-grain data for these intervals:") + print(no_fine_data_dts) for idx in bad_dts: if not idx in df_new.index: # Yahoo didn't return finer-grain data for this interval, # so probably no trading happened. - # print("no fine data") continue df_new_row = df_new.loc[idx] @@ -1026,9 +1090,12 @@ def _reconstruct_intervals_batch(self, df, interval, tag=-1): df_v2.loc[idx, "Volume"] = df_new_row["Volume"] n_fixed += 1 + if debug: + print("df_v2:") ; print(df_v2) + return df_v2 - def _fix_unit_mixups(self, df, interval, tz_exchange): + def _fix_unit_mixups(self, df, interval, tz_exchange, prepost, silent=False): # Sometimes Yahoo returns few prices in cents/pence instead of $/£ # I.e. 100x bigger # Easy to detect and fix, just look for outliers = ~100x local median @@ -1050,7 +1117,7 @@ def _fix_unit_mixups(self, df, interval, tz_exchange): # adding it to dependencies. from scipy import ndimage as _ndimage - data_cols = ["High", "Open", "Low", "Close"] # Order important, separate High from Low + data_cols = ["High", "Open", "Low", "Close", "Adj Close"] # Order important, separate High from Low data_cols = [c for c in data_cols if c in df2.columns] f_zeroes = (df2[data_cols]==0).any(axis=1) if f_zeroes.any(): @@ -1075,7 +1142,7 @@ def _fix_unit_mixups(self, df, interval, tz_exchange): df2.loc[fi, c] = tag n_before = (df2[data_cols].to_numpy()==tag).sum() - df2 = self._reconstruct_intervals_batch(df2, interval, tag=tag) + df2 = self._reconstruct_intervals_batch(df2, interval, prepost, tag, silent) n_after = (df2[data_cols].to_numpy()==tag).sum() if n_after > 0: @@ -1098,6 +1165,11 @@ def _fix_unit_mixups(self, df, interval, tz_exchange): if fi[j]: df2.loc[idx, c] = df.loc[idx, c] * 0.01 # + c = "Adj Close" + j = data_cols.index(c) + if fi[j]: + df2.loc[idx, c] = df.loc[idx, c] * 0.01 + # c = "High" j = data_cols.index(c) if fi[j]: @@ -1112,7 +1184,7 @@ def _fix_unit_mixups(self, df, interval, tz_exchange): n_fixed = n_before - n_after_crude n_fixed_crudely = n_after - n_after_crude - if n_fixed > 0: + if not silent and n_fixed > 0: report_msg = f"{self.ticker}: fixed {n_fixed}/{n_before} currency unit mixups " if n_fixed_crudely > 0: report_msg += f"({n_fixed_crudely} crudely) " @@ -1132,7 +1204,7 @@ def _fix_unit_mixups(self, df, interval, tz_exchange): return df2 - def _fix_zeroes(self, df, interval, tz_exchange): + def _fix_zeroes(self, df, interval, tz_exchange, prepost, silent=False): # Sometimes Yahoo returns prices=0 or NaN when trades occurred. # But most times when prices=0 or NaN returned is because no trades. # Impossible to distinguish, so only attempt repair if few or rare. @@ -1140,6 +1212,12 @@ def _fix_zeroes(self, df, interval, tz_exchange): if df.shape[0] == 0: return df + debug = False + # debug = True + + intraday = interval[-1] in ("m", 'h') + + df = df.sort_index() # important! df2 = df.copy() if df2.index.tz is None: @@ -1148,16 +1226,34 @@ def _fix_zeroes(self, df, interval, tz_exchange): df2.index = df2.index.tz_convert(tz_exchange) price_cols = [c for c in ["Open", "High", "Low", "Close", "Adj Close"] if c in df2.columns] - f_zero_or_nan = (df2[price_cols] == 0.0).values | df2[price_cols].isna().values + f_prices_bad = (df2[price_cols] == 0.0) | df2[price_cols].isna() + df2_reserve = None + if intraday: + # Ignore days with >50% intervals containing NaNs + df_nans = pd.DataFrame(f_prices_bad.any(axis=1), columns=["nan"]) + df_nans["_date"] = df_nans.index.date + grp = df_nans.groupby("_date") + nan_pct = grp.sum() / grp.count() + dts = nan_pct.index[nan_pct["nan"]>0.5] + f_zero_or_nan_ignore = _np.isin(f_prices_bad.index.date, dts) + df2_reserve = df2[f_zero_or_nan_ignore] + df2 = df2[~f_zero_or_nan_ignore] + f_prices_bad = (df2[price_cols] == 0.0) | df2[price_cols].isna() + + f_high_low_good = (~df2["High"].isna()) & (~df2["Low"].isna()) + f_vol_bad = (df2["Volume"]==0).to_numpy() & f_high_low_good & (df2["High"]!=df2["Low"]).to_numpy() + # Check whether worth attempting repair - if f_zero_or_nan.any(axis=1).sum() == 0: + f_prices_bad = f_prices_bad.to_numpy() + f_bad_rows = f_prices_bad.any(axis=1) | f_vol_bad + if not f_bad_rows.any(): + if debug: + print("no bad data to repair") return df - if f_zero_or_nan.sum() == len(price_cols)*len(df2): + if f_prices_bad.sum() == len(price_cols)*len(df2): # Need some good data to calibrate - return df - # - avoid repair if many zeroes/NaNs - pct_zero_or_nan = f_zero_or_nan.sum() / (len(price_cols)*len(df2)) - if f_zero_or_nan.any(axis=1).sum()>2 and pct_zero_or_nan > 0.05: + if debug: + print("no good data to calibrate") return df data_cols = price_cols + ["Volume"] @@ -1166,17 +1262,31 @@ def _fix_zeroes(self, df, interval, tz_exchange): tag = -1.0 for i in range(len(price_cols)): c = price_cols[i] - df2.loc[f_zero_or_nan[:,i], c] = tag + df2.loc[f_prices_bad[:,i], c] = tag + df2.loc[f_vol_bad, "Volume"] = tag # If volume=0 or NaN for bad prices, then tag volume for repair - df2.loc[f_zero_or_nan.any(axis=1) & (df2["Volume"]==0), "Volume"] = tag - df2.loc[f_zero_or_nan.any(axis=1) & (df2["Volume"].isna()), "Volume"] = tag + f_vol_zero_or_nan = (df2["Volume"].to_numpy()==0) | (df2["Volume"].isna().to_numpy()) + df2.loc[f_prices_bad.any(axis=1) & f_vol_zero_or_nan, "Volume"] = tag + # If volume=0 or NaN but price moved in interval, then tag volume for repair + f_change = df2["High"].to_numpy() != df2["Low"].to_numpy() + df2.loc[f_change & f_vol_zero_or_nan, "Volume"] = tag n_before = (df2[data_cols].to_numpy()==tag).sum() - df2 = self._reconstruct_intervals_batch(df2, interval, tag=tag) + dts_tagged = df2.index[(df2[data_cols].to_numpy()==tag).any(axis=1)] + df2 = self._reconstruct_intervals_batch(df2, interval, prepost, tag, silent) n_after = (df2[data_cols].to_numpy()==tag).sum() + dts_not_repaired = df2.index[(df2[data_cols].to_numpy()==tag).any(axis=1)] n_fixed = n_before - n_after - if n_fixed > 0: - print("{}: fixed {} price=0.0 errors in {} price data".format(self.ticker, n_fixed, interval)) + if not silent and n_fixed > 0: + msg = f"{self.ticker}: fixed {n_fixed}/{n_before} value=0 errors in {interval} price data" + if n_fixed < 4: + dts_repaired = sorted(list(set(dts_tagged).difference(dts_not_repaired))) + msg += f": {dts_repaired}" + print(msg) + + if df2_reserve is not None: + df2 = _pd.concat([df2, df2_reserve]) + df2 = df2.sort_index() # Restore original values where repair failed (i.e. remove tag values) f = df2[data_cols].values==tag