Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add decrypt key extraction from JS + GitHub backup #1353

Merged
merged 2 commits into from
Jan 26, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
163 changes: 107 additions & 56 deletions yfinance/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import requests as requests
import re
from bs4 import BeautifulSoup

from frozendict import frozendict

Expand Down Expand Up @@ -46,67 +47,38 @@ def wrapped(*args, **kwargs):
return wrapped


def decrypt_cryptojs_aes_stores(data):
def _extract_extra_keys_from_stores(data):
new_keys = [k for k in data.keys() if k not in ["context", "plugins"]]
new_keys_values = set([data[k] for k in new_keys])

# Maybe multiple keys have same value - keep one of each
new_keys_uniq = []
new_keys_uniq_values = set()
for k in new_keys:
v = data[k]
if not v in new_keys_uniq_values:
new_keys_uniq.append(k)
new_keys_uniq_values.add(v)

return new_keys_uniq


def decrypt_cryptojs_aes_stores(data, keys=None):
encrypted_stores = data['context']['dispatcher']['stores']

password = None
candidate_passwords = []
if keys is not None:
if not isinstance(keys, list):
raise TypeError("'keys' must be list")
candidate_passwords = keys
else:
candidate_passwords = []

if "_cs" in data and "_cr" in data:
_cs = data["_cs"]
_cr = data["_cr"]
_cr = b"".join(int.to_bytes(i, length=4, byteorder="big", signed=True) for i in json.loads(_cr)["words"])
password = hashlib.pbkdf2_hmac("sha1", _cs.encode("utf8"), _cr, 1, dklen=32).hex()
else:
# Currently assume one extra key in dict, which is password. Print error if
# more extra keys detected.
new_keys = [k for k in data.keys() if k not in ["context", "plugins"]]
new_keys_values = set([data[k] for k in new_keys])

# Maybe multiple keys have same value - keep one of each
new_keys2 = []
new_keys2_values = set()
for k in new_keys:
v = data[k]
if not v in new_keys2_values:
new_keys2.append(k)
new_keys2_values.add(v)

l = len(new_keys)
if l == 0:
return None
elif l == 1 and isinstance(data[new_keys[0]], str):
password_key = new_keys[0]
# else:
# msg = "Yahoo has again changed data format, yfinance now unsure which key(s) is for decryption:"
# new_keys_pretty = {}
# l = min(10, len(new_keys))
# for i in range(0, l):
# k = new_keys[i]
# k_str = k if len(k) < 32 else k[:32-3]+"..."
# v = data[k]
# v_type = type(v)
# v_str = str(v)
# if len(v_str) > 256:
# v_str = v_str[:256]+"..."
# new_keys_pretty[k_str] = f"{v_str}' ({v_type})"
# for k in new_keys_pretty:
# msg += '\n' + f"'{k}' -> '{new_keys_pretty[k]}'"
# if len(new_keys) > l:
# d = len(new_keys) - l
# msg += '\n' + "..."
# msg += '\n' + f"{d} more options!"
# raise Exception(msg)
# password_key = new_keys[0]
# password = data[password_key]

# The above attempt to smartly pick out decryption key has stopped working.
# Fortunately the keys Yahoo use are currently hardcoded in their JSON:
candidate_passwords += ["ad4d90b3c9f2e1d156ef98eadfa0ff93e4042f6960e54aa2a13f06f528e6b50ba4265a26a1fd5b9cd3db0d268a9c34e1d080592424309429a58bce4adc893c87", \
"e9a8ab8e5620b712ebc2fb4f33d5c8b9c80c0d07e8c371911c785cf674789f1747d76a909510158a7b7419e86857f2d7abbd777813ff64840e4cbc514d12bcae",
"6ae2523aeafa283dad746556540145bf603f44edbf37ad404d3766a8420bb5eb1d3738f52a227b88283cca9cae44060d5f0bba84b6a495082589f5fe7acbdc9e",
"3365117c2a368ffa5df7313a4a84988f73926a86358e8eea9497c5ff799ce27d104b68e5f2fbffa6f8f92c1fef41765a7066fa6bcf050810a9c4c7872fd3ebf0"]

# candidate_passwords += [data[k] for k in new_keys] # don't do these, none work

encrypted_stores = b64decode(encrypted_stores)
assert encrypted_stores[0:8] == b"Salted__"
Expand Down Expand Up @@ -187,7 +159,7 @@ def _decrypt(encrypted_stores, password, key, iv):
except:
pass
if not success:
raise Exception("yfinance failed to decrypt Yahoo data response with hardcoded keys, contact developers")
raise Exception("yfinance failed to decrypt Yahoo data response")

decoded_stores = json.loads(plaintext)
return decoded_stores
Expand Down Expand Up @@ -230,6 +202,66 @@ def _get_proxy(self, proxy):
proxy = {"https": proxy}
return proxy

def _get_decryption_keys_from_yahoo_js(self, soup):
result = None

key_count = 4
re_script = soup.find("script", string=re.compile("root.App.main")).text
re_data = json.loads(re.search("root.App.main\s+=\s+(\{.*\})", re_script).group(1))
re_data.pop("context", None)
key_list = list(re_data.keys())
if re_data.get("plugins"): # 1) attempt to get last 4 keys after plugins
ind = key_list.index("plugins")
if len(key_list) > ind+1:
sub_keys = key_list[ind+1:]
if len(sub_keys) == key_count:
re_obj = {}
missing_val = False
for k in sub_keys:
if not re_data.get(k):
missing_val = True
break
re_obj.update({k: re_data.get(k)})
if not missing_val:
result = re_obj

if not result is None:
return [''.join(result.values())]

re_keys = [] # 2) attempt scan main.js file approach to get keys
prefix = "https://s.yimg.com/uc/finance/dd-site/js/main."
tags = [tag['src'] for tag in soup.find_all('script') if prefix in tag.get('src', '')]
for t in tags:
response_js = self.cache_get(t)
#
if response_js.status_code != 200:
time.sleep(random.randrange(10, 20))
response_js.close()
else:
r_data = response_js.content.decode("utf8")
re_list = [
x.group() for x in re.finditer(r"context.dispatcher.stores=JSON.parse((?:.*?\r?\n?)*)toString", r_data)
]
for rl in re_list:
re_sublist = [x.group() for x in re.finditer(r"t\[\"((?:.*?\r?\n?)*)\"\]", rl)]
if len(re_sublist) == key_count:
re_keys = [sl.replace('t["', '').replace('"]', '') for sl in re_sublist]
break
response_js.close()
if len(re_keys) == key_count:
break
re_obj = {}
missing_val = False
for k in re_keys:
if not re_data.get(k):
missing_val = True
break
re_obj.update({k: re_data.get(k)})
if not missing_val:
return [''.join(re_obj.values())]

return []

@lru_cache_freezeargs
@lru_cache(maxsize=cache_maxsize)
def get_json_data_stores(self, sub_page: str = None, proxy=None) -> dict:
Expand All @@ -241,7 +273,8 @@ def get_json_data_stores(self, sub_page: str = None, proxy=None) -> dict:
else:
ticker_url = "{}/{}".format(_SCRAPE_URL_, self.ticker)

html = self.get(url=ticker_url, proxy=proxy).text
response = self.get(url=ticker_url, proxy=proxy)
html = response.text

# The actual json-data for stores is in a javascript assignment in the webpage
try:
Expand All @@ -253,7 +286,25 @@ def get_json_data_stores(self, sub_page: str = None, proxy=None) -> dict:

data = json.loads(json_str)

stores = decrypt_cryptojs_aes_stores(data)
# Gather decryption keys:
soup = BeautifulSoup(response.content, "html.parser")
keys = self._get_decryption_keys_from_yahoo_js(soup)
if len(keys) == 0:
msg = "No decryption keys could be extracted from JS file."
if "requests_cache" in str(type(response)):
msg += " Try flushing your 'requests_cache', probably parsing old JS."
print("WARNING: " + msg + " Falling back to backup decrypt methods.")
if len(keys) == 0:
keys_url = "https://github.com/ranaroussi/yfinance/raw/main/yfinance/scrapers/yahoo-keys.txt"
response_gh = self.cache_get(keys_url)
keys = response_gh.text.splitlines()
extra_keys = _extract_extra_keys_from_stores(data)
if len(extra_keys) < 10:
# Only brute-force with these extra keys if few
keys += extra_keys

# Decrypt!
stores = decrypt_cryptojs_aes_stores(data, keys)
if stores is None:
# Maybe Yahoo returned old format, not encrypted
if "context" in data and "dispatcher" in data["context"]:
Expand Down