Skip to content

Commit

Permalink
fix running in jupyter noteobook, switch back to selenium
Browse files Browse the repository at this point in the history
  • Loading branch information
gdifiore committed Jul 31, 2024
1 parent b1b9131 commit 0b7284e
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 30 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -114,4 +114,5 @@ venv.bak/
# Custom
NOTES
test.*
*.lock
*.lock
.pyball_cache/
18 changes: 16 additions & 2 deletions pyball/playerid_lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@
# Description: File containing functions to obtain player (id) information
# on various statistic sites from a lookup table.

from functools import wraps
import io
import re
import diskcache
import zipfile
from functools import lru_cache
import unicodedata
import logging
import pandas as pd
Expand All @@ -18,6 +19,19 @@
logger = logging.getLogger(__name__)


cache = diskcache.Cache('./.pyball_cache')

def disk_cache(func):
@wraps(func)
def wrapper(*args, **kwargs):
key = f"{func.__name__}:{args}:{kwargs}"
result = cache.get(key)
if result is None:
result = func(*args, **kwargs)
cache.set(key, result)
return result
return wrapper

class PlayerLookup:
"""
A class for looking up player information in the registry.
Expand Down Expand Up @@ -54,7 +68,7 @@ def _compile_player_data(zip_archive: zipfile.ZipFile) -> pd.DataFrame:
return pd.concat(dataframes, axis=0)

@staticmethod
@lru_cache(maxsize=1)
@disk_cache
def fetch_chadwick_data() -> pd.DataFrame:
"""
Fetches and processes player data from the Chadwick Register.
Expand Down
58 changes: 33 additions & 25 deletions pyball/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,21 @@
import hashlib
import diskcache
from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

cache = diskcache.Cache('./.pyball_cache')



def fetch_url_content(url, cache_time=86400):
"""
Function to read a url and return the BeautifulSoup object, using disk cache when available
Function to read a URL and return the BeautifulSoup object, using disk cache when available
"""
# Create a unique key for this URL
url_hash = hashlib.md5(url.encode()).hexdigest()
Expand All @@ -31,44 +37,46 @@ def fetch_url_content(url, cache_time=86400):

# If no valid cache, fetch the content
print("Fetching from URL")
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
html = None
try:
page.goto(url, wait_until="networkidle", timeout=60000)

# Specific handling for different sites
if "baseball-reference.com" in url:
page.wait_for_selector('div#inner_nav', timeout=30000)
elif "baseballsavant" in url:
page.wait_for_selector("div.pitchingBreakdown table#detailedPitches", timeout=30000)

html = page.content()
except PlaywrightTimeoutError:
html = page.content()
finally:
browser.close()
options = Options()
options.add_argument("--headless")
service = Service()
driver = webdriver.Chrome(service=service, options=options)

try:
driver.get(url)

# Specific handling for different sites
if "baseball-reference.com" in url:
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div#inner_nav')))
elif "baseballsavant" in url:
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.pitchingBreakdown table#detailedPitches")))
else:
# Default wait for network idle
time.sleep(10) # Simple wait as Selenium doesn't have a built-in "networkidle" equivalent

html = driver.page_source
except TimeoutException:
html = driver.page_source
finally:
driver.quit()

if html:
# Cache the new content
cache.set(url_hash, (time.time(), html))
cache[url_hash] = (time.time(), html)
return BeautifulSoup(html, "html.parser")
else:
return None


def read_url(url):
"""
Function to read a url, using cache when available
Function to read a URL, using cache when available
"""
try:
return fetch_url_content(url)
except Exception as e:
print(f"Error fetching URL: {e}")
return None


def make_bbref_player_url(bbref_key):
"""
Function to generate baseball-reference url from bbref_key
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ python = "^3.10.12"
pandas = "^1.3.4"
numpy = "1.26.4"
bs4 = "^0.0.1"
requests = "^2.26.0"
playwright = "^1.45.0"
lxml = "^5.2.2"
diskcache = "^5.6.3"
requests = "^2.32.3"
selenium = "^4.23.1"

[tool.poetry.group.dev.dependencies]
pytest = "^8.3.1"
Expand Down

0 comments on commit 0b7284e

Please sign in to comment.