fix running in jupyter noteobook, switch back to selenium

gdifiore · Jul 31, 2024 · 0b7284e · 0b7284e
1 parent b1b9131
commit 0b7284e
Show file tree

Hide file tree

Showing 4 changed files with 53 additions and 30 deletions.
diff --git a/.gitignore b/.gitignore
@@ -114,4 +114,5 @@ venv.bak/
 # Custom
 NOTES
 test.*
-*.lock
+*.lock
+.pyball_cache/
diff --git a/pyball/playerid_lookup.py b/pyball/playerid_lookup.py
@@ -5,10 +5,11 @@
 # Description: File containing functions to obtain player (id) information
 # on various statistic sites from a lookup table.
 
+from functools import wraps
 import io
 import re
+import diskcache
 import zipfile
-from functools import lru_cache
 import unicodedata
 import logging
 import pandas as pd
@@ -18,6 +19,19 @@
 logger = logging.getLogger(__name__)
 
 
+cache = diskcache.Cache('./.pyball_cache')
+
+def disk_cache(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        key = f"{func.__name__}:{args}:{kwargs}"
+        result = cache.get(key)
+        if result is None:
+            result = func(*args, **kwargs)
+            cache.set(key, result)
+        return result
+    return wrapper
+
 class PlayerLookup:
     """
     A class for looking up player information in the registry.
@@ -54,7 +68,7 @@ def _compile_player_data(zip_archive: zipfile.ZipFile) -> pd.DataFrame:
         return pd.concat(dataframes, axis=0)
 
     @staticmethod
-    @lru_cache(maxsize=1)
+    @disk_cache
     def fetch_chadwick_data() -> pd.DataFrame:
         """
         Fetches and processes player data from the Chadwick Register.

diff --git a/pyball/utils.py b/pyball/utils.py
@@ -8,15 +8,21 @@
 import hashlib
 import diskcache
 from bs4 import BeautifulSoup
-from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
-
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import TimeoutException
 
 cache = diskcache.Cache('./.pyball_cache')
 
 
+
 def fetch_url_content(url, cache_time=86400):
     """
-    Function to read a url and return the BeautifulSoup object, using disk cache when available
+    Function to read a URL and return the BeautifulSoup object, using disk cache when available
     """
     # Create a unique key for this URL
     url_hash = hashlib.md5(url.encode()).hexdigest()
@@ -31,44 +37,46 @@ def fetch_url_content(url, cache_time=86400):
 
     # If no valid cache, fetch the content
     print("Fetching from URL")
-    with sync_playwright() as p:
-        browser = p.chromium.launch(headless=True)
-        page = browser.new_page()
-        html = None
-        try:
-            page.goto(url, wait_until="networkidle", timeout=60000)
-
-            # Specific handling for different sites
-            if "baseball-reference.com" in url:
-                page.wait_for_selector('div#inner_nav', timeout=30000)
-            elif "baseballsavant" in url:
-                page.wait_for_selector("div.pitchingBreakdown table#detailedPitches", timeout=30000)
-
-            html = page.content()
-        except PlaywrightTimeoutError:
-            html = page.content()
-        finally:
-            browser.close()
+    options = Options()
+    options.add_argument("--headless")
+    service = Service()
+    driver = webdriver.Chrome(service=service, options=options)
+
+    try:
+        driver.get(url)
+
+        # Specific handling for different sites
+        if "baseball-reference.com" in url:
+            WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div#inner_nav')))
+        elif "baseballsavant" in url:
+            WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.pitchingBreakdown table#detailedPitches")))
+        else:
+            # Default wait for network idle
+            time.sleep(10)  # Simple wait as Selenium doesn't have a built-in "networkidle" equivalent
+
+        html = driver.page_source
+    except TimeoutException:
+        html = driver.page_source
+    finally:
+        driver.quit()
 
     if html:
         # Cache the new content
-        cache.set(url_hash, (time.time(), html))
+        cache[url_hash] = (time.time(), html)
         return BeautifulSoup(html, "html.parser")
     else:
         return None
 
-
 def read_url(url):
     """
-    Function to read a url, using cache when available
+    Function to read a URL, using cache when available
     """
     try:
         return fetch_url_content(url)
     except Exception as e:
         print(f"Error fetching URL: {e}")
         return None
 
-
 def make_bbref_player_url(bbref_key):
     """
     Function to generate baseball-reference url from bbref_key

diff --git a/pyproject.toml b/pyproject.toml
@@ -15,10 +15,10 @@ python = "^3.10.12"
 pandas = "^1.3.4"
 numpy = "1.26.4"
 bs4 = "^0.0.1"
-requests = "^2.26.0"
-playwright = "^1.45.0"
 lxml = "^5.2.2"
 diskcache = "^5.6.3"
+requests = "^2.32.3"
+selenium = "^4.23.1"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.3.1"
-Original file line number
+Diff line change
@@ Expand Up / @@ -114,4 +114,5 @@ venv.bak/ @@
     # Custom
     NOTES
     test.*
-    *.lock
+    *.lock
+    .pyball_cache/