Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add 3.10 and 3.11 and remove 3.6 and 3.7 from test versions #296

Merged
merged 2 commits into from
Jan 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ jobs:
strategy:
matrix:
python-version:
- 3.6
- 3.7
- 3.8
- 3.9
- "3.8"
- "3.9"
- "3.10"
- "3.11"
name: Install dependencies and test
runs-on: ubuntu-latest
steps:
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ LOG_LEVEL := INFO
MYPY_RUN_AGAINST_DEFAULT := *.py pybaseball tests
MYPY_RUN_AGAINST := $(MYPY_RUN_AGAINST_DEFAULT)
ONLY_MODIFIED := 1
TEST_RUN_AGAINST := tests
TEST_RUN_AGAINST := tests/pybaseball
TEST_FLAGS := -n auto


Expand Down
2 changes: 1 addition & 1 deletion mypy.ini
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[mypy]
python_version = 3.6
python_version = 3.8
warn_return_any = True
warn_unused_configs = True
disallow_untyped_defs = True
Expand Down
6 changes: 4 additions & 2 deletions pybaseball/amateur_draft.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import pandas as pd
import requests

from . import cache
from .datasources.bref import BRefSession

session = BRefSession()

# pylint: disable=line-too-long
_URL = "https://www.baseball-reference.com/draft/?year_ID={year}&draft_round={draft_round}&draft_type=junreg&query_type=year_round&"


def get_draft_results(year: int, draft_round: int) -> pd.DataFrame:
url = _URL.format(year=year, draft_round=draft_round)
res = requests.get(url, timeout=None).content
res = session.get(url, timeout=None).content
draft_results = pd.read_html(res)
return draft_results

Expand Down
6 changes: 4 additions & 2 deletions pybaseball/amateur_draft_by_team.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import pandas as pd
import requests

from . import cache
from .datasources.bref import BRefSession

session = BRefSession()

# pylint: disable=line-too-long
_URL = "https://www.baseball-reference.com/draft/?team_ID={team}&year_ID={year}&draft_type=junreg&query_type=franch_year"


def get_draft_results(team: str, year: int) -> pd.DataFrame:
url = _URL.format(team=team, year=year)
res = requests.get(url, timeout=None).content
res = session.get(url, timeout=None).content
draft_results = pd.read_html(res)
return pd.concat(draft_results)

Expand Down
30 changes: 30 additions & 0 deletions pybaseball/data/fangraphs_teams.csv
Original file line number Diff line number Diff line change
Expand Up @@ -2904,3 +2904,33 @@
2902,2020,NL,SFN,SFG,30,SFG,SFN
2903,2020,NL,SLN,STL,28,STL,SLN
2904,2020,NL,WAS,WSN,24,WSN,WAS
2905,2021,AL,BAL,BAL,2,BAL,BAL
2906,2021,AL,BOS,BOS,3,BOS,BOS
2907,2021,AL,CHA,CHW,4,CHW,CHA
2908,2021,AL,CLE,CLE,5,CLE,CLE
2909,2021,AL,DET,DET,6,DET,DET
2910,2021,AL,HOU,HOU,21,HOU,HOU
2911,2021,AL,KCA,KCR,7,KCR,KCA
2912,2021,AL,LAA,ANA,1,LAA,ANA
2913,2021,AL,MIN,MIN,8,MIN,MIN
2914,2021,AL,NYA,NYY,9,NYY,NYA
2915,2021,AL,OAK,OAK,10,OAK,OAK
2916,2021,AL,SEA,SEA,11,SEA,SEA
2917,2021,AL,TBA,TBD,12,TBR,TBA
2918,2021,AL,TEX,TEX,13,TEX,TEX
2919,2021,AL,TOR,TOR,14,TOR,TOR
2920,2021,NL,ARI,ARI,15,ARI,ARI
2921,2021,NL,ATL,ATL,16,ATL,ATL
2922,2021,NL,CHN,CHC,17,CHC,CHN
2923,2021,NL,CIN,CIN,18,CIN,CIN
2924,2021,NL,COL,COL,19,COL,COL
2925,2021,NL,LAN,LAD,22,LAD,LAN
2926,2021,NL,MIA,FLA,20,MIA,MIA
2927,2021,NL,MIL,MIL,23,MIL,MIL
2928,2021,NL,NYN,NYM,25,NYM,NYN
2929,2021,NL,PHI,PHI,26,PHI,PHI
2930,2021,NL,PIT,PIT,27,PIT,PIT
2931,2021,NL,SDN,SDP,29,SDP,SDN
2932,2021,NL,SFN,SFG,30,SFG,SFN
2933,2021,NL,SLN,STL,28,STL,SLN
2934,2021,NL,WAS,WSN,24,WSN,WAS
35 changes: 35 additions & 0 deletions pybaseball/datasources/bref.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import datetime
from time import sleep
from typing import Any, Optional

import requests

from ..datahelpers import singleton


class BRefSession(singleton.Singleton):
"""
This is needed because Baseball Reference has rules against bots.

Current policy says no more than 20 requests per minute, but in testing
anything more than 10 requests per minute gets you blocked for one hour.

So this global session will prevent a user from getting themselves blocked.
"""

def __init__(self, max_requests_per_minute: int = 10) -> None:
self.max_requests_per_minute = max_requests_per_minute
self.last_request: Optional[datetime.datetime] = None
self.session = requests.Session()

def get(self, url: str, **kwargs: Any) -> requests.Response:
if self.last_request:
delta = datetime.datetime.now() - self.last_request
sleep_length = (60 / self.max_requests_per_minute) - delta.total_seconds()
if sleep_length > 0:
sleep(sleep_length)

self.last_request = datetime.datetime.now()

return self.session.get(url, **kwargs)

10 changes: 6 additions & 4 deletions pybaseball/league_batting_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
from typing import Optional

import pandas as pd
import requests
from bs4 import BeautifulSoup

from . import cache
from .utils import most_recent_season, sanitize_date_range
from .datasources.bref import BRefSession

session = BRefSession()


def get_soup(start_dt: date, end_dt: date) -> BeautifulSoup:
Expand All @@ -16,7 +18,7 @@ def get_soup(start_dt: date, end_dt: date) -> BeautifulSoup:
# print('Error: a date range needs to be specified')
# return None
url = "http://www.baseball-reference.com/leagues/daily.cgi?user_team=&bust_cache=&type=b&lastndays=7&dates=fromandto&fromandto={}.{}&level=mlb&franch=&stat=&stat_value=0".format(start_dt, end_dt)
s = requests.get(url).content
s = session.get(url).content
# a workaround to avoid beautiful soup applying the wrong encoding
s = str(s).encode()
return BeautifulSoup(s, features="lxml")
Expand Down Expand Up @@ -67,7 +69,7 @@ def batting_stats_range(start_dt: Optional[str] = None, end_dt: Optional[str] =
#table[column] = table[column].astype('float')
table[column] = pd.to_numeric(table[column])
#table['column'] = table['column'].convert_objects(convert_numeric=True)
table = table.drop('', 1)
table = table.drop('', axis=1)
return table


Expand All @@ -91,7 +93,7 @@ def bwar_bat(return_all: bool = False) -> pd.DataFrame:
To get all fields from this table, supply argument return_all=True.
"""
url = "http://www.baseball-reference.com/data/war_daily_bat.txt"
s = requests.get(url).content
s = session.get(url).content
c=pd.read_csv(io.StringIO(s.decode('utf-8')))
if return_all:
return c
Expand Down
38 changes: 21 additions & 17 deletions pybaseball/league_pitching_stats.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,36 @@
from datetime import date
import io
from typing import Optional, Union

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

from . import cache
from .utils import most_recent_season, sanitize_date_range
from .datasources.bref import BRefSession

session = BRefSession()

def get_soup(start_dt, end_dt):

def get_soup(start_dt: Optional[Union[date, str]], end_dt: Optional[Union[date, str]]) -> BeautifulSoup:
# get most recent standings if date not specified
if((start_dt is None) or (end_dt is None)):
print('Error: a date range needs to be specified')
return None
url = "http://www.baseball-reference.com/leagues/daily.cgi?user_team=&bust_cache=&type=p&lastndays=7&dates=fromandto&fromandto={}.{}&level=mlb&franch=&stat=&stat_value=0".format(start_dt, end_dt)
s = requests.get(url).content
s = session.get(url).content
# a workaround to avoid beautiful soup applying the wrong encoding
s = str(s).encode()
return BeautifulSoup(s, features="lxml")


def get_table(soup):
def get_table(soup: BeautifulSoup) -> pd.DataFrame:
table = soup.find_all('table')[0]
data = []
raw_data = []
headings = [th.get_text() for th in table.find("tr").find_all("th")][1:]
headings.append("mlbID")
data.append(headings)
raw_data.append(headings)
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
Expand All @@ -35,15 +39,15 @@ def get_table(soup):
mlbid = row_anchor["href"].split("mlb_ID=")[-1] if row_anchor else pd.NA # ID str or nan
cols = [ele.text.strip() for ele in cols]
cols.append(mlbid)
data.append([ele for ele in cols])
data = pd.DataFrame(data)
raw_data.append([ele for ele in cols])
data = pd.DataFrame(raw_data)
data = data.rename(columns=data.iloc[0])
data = data.reindex(data.index.drop(0))
return data


@cache.df_cache()
def pitching_stats_range(start_dt=None, end_dt=None):
def pitching_stats_range(start_dt: Optional[str]=None, end_dt: Optional[str]=None) -> pd.DataFrame:
"""
Get all pitching stats for a set time range. This can be the past week, the
month of August, anything. Just supply the start and end date in YYYY-MM-DD
Expand Down Expand Up @@ -71,30 +75,30 @@ def pitching_stats_range(start_dt=None, end_dt=None):
for column in ['Str', 'StL', 'StS', 'GB/FB', 'LD', 'PU']:
table[column] = table[column].replace('%','',regex=True).astype('float')/100

table = table.drop('',1)
table = table.drop('', axis=1)
return table

def pitching_stats_bref(season=None):
def pitching_stats_bref(season: Optional[int]=None) -> pd.DataFrame:
"""
Get all pitching stats for a set season. If no argument is supplied, gives stats for
current season to date.
"""
if season is None:
season = most_recent_season()
season = str(season)
start_dt = season + '-03-01' #opening day is always late march or early april
end_dt = season + '-11-01' #season is definitely over by November
str_season = str(season)
start_dt = str_season + '-03-01' #opening day is always late march or early april
end_dt = str_season + '-11-01' #season is definitely over by November
return(pitching_stats_range(start_dt, end_dt))


def bwar_pitch(return_all=False):
def bwar_pitch(return_all: bool=False) -> pd.DataFrame:
"""
Get data from war_daily_pitch table. Returns WAR, its components, and a few other useful stats.
To get all fields from this table, supply argument return_all=True.
"""
url = "http://www.baseball-reference.com/data/war_daily_pitch.txt"
s = requests.get(url).content
c=pd.read_csv(io.StringIO(s.decode('utf-8')))
s = session.get(url).content
c = pd.read_csv(io.StringIO(s.decode('utf-8')))
if return_all:
return c
else:
Expand Down
48 changes: 17 additions & 31 deletions pybaseball/split_stats.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,22 @@
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from typing import Dict, Optional, Tuple, Union
from typing import Dict, List, Optional, Tuple, Union

import bs4 as bs
import pandas as pd
import re

from .datasources.bref import BRefSession

def download_url(url: str) -> bytes:
"""
Gets the content from the url specified
"""
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
resp = session.get(url)
return resp.content
session = BRefSession()


def get_split_soup(playerid: str, year: Optional[int] = None, pitching_splits: bool = False) -> bs.BeautifulSoup:
"""
gets soup for the player splits.
"""
pitch_or_bat = 'p' if pitching_splits else 'b'
if year is None: # provides scores from yesterday if date is not provided
url = f"https://www.baseball-reference.com/players/split.fcgi?id={playerid}&year=Career&t={pitch_or_bat}"
else:
year = str(year)
url = f"https://www.baseball-reference.com/players/split.fcgi?id={playerid}&year={year}&t={pitch_or_bat}"
html = download_url(url)
str_year = 'Career' if year is None else str(year)
url = f"https://www.baseball-reference.com/players/split.fcgi?id={playerid}&year={str_year}&t={pitch_or_bat}"
html = session.get(url).content
soup = bs.BeautifulSoup(html, 'lxml')
return soup

Expand All @@ -43,8 +29,8 @@ def get_player_info(playerid: str, soup: bs.BeautifulSoup = None) -> Dict:
if not soup:
soup = get_split_soup(playerid)
about_info = soup.find_all(
"div", {"itemtype": "https://schema.org/Person"})
info = [ele for ele in about_info]
"div", {"class": "players"})
info: List[bs.BeautifulSoup] = [ele for ele in about_info]
fv = []
# This for loop goes through the player bio section at the top of the splits page to find all of the <p> tags
for i in range(len(info)):
Expand Down Expand Up @@ -79,8 +65,8 @@ def get_splits(playerid: str, year: Optional[int] = None, player_info: bool = Fa
soup = get_split_soup(playerid, year, pitching_splits)
# the splits tables on the bbref site are all within an embedded comment. This finds all the comments
comment = soup.find_all(text=lambda text: isinstance(text, bs.Comment))
data = []
level_data = []
raw_data = []
raw_level_data = []
for i in range(len(comment)):
commentsoup = bs.BeautifulSoup(comment[i], 'lxml')
split_tables = commentsoup.find_all(
Expand All @@ -102,7 +88,7 @@ def get_splits(playerid: str, year: Optional[int] = None, player_info: bool = Fa
level_headings.append('Player ID')
# singles data isn't included in the tables so this appends the column header
level_headings.append('1B')
level_data.append(level_headings)
raw_level_data.append(level_headings)
rows = splits[j].find_all('tr')
for row in rows:
if year == None: # The bbref tables for career splits have one extra preceding th column labeled 'I' that is not used and is not in the single season records
Expand All @@ -113,7 +99,7 @@ def get_splits(playerid: str, year: Optional[int] = None, player_info: bool = Fa
if split_type != "By Inning": # bbref added three empty columns to the by inning tables that don't match the rest of the tables. Not including this split table in results
level_cols.append(split_type)
level_cols.append(playerid)
level_data.append([ele for ele in level_cols])
raw_level_data.append([ele for ele in level_cols])
else:
if year == None: # The bbref tables for career splits have one extra preceding th column labeled 'I' that is not used and is not in the single season records
headings = [th.get_text()
Expand All @@ -125,7 +111,7 @@ def get_splits(playerid: str, year: Optional[int] = None, player_info: bool = Fa
headings.append('Player ID')
# singles data isn't included in the tables so this appends the column header
headings.append('1B')
data.append(headings)
raw_data.append(headings)
rows = splits[j].find_all('tr')
for row in rows:
if year == None: # The bbref tables for career splits have one extra preceding th column labeled 'I' that is not used and is not in the single season records
Expand All @@ -136,9 +122,9 @@ def get_splits(playerid: str, year: Optional[int] = None, player_info: bool = Fa
if split_type != "By Inning": # bbref added three empty columns to the by inning tables that don't match the rest of the tables. Not including this split table in results
cols.append(split_type)
cols.append(playerid)
data.append([ele for ele in cols])
raw_data.append([ele for ele in cols])

data = pd.DataFrame(data)
data = pd.DataFrame(raw_data)
data = data.rename(columns=data.iloc[0])
data = data.reindex(data.index.drop(0))
data = data.set_index(['Player ID', 'Split Type', 'Split'])
Expand All @@ -148,7 +134,7 @@ def get_splits(playerid: str, year: Optional[int] = None, player_info: bool = Fa
data['1B'] = data['H']-data['2B']-data['3B']-data['HR']
data = data.loc[playerid]
if pitching_splits is True: # Returns Game Level tables as a second dataframe for pitching splits
level_data = pd.DataFrame(level_data)
level_data = pd.DataFrame(raw_level_data)
level_data = level_data.rename(columns=level_data.iloc[0])
level_data = level_data.reindex(level_data.index.drop(0))
level_data = level_data.set_index(['Player ID', 'Split Type', 'Split'])
Expand Down
Loading